From 6606aaefcc8c5eb03d5177eabf341535bee7c568 Mon Sep 17 00:00:00 2001 From: WeebDataHoarder <57538841+WeebDataHoarder@users.noreply.github.com> Date: Sat, 20 Apr 2024 21:17:33 +0200 Subject: [PATCH] Version v3.0.0, support full datataset mode in 64-bit targets, modified api, optimized allocations, full VM run JIT on amd64, optimize AES asm --- README.md | 19 +-- aes/hash.go | 28 +---- aes/hash_amd64.go | 50 ++++++++ aes/hash_generic.go | 36 ++++++ aes/round_amd64.go | 48 +------- asm/aes.go | 11 -- asm/aes_amd64.go | 10 +- asm/aes_amd64.s | 179 ++++++++++++++++++++-------- blake2/generator.go | 46 ++++++++ blake2b.go | 50 -------- cache.go | 63 +++++----- config.go | 4 +- dataset.go | 29 ++++- dataset_full.go | 52 +++++++++ dataset_full_no64.go | 34 ++++++ dataset_light.go | 38 ++++-- go.mod | 2 +- jit_amd64.go | 74 +----------- float.go => math.go | 33 +++++- randomx_test.go | 139 ++++++++++++++++------ register.go | 4 - superscalar.go | 233 +++++-------------------------------- superscalar_instruction.go | 157 +++++++++++++++++++++++++ vm.go | 138 +++++++++++++--------- vm_bytecode_jit_amd64.go | 174 +++++++++++++++++++++++++-- vm_bytecode_jit_amd64.s | 106 ++++++++++++++++- vm_bytecode_jit_generic.go | 7 +- vm_bytecode_native.go | 2 +- vm_instruction.go | 9 +- 29 files changed, 1138 insertions(+), 637 deletions(-) create mode 100644 aes/hash_amd64.go create mode 100644 aes/hash_generic.go delete mode 100644 asm/aes.go create mode 100644 blake2/generator.go delete mode 100644 blake2b.go create mode 100644 dataset_full.go create mode 100644 dataset_full_no64.go rename float.go => math.go (65%) create mode 100644 superscalar_instruction.go diff --git a/README.md b/README.md index 9e7bae9..6e8e73f 100644 --- a/README.md +++ b/README.md @@ -14,17 +14,18 @@ This package implements RandomX without CGO, using only Golang code, native floa All test cases pass properly. +Supports Full mode and Light mode. + For the C++ implementation and design of RandomX, see [github.com/tevador/RandomX](https://github.com/tevador/RandomX) -| Feature | 386 | amd64 | arm | arm64 | mips | mips64 | riscv64 | wasm | -|:----------------------------:|:---:|:-----:|:---:|:-----:|:----:|:------:|:-------:|:----:| -| purego | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Hardware Float Operations | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | -| Hardware AES Operations | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| Native Superscalar Execution | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | -| Superscalar JIT Execution | ❌ | ✅* | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | -| Native VM Execution | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ❌ | ❌ | -| VM JIT Execution | ❌ | ✅* | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | +| Feature | 386 | amd64 | arm | arm64 | mips | mips64 | riscv64 | wasm | +|:---------------------:|:----------:|:--------------:|:------:|:----------:|:------:|:------:|:-------:|:------:| +| purego | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| Full Mode | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ✅ | ❌ | +| Float Operations | hw | **hw** | soft | **hw** | soft | soft | soft | soft | +| AES Operations | soft | **hw** | soft | soft | soft | soft | soft | soft | +| Superscalar Execution | native | **native+jit** | native | native | native | native | native | native | +| VM Execution | **native** | **native+jit** | soft | **native** | soft | soft | soft | soft | A pure Golang implementation can be used on platforms without hard float support or via the `purego` build flag manually. diff --git a/aes/hash.go b/aes/hash.go index aa19b1d..0fdf40d 100644 --- a/aes/hash.go +++ b/aes/hash.go @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package aes import ( - "git.gammaspectra.live/P2Pool/go-randomx/v2/keys" + "git.gammaspectra.live/P2Pool/go-randomx/v3/keys" "unsafe" ) @@ -50,21 +50,7 @@ func HashAes1Rx4(input []byte, output *[64]byte) { if len(input)%64 != 0 { panic("unsupported") } - - // states are copied - states := keys.AesHash1R_State - - for input_ptr := 0; input_ptr < len(input); input_ptr += 64 { - in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:]))) - - aesroundtrip_encdec(&states, in) - } - - aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[0]) - - aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[1]) - - copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:]) + hashAes1Rx4(input, output) } // FillAes1Rx4 @@ -81,15 +67,7 @@ func FillAes1Rx4(state *[64]byte, output []byte) { if len(output)%len(state) != 0 { panic("unsupported") } - - // Reference to state without copying - states := (*[4][4]uint32)(unsafe.Pointer(state)) - - for outptr := 0; outptr < len(output); outptr += len(state) { - aesroundtrip_decenc(states, &keys.AesGenerator1R_Keys) - - copy(output[outptr:], state[:]) - } + fillAes1Rx4(state, output) } var fillAes4Rx4Keys0 = [4][4]uint32{ diff --git a/aes/hash_amd64.go b/aes/hash_amd64.go new file mode 100644 index 0000000..b88a0e8 --- /dev/null +++ b/aes/hash_amd64.go @@ -0,0 +1,50 @@ +//go:build amd64 && !purego + +package aes + +import ( + "git.gammaspectra.live/P2Pool/go-randomx/v3/asm" + "git.gammaspectra.live/P2Pool/go-randomx/v3/keys" + "golang.org/x/sys/cpu" + "unsafe" +) + +var supportsAES = cpu.X86.HasAES + +func fillAes1Rx4(state *[64]byte, output []byte) { + // Reference to state without copying + states := (*[4][4]uint32)(unsafe.Pointer(state)) + + if supportsAES { + asm.FillAes1Rx4(states, &keys.AesGenerator1R_Keys, unsafe.SliceData(output), uint64(len(output))) + return + } + + for outptr := 0; outptr < len(output); outptr += len(state) { + aesroundtrip_decenc(states, &keys.AesGenerator1R_Keys) + + copy(output[outptr:], state[:]) + } +} + +func hashAes1Rx4(input []byte, output *[64]byte) { + if supportsAES { + asm.HashAes1Rx4(&keys.AesHash1R_State, &keys.AesHash1R_XKeys, output, unsafe.SliceData(input), uint64(len(input))) + return + } + + // states are copied + states := keys.AesHash1R_State + + for input_ptr := 0; input_ptr < len(input); input_ptr += 64 { + in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:]))) + + aesroundtrip_encdec(&states, in) + } + + aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[0]) + + aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[1]) + + copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:]) +} diff --git a/aes/hash_generic.go b/aes/hash_generic.go new file mode 100644 index 0000000..6c76e3e --- /dev/null +++ b/aes/hash_generic.go @@ -0,0 +1,36 @@ +//go:build !amd64 || purego + +package aes + +import ( + "git.gammaspectra.live/P2Pool/go-randomx/v3/keys" + "unsafe" +) + +func fillAes1Rx4(state *[64]byte, output []byte) { + // Reference to state without copying + states := (*[4][4]uint32)(unsafe.Pointer(state)) + + for outptr := 0; outptr < len(output); outptr += len(state) { + aesroundtrip_decenc(states, &keys.AesGenerator1R_Keys) + + copy(output[outptr:], state[:]) + } +} + +func hashAes1Rx4(input []byte, output *[64]byte) { + // states are copied + states := keys.AesHash1R_State + + for input_ptr := 0; input_ptr < len(input); input_ptr += 64 { + in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:]))) + + aesroundtrip_encdec(&states, in) + } + + aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[0]) + + aesroundtrip_encdec1(&states, &keys.AesHash1R_XKeys[1]) + + copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:]) +} diff --git a/aes/round_amd64.go b/aes/round_amd64.go index c63f6ff..cd1e5eb 100644 --- a/aes/round_amd64.go +++ b/aes/round_amd64.go @@ -3,52 +3,12 @@ package aes import ( - _ "git.gammaspectra.live/P2Pool/go-randomx/v2/asm" - "golang.org/x/sys/cpu" - _ "unsafe" + "git.gammaspectra.live/P2Pool/go-randomx/v3/asm" ) -//go:noescape -//go:linkname hard_aesdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesdec -func hard_aesdec(state *[4]uint32, key *[4]uint32) - -//go:noescape -//go:linkname hard_aesenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesenc -func hard_aesenc(state *[4]uint32, key *[4]uint32) - -//go:noescape -//go:linkname hard_aesroundtrip_decenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_decenc -func hard_aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) - -//go:noescape -//go:linkname hard_aesroundtrip_encdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec -func hard_aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) - -//go:noescape -//go:linkname hard_aesroundtrip_encdec1 git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec1 -func hard_aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) - -var supportsAES = cpu.X86.HasAES - -func aesenc(state *[4]uint32, key *[4]uint32) { - if supportsAES { - hard_aesenc(state, key) - } else { - soft_aesenc(state, key) - } -} - -func aesdec(state *[4]uint32, key *[4]uint32) { - if supportsAES { - hard_aesdec(state, key) - } else { - soft_aesdec(state, key) - } -} - func aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) { if supportsAES { - hard_aesroundtrip_decenc(states, keys) + asm.AESRoundTrip_DecEnc(states, keys) } else { soft_aesdec(&states[0], &keys[0]) soft_aesenc(&states[1], &keys[1]) @@ -59,7 +19,7 @@ func aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) { func aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) { if supportsAES { - hard_aesroundtrip_encdec(states, keys) + asm.AESRoundTrip_EncDec(states, keys) } else { soft_aesenc(&states[0], &keys[0]) soft_aesdec(&states[1], &keys[1]) @@ -70,7 +30,7 @@ func aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) { func aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) { if supportsAES { - hard_aesroundtrip_encdec1(states, key) + asm.AESRoundTrip_EncDec1(states, key) } else { soft_aesenc(&states[0], key) soft_aesdec(&states[1], key) diff --git a/asm/aes.go b/asm/aes.go deleted file mode 100644 index 698bc4c..0000000 --- a/asm/aes.go +++ /dev/null @@ -1,11 +0,0 @@ -//go:build amd64 && !purego - -package asm - -func AESRoundEncrypt(state *[4]uint32, key *[4]uint32) { - aesenc(state, key) -} - -func AESRoundDecrypt(state *[4]uint32, key *[4]uint32) { - aesdec(state, key) -} diff --git a/asm/aes_amd64.go b/asm/aes_amd64.go index 3e02388..9bd534e 100644 --- a/asm/aes_amd64.go +++ b/asm/aes_amd64.go @@ -3,16 +3,16 @@ package asm //go:noescape -func aesenc(state *[4]uint32, key *[4]uint32) +func FillAes1Rx4(states *[4][4]uint32, keys *[4][4]uint32, output *byte, outputLen uint64) //go:noescape -func aesdec(state *[4]uint32, key *[4]uint32) +func HashAes1Rx4(initialState *[4][4]uint32, xKeys *[2][4]uint32, output *[64]byte, input *byte, inputLen uint64) //go:noescape -func aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) +func AESRoundTrip_DecEnc(states *[4][4]uint32, keys *[4][4]uint32) //go:noescape -func aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) +func AESRoundTrip_EncDec(states *[4][4]uint32, keys *[4][4]uint32) //go:noescape -func aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) +func AESRoundTrip_EncDec1(states *[4][4]uint32, key *[4]uint32) diff --git a/asm/aes_amd64.s b/asm/aes_amd64.s index 40ac125..8ed3e69 100644 --- a/asm/aes_amd64.s +++ b/asm/aes_amd64.s @@ -2,92 +2,171 @@ #include "textflag.h" -TEXT ·aesenc(SB),NOSPLIT|NOFRAME,$0-16 - MOVQ state+0(FP), AX - MOVQ key+8(FP), BX - VMOVDQU32 0(AX), X0 - VMOVDQU32 0(BX), X1 - AESENC X1, X0 - VMOVDQU32 X0, 0(AX) +TEXT ·FillAes1Rx4(SB),NOSPLIT|NOFRAME,$0-32 + MOVQ states+0(FP), AX + MOVQ keys+8(FP), BX + MOVQ output+16(FP), CX + MOVQ outputLen+24(FP), DX + + // initial state + VMOVDQU 0(AX), X0 + VMOVDQU 16(AX), X1 + VMOVDQU 32(AX), X2 + VMOVDQU 48(AX), X3 + + // keys: X4-X7 + VMOVDQU 0(BX), X4 + VMOVDQU 16(BX), X5 + VMOVDQU 32(BX), X6 + VMOVDQU 48(BX), X7 + +loop: + + AESDEC X4, X0 + AESENC X5, X1 + AESDEC X6, X2 + AESENC X7, X3 + + // store state onto output + VMOVDQU X0, 0(CX) + VMOVDQU X1, 16(CX) + VMOVDQU X2, 32(CX) + VMOVDQU X3, 48(CX) + ADDQ $64, CX + + // outputLen -= 64, continue if not 0 + SUBQ $64, DX + JNE loop + + // offload initial state + VMOVDQU X0, 0(AX) + VMOVDQU X1, 16(AX) + VMOVDQU X2, 32(AX) + VMOVDQU X3, 48(AX) RET -TEXT ·aesdec(SB),NOSPLIT|NOFRAME,$0-16 - MOVQ state+0(FP), AX - MOVQ key+8(FP), BX - VMOVDQU32 0(AX), X0 - VMOVDQU32 0(BX), X1 - AESDEC X1, X0 - VMOVDQU32 X0, 0(AX) + +TEXT ·HashAes1Rx4(SB),NOSPLIT|NOFRAME,$0-40 + MOVQ initialState+0(FP), AX + + // initial state + VMOVDQU 0(AX), X0 + VMOVDQU 16(AX), X1 + VMOVDQU 32(AX), X2 + VMOVDQU 48(AX), X3 + + + MOVQ xKeys+8(FP), AX + MOVQ output+16(FP), BX + MOVQ input+24(FP), CX + MOVQ inputLen+32(FP), DX + +loop: + // input as keys: X4-X7 + VMOVDQU 0(CX), X4 + VMOVDQU 16(CX), X5 + VMOVDQU 32(CX), X6 + VMOVDQU 48(CX), X7 + + AESENC X4, X0 + AESDEC X5, X1 + AESENC X6, X2 + AESDEC X7, X3 + + ADDQ $64, CX + // inputLen -= 64, continue if not 0 + SUBQ $64, DX + JNE loop + + // do encdec1 with both keys! + VMOVDQU 0(AX), X4 + VMOVDQU 16(AX), X5 + + AESENC X4, X0 + AESDEC X4, X1 + AESENC X4, X2 + AESDEC X4, X3 + + AESENC X5, X0 + AESDEC X5, X1 + AESENC X5, X2 + AESDEC X5, X3 + + // offload into output + VMOVDQU X0, 0(BX) + VMOVDQU X1, 16(BX) + VMOVDQU X2, 32(BX) + VMOVDQU X3, 48(BX) RET -TEXT ·aesroundtrip_decenc(SB),NOSPLIT|NOFRAME,$0-16 +TEXT ·AESRoundTrip_DecEnc(SB),NOSPLIT|NOFRAME,$0-16 MOVQ states+0(FP), AX MOVQ keys+8(FP), BX - VMOVDQU32 0(AX), X0 - VMOVDQU32 0(BX), X1 - VMOVDQU32 16(AX), X2 - VMOVDQU32 16(BX), X3 - VMOVDQU32 32(AX), X4 - VMOVDQU32 32(BX), X5 - VMOVDQU32 48(AX), X6 - VMOVDQU32 48(BX), X7 + VMOVDQU 0(AX), X0 + VMOVDQU 0(BX), X1 + VMOVDQU 16(AX), X2 + VMOVDQU 16(BX), X3 + VMOVDQU 32(AX), X4 + VMOVDQU 32(BX), X5 + VMOVDQU 48(AX), X6 + VMOVDQU 48(BX), X7 AESDEC X1, X0 AESENC X3, X2 AESDEC X5, X4 AESENC X7, X6 - VMOVDQU32 X0, 0(AX) - VMOVDQU32 X2, 16(AX) - VMOVDQU32 X4, 32(AX) - VMOVDQU32 X6, 48(AX) + VMOVDQU X0, 0(AX) + VMOVDQU X2, 16(AX) + VMOVDQU X4, 32(AX) + VMOVDQU X6, 48(AX) RET -TEXT ·aesroundtrip_encdec(SB),NOSPLIT|NOFRAME,$0-16 +TEXT ·AESRoundTrip_EncDec(SB),NOSPLIT|NOFRAME,$0-16 MOVQ states+0(FP), AX MOVQ keys+8(FP), BX - VMOVDQU32 0(AX), X0 - VMOVDQU32 0(BX), X1 - VMOVDQU32 16(AX), X2 - VMOVDQU32 16(BX), X3 - VMOVDQU32 32(AX), X4 - VMOVDQU32 32(BX), X5 - VMOVDQU32 48(AX), X6 - VMOVDQU32 48(BX), X7 + VMOVDQU 0(AX), X0 + VMOVDQU 0(BX), X1 + VMOVDQU 16(AX), X2 + VMOVDQU 16(BX), X3 + VMOVDQU 32(AX), X4 + VMOVDQU 32(BX), X5 + VMOVDQU 48(AX), X6 + VMOVDQU 48(BX), X7 AESENC X1, X0 AESDEC X3, X2 AESENC X5, X4 AESDEC X7, X6 - VMOVDQU32 X0, 0(AX) - VMOVDQU32 X2, 16(AX) - VMOVDQU32 X4, 32(AX) - VMOVDQU32 X6, 48(AX) + VMOVDQU X0, 0(AX) + VMOVDQU X2, 16(AX) + VMOVDQU X4, 32(AX) + VMOVDQU X6, 48(AX) RET -TEXT ·aesroundtrip_encdec1(SB),NOSPLIT|NOFRAME,$0-16 +TEXT ·AESRoundTrip_EncDec1(SB),NOSPLIT|NOFRAME,$0-16 MOVQ states+0(FP), AX MOVQ key+8(FP), BX - VMOVDQU32 0(BX), X0 - VMOVDQU32 0(AX), X1 - VMOVDQU32 16(AX), X2 - VMOVDQU32 32(AX), X3 - VMOVDQU32 48(AX), X4 + VMOVDQU 0(BX), X0 + VMOVDQU 0(AX), X1 + VMOVDQU 16(AX), X2 + VMOVDQU 32(AX), X3 + VMOVDQU 48(AX), X4 AESENC X0, X1 AESDEC X0, X2 AESENC X0, X3 AESDEC X0, X4 - VMOVDQU32 X1, 0(AX) - VMOVDQU32 X2, 16(AX) - VMOVDQU32 X3, 32(AX) - VMOVDQU32 X4, 48(AX) + VMOVDQU X1, 0(AX) + VMOVDQU X2, 16(AX) + VMOVDQU X3, 32(AX) + VMOVDQU X4, 48(AX) RET diff --git a/blake2/generator.go b/blake2/generator.go new file mode 100644 index 0000000..969c1f5 --- /dev/null +++ b/blake2/generator.go @@ -0,0 +1,46 @@ +package blake2 + +import ( + "encoding/binary" + "golang.org/x/crypto/blake2b" +) + +type Generator struct { + state [blake2b.Size]byte + i int +} + +func New(seed []byte, nonce uint32) *Generator { + var state [blake2b.Size]byte + copy(state[:60], seed) + binary.LittleEndian.PutUint32(state[60:], nonce) + g := &Generator{ + i: len(state), + state: state, + } + + return g +} + +func (g *Generator) GetUint32() (v uint32) { + if (g.i + 4) > len(g.state) { + g.reseed() + } + v = binary.LittleEndian.Uint32(g.state[g.i:]) + g.i += 4 + return v +} + +func (g *Generator) GetByte() (v byte) { + if (g.i + 1) > len(g.state) { + g.reseed() + } + v = g.state[g.i] + g.i++ + return v +} + +func (g *Generator) reseed() { + g.state = blake2b.Sum512(g.state[:]) + g.i = 0 +} diff --git a/blake2b.go b/blake2b.go deleted file mode 100644 index 2212470..0000000 --- a/blake2b.go +++ /dev/null @@ -1,50 +0,0 @@ -package randomx - -import ( - "encoding/binary" - "golang.org/x/crypto/blake2b" -) - -type Blake2Generator struct { - data [64]byte - dataindex int - allocRegIndex [8]int - allocRegisters [8]Register -} - -func Init_Blake2Generator(key []byte, nonce uint32) *Blake2Generator { - var b Blake2Generator - b.dataindex = len(b.data) - if len(key) > 60 { - copy(b.data[:], key[0:60]) - } else { - copy(b.data[:], key) - } - binary.LittleEndian.PutUint32(b.data[60:], nonce) - - return &b -} - -func (b *Blake2Generator) checkdata(bytesNeeded int) { - if b.dataindex+bytesNeeded > cap(b.data) { - //blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0); - h := blake2b.Sum512(b.data[:]) - copy(b.data[:], h[:]) - b.dataindex = 0 - } - -} - -func (b *Blake2Generator) GetByte() byte { - b.checkdata(1) - ret := b.data[b.dataindex] - b.dataindex++ - return ret -} -func (b *Blake2Generator) GetUint32() uint32 { - b.checkdata(4) - ret := binary.LittleEndian.Uint32(b.data[b.dataindex:]) - b.dataindex += 4 - - return ret -} diff --git a/cache.go b/cache.go index 7bd43e2..edff15a 100644 --- a/cache.go +++ b/cache.go @@ -1,8 +1,9 @@ package randomx import ( - "git.gammaspectra.live/P2Pool/go-randomx/v2/argon2" - "git.gammaspectra.live/P2Pool/go-randomx/v2/keys" + "git.gammaspectra.live/P2Pool/go-randomx/v3/argon2" + "git.gammaspectra.live/P2Pool/go-randomx/v3/blake2" + "git.gammaspectra.live/P2Pool/go-randomx/v3/keys" "runtime" "slices" "unsafe" @@ -15,7 +16,7 @@ func (m *MemoryBlock) GetLine(addr uint64) *RegisterLine { return (*RegisterLine)(unsafe.Pointer(unsafe.SliceData(m[addr : addr+8 : addr+8]))) } -type Randomx_Cache struct { +type Cache struct { Blocks []MemoryBlock Programs [RANDOMX_PROGRAM_COUNT]SuperScalarProgram @@ -25,36 +26,20 @@ type Randomx_Cache struct { Flags uint64 } -func Randomx_alloc_cache(flags uint64) *Randomx_Cache { +func NewCache(flags uint64) *Cache { if flags == RANDOMX_FLAG_DEFAULT { flags = RANDOMX_FLAG_JIT } - return &Randomx_Cache{ + return &Cache{ Flags: flags, } } -func (cache *Randomx_Cache) HasJIT() bool { +func (cache *Cache) HasJIT() bool { return cache.Flags&RANDOMX_FLAG_JIT > 0 && cache.JitPrograms[0] != nil } -func (cache *Randomx_Cache) VM_Initialize() *VM { - - vm := &VM{ - Dataset: &Randomx_DatasetLight{ - Cache: cache, - }, - } - if cache.HasJIT() { - vm.JITProgram = mapProgram(nil, int(RandomXCodeSize)) - if cache.Flags&RANDOMX_FLAG_SECURE == 0 { - mapProgramRWX(vm.JITProgram) - } - } - return vm -} - -func (cache *Randomx_Cache) Close() error { +func (cache *Cache) Close() error { for _, p := range cache.JitPrograms { if p != nil { err := p.Close() @@ -66,10 +51,12 @@ func (cache *Randomx_Cache) Close() error { return nil } -func (cache *Randomx_Cache) Init(key []byte) { - // Lock due to external JIT madness - runtime.LockOSThread() - defer runtime.UnlockOSThread() +func (cache *Cache) Init(key []byte) { + if cache.Flags&RANDOMX_FLAG_JIT > 0 { + // Lock due to external JIT madness + runtime.LockOSThread() + defer runtime.UnlockOSThread() + } kkey := slices.Clone(key) @@ -79,10 +66,11 @@ func (cache *Randomx_Cache) Init(key []byte) { cache.Blocks = memoryBlocks - nonce := uint32(0) //uint32(len(key)) - gen := Init_Blake2Generator(key, nonce) + const nonce uint32 = 0 + + gen := blake2.New(key, nonce) for i := 0; i < 8; i++ { - cache.Programs[i] = Build_SuperScalar_Program(gen) // build a superscalar program + cache.Programs[i] = BuildSuperScalarProgram(gen) // build a superscalar program if cache.Flags&RANDOMX_FLAG_JIT > 0 { cache.JitPrograms[i] = generateSuperscalarCode(cache.Programs[i]) } @@ -93,7 +81,7 @@ func (cache *Randomx_Cache) Init(key []byte) { const Mask = CacheSize/CacheLineSize - 1 // GetMixBlock fetch a 64 byte block in uint64 form -func (cache *Randomx_Cache) GetMixBlock(addr uint64) *RegisterLine { +func (cache *Cache) GetMixBlock(addr uint64) *RegisterLine { addr = (addr & Mask) * CacheLineSize @@ -101,7 +89,7 @@ func (cache *Randomx_Cache) GetMixBlock(addr uint64) *RegisterLine { return cache.Blocks[block].GetLine(addr % 1024) } -func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64) { +func (cache *Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64) { registerValue := itemNumber rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0] @@ -129,7 +117,7 @@ func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64) } } -func (cache *Randomx_Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint64) { +func (cache *Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint64) { registerValue := itemNumber rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0] @@ -155,9 +143,12 @@ func (cache *Randomx_Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint } } -func (cache *Randomx_Cache) initDataset(dataset []RegisterLine, startItem, endItem uint64) { - panic("todo") +func (cache *Cache) InitDataset(dataset []RegisterLine, startItem, endItem uint64) { for itemNumber := startItem; itemNumber < endItem; itemNumber, dataset = itemNumber+1, dataset[1:] { - cache.InitDatasetItem(&dataset[0], itemNumber) + if cache.HasJIT() { + cache.InitDatasetItemJIT(&dataset[0], itemNumber) + } else { + cache.InitDatasetItem(&dataset[0], itemNumber) + } } } diff --git a/config.go b/config.go index 92aa9a2..8763883 100644 --- a/config.go +++ b/config.go @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package randomx -import "git.gammaspectra.live/P2Pool/go-randomx/v2/argon2" +import "git.gammaspectra.live/P2Pool/go-randomx/v3/argon2" // see reference configuration.h // Cache size in KiB. Must be a power of 2. @@ -81,7 +81,7 @@ const RANDOMX_JUMP_BITS = 8 // Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16. const RANDOMX_JUMP_OFFSET = 8 -const DATASETEXTRAITEMS = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE +const DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE const SuperscalarMaxSize = 3*RANDOMX_SUPERSCALAR_LATENCY + 2 const RANDOMX_DATASET_ITEM_SIZE uint64 = 64 diff --git a/dataset.go b/dataset.go index a642f9d..5b2b3e4 100644 --- a/dataset.go +++ b/dataset.go @@ -1,8 +1,31 @@ package randomx -type Randomx_Dataset interface { - InitDataset(startItem, endItem uint64) - ReadDataset(address uint64, r, cache *RegisterLine) +import "sync" + +type Dataset interface { + InitDataset(startItem, itemCount uint64) + ReadDataset(address uint64, r *RegisterLine) PrefetchDataset(address uint64) Flags() uint64 + Cache() *Cache + Memory() *[DatasetItemCount]RegisterLine +} + +func InitDatasetParallel(dataset Dataset, n int) { + n = max(1, n) + + var wg sync.WaitGroup + for i := uint64(1); i < uint64(n); i++ { + a := (DatasetItemCount * i) / uint64(n) + b := (DatasetItemCount * (i + 1)) / uint64(n) + + wg.Add(1) + go func(a, b uint64) { + defer wg.Done() + dataset.InitDataset(a, b-a) + }(a, b) + } + + dataset.InitDataset(0, DatasetItemCount/uint64(n)) + wg.Wait() } diff --git a/dataset_full.go b/dataset_full.go new file mode 100644 index 0000000..cb348aa --- /dev/null +++ b/dataset_full.go @@ -0,0 +1,52 @@ +//go:build amd64 || arm64 || arm64be || loong64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || sparc64 + +package randomx + +const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE + +const DatasetItemCount = DatasetSize / CacheLineSize + +type DatasetFull struct { + cache *Cache + memory [DatasetItemCount]RegisterLine +} + +func NewFullDataset(cache *Cache) *DatasetFull { + return &DatasetFull{ + cache: cache, + } +} + +func (d *DatasetFull) PrefetchDataset(address uint64) { + +} + +func (d *DatasetFull) ReadDataset(address uint64, r *RegisterLine) { + cache := &d.memory[address/CacheLineSize] + + for i := range r { + r[i] ^= cache[i] + } +} + +func (d *DatasetFull) Cache() *Cache { + return d.cache +} + +func (d *DatasetFull) Flags() uint64 { + return d.cache.Flags +} + +func (d *DatasetFull) Memory() *[DatasetItemCount]RegisterLine { + return &d.memory +} + +func (d *DatasetFull) InitDataset(startItem, itemCount uint64) { + if startItem >= DatasetItemCount || itemCount > DatasetItemCount { + panic("out of range") + } + if startItem+itemCount > DatasetItemCount { + panic("out of range") + } + d.cache.InitDataset(d.memory[startItem:startItem+itemCount], startItem, startItem+itemCount) +} diff --git a/dataset_full_no64.go b/dataset_full_no64.go new file mode 100644 index 0000000..ef268ff --- /dev/null +++ b/dataset_full_no64.go @@ -0,0 +1,34 @@ +//go:build !(amd64 || arm64 || arm64be || loong64 || mips64 || mips64le || ppc64 || ppc64le || riscv64 || s390x || sparc64) + +package randomx + +const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE + +const DatasetItemCount = DatasetSize / CacheLineSize + +type DatasetFull struct { +} + +func NewFullDataset(cache *Cache) *DatasetFull { + return nil +} + +func (d *DatasetFull) PrefetchDataset(address uint64) { + +} + +func (d *DatasetFull) ReadDataset(address uint64, r *RegisterLine) { + +} + +func (d *DatasetFull) Cache() *Cache { + return nil +} + +func (d *DatasetFull) Flags() uint64 { + return 0 +} + +func (d *DatasetFull) InitDataset(startItem, itemCount uint64) { + +} diff --git a/dataset_light.go b/dataset_light.go index 5a88d92..fc91420 100644 --- a/dataset_light.go +++ b/dataset_light.go @@ -1,19 +1,25 @@ package randomx -type Randomx_DatasetLight struct { - Cache *Randomx_Cache - Memory []uint64 +type DatasetLight struct { + cache *Cache } -func (d *Randomx_DatasetLight) PrefetchDataset(address uint64) { +func NewLightDataset(cache *Cache) *DatasetLight { + return &DatasetLight{ + cache: cache, + } +} + +func (d *DatasetLight) PrefetchDataset(address uint64) { } -func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLine) { - if d.Cache.HasJIT() { - d.Cache.InitDatasetItemJIT(cache, address/CacheLineSize) +func (d *DatasetLight) ReadDataset(address uint64, r *RegisterLine) { + var cache RegisterLine + if d.cache.HasJIT() { + d.cache.InitDatasetItemJIT(&cache, address/CacheLineSize) } else { - d.Cache.InitDatasetItem(cache, address/CacheLineSize) + d.cache.InitDatasetItem(&cache, address/CacheLineSize) } for i := range r { @@ -21,10 +27,18 @@ func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLin } } -func (d *Randomx_DatasetLight) Flags() uint64 { - return d.Cache.Flags +func (d *DatasetLight) Flags() uint64 { + return d.cache.Flags } -func (d *Randomx_DatasetLight) InitDataset(startItem, endItem uint64) { - //d.Cache.initDataset(d.Cache.Programs) +func (d *DatasetLight) Cache() *Cache { + return d.cache +} + +func (d *DatasetLight) Memory() *[DatasetItemCount]RegisterLine { + return nil +} + +func (d *DatasetLight) InitDataset(startItem, itemCount uint64) { + } diff --git a/go.mod b/go.mod index 2899163..59f7938 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module git.gammaspectra.live/P2Pool/go-randomx/v2 +module git.gammaspectra.live/P2Pool/go-randomx/v3 go 1.21 diff --git a/jit_amd64.go b/jit_amd64.go index cc9b4a2..3ffdef0 100644 --- a/jit_amd64.go +++ b/jit_amd64.go @@ -3,9 +3,8 @@ package randomx import ( - "bytes" "encoding/binary" - "git.gammaspectra.live/P2Pool/go-randomx/v2/asm" + "git.gammaspectra.live/P2Pool/go-randomx/v3/asm" ) /* @@ -13,11 +12,11 @@ import ( REGISTER ALLOCATION: ; rax -> temporary - ; rbx -> iteration counter "ic" + ; rbx -> todo: iteration counter "ic" ; rcx -> temporary ; rdx -> temporary ; rsi -> scratchpad pointer - ; rdi -> (not used) + ; rdi -> todo: dataset pointer ; rbp -> (do not use, it's used by Golang sampling) jump target //todo: memory registers "ma" (high 32 bits), "mx" (low 32 bits) ; rsp -> stack pointer ; r8 -> "r0" @@ -128,7 +127,7 @@ var REX_MOV_MR = []byte{0x4c, 0x89} var REX_XOR_EAX = []byte{0x41, 0x33} var SUB_EBX = []byte{0x83, 0xEB, 0x01} var JNZ = []byte{0x0f, 0x85} -var JMP = 0xe9 +var JMP byte = 0xe9 var REX_XOR_RAX_R64 = []byte{0x49, 0x33} var REX_XCHG = []byte{0x4d, 0x87} @@ -157,6 +156,8 @@ var NOP6 = []byte{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00} var NOP7 = []byte{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00} var NOP8 = []byte{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00} +var NOPX = [][]byte{NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8} + var JMP_ALIGN_PREFIX = [14][]byte{ {}, {0x2E}, @@ -263,66 +264,3 @@ var BranchesWithin32B = func() bool { } return false }() - -/* -;# callee-saved registers - Microsoft x64 calling convention -push rbx -push rbp -push rdi -push rsi -push r12 -push r13 -push r14 -push r15 -sub rsp, 80 -movdqu xmmword ptr [rsp+64], xmm6 -movdqu xmmword ptr [rsp+48], xmm7 -movdqu xmmword ptr [rsp+32], xmm8 -movdqu xmmword ptr [rsp+16], xmm9 -movdqu xmmword ptr [rsp+0], xmm10 -sub rsp, 80 -movdqu xmmword ptr [rsp+64], xmm11 -movdqu xmmword ptr [rsp+48], xmm12 -movdqu xmmword ptr [rsp+32], xmm13 -movdqu xmmword ptr [rsp+16], xmm14 -movdqu xmmword ptr [rsp+0], xmm15 - -;# function arguments -push rcx ;# RegisterFile& registerFile -mov rbp, qword ptr [rdx] ;# "mx", "ma" -mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset -mov rsi, r8 ;# uint8_t* scratchpad -mov rbx, r9 ;# loop counter - -mov rax, rbp -ror rbp, 32 - -;# zero integer registers -xor r8, r8 -xor r9, r9 -xor r10, r10 -xor r11, r11 -xor r12, r12 -xor r13, r13 -xor r14, r14 -xor r15, r15 - -;# load constant registers -lea rcx, [rcx+120] -movapd xmm8, xmmword ptr [rcx+72] -movapd xmm9, xmmword ptr [rcx+88] -movapd xmm10, xmmword ptr [rcx+104] -movapd xmm11, xmmword ptr [rcx+120] - -movapd xmm13, xmmword ptr [mantissaMask] -movapd xmm14, xmmword ptr [exp240] -movapd xmm15, xmmword ptr [scaleMask] -mov rdx, rax -and eax, RANDOMX_SCRATCHPAD_MASK -ror rdx, 32 -and edx, RANDOMX_SCRATCHPAD_MASK -jmp rx_program_loop_begin -*/ -var randomx_program_prologue = bytes.Repeat(NOP1, 64) - -var randomx_program_loop_begin = bytes.Repeat(NOP1, 64) diff --git a/float.go b/math.go similarity index 65% rename from float.go rename to math.go index 7724404..dac2995 100644 --- a/float.go +++ b/math.go @@ -1,6 +1,9 @@ package randomx -import "math" +import ( + "math" + "math/bits" +) const ( mantbits64 uint = 52 @@ -41,10 +44,36 @@ func StaticExponent(entropy uint64) uint64 { return exponent } -func EMask(entropy uint64) uint64 { +func ExponentMask(entropy uint64) uint64 { return (entropy & mask22bit) | StaticExponent(entropy) } func Xor(a, b float64) float64 { return math.Float64frombits(math.Float64bits(a) ^ math.Float64bits(b)) } + +func smulh(a, b int64) uint64 { + hi_, _ := bits.Mul64(uint64(a), uint64(b)) + t1 := (a >> 63) & b + t2 := (b >> 63) & a + return uint64(int64(hi_) - t1 - t2) +} + +// reciprocal +// Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64. +// divisor must not be 0 or a power of 2 +func reciprocal(divisor uint32) uint64 { + + const p2exp63 = uint64(1) << 63 + + quotient := p2exp63 / uint64(divisor) + remainder := p2exp63 % uint64(divisor) + + shift := bits.Len32(divisor) + + return (quotient << shift) + ((remainder << shift) / uint64(divisor)) +} + +func signExtend2sCompl(x uint32) uint64 { + return uint64(int64(int32(x))) +} diff --git a/randomx_test.go b/randomx_test.go index a94e914..feeb8b9 100644 --- a/randomx_test.go +++ b/randomx_test.go @@ -31,7 +31,9 @@ package randomx import ( "fmt" + "os" "runtime" + "slices" ) import "testing" @@ -47,9 +49,9 @@ var Tests = []struct { {[]byte("test key 001"), []byte("sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"), "e9ff4503201c0c2cca26d285c93ae883f9b1d30c9eb240b820756f2d5a7905fc"}, // test d } -func Test_Randomx(t *testing.T) { +func Test_RandomXLight(t *testing.T) { - c := Randomx_alloc_cache(0) + c := NewCache(0) for ix, tt := range Tests { @@ -62,7 +64,10 @@ func Test_Randomx(t *testing.T) { } }() - vm := c.VM_Initialize() + dataset := NewLightDataset(c) + dataset.InitDataset(0, DatasetItemCount) + + vm := NewVM(dataset) defer vm.Close() var output_hash [32]byte @@ -74,57 +79,125 @@ func Test_Randomx(t *testing.T) { } }) } - } -func Benchmark_RandomX(b *testing.B) { +func Test_RandomXFull(t *testing.T) { + if os.Getenv("CI") != "" { + t.Skip("Skipping full mode in CI environment") + } + + c := NewCache(0) + + for ix, tt := range Tests { + + t.Run(string(tt.key)+"_____"+string(tt.input), func(t *testing.T) { + c.Init(tt.key) + defer func() { + err := c.Close() + if err != nil { + t.Error(err) + } + }() + + dataset := NewFullDataset(c) + if dataset == nil { + t.Skip("Skipping full mode in 32-bit environment") + } + InitDatasetParallel(dataset, runtime.NumCPU()) + + vm := NewVM(dataset) + defer vm.Close() + + var output_hash [32]byte + vm.CalculateHash(tt.input, &output_hash) + + actual := fmt.Sprintf("%x", output_hash) + if actual != tt.expected { + t.Errorf("#%d Fib(%v): expected %s, actual %s", ix, tt.key, tt.expected, actual) + } + }) + + // cleanup 2GiB between runs + runtime.GC() + } +} + +var BenchmarkTest = Tests[0] +var BenchmarkCache *Cache +var BenchmarkDatasetLight *DatasetLight +var BenchmarkDatasetFull *DatasetFull + +func TestMain(m *testing.M) { + if slices.Contains(os.Args, "-test.bench") { + //init light and full dataset + BenchmarkCache = NewCache(0) + BenchmarkCache.Init(BenchmarkTest.key) + BenchmarkDatasetLight = NewLightDataset(BenchmarkCache) + BenchmarkDatasetLight.InitDataset(0, DatasetItemCount) + BenchmarkDatasetFull = NewFullDataset(BenchmarkCache) + InitDatasetParallel(BenchmarkDatasetFull, runtime.NumCPU()) + defer BenchmarkCache.Close() + } + os.Exit(m.Run()) +} + +func Benchmark_RandomXLight(b *testing.B) { b.ReportAllocs() - tt := Tests[0] - - c := Randomx_alloc_cache(0) - - c.Init(tt.key) - defer func() { - err := c.Close() - if err != nil { - b.Error(err) - } - }() - - vm := c.VM_Initialize() + vm := NewVM(BenchmarkDatasetLight) defer vm.Close() + b.ResetTimer() for i := 0; i < b.N; i++ { var output_hash [32]byte - vm.CalculateHash(tt.input, &output_hash) + vm.CalculateHash(BenchmarkTest.input, &output_hash) runtime.KeepAlive(output_hash) } } -func Benchmark_RandomXParallel(b *testing.B) { +func Benchmark_RandomXFull(b *testing.B) { b.ReportAllocs() - tt := Tests[0] - - c := Randomx_alloc_cache(0) - - c.Init(tt.key) - defer func() { - err := c.Close() - if err != nil { - b.Error(err) - } - }() + vm := NewVM(BenchmarkDatasetFull) + defer vm.Close() b.ResetTimer() + for i := 0; i < b.N; i++ { + var output_hash [32]byte + vm.CalculateHash(BenchmarkTest.input, &output_hash) + runtime.KeepAlive(output_hash) + } +} + +func Benchmark_RandomXLight_Parallel(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + b.RunParallel(func(pb *testing.PB) { var output_hash [32]byte - vm := c.VM_Initialize() + + vm := NewVM(BenchmarkDatasetLight) defer vm.Close() for pb.Next() { - vm.CalculateHash(tt.input, &output_hash) + vm.CalculateHash(BenchmarkTest.input, &output_hash) + runtime.KeepAlive(output_hash) + } + }) +} + +func Benchmark_RandomXFull_Parallel(b *testing.B) { + b.ReportAllocs() + b.ResetTimer() + + b.RunParallel(func(pb *testing.PB) { + var output_hash [32]byte + + vm := NewVM(BenchmarkDatasetFull) + defer vm.Close() + + for pb.Next() { + vm.CalculateHash(BenchmarkTest.input, &output_hash) runtime.KeepAlive(output_hash) } }) diff --git a/register.go b/register.go index 1173adb..31f7916 100644 --- a/register.go +++ b/register.go @@ -24,7 +24,3 @@ const RegisterFileSize = RegistersCount*8 + RegistersCountFloat*2*8*3 func (rf *RegisterFile) Memory() *[RegisterFileSize]byte { return (*[RegisterFileSize]byte)(unsafe.Pointer(rf)) } - -type MemoryRegisters struct { - mx, ma uint64 -} diff --git a/superscalar.go b/superscalar.go index 80dbed0..8cb5753 100644 --- a/superscalar.go +++ b/superscalar.go @@ -29,7 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package randomx -import "math/bits" +import ( + "git.gammaspectra.live/P2Pool/go-randomx/v3/blake2" + "math/bits" +) type ExecutionPort byte @@ -201,7 +204,7 @@ var buffer3 = []int{4, 9, 3} var buffer4 = []int{4, 4, 4, 4} var buffer5 = []int{3, 3, 10} -var Decoder_To_Instruction_Length = [][]int{ +var decoderToInstructionSize = [][]int{ buffer0, buffer1, buffer2, @@ -258,7 +261,7 @@ func (d DecoderType) String() string { } } -func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *Blake2Generator) DecoderType { +func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *blake2.Generator) DecoderType { if ins.Opcode == S_IMULH_R || ins.Opcode == S_ISMULH_R { return Decoder3310 @@ -295,158 +298,6 @@ func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *Blake2Gene return Decoder484 } -var slot3 = []*Instruction{&ISUB_R, &IXOR_R} // 3 length instruction will be filled with these -var slot3L = []*Instruction{&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R} - -var slot4 = []*Instruction{&IROR_C, &IADD_RS} -var slot7 = []*Instruction{&IXOR_C7, &IADD_C7} -var slot8 = []*Instruction{&IXOR_C8, &IADD_C8} -var slot9 = []*Instruction{&IXOR_C9, &IADD_C9} -var slot10 = []*Instruction{&IMUL_RCP} - -// SuperScalarInstruction superscalar program is built with superscalar instructions -type SuperScalarInstruction struct { - Opcode byte - Dst int - Src int - Mod byte - Imm32 uint32 - Imm64 uint64 - OpGroup int - OpGroupPar int - GroupParIsSource int - ins *Instruction - CanReuse bool -} - -func (sins *SuperScalarInstruction) FixSrcReg() { - if sins.Src == 0xff { - sins.Src = sins.Dst - } - -} -func (sins *SuperScalarInstruction) Reset() { - sins.Opcode = 99 - sins.Src = 0xff - sins.Dst = 0xff - sins.CanReuse = false - sins.GroupParIsSource = 0 -} -func create(sins *SuperScalarInstruction, ins *Instruction, gen *Blake2Generator) { - sins.Reset() - sins.ins = ins - sins.OpGroupPar = -1 - sins.Opcode = ins.Opcode - - switch ins.Opcode { - case S_ISUB_R: - sins.Mod = 0 - sins.Imm32 = 0 - sins.OpGroup = S_IADD_RS - sins.GroupParIsSource = 1 - case S_IXOR_R: - sins.Mod = 0 - sins.Imm32 = 0 - sins.OpGroup = S_IXOR_R - sins.GroupParIsSource = 1 - case S_IADD_RS: - sins.Mod = gen.GetByte() - // set modshift on Imm32 - sins.Imm32 = uint32((sins.Mod >> 2) % 4) // bits 2-3 - //sins.Imm32 = 0 - sins.OpGroup = S_IADD_RS - sins.GroupParIsSource = 1 - case S_IMUL_R: - sins.Mod = 0 - sins.Imm32 = 0 - sins.OpGroup = S_IMUL_R - sins.GroupParIsSource = 1 - case S_IROR_C: - sins.Mod = 0 - - for sins.Imm32 = 0; sins.Imm32 == 0; { - sins.Imm32 = uint32(gen.GetByte() & 63) - } - - sins.OpGroup = S_IROR_C - sins.OpGroupPar = -1 - case S_IADD_C7, S_IADD_C8, S_IADD_C9: - sins.Mod = 0 - sins.Imm32 = gen.GetUint32() - sins.OpGroup = S_IADD_C7 - sins.OpGroupPar = -1 - case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9: - sins.Mod = 0 - sins.Imm32 = gen.GetUint32() - sins.OpGroup = S_IXOR_C7 - sins.OpGroupPar = -1 - - case S_IMULH_R: - sins.CanReuse = true - sins.Mod = 0 - sins.Imm32 = 0 - sins.OpGroup = S_IMULH_R - sins.OpGroupPar = int(gen.GetUint32()) - case S_ISMULH_R: - sins.CanReuse = true - sins.Mod = 0 - sins.Imm32 = 0 - sins.OpGroup = S_ISMULH_R - sins.OpGroupPar = int(gen.GetUint32()) - - case S_IMUL_RCP: - - sins.Mod = 0 - for { - sins.Imm32 = gen.GetUint32() - if (sins.Imm32&sins.Imm32 - 1) != 0 { - break - } - } - - sins.Imm64 = randomx_reciprocal(sins.Imm32) - - sins.OpGroup = S_IMUL_RCP - - default: - panic("should not occur") - - } - -} -func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *Blake2Generator, instruction_len int, decoder_type int, islast, isfirst bool) { - - switch instruction_len { - case 3: - if islast { - create(sins, slot3L[gen.GetByte()&3], gen) - } else { - create(sins, slot3[gen.GetByte()&1], gen) - } - case 4: - //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions - if decoder_type == int(Decoder4444) && !islast { - create(sins, &IMUL_R, gen) - } else { - create(sins, slot4[gen.GetByte()&1], gen) - } - case 7: - create(sins, slot7[gen.GetByte()&1], gen) - - case 8: - create(sins, slot8[gen.GetByte()&1], gen) - - case 9: - create(sins, slot9[gen.GetByte()&1], gen) - case 10: - create(sins, slot10[0], gen) - - default: - panic("should not be possible") - } - -} - type SuperScalarProgram []SuperScalarInstruction func (p SuperScalarProgram) setAddressRegister(addressRegister int) { @@ -460,7 +311,7 @@ func (p SuperScalarProgram) Program() []SuperScalarInstruction { return p[1:] } -func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram { +func BuildSuperScalarProgram(gen *blake2.Generator) SuperScalarProgram { cycle := 0 depcycle := 0 //retire_cycle := 0 @@ -474,12 +325,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram { code_size := 0 program := make(SuperScalarProgram, 1, 512) - preAllocatedRegisters := gen.allocRegIndex[:] - - registers := gen.allocRegisters[:] - for i := range registers { - registers[i] = Register{} - } + var registers [8]Register sins := &SuperScalarInstruction{} sins.ins = &Instruction{Opcode: S_NOP} @@ -508,7 +354,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram { if ports_saturated || program_size >= SuperscalarMaxSize { break } - CreateSuperScalarInstruction(sins, gen, Decoder_To_Instruction_Length[int(decoder)][buffer_index], int(decoder), len(Decoder_To_Instruction_Length[decoder]) == (buffer_index+1), buffer_index == 0) + CreateSuperScalarInstruction(sins, gen, decoderToInstructionSize[decoder][buffer_index], decoder, len(decoderToInstructionSize[decoder]) == (buffer_index+1), buffer_index == 0) macro_op_index = 0 } @@ -529,7 +375,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram { if macro_op_index == sins.ins.SrcOP { // FIXME forward := 0 - for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectSource(preAllocatedRegisters, scheduleCycle, registers, gen); forward++ { + for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectSource(scheduleCycle, ®isters, gen); forward++ { scheduleCycle++ cycle++ } @@ -547,7 +393,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram { if macro_op_index == sins.ins.DstOP { // FIXME forward := 0 - for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectDestination(preAllocatedRegisters, scheduleCycle, throwAwayCount > 0, registers, gen); forward++ { + for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectDestination(scheduleCycle, throwAwayCount > 0, ®isters, gen); forward++ { scheduleCycle++ cycle++ } @@ -708,24 +554,24 @@ const RegisterNeedsDisplacement = 5 // RegisterNeedsSib x86 r12 register const RegisterNeedsSib = 4 -func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters []int, cycle int, Registers []Register, gen *Blake2Generator) bool { - available_registers := preAllocatedAvailableRegisters[:0] +func (sins *SuperScalarInstruction) SelectSource(cycle int, registers *[8]Register, gen *blake2.Generator) bool { + availableRegisters := make([]int, 0, 8) - for i := range Registers { - if Registers[i].Latency <= cycle { - available_registers = append(available_registers, i) + for i := range registers { + if registers[i].Latency <= cycle { + availableRegisters = append(availableRegisters, i) } } - if len(available_registers) == 2 && sins.Opcode == S_IADD_RS { - if available_registers[0] == RegisterNeedsDisplacement || available_registers[1] == RegisterNeedsDisplacement { + if len(availableRegisters) == 2 && sins.Opcode == S_IADD_RS { + if availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement { sins.Src = RegisterNeedsDisplacement sins.OpGroupPar = sins.Src return true } } - if selectRegister(available_registers, gen, &sins.Src) { + if selectRegister(availableRegisters, gen, &sins.Src) { if sins.GroupParIsSource == 0 { @@ -737,35 +583,35 @@ func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters return false } -func (sins *SuperScalarInstruction) SelectDestination(preAllocatedAvailableRegisters []int, cycle int, allowChainedMul bool, Registers []Register, gen *Blake2Generator) bool { - preAllocatedAvailableRegisters = preAllocatedAvailableRegisters[:0] +func (sins *SuperScalarInstruction) SelectDestination(cycle int, allowChainedMul bool, Registers *[8]Register, gen *blake2.Generator) bool { + var availableRegisters = make([]int, 0, 8) for i := range Registers { if Registers[i].Latency <= cycle && (sins.CanReuse || i != sins.Src) && (allowChainedMul || sins.OpGroup != S_IMUL_R || Registers[i].LastOpGroup != S_IMUL_R) && (Registers[i].LastOpGroup != sins.OpGroup || Registers[i].LastOpPar != sins.OpGroupPar) && (sins.Opcode != S_IADD_RS || i != RegisterNeedsDisplacement) { - preAllocatedAvailableRegisters = append(preAllocatedAvailableRegisters, i) + availableRegisters = append(availableRegisters, i) } } - return selectRegister(preAllocatedAvailableRegisters, gen, &sins.Dst) + return selectRegister(availableRegisters, gen, &sins.Dst) } -func selectRegister(available_registers []int, gen *Blake2Generator, reg *int) bool { +func selectRegister(availableRegisters []int, gen *blake2.Generator, reg *int) bool { index := 0 - if len(available_registers) == 0 { + if len(availableRegisters) == 0 { return false } - if len(available_registers) > 1 { + if len(availableRegisters) > 1 { tmp := gen.GetUint32() - index = int(tmp % uint32(len(available_registers))) + index = int(tmp % uint32(len(availableRegisters))) } else { index = 0 } - *reg = available_registers[index] + *reg = availableRegisters[index] return true } @@ -799,26 +645,3 @@ func executeSuperscalar(p []SuperScalarInstruction, r *RegisterLine) { } } - -func smulh(a, b int64) uint64 { - hi_, _ := bits.Mul64(uint64(a), uint64(b)) - t1 := (a >> 63) & b - t2 := (b >> 63) & a - return uint64(int64(hi_) - t1 - t2) -} - -func randomx_reciprocal(divisor uint32) uint64 { - - const p2exp63 = uint64(1) << 63 - - quotient := p2exp63 / uint64(divisor) - remainder := p2exp63 % uint64(divisor) - - shift := bits.Len32(divisor) - - return (quotient << shift) + ((remainder << shift) / uint64(divisor)) -} - -func signExtend2sCompl(x uint32) uint64 { - return uint64(int64(int32(x))) -} diff --git a/superscalar_instruction.go b/superscalar_instruction.go new file mode 100644 index 0000000..a5ffbb2 --- /dev/null +++ b/superscalar_instruction.go @@ -0,0 +1,157 @@ +package randomx + +import "git.gammaspectra.live/P2Pool/go-randomx/v3/blake2" + +// SuperScalarInstruction superscalar program is built with superscalar instructions +type SuperScalarInstruction struct { + Opcode byte + Dst int + Src int + Mod byte + Imm32 uint32 + Imm64 uint64 + OpGroup int + OpGroupPar int + GroupParIsSource int + ins *Instruction + CanReuse bool +} + +func (sins *SuperScalarInstruction) FixSrcReg() { + if sins.Src == 0xff { + sins.Src = sins.Dst + } + +} +func (sins *SuperScalarInstruction) Reset() { + sins.Opcode = 99 + sins.Src = 0xff + sins.Dst = 0xff + sins.CanReuse = false + sins.GroupParIsSource = 0 +} + +func createSuperScalarInstruction(sins *SuperScalarInstruction, ins *Instruction, gen *blake2.Generator) { + sins.Reset() + sins.ins = ins + sins.OpGroupPar = -1 + sins.Opcode = ins.Opcode + + switch ins.Opcode { + case S_ISUB_R: + sins.Mod = 0 + sins.Imm32 = 0 + sins.OpGroup = S_IADD_RS + sins.GroupParIsSource = 1 + case S_IXOR_R: + sins.Mod = 0 + sins.Imm32 = 0 + sins.OpGroup = S_IXOR_R + sins.GroupParIsSource = 1 + case S_IADD_RS: + sins.Mod = gen.GetByte() + // set modshift on Imm32 + sins.Imm32 = uint32((sins.Mod >> 2) % 4) // bits 2-3 + //sins.Imm32 = 0 + sins.OpGroup = S_IADD_RS + sins.GroupParIsSource = 1 + case S_IMUL_R: + sins.Mod = 0 + sins.Imm32 = 0 + sins.OpGroup = S_IMUL_R + sins.GroupParIsSource = 1 + case S_IROR_C: + sins.Mod = 0 + + for sins.Imm32 = 0; sins.Imm32 == 0; { + sins.Imm32 = uint32(gen.GetByte() & 63) + } + + sins.OpGroup = S_IROR_C + sins.OpGroupPar = -1 + case S_IADD_C7, S_IADD_C8, S_IADD_C9: + sins.Mod = 0 + sins.Imm32 = gen.GetUint32() + sins.OpGroup = S_IADD_C7 + sins.OpGroupPar = -1 + case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9: + sins.Mod = 0 + sins.Imm32 = gen.GetUint32() + sins.OpGroup = S_IXOR_C7 + sins.OpGroupPar = -1 + + case S_IMULH_R: + sins.CanReuse = true + sins.Mod = 0 + sins.Imm32 = 0 + sins.OpGroup = S_IMULH_R + sins.OpGroupPar = int(gen.GetUint32()) + case S_ISMULH_R: + sins.CanReuse = true + sins.Mod = 0 + sins.Imm32 = 0 + sins.OpGroup = S_ISMULH_R + sins.OpGroupPar = int(gen.GetUint32()) + + case S_IMUL_RCP: + + sins.Mod = 0 + for { + sins.Imm32 = gen.GetUint32() + if (sins.Imm32&sins.Imm32 - 1) != 0 { + break + } + } + + sins.Imm64 = reciprocal(sins.Imm32) + + sins.OpGroup = S_IMUL_RCP + + default: + panic("should not occur") + + } + +} + +var slot3 = []*Instruction{&ISUB_R, &IXOR_R} // 3 length instruction will be filled with these +var slot3L = []*Instruction{&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R} + +var slot4 = []*Instruction{&IROR_C, &IADD_RS} +var slot7 = []*Instruction{&IXOR_C7, &IADD_C7} +var slot8 = []*Instruction{&IXOR_C8, &IADD_C8} +var slot9 = []*Instruction{&IXOR_C9, &IADD_C9} +var slot10 = []*Instruction{&IMUL_RCP} + +func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *blake2.Generator, instructionLen int, decoderType DecoderType, last, first bool) { + + switch instructionLen { + case 3: + if last { + createSuperScalarInstruction(sins, slot3L[gen.GetByte()&3], gen) + } else { + createSuperScalarInstruction(sins, slot3[gen.GetByte()&1], gen) + } + case 4: + //if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions + if decoderType == Decoder4444 && !last { + createSuperScalarInstruction(sins, &IMUL_R, gen) + } else { + createSuperScalarInstruction(sins, slot4[gen.GetByte()&1], gen) + } + case 7: + createSuperScalarInstruction(sins, slot7[gen.GetByte()&1], gen) + + case 8: + createSuperScalarInstruction(sins, slot8[gen.GetByte()&1], gen) + + case 9: + createSuperScalarInstruction(sins, slot9[gen.GetByte()&1], gen) + case 10: + createSuperScalarInstruction(sins, slot10[0], gen) + + default: + panic("should not be possible") + } + +} diff --git a/vm.go b/vm.go index 4657825..90c274e 100644 --- a/vm.go +++ b/vm.go @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package randomx import ( - "git.gammaspectra.live/P2Pool/go-randomx/v2/aes" + "git.gammaspectra.live/P2Pool/go-randomx/v3/aes" "math" "runtime" "unsafe" @@ -45,16 +45,30 @@ type REG struct { type VM struct { ScratchPad ScratchPad - Dataset Randomx_Dataset + Dataset Dataset - JITProgram VMProgramFunc + program ByteCode + jitProgram VMProgramFunc } -// Run calculate hash based on input +func NewVM(dataset Dataset) *VM { + vm := &VM{ + Dataset: dataset, + } + if dataset.Cache().HasJIT() { + vm.jitProgram = mapProgram(nil, int(RandomXCodeSize)) + if dataset.Flags()&RANDOMX_FLAG_SECURE == 0 { + mapProgramRWX(vm.jitProgram) + } + } + return vm +} + +// run calculate hash based on input. Not thread safe. // Warning: Underlying callers will run float64 SetRoundingMode directly // It is the caller's responsibility to set and restore the mode to IEEE 754 roundTiesToEven between full executions // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes -func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) { +func (vm *VM) run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) { reg.FPRC = roundingMode @@ -64,49 +78,64 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) { entropy := (*[16]uint64)(unsafe.Pointer(&buffer)) - prog := buffer[len(entropy)*8:] - // do more initialization before we run for i := range entropy[:8] { reg.A[i/2][i%2] = SmallPositiveFloatBits(entropy[i]) } - var mem MemoryRegisters + // memory registers + var ma, mx uint32 - mem.ma = entropy[8] & CacheLineAlignMask - mem.mx = entropy[10] + ma = uint32(entropy[8] & CacheLineAlignMask) + mx = uint32(entropy[10]) addressRegisters := entropy[12] var readReg [4]uint64 - for i := range readReg { readReg[i] = uint64(i*2) + (addressRegisters & 1) addressRegisters >>= 1 } - datasetOffset := (entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize + datasetOffset := (entropy[13] % (DatasetExtraItems + 1)) * CacheLineSize - eMask := [2]uint64{EMask(entropy[14]), EMask(entropy[15])} + eMask := [2]uint64{ExponentMask(entropy[14]), ExponentMask(entropy[15])} - byteCode := CompileProgramToByteCode(prog) + prog := buffer[len(entropy)*8:] + CompileProgramToByteCode(prog, &vm.program) - spAddr0 := mem.mx - spAddr1 := mem.ma + datasetMemory := vm.Dataset.Memory() - var rlCache RegisterLine + var jitProgram VMProgramFunc - if vm.JITProgram != nil { - if vm.Dataset.Flags()&RANDOMX_FLAG_SECURE > 0 { - mapProgramRW(vm.JITProgram) - byteCode.generateCode(vm.JITProgram) - mapProgramRX(vm.JITProgram) + if vm.jitProgram != nil { + if datasetMemory == nil { + if vm.Dataset.Flags()&RANDOMX_FLAG_SECURE > 0 { + mapProgramRW(vm.jitProgram) + jitProgram = vm.program.generateCode(vm.jitProgram, nil) + mapProgramRX(vm.jitProgram) + } else { + jitProgram = vm.program.generateCode(vm.jitProgram, nil) + } } else { - byteCode.generateCode(vm.JITProgram) + // full mode and we have JIT + if vm.Dataset.Flags()&RANDOMX_FLAG_SECURE > 0 { + mapProgramRW(vm.jitProgram) + jitProgram = vm.program.generateCode(vm.jitProgram, &readReg) + mapProgramRX(vm.jitProgram) + } else { + jitProgram = vm.program.generateCode(vm.jitProgram, &readReg) + } + + vm.jitProgram.ExecuteFull(®, &vm.ScratchPad, &datasetMemory[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask) + return reg } } + spAddr0 := uint64(mx) + spAddr1 := uint64(ma) + for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ { spMix := reg.R[readReg[0]] ^ reg.R[readReg[1]] @@ -131,22 +160,23 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) { reg.E[i][HIGH] = MaskRegisterExponentMantissa(reg.E[i][HIGH], eMask[HIGH]) } - // Run the actual bytecode - if vm.JITProgram != nil { - vm.JITProgram.Execute(®, &vm.ScratchPad, eMask) + // run the actual bytecode + if jitProgram != nil { + // light mode + jitProgram.Execute(®, &vm.ScratchPad, eMask) } else { - byteCode.Execute(®, &vm.ScratchPad, eMask) + vm.program.Execute(®, &vm.ScratchPad, eMask) } - mem.mx ^= reg.R[readReg[2]] ^ reg.R[readReg[3]] - mem.mx &= CacheLineAlignMask + mx ^= uint32(reg.R[readReg[2]] ^ reg.R[readReg[3]]) + mx &= uint32(CacheLineAlignMask) - vm.Dataset.PrefetchDataset(datasetOffset + mem.mx) - // execute diffuser superscalar program to get dataset 64 bytes - vm.Dataset.ReadDataset(datasetOffset+mem.ma, ®.R, &rlCache) + vm.Dataset.PrefetchDataset(datasetOffset + uint64(mx)) + // execute / load output from diffuser superscalar program to get dataset 64 bytes + vm.Dataset.ReadDataset(datasetOffset+uint64(ma), ®.R) // swap the elements - mem.mx, mem.ma = mem.ma, mem.mx + mx, ma = ma, mx for i := uint64(0); i < RegistersCount; i++ { vm.ScratchPad.Store64(uint32(spAddr1+8*i), reg.R[i]) @@ -165,17 +195,17 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) { } + runtime.KeepAlive(buffer) + return reg } -func (vm *VM) InitScratchpad(seed *[64]byte) { +func (vm *VM) initScratchpad(seed *[64]byte) { vm.ScratchPad.Init(seed) } -func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile { - hash512, _ := blake2b.New512(nil) - +func (vm *VM) runLoops(tempHash [64]byte) RegisterFile { if lockThreadDueToRoundingMode { // Lock thread due to rounding mode flags runtime.LockOSThread() @@ -185,20 +215,16 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile { roundingMode := uint8(0) for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ { - reg := vm.Run(tempHash, roundingMode) + reg := vm.run(tempHash, roundingMode) roundingMode = reg.FPRC - hash512.Reset() - // write R, F, E, A registers - hash512.Write(reg.Memory()[:]) + tempHash = blake2b.Sum512(reg.Memory()[:]) runtime.KeepAlive(reg) - - hash512.Sum(tempHash[:0]) } // final loop executes here - reg := vm.Run(tempHash, roundingMode) + reg := vm.run(tempHash, roundingMode) // always force a restore reg.FPRC = 0xff @@ -208,33 +234,29 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile { return reg } +// CalculateHash Not thread safe. func (vm *VM) CalculateHash(input []byte, output *[32]byte) { tempHash := blake2b.Sum512(input) - vm.InitScratchpad(&tempHash) + vm.initScratchpad(&tempHash) - reg := vm.RunLoops(tempHash) + reg := vm.runLoops(tempHash) // now hash the scratch pad as it will act as register A aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash) - hash256, _ := blake2b.New256(nil) + regMem := reg.Memory() + // write hash onto register A + copy(regMem[RegisterFileSize-RegistersCountFloat*2*8:], tempHash[:]) - hash256.Reset() - - // write R, F, E registers - hash256.Write(reg.Memory()[:RegisterFileSize-RegistersCountFloat*2*8]) + // write R, F, E, A registers + *output = blake2b.Sum256(regMem[:]) runtime.KeepAlive(reg) - - // write register A - hash256.Write(tempHash[:]) - - hash256.Sum(output[:0]) } func (vm *VM) Close() error { - if vm.JITProgram != nil { - return vm.JITProgram.Close() + if vm.jitProgram != nil { + return vm.jitProgram.Close() } return nil } diff --git a/vm_bytecode_jit_amd64.go b/vm_bytecode_jit_amd64.go index 32f68c5..3651735 100644 --- a/vm_bytecode_jit_amd64.go +++ b/vm_bytecode_jit_amd64.go @@ -11,6 +11,114 @@ import ( //go:noescape func vm_run(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64, jmp uintptr) +//go:noescape +func vm_run_full(rf *RegisterFile, pad *ScratchPad, dataset *RegisterLine, iterations, memoryRegisters uint64, eMask [2]uint64, jmp uintptr) + +/* +#define RANDOMX_DATASET_BASE_SIZE 2147483648 +#define RANDOMX_DATASET_BASE_MASK (RANDOMX_DATASET_BASE_SIZE-64) + +mov ecx, ebp ;# ecx = ma +;#and ecx, RANDOMX_DATASET_BASE_MASK +and ecx, 2147483584 +xor r8, qword ptr [rdi+rcx] +ror rbp, 32 ;# swap "ma" and "mx" +xor rbp, rax ;# modify "mx" +mov edx, ebp ;# edx = mx +;#and edx, RANDOMX_DATASET_BASE_MASK +and edx, 2147483584 +prefetchnta byte ptr [rdi+rdx] +xor r9, qword ptr [rdi+rcx+8] +xor r10, qword ptr [rdi+rcx+16] +xor r11, qword ptr [rdi+rcx+24] +xor r12, qword ptr [rdi+rcx+32] +xor r13, qword ptr [rdi+rcx+40] +xor r14, qword ptr [rdi+rcx+48] +xor r15, qword ptr [rdi+rcx+56] +*/ +var programReadDataset = []byte{0x89, 0xE9, 0x81, 0xE1, 0xC0, 0xFF, 0xFF, 0x7F, 0x4C, 0x33, 0x04, 0x0F, 0x48, 0xC1, 0xCD, 0x20, 0x48, 0x31, 0xC5, 0x89, 0xEA, 0x81, 0xE2, 0xC0, 0xFF, 0xFF, 0x7F, 0x0F, 0x18, 0x04, 0x17, 0x4C, 0x33, 0x4C, 0x0F, 0x08, 0x4C, 0x33, 0x54, 0x0F, 0x10, 0x4C, 0x33, 0x5C, 0x0F, 0x18, 0x4C, 0x33, 0x64, 0x0F, 0x20, 0x4C, 0x33, 0x6C, 0x0F, 0x28, 0x4C, 0x33, 0x74, 0x0F, 0x30, 0x4C, 0x33, 0x7C, 0x0F, 0x38} + +/* +lea rcx, [rsi+rax] +push rcx +xor r8, qword ptr [rcx+0] +xor r9, qword ptr [rcx+8] +xor r10, qword ptr [rcx+16] +xor r11, qword ptr [rcx+24] +xor r12, qword ptr [rcx+32] +xor r13, qword ptr [rcx+40] +xor r14, qword ptr [rcx+48] +xor r15, qword ptr [rcx+56] +lea rcx, [rsi+rdx] +push rcx +cvtdq2pd xmm0, qword ptr [rcx+0] +cvtdq2pd xmm1, qword ptr [rcx+8] +cvtdq2pd xmm2, qword ptr [rcx+16] +cvtdq2pd xmm3, qword ptr [rcx+24] +cvtdq2pd xmm4, qword ptr [rcx+32] +cvtdq2pd xmm5, qword ptr [rcx+40] +cvtdq2pd xmm6, qword ptr [rcx+48] +cvtdq2pd xmm7, qword ptr [rcx+56] +andps xmm4, xmm13 +andps xmm5, xmm13 +andps xmm6, xmm13 +andps xmm7, xmm13 +orps xmm4, xmm14 +orps xmm5, xmm14 +orps xmm6, xmm14 +orps xmm7, xmm14 +*/ +var programLoopLoad = []byte{0x48, 0x8D, 0x0C, 0x06, 0x51, 0x4C, 0x33, 0x01, 0x4C, 0x33, 0x49, 0x08, 0x4C, 0x33, 0x51, 0x10, 0x4C, 0x33, 0x59, 0x18, 0x4C, 0x33, 0x61, 0x20, 0x4C, 0x33, 0x69, 0x28, 0x4C, 0x33, 0x71, 0x30, 0x4C, 0x33, 0x79, 0x38, 0x48, 0x8D, 0x0C, 0x16, 0x51, 0xF3, 0x0F, 0xE6, 0x01, 0xF3, 0x0F, 0xE6, 0x49, 0x08, 0xF3, 0x0F, 0xE6, 0x51, 0x10, 0xF3, 0x0F, 0xE6, 0x59, 0x18, 0xF3, 0x0F, 0xE6, 0x61, 0x20, 0xF3, 0x0F, 0xE6, 0x69, 0x28, 0xF3, 0x0F, 0xE6, 0x71, 0x30, 0xF3, 0x0F, 0xE6, 0x79, 0x38, 0x41, 0x0F, 0x54, 0xE5, 0x41, 0x0F, 0x54, 0xED, 0x41, 0x0F, 0x54, 0xF5, 0x41, 0x0F, 0x54, 0xFD, 0x41, 0x0F, 0x56, 0xE6, 0x41, 0x0F, 0x56, 0xEE, 0x41, 0x0F, 0x56, 0xF6, 0x41, 0x0F, 0x56, 0xFE} + +/* +pop rcx +mov qword ptr [rcx+0], r8 +mov qword ptr [rcx+8], r9 +mov qword ptr [rcx+16], r10 +mov qword ptr [rcx+24], r11 +mov qword ptr [rcx+32], r12 +mov qword ptr [rcx+40], r13 +mov qword ptr [rcx+48], r14 +mov qword ptr [rcx+56], r15 +pop rcx +xorpd xmm0, xmm4 +xorpd xmm1, xmm5 +xorpd xmm2, xmm6 +xorpd xmm3, xmm7 + +movupd xmmword ptr [rcx+0], xmm0 +movupd xmmword ptr [rcx+16], xmm1 +movupd xmmword ptr [rcx+32], xmm2 +movupd xmmword ptr [rcx+48], xmm3 +;#movapd xmmword ptr [rcx+0], xmm0 +;#movapd xmmword ptr [rcx+16], xmm1 +;#movapd xmmword ptr [rcx+32], xmm2 +;#movapd xmmword ptr [rcx+48], xmm3 +*/ +//var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30} +var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x11, 0x01, 0x66, 0x0F, 0x11, 0x49, 0x10, 0x66, 0x0F, 0x11, 0x51, 0x20, 0x66, 0x0F, 0x11, 0x59, 0x30} + +/* +#define RANDOMX_SCRATCHPAD_L3 2097152 +#define RANDOMX_SCRATCHPAD_MASK (RANDOMX_SCRATCHPAD_L3-64) +mov rdx, rax +;#and eax, RANDOMX_SCRATCHPAD_MASK +and eax, 2097088 +ror rdx, 32 +;#and edx, RANDOMX_SCRATCHPAD_MASK +and edx, 2097088 +*/ +var programCalculateSpAddrs = []byte{0x48, 0x89, 0xC2, 0x25, 0xC0, 0xFF, 0x1F, 0x00, 0x48, 0xC1, 0xCA, 0x20, 0x81, 0xE2, 0xC0, 0xFF, 0x1F, 0x00} + +func (f VMProgramFunc) ExecuteFull(rf *RegisterFile, pad *ScratchPad, dataset *RegisterLine, iterations uint64, ma, mx uint32, eMask [2]uint64) { + if f == nil { + panic("program is nil") + } + + jmpPtr := uintptr(unsafe.Pointer(unsafe.SliceData(f))) + vm_run_full(rf, pad, dataset, iterations, (uint64(ma)<<32)|uint64(mx), eMask, jmpPtr) +} + func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) { if f == nil { panic("program is nil") @@ -20,15 +128,22 @@ func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint6 vm_run(rf, pad, eMask, jmpPtr) } -func (c *ByteCode) generateCode(program []byte) { +func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte { program = program[:0] + isFullMode := readReg != nil + + if isFullMode { + + program = append(program, programCalculateSpAddrs...) + // prologue + program = append(program, programLoopLoad...) + } + var instructionOffsets [RANDOMX_PROGRAM_SIZE]int32 - var codePos int32 for ix := range c { - instructionOffsets[ix] = codePos - curLen := len(program) + instructionOffsets[ix] = int32(len(program)) instr := &c[ix] switch instr.Opcode { @@ -254,10 +369,10 @@ func (c *ByteCode) generateCode(program []byte) { reg := instr.Dst target := instr.jumpTarget() + 1 - jmpOffset := instructionOffsets[target] - (codePos + 16) + jmpOffset := instructionOffsets[target] - (int32(len(program)) + 16) if BranchesWithin32B { - branchBegin := uint32(codePos + 7) + branchBegin := uint32(int32(len(program)) + 7) branchEnd := branchBegin if jmpOffset >= -128 { branchEnd += 9 @@ -305,8 +420,51 @@ func (c *ByteCode) generateCode(program []byte) { case VM_NOP: program = append(program, NOP1...) } - - codePos += int32(len(program) - curLen) } + + if isFullMode { + // end of prologue + program = append(program, REX_MOV_RR...) + program = append(program, 0xc0+byte(readReg[2])) + program = append(program, REX_XOR_EAX...) + program = append(program, 0xc0+byte(readReg[3])) + + // read dataset + + program = append(program, programReadDataset...) + + // epilogue + program = append(program, REX_MOV_RR64...) + program = append(program, 0xc0+byte(readReg[0])) + program = append(program, REX_XOR_RAX_R64...) + program = append(program, 0xc0+byte(readReg[1])) + //todo: prefetch scratchpad + + program = append(program, programLoopStore...) + + if BranchesWithin32B { + branchBegin := uint32(len(program)) + branchEnd := branchBegin + 9 + + // If the jump crosses or touches 32-byte boundary, align it + if (branchBegin ^ branchEnd) >= 32 { + alignmentSize := 32 - (branchBegin & 31) + if alignmentSize > 8 { + program = append(program, NOPX[alignmentSize-9][:alignmentSize-8]...) + alignmentSize = 8 + } + program = append(program, NOPX[alignmentSize-1][:alignmentSize]...) + } + } + + program = append(program, SUB_EBX...) + program = append(program, JNZ...) + program = binary.LittleEndian.AppendUint32(program, uint32(-len(program)-4)) + //exit otherwise + + } + program = append(program, RET) + + return program } diff --git a/vm_bytecode_jit_amd64.s b/vm_bytecode_jit_amd64.s index 708581c..6fc149f 100644 --- a/vm_bytecode_jit_amd64.s +++ b/vm_bytecode_jit_amd64.s @@ -34,8 +34,6 @@ TEXT ·vm_run(SB),$8-40 VMOVUPD (28*8)(AX), X10 VMOVUPD (30*8)(AX), X11 - //TODO: rest of init - // mantissa mask //VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13 MOVQ $0x00ffffffffffffff, AX @@ -89,3 +87,107 @@ TEXT ·vm_run(SB),$8-40 // a0-a3 are constant, no need to move RET + + +#define RANDOMX_SCRATCHPAD_L3 2097152 +#define RANDOMX_SCRATCHPAD_MASK (RANDOMX_SCRATCHPAD_L3-64) + +TEXT ·vm_run_full(SB),$32-64 + + // move register file to registers + MOVQ rf+0(FP), AX + + PREFETCHNTA 0(AX) + // r0-r7 + MOVQ (0*8)(AX), R8 + MOVQ (1*8)(AX), R9 + MOVQ (2*8)(AX), R10 + MOVQ (3*8)(AX), R11 + MOVQ (4*8)(AX), R12 + MOVQ (5*8)(AX), R13 + MOVQ (6*8)(AX), R14 + MOVQ (7*8)(AX), R15 + + // f0-f3 + VMOVUPD (8*8)(AX), X0 + VMOVUPD (10*8)(AX), X1 + VMOVUPD (12*8)(AX), X2 + VMOVUPD (14*8)(AX), X3 + // e0-e3 + VMOVUPD (16*8)(AX), X4 + VMOVUPD (18*8)(AX), X5 + VMOVUPD (20*8)(AX), X6 + VMOVUPD (22*8)(AX), X7 + // load constants a0-a3 + VMOVUPD (24*8)(AX), X8 + VMOVUPD (26*8)(AX), X9 + VMOVUPD (28*8)(AX), X10 + VMOVUPD (30*8)(AX), X11 + + //TODO: rest of init + + // mantissa mask + //VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13 + MOVQ $0x00ffffffffffffff, AX + VMOVQ AX, X13 + VPBROADCASTQ X13, X13 + + // eMask + VMOVDQU64 eMask+40(FP), X14 + + // scale mask + //VMOVQ $0x80F0000000000000, $0x80F0000000000000, X15 + MOVQ $0x80F0000000000000, AX + VMOVQ AX, X15 + VPBROADCASTQ X15, X15 + + // scratchpad pointer on rsi + MOVQ pad+8(FP), SI + // dataset pointer on rdi + MOVQ dataset+16(FP), DI + // iterations on rbx + MOVQ iterations+24(FP), BX + // ma and mx on rbp TODO: change this + MOVQ memoryRegisters+32(FP), BP + + // do ma/mx calcs + MOVQ BP, AX + RORQ $32, BP + + //AX = spAddr0 + //DX = spAddr1 + + // JIT location + MOVQ jmp+56(FP), CX + // jump to JIT code + // this handles readReg[0-3] and dataset reading, load, stores + CALL CX + + // move register file back to registers + MOVQ rf+0(FP), AX + + PREFETCHT0 0(AX) + // r0-r7 + MOVQ R8, (0*8)(AX) + MOVQ R9, (1*8)(AX) + MOVQ R10, (2*8)(AX) + MOVQ R11, (3*8)(AX) + MOVQ R12, (4*8)(AX) + MOVQ R13, (5*8)(AX) + MOVQ R14, (6*8)(AX) + MOVQ R15, (7*8)(AX) + + // f0-f3 + VMOVUPD X0, (8*8)(AX) + VMOVUPD X1, (10*8)(AX) + VMOVUPD X2, (12*8)(AX) + VMOVUPD X3, (14*8)(AX) + // e0-e3 + VMOVUPD X4, (16*8)(AX) + VMOVUPD X5, (18*8)(AX) + VMOVUPD X6, (20*8)(AX) + VMOVUPD X7, (22*8)(AX) + + // a0-a3 are constant, no need to move + + RET diff --git a/vm_bytecode_jit_generic.go b/vm_bytecode_jit_generic.go index 915c989..a223a9e 100644 --- a/vm_bytecode_jit_generic.go +++ b/vm_bytecode_jit_generic.go @@ -2,10 +2,13 @@ package randomx -func (c *ByteCode) generateCode(program []byte) { - +func (c *ByteCode) generateCode(program []byte) []byte { + return nil } func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) { } +func (f VMProgramFunc) ExecuteFull(rf *RegisterFile, pad *ScratchPad, dataset *[DatasetItemCount]RegisterLine, iterations, memoryRegisters uint64, eMask [2]uint64) { + +} diff --git a/vm_bytecode_native.go b/vm_bytecode_native.go index dc25404..d278758 100644 --- a/vm_bytecode_native.go +++ b/vm_bytecode_native.go @@ -3,7 +3,7 @@ package randomx import ( - "git.gammaspectra.live/P2Pool/go-randomx/v2/asm" + "git.gammaspectra.live/P2Pool/go-randomx/v3/asm" "math" "math/bits" ) diff --git a/vm_instruction.go b/vm_instruction.go index 46c8d26..dd8f380 100644 --- a/vm_instruction.go +++ b/vm_instruction.go @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. package randomx import ( - "git.gammaspectra.live/P2Pool/go-randomx/v2/aes" + "git.gammaspectra.live/P2Pool/go-randomx/v3/aes" "unsafe" ) import "encoding/binary" @@ -63,7 +63,7 @@ func (ins VM_Instruction) Opcode() byte { // CompileProgramToByteCode this will interpret single vm instruction into executable opcodes // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions -func CompileProgramToByteCode(prog []byte) (bc ByteCode) { +func CompileProgramToByteCode(prog []byte, bc *ByteCode) { var registerUsage [RegistersCount]int for i := range registerUsage { @@ -194,7 +194,7 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) { divisor := instr.IMM() if !isZeroOrPowerOf2(divisor) { ibc.Opcode = VM_IMUL_I - ibc.Imm = randomx_reciprocal(divisor) + ibc.Imm = reciprocal(divisor) registerUsage[dst] = i } else { ibc.Opcode = VM_NOP @@ -355,9 +355,6 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) { } } - - return bc - } type ScratchPad [ScratchpadSize]byte