Ensure 16-byte alignment of dataset/scratchpad/register file and use more performance fetch/write SIMD on amd64
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
DataHoarder 2024-05-02 12:06:38 +02:00
parent 9826b7beb4
commit 9aa3631f37
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
6 changed files with 89 additions and 66 deletions

7
alignment.go Normal file
View file

@ -0,0 +1,7 @@
package randomx
func assertAlignedTo16(ptr uintptr) {
if ptr&0b1111 != 0 {
panic("not aligned to 16")
}
}

View file

@ -3,6 +3,7 @@ package randomx
import (
"errors"
"sync"
"unsafe"
)
const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE
@ -31,6 +32,7 @@ func NewDataset(flags Flags) (result *Dataset, err error) {
//todo: implement large pages, align allocation
alignedMemory := make([]RegisterLine, DatasetItemCount)
assertAlignedTo16(uintptr(unsafe.Pointer(unsafe.SliceData(alignedMemory))))
//todo: err on not large pages

View file

@ -23,9 +23,11 @@ TEXT ·superscalar_run(SB),$0-16
CALL AX
// todo: not supported by golang
// prefetchw BYTE PTR [rsi]
// PREFETCHW 0(SI)
PREFETCHT0 0(SI)
BYTE $0x0F
BYTE $0x0D
BYTE $0x0E
// move registers back to register line
MOVQ R8, 0(SI)

27
vm.go
View file

@ -39,7 +39,7 @@ import (
import "golang.org/x/crypto/blake2b"
type VM struct {
pad ScratchPad
pad *ScratchPad
flags Flags
@ -48,7 +48,7 @@ type VM struct {
hashState [blake2b.Size]byte
registerFile RegisterFile
registerFile *RegisterFile
AES aes.AES
@ -92,11 +92,16 @@ func NewVM(flags Flags, cache *Cache, dataset *Dataset) (*VM, error) {
}
vm := &VM{
Cache: cache,
Dataset: dataset,
flags: flags,
Cache: cache,
Dataset: dataset,
flags: flags,
pad: new(ScratchPad),
registerFile: new(RegisterFile),
}
assertAlignedTo16(uintptr(unsafe.Pointer(vm.pad)))
assertAlignedTo16(uintptr(unsafe.Pointer(vm.registerFile)))
if flags.Has(RANDOMX_FLAG_HARD_AES) {
vm.AES = aes.NewHardAES()
}
@ -128,7 +133,7 @@ func (vm *VM) run() {
// do more initialization before we run
reg := &vm.registerFile
reg := vm.registerFile
reg.Clear()
// initialize constant registers
@ -178,7 +183,7 @@ func (vm *VM) run() {
jitProgram = vm.program.generateCode(vm.jitProgram, &readReg)
}
vm.jitProgram.ExecuteFull(reg, &vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
vm.jitProgram.ExecuteFull(reg, vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
return
}
}
@ -215,9 +220,9 @@ func (vm *VM) run() {
// run the actual bytecode
if jitProgram != nil {
// light mode
jitProgram.Execute(reg, &vm.pad, eMask)
jitProgram.Execute(reg, vm.pad, eMask)
} else {
vm.program.Execute(reg, &vm.pad, eMask)
vm.program.Execute(reg, vm.pad, eMask)
}
mx ^= uint32(reg.R[readReg[2]] ^ reg.R[readReg[3]])
@ -271,10 +276,10 @@ func (vm *VM) runLoops() {
}
// always force a restore before startup
ResetRoundingMode(&vm.registerFile)
ResetRoundingMode(vm.registerFile)
// restore rounding mode at the end
defer ResetRoundingMode(&vm.registerFile)
defer ResetRoundingMode(vm.registerFile)
for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
vm.run()

View file

@ -86,17 +86,13 @@ xorpd xmm1, xmm5
xorpd xmm2, xmm6
xorpd xmm3, xmm7
movupd xmmword ptr [rcx+0], xmm0
movupd xmmword ptr [rcx+16], xmm1
movupd xmmword ptr [rcx+32], xmm2
movupd xmmword ptr [rcx+48], xmm3
;#movapd xmmword ptr [rcx+0], xmm0
;#movapd xmmword ptr [rcx+16], xmm1
;#movapd xmmword ptr [rcx+32], xmm2
;#movapd xmmword ptr [rcx+48], xmm3
;# aligned mode
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3
*/
//var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x11, 0x01, 0x66, 0x0F, 0x11, 0x49, 0x10, 0x66, 0x0F, 0x11, 0x51, 0x20, 0x66, 0x0F, 0x11, 0x59, 0x30}
var programLoopStoreAligned = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
/*
#define RANDOMX_SCRATCHPAD_L3 2097152
@ -440,7 +436,7 @@ func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte {
program = append(program, 0xc0+byte(readReg[1]))
//todo: prefetch scratchpad
program = append(program, programLoopStore...)
program = append(program, programLoopStoreAligned...)
if BranchesWithin32B {
branchBegin := uint32(len(program))

View file

@ -19,20 +19,20 @@ TEXT ·vm_run(SB),$8-40
MOVQ (7*8)(AX), R15
// f0-f3
VMOVUPD (8*8)(AX), X0
VMOVUPD (10*8)(AX), X1
VMOVUPD (12*8)(AX), X2
VMOVUPD (14*8)(AX), X3
VMOVAPD (8*8)(AX), X0
VMOVAPD (10*8)(AX), X1
VMOVAPD (12*8)(AX), X2
VMOVAPD (14*8)(AX), X3
// e0-e3
VMOVUPD (16*8)(AX), X4
VMOVUPD (18*8)(AX), X5
VMOVUPD (20*8)(AX), X6
VMOVUPD (22*8)(AX), X7
VMOVAPD (16*8)(AX), X4
VMOVAPD (18*8)(AX), X5
VMOVAPD (20*8)(AX), X6
VMOVAPD (22*8)(AX), X7
// a0-a3
VMOVUPD (24*8)(AX), X8
VMOVUPD (26*8)(AX), X9
VMOVUPD (28*8)(AX), X10
VMOVUPD (30*8)(AX), X11
VMOVAPD (24*8)(AX), X8
VMOVAPD (26*8)(AX), X9
VMOVAPD (28*8)(AX), X10
VMOVAPD (30*8)(AX), X11
// mantissa mask
//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
@ -62,7 +62,12 @@ TEXT ·vm_run(SB),$8-40
// move register file back to registers
MOVQ rf+0(FP), AX
PREFETCHT0 0(AX)
// prefetchw BYTE PTR [rax]
// PREFETCHW 0(AX)
BYTE $0x0F
BYTE $0x0D
BYTE $0x08
// r0-r7
MOVQ R8, (0*8)(AX)
MOVQ R9, (1*8)(AX)
@ -74,15 +79,15 @@ TEXT ·vm_run(SB),$8-40
MOVQ R15, (7*8)(AX)
// f0-f3
VMOVUPD X0, (8*8)(AX)
VMOVUPD X1, (10*8)(AX)
VMOVUPD X2, (12*8)(AX)
VMOVUPD X3, (14*8)(AX)
VMOVAPD X0, (8*8)(AX)
VMOVAPD X1, (10*8)(AX)
VMOVAPD X2, (12*8)(AX)
VMOVAPD X3, (14*8)(AX)
// e0-e3
VMOVUPD X4, (16*8)(AX)
VMOVUPD X5, (18*8)(AX)
VMOVUPD X6, (20*8)(AX)
VMOVUPD X7, (22*8)(AX)
VMOVAPD X4, (16*8)(AX)
VMOVAPD X5, (18*8)(AX)
VMOVAPD X6, (20*8)(AX)
VMOVAPD X7, (22*8)(AX)
// a0-a3 are constant, no need to move
@ -109,20 +114,20 @@ TEXT ·vm_run_full(SB),$32-64
MOVQ (7*8)(AX), R15
// f0-f3
VMOVUPD (8*8)(AX), X0
VMOVUPD (10*8)(AX), X1
VMOVUPD (12*8)(AX), X2
VMOVUPD (14*8)(AX), X3
VMOVAPD (8*8)(AX), X0
VMOVAPD (10*8)(AX), X1
VMOVAPD (12*8)(AX), X2
VMOVAPD (14*8)(AX), X3
// e0-e3
VMOVUPD (16*8)(AX), X4
VMOVUPD (18*8)(AX), X5
VMOVUPD (20*8)(AX), X6
VMOVUPD (22*8)(AX), X7
VMOVAPD (16*8)(AX), X4
VMOVAPD (18*8)(AX), X5
VMOVAPD (20*8)(AX), X6
VMOVAPD (22*8)(AX), X7
// load constants a0-a3
VMOVUPD (24*8)(AX), X8
VMOVUPD (26*8)(AX), X9
VMOVUPD (28*8)(AX), X10
VMOVUPD (30*8)(AX), X11
VMOVAPD (24*8)(AX), X8
VMOVAPD (26*8)(AX), X9
VMOVAPD (28*8)(AX), X10
VMOVAPD (30*8)(AX), X11
//TODO: rest of init
@ -166,7 +171,13 @@ TEXT ·vm_run_full(SB),$32-64
// move register file back to registers
MOVQ rf+0(FP), AX
PREFETCHT0 0(AX)
// prefetchw BYTE PTR [rax]
// PREFETCHW 0(AX)
BYTE $0x0F
BYTE $0x0D
BYTE $0x08
// r0-r7
MOVQ R8, (0*8)(AX)
MOVQ R9, (1*8)(AX)
@ -178,15 +189,15 @@ TEXT ·vm_run_full(SB),$32-64
MOVQ R15, (7*8)(AX)
// f0-f3
VMOVUPD X0, (8*8)(AX)
VMOVUPD X1, (10*8)(AX)
VMOVUPD X2, (12*8)(AX)
VMOVUPD X3, (14*8)(AX)
VMOVAPD X0, (8*8)(AX)
VMOVAPD X1, (10*8)(AX)
VMOVAPD X2, (12*8)(AX)
VMOVAPD X3, (14*8)(AX)
// e0-e3
VMOVUPD X4, (16*8)(AX)
VMOVUPD X5, (18*8)(AX)
VMOVUPD X6, (20*8)(AX)
VMOVUPD X7, (22*8)(AX)
VMOVAPD X4, (16*8)(AX)
VMOVAPD X5, (18*8)(AX)
VMOVAPD X6, (20*8)(AX)
VMOVAPD X7, (22*8)(AX)
// a0-a3 are constant, no need to move