Ensure 16-byte alignment of dataset/scratchpad/register file and use more performance fetch/write SIMD on amd64
All checks were successful
continuous-integration/drone/push Build is passing
All checks were successful
continuous-integration/drone/push Build is passing
This commit is contained in:
parent
9826b7beb4
commit
9aa3631f37
7
alignment.go
Normal file
7
alignment.go
Normal file
|
@ -0,0 +1,7 @@
|
|||
package randomx
|
||||
|
||||
func assertAlignedTo16(ptr uintptr) {
|
||||
if ptr&0b1111 != 0 {
|
||||
panic("not aligned to 16")
|
||||
}
|
||||
}
|
|
@ -3,6 +3,7 @@ package randomx
|
|||
import (
|
||||
"errors"
|
||||
"sync"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE
|
||||
|
@ -31,6 +32,7 @@ func NewDataset(flags Flags) (result *Dataset, err error) {
|
|||
|
||||
//todo: implement large pages, align allocation
|
||||
alignedMemory := make([]RegisterLine, DatasetItemCount)
|
||||
assertAlignedTo16(uintptr(unsafe.Pointer(unsafe.SliceData(alignedMemory))))
|
||||
|
||||
//todo: err on not large pages
|
||||
|
||||
|
|
|
@ -23,9 +23,11 @@ TEXT ·superscalar_run(SB),$0-16
|
|||
CALL AX
|
||||
|
||||
|
||||
// todo: not supported by golang
|
||||
// prefetchw BYTE PTR [rsi]
|
||||
// PREFETCHW 0(SI)
|
||||
PREFETCHT0 0(SI)
|
||||
BYTE $0x0F
|
||||
BYTE $0x0D
|
||||
BYTE $0x0E
|
||||
|
||||
// move registers back to register line
|
||||
MOVQ R8, 0(SI)
|
||||
|
|
27
vm.go
27
vm.go
|
@ -39,7 +39,7 @@ import (
|
|||
import "golang.org/x/crypto/blake2b"
|
||||
|
||||
type VM struct {
|
||||
pad ScratchPad
|
||||
pad *ScratchPad
|
||||
|
||||
flags Flags
|
||||
|
||||
|
@ -48,7 +48,7 @@ type VM struct {
|
|||
|
||||
hashState [blake2b.Size]byte
|
||||
|
||||
registerFile RegisterFile
|
||||
registerFile *RegisterFile
|
||||
|
||||
AES aes.AES
|
||||
|
||||
|
@ -92,11 +92,16 @@ func NewVM(flags Flags, cache *Cache, dataset *Dataset) (*VM, error) {
|
|||
}
|
||||
|
||||
vm := &VM{
|
||||
Cache: cache,
|
||||
Dataset: dataset,
|
||||
flags: flags,
|
||||
Cache: cache,
|
||||
Dataset: dataset,
|
||||
flags: flags,
|
||||
pad: new(ScratchPad),
|
||||
registerFile: new(RegisterFile),
|
||||
}
|
||||
|
||||
assertAlignedTo16(uintptr(unsafe.Pointer(vm.pad)))
|
||||
assertAlignedTo16(uintptr(unsafe.Pointer(vm.registerFile)))
|
||||
|
||||
if flags.Has(RANDOMX_FLAG_HARD_AES) {
|
||||
vm.AES = aes.NewHardAES()
|
||||
}
|
||||
|
@ -128,7 +133,7 @@ func (vm *VM) run() {
|
|||
|
||||
// do more initialization before we run
|
||||
|
||||
reg := &vm.registerFile
|
||||
reg := vm.registerFile
|
||||
reg.Clear()
|
||||
|
||||
// initialize constant registers
|
||||
|
@ -178,7 +183,7 @@ func (vm *VM) run() {
|
|||
jitProgram = vm.program.generateCode(vm.jitProgram, &readReg)
|
||||
}
|
||||
|
||||
vm.jitProgram.ExecuteFull(reg, &vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
|
||||
vm.jitProgram.ExecuteFull(reg, vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
|
||||
return
|
||||
}
|
||||
}
|
||||
|
@ -215,9 +220,9 @@ func (vm *VM) run() {
|
|||
// run the actual bytecode
|
||||
if jitProgram != nil {
|
||||
// light mode
|
||||
jitProgram.Execute(reg, &vm.pad, eMask)
|
||||
jitProgram.Execute(reg, vm.pad, eMask)
|
||||
} else {
|
||||
vm.program.Execute(reg, &vm.pad, eMask)
|
||||
vm.program.Execute(reg, vm.pad, eMask)
|
||||
}
|
||||
|
||||
mx ^= uint32(reg.R[readReg[2]] ^ reg.R[readReg[3]])
|
||||
|
@ -271,10 +276,10 @@ func (vm *VM) runLoops() {
|
|||
}
|
||||
|
||||
// always force a restore before startup
|
||||
ResetRoundingMode(&vm.registerFile)
|
||||
ResetRoundingMode(vm.registerFile)
|
||||
|
||||
// restore rounding mode at the end
|
||||
defer ResetRoundingMode(&vm.registerFile)
|
||||
defer ResetRoundingMode(vm.registerFile)
|
||||
|
||||
for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
|
||||
vm.run()
|
||||
|
|
|
@ -86,17 +86,13 @@ xorpd xmm1, xmm5
|
|||
xorpd xmm2, xmm6
|
||||
xorpd xmm3, xmm7
|
||||
|
||||
movupd xmmword ptr [rcx+0], xmm0
|
||||
movupd xmmword ptr [rcx+16], xmm1
|
||||
movupd xmmword ptr [rcx+32], xmm2
|
||||
movupd xmmword ptr [rcx+48], xmm3
|
||||
;#movapd xmmword ptr [rcx+0], xmm0
|
||||
;#movapd xmmword ptr [rcx+16], xmm1
|
||||
;#movapd xmmword ptr [rcx+32], xmm2
|
||||
;#movapd xmmword ptr [rcx+48], xmm3
|
||||
;# aligned mode
|
||||
movapd xmmword ptr [rcx+0], xmm0
|
||||
movapd xmmword ptr [rcx+16], xmm1
|
||||
movapd xmmword ptr [rcx+32], xmm2
|
||||
movapd xmmword ptr [rcx+48], xmm3
|
||||
*/
|
||||
//var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
|
||||
var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x11, 0x01, 0x66, 0x0F, 0x11, 0x49, 0x10, 0x66, 0x0F, 0x11, 0x51, 0x20, 0x66, 0x0F, 0x11, 0x59, 0x30}
|
||||
var programLoopStoreAligned = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
|
||||
|
||||
/*
|
||||
#define RANDOMX_SCRATCHPAD_L3 2097152
|
||||
|
@ -440,7 +436,7 @@ func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte {
|
|||
program = append(program, 0xc0+byte(readReg[1]))
|
||||
//todo: prefetch scratchpad
|
||||
|
||||
program = append(program, programLoopStore...)
|
||||
program = append(program, programLoopStoreAligned...)
|
||||
|
||||
if BranchesWithin32B {
|
||||
branchBegin := uint32(len(program))
|
||||
|
|
|
@ -19,20 +19,20 @@ TEXT ·vm_run(SB),$8-40
|
|||
MOVQ (7*8)(AX), R15
|
||||
|
||||
// f0-f3
|
||||
VMOVUPD (8*8)(AX), X0
|
||||
VMOVUPD (10*8)(AX), X1
|
||||
VMOVUPD (12*8)(AX), X2
|
||||
VMOVUPD (14*8)(AX), X3
|
||||
VMOVAPD (8*8)(AX), X0
|
||||
VMOVAPD (10*8)(AX), X1
|
||||
VMOVAPD (12*8)(AX), X2
|
||||
VMOVAPD (14*8)(AX), X3
|
||||
// e0-e3
|
||||
VMOVUPD (16*8)(AX), X4
|
||||
VMOVUPD (18*8)(AX), X5
|
||||
VMOVUPD (20*8)(AX), X6
|
||||
VMOVUPD (22*8)(AX), X7
|
||||
VMOVAPD (16*8)(AX), X4
|
||||
VMOVAPD (18*8)(AX), X5
|
||||
VMOVAPD (20*8)(AX), X6
|
||||
VMOVAPD (22*8)(AX), X7
|
||||
// a0-a3
|
||||
VMOVUPD (24*8)(AX), X8
|
||||
VMOVUPD (26*8)(AX), X9
|
||||
VMOVUPD (28*8)(AX), X10
|
||||
VMOVUPD (30*8)(AX), X11
|
||||
VMOVAPD (24*8)(AX), X8
|
||||
VMOVAPD (26*8)(AX), X9
|
||||
VMOVAPD (28*8)(AX), X10
|
||||
VMOVAPD (30*8)(AX), X11
|
||||
|
||||
// mantissa mask
|
||||
//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
|
||||
|
@ -62,7 +62,12 @@ TEXT ·vm_run(SB),$8-40
|
|||
// move register file back to registers
|
||||
MOVQ rf+0(FP), AX
|
||||
|
||||
PREFETCHT0 0(AX)
|
||||
// prefetchw BYTE PTR [rax]
|
||||
// PREFETCHW 0(AX)
|
||||
BYTE $0x0F
|
||||
BYTE $0x0D
|
||||
BYTE $0x08
|
||||
|
||||
// r0-r7
|
||||
MOVQ R8, (0*8)(AX)
|
||||
MOVQ R9, (1*8)(AX)
|
||||
|
@ -74,15 +79,15 @@ TEXT ·vm_run(SB),$8-40
|
|||
MOVQ R15, (7*8)(AX)
|
||||
|
||||
// f0-f3
|
||||
VMOVUPD X0, (8*8)(AX)
|
||||
VMOVUPD X1, (10*8)(AX)
|
||||
VMOVUPD X2, (12*8)(AX)
|
||||
VMOVUPD X3, (14*8)(AX)
|
||||
VMOVAPD X0, (8*8)(AX)
|
||||
VMOVAPD X1, (10*8)(AX)
|
||||
VMOVAPD X2, (12*8)(AX)
|
||||
VMOVAPD X3, (14*8)(AX)
|
||||
// e0-e3
|
||||
VMOVUPD X4, (16*8)(AX)
|
||||
VMOVUPD X5, (18*8)(AX)
|
||||
VMOVUPD X6, (20*8)(AX)
|
||||
VMOVUPD X7, (22*8)(AX)
|
||||
VMOVAPD X4, (16*8)(AX)
|
||||
VMOVAPD X5, (18*8)(AX)
|
||||
VMOVAPD X6, (20*8)(AX)
|
||||
VMOVAPD X7, (22*8)(AX)
|
||||
|
||||
// a0-a3 are constant, no need to move
|
||||
|
||||
|
@ -109,20 +114,20 @@ TEXT ·vm_run_full(SB),$32-64
|
|||
MOVQ (7*8)(AX), R15
|
||||
|
||||
// f0-f3
|
||||
VMOVUPD (8*8)(AX), X0
|
||||
VMOVUPD (10*8)(AX), X1
|
||||
VMOVUPD (12*8)(AX), X2
|
||||
VMOVUPD (14*8)(AX), X3
|
||||
VMOVAPD (8*8)(AX), X0
|
||||
VMOVAPD (10*8)(AX), X1
|
||||
VMOVAPD (12*8)(AX), X2
|
||||
VMOVAPD (14*8)(AX), X3
|
||||
// e0-e3
|
||||
VMOVUPD (16*8)(AX), X4
|
||||
VMOVUPD (18*8)(AX), X5
|
||||
VMOVUPD (20*8)(AX), X6
|
||||
VMOVUPD (22*8)(AX), X7
|
||||
VMOVAPD (16*8)(AX), X4
|
||||
VMOVAPD (18*8)(AX), X5
|
||||
VMOVAPD (20*8)(AX), X6
|
||||
VMOVAPD (22*8)(AX), X7
|
||||
// load constants a0-a3
|
||||
VMOVUPD (24*8)(AX), X8
|
||||
VMOVUPD (26*8)(AX), X9
|
||||
VMOVUPD (28*8)(AX), X10
|
||||
VMOVUPD (30*8)(AX), X11
|
||||
VMOVAPD (24*8)(AX), X8
|
||||
VMOVAPD (26*8)(AX), X9
|
||||
VMOVAPD (28*8)(AX), X10
|
||||
VMOVAPD (30*8)(AX), X11
|
||||
|
||||
//TODO: rest of init
|
||||
|
||||
|
@ -166,7 +171,13 @@ TEXT ·vm_run_full(SB),$32-64
|
|||
// move register file back to registers
|
||||
MOVQ rf+0(FP), AX
|
||||
|
||||
PREFETCHT0 0(AX)
|
||||
|
||||
// prefetchw BYTE PTR [rax]
|
||||
// PREFETCHW 0(AX)
|
||||
BYTE $0x0F
|
||||
BYTE $0x0D
|
||||
BYTE $0x08
|
||||
|
||||
// r0-r7
|
||||
MOVQ R8, (0*8)(AX)
|
||||
MOVQ R9, (1*8)(AX)
|
||||
|
@ -178,15 +189,15 @@ TEXT ·vm_run_full(SB),$32-64
|
|||
MOVQ R15, (7*8)(AX)
|
||||
|
||||
// f0-f3
|
||||
VMOVUPD X0, (8*8)(AX)
|
||||
VMOVUPD X1, (10*8)(AX)
|
||||
VMOVUPD X2, (12*8)(AX)
|
||||
VMOVUPD X3, (14*8)(AX)
|
||||
VMOVAPD X0, (8*8)(AX)
|
||||
VMOVAPD X1, (10*8)(AX)
|
||||
VMOVAPD X2, (12*8)(AX)
|
||||
VMOVAPD X3, (14*8)(AX)
|
||||
// e0-e3
|
||||
VMOVUPD X4, (16*8)(AX)
|
||||
VMOVUPD X5, (18*8)(AX)
|
||||
VMOVUPD X6, (20*8)(AX)
|
||||
VMOVUPD X7, (22*8)(AX)
|
||||
VMOVAPD X4, (16*8)(AX)
|
||||
VMOVAPD X5, (18*8)(AX)
|
||||
VMOVAPD X6, (20*8)(AX)
|
||||
VMOVAPD X7, (22*8)(AX)
|
||||
|
||||
// a0-a3 are constant, no need to move
|
||||
|
||||
|
|
Loading…
Reference in a new issue