Compare commits

...

2 commits

10 changed files with 201 additions and 69 deletions

7
alignment.go Normal file
View file

@ -0,0 +1,7 @@
package randomx
func assertAlignedTo16(ptr uintptr) {
if ptr&0b1111 != 0 {
panic("not aligned to 16")
}
}

View file

@ -3,6 +3,7 @@ package randomx
import (
"errors"
"sync"
"unsafe"
)
const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE
@ -31,6 +32,7 @@ func NewDataset(flags Flags) (result *Dataset, err error) {
//todo: implement large pages, align allocation
alignedMemory := make([]RegisterLine, DatasetItemCount)
assertAlignedTo16(uintptr(unsafe.Pointer(unsafe.SliceData(alignedMemory))))
//todo: err on not large pages

View file

@ -23,7 +23,7 @@ func NewHardAES() AES {
return nil
}
func (h hardAES) HashAes1Rx4(input []byte, output *[64]byte) {
func (aes hardAES) HashAes1Rx4(input []byte, output *[64]byte) {
if len(input)%len(output) != 0 {
panic("unsupported")
}
@ -31,7 +31,7 @@ func (h hardAES) HashAes1Rx4(input []byte, output *[64]byte) {
asm.HashAes1Rx4(&keys.AesHash1R_State, &keys.AesHash1R_XKeys, output, unsafe.SliceData(input), uint64(len(input)))
}
func (h hardAES) FillAes1Rx4(state *[64]byte, output []byte) {
func (aes hardAES) FillAes1Rx4(state *[64]byte, output []byte) {
if len(output)%len(state) != 0 {
panic("unsupported")
}
@ -42,7 +42,7 @@ func (h hardAES) FillAes1Rx4(state *[64]byte, output []byte) {
runtime.KeepAlive(state)
}
func (h hardAES) FillAes4Rx4(state [64]byte, output []byte) {
func (aes hardAES) FillAes4Rx4(state [64]byte, output []byte) {
if len(output)%len(state) != 0 {
panic("unsupported")
}
@ -61,3 +61,9 @@ func (h hardAES) FillAes4Rx4(state [64]byte, output []byte) {
copy(output[outptr:], state[:])
}
}
func (aes hardAES) HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte) {
//TODO
aes.HashAes1Rx4(scratchpad, output)
aes.FillAes1Rx4(fillState, scratchpad)
}

View file

@ -28,6 +28,9 @@ type AES interface {
// calls to this function.
FillAes1Rx4(state *[64]byte, output []byte)
// HashAndFillAes1Rx4 Hashes and fills scratchpad and output in one sweep
HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte)
// FillAes4Rx4 used to generate final program
//
// 'state' is copied when calling

View file

@ -67,3 +67,9 @@ func (aes softAES) FillAes4Rx4(state [64]byte, output []byte) {
copy(output[outptr:], state[:])
}
}
func (aes softAES) HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte) {
//TODO
aes.HashAes1Rx4(scratchpad, output)
aes.FillAes1Rx4(fillState, scratchpad)
}

View file

@ -143,6 +143,60 @@ func Test_RandomXLight(t *testing.T) {
}
}
func Test_RandomXBatch(t *testing.T) {
t.Parallel()
for _, n := range []string{"softaes", "hardaes"} {
t.Run(n, func(t *testing.T) {
t.Parallel()
tFlags, skip := testFlags(t.Name(), 0)
if skip {
t.Skip("not supported on this platform")
}
c := NewCache(tFlags)
if c == nil {
t.Fatal("nil cache")
}
defer func() {
err := c.Close()
if err != nil {
t.Error(err)
}
}()
tests := Tests[1:4]
c.Init(tests[0].key)
vm, err := NewVM(tFlags, c, nil)
if err != nil {
t.Fatal(err)
}
defer func() {
err := vm.Close()
if err != nil {
t.Error(err)
}
}()
var outputHash [3][RANDOMX_HASH_SIZE]byte
vm.CalculateHashFirst(tests[0].input)
vm.CalculateHashNext(tests[1].input, &outputHash[0])
vm.CalculateHashNext(tests[2].input, &outputHash[1])
vm.CalculateHashLast(&outputHash[2])
for i, test := range tests {
outputHex := hex.EncodeToString(outputHash[i][:])
if outputHex != test.expected {
t.Errorf("key=%v, input=%v", test.key, test.input)
t.Errorf("expected=%s, actual=%s", test.expected, outputHex)
t.FailNow()
}
}
})
}
}
func Test_RandomXFull(t *testing.T) {
if testing.Short() {
t.Skip("Skipping full mode with -short")

View file

@ -23,9 +23,11 @@ TEXT ·superscalar_run(SB),$0-16
CALL AX
// todo: not supported by golang
// prefetchw BYTE PTR [rsi]
// PREFETCHW 0(SI)
PREFETCHT0 0(SI)
BYTE $0x0F
BYTE $0x0D
BYTE $0x0E
// move registers back to register line
MOVQ R8, 0(SI)

67
vm.go
View file

@ -39,7 +39,7 @@ import (
import "golang.org/x/crypto/blake2b"
type VM struct {
pad ScratchPad
pad *ScratchPad
flags Flags
@ -48,7 +48,7 @@ type VM struct {
hashState [blake2b.Size]byte
registerFile RegisterFile
registerFile *RegisterFile
AES aes.AES
@ -92,11 +92,16 @@ func NewVM(flags Flags, cache *Cache, dataset *Dataset) (*VM, error) {
}
vm := &VM{
Cache: cache,
Dataset: dataset,
flags: flags,
Cache: cache,
Dataset: dataset,
flags: flags,
pad: new(ScratchPad),
registerFile: new(RegisterFile),
}
assertAlignedTo16(uintptr(unsafe.Pointer(vm.pad)))
assertAlignedTo16(uintptr(unsafe.Pointer(vm.registerFile)))
if flags.Has(RANDOMX_FLAG_HARD_AES) {
vm.AES = aes.NewHardAES()
}
@ -128,7 +133,7 @@ func (vm *VM) run() {
// do more initialization before we run
reg := &vm.registerFile
reg := vm.registerFile
reg.Clear()
// initialize constant registers
@ -178,7 +183,7 @@ func (vm *VM) run() {
jitProgram = vm.program.generateCode(vm.jitProgram, &readReg)
}
vm.jitProgram.ExecuteFull(reg, &vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
vm.jitProgram.ExecuteFull(reg, vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
return
}
}
@ -215,9 +220,9 @@ func (vm *VM) run() {
// run the actual bytecode
if jitProgram != nil {
// light mode
jitProgram.Execute(reg, &vm.pad, eMask)
jitProgram.Execute(reg, vm.pad, eMask)
} else {
vm.program.Execute(reg, &vm.pad, eMask)
vm.program.Execute(reg, vm.pad, eMask)
}
mx ^= uint32(reg.R[readReg[2]] ^ reg.R[readReg[3]])
@ -271,10 +276,10 @@ func (vm *VM) runLoops() {
}
// always force a restore before startup
ResetRoundingMode(&vm.registerFile)
ResetRoundingMode(vm.registerFile)
// restore rounding mode at the end
defer ResetRoundingMode(&vm.registerFile)
defer ResetRoundingMode(vm.registerFile)
for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
vm.run()
@ -327,6 +332,46 @@ func (vm *VM) CalculateHash(input []byte, output *[RANDOMX_HASH_SIZE]byte) {
*output = blake2b.Sum256(regMem[:])
}
// CalculateHashFirst will begin a hash calculation.
func (vm *VM) CalculateHashFirst(input []byte) {
vm.hashState = blake2b.Sum512(input)
vm.initScratchpad(&vm.hashState)
}
// CalculateHashNext will output the hash value of the previous input and begin the calculation of the next hash.
func (vm *VM) CalculateHashNext(nextInput []byte, output *[RANDOMX_HASH_SIZE]byte) {
vm.runLoops()
// now hash the scratch pad as it will act as register A
vm.AES.HashAes1Rx4(vm.pad[:], &vm.hashState)
// Finish current hash and fill the scratchpad for the next hash at the same time
regMem := vm.registerFile.Memory()
vm.hashState = blake2b.Sum512(nextInput)
// write hash onto register A
vm.AES.HashAndFillAes1Rx4(vm.pad[:], (*[64]byte)(unsafe.Pointer(unsafe.SliceData(regMem[RegisterFileSize-RegistersCountFloat*2*8:]))), &vm.hashState)
runtime.KeepAlive(regMem)
// write R, F, E, A registers
*output = blake2b.Sum256(regMem[:])
}
// CalculateHashLast will output the hash value of the previous input.
func (vm *VM) CalculateHashLast(output *[RANDOMX_HASH_SIZE]byte) {
vm.runLoops()
// now hash the scratch pad as it will act as register A
vm.AES.HashAes1Rx4(vm.pad[:], &vm.hashState)
regMem := vm.registerFile.Memory()
// write hash onto register A
copy(regMem[RegisterFileSize-RegistersCountFloat*2*8:], vm.hashState[:])
// write R, F, E, A registers
*output = blake2b.Sum256(regMem[:])
}
// Close Releases all memory occupied by the structure.
func (vm *VM) Close() error {
if vm.jitProgram != nil {

View file

@ -86,17 +86,13 @@ xorpd xmm1, xmm5
xorpd xmm2, xmm6
xorpd xmm3, xmm7
movupd xmmword ptr [rcx+0], xmm0
movupd xmmword ptr [rcx+16], xmm1
movupd xmmword ptr [rcx+32], xmm2
movupd xmmword ptr [rcx+48], xmm3
;#movapd xmmword ptr [rcx+0], xmm0
;#movapd xmmword ptr [rcx+16], xmm1
;#movapd xmmword ptr [rcx+32], xmm2
;#movapd xmmword ptr [rcx+48], xmm3
;# aligned mode
movapd xmmword ptr [rcx+0], xmm0
movapd xmmword ptr [rcx+16], xmm1
movapd xmmword ptr [rcx+32], xmm2
movapd xmmword ptr [rcx+48], xmm3
*/
//var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
var programLoopStore = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x11, 0x01, 0x66, 0x0F, 0x11, 0x49, 0x10, 0x66, 0x0F, 0x11, 0x51, 0x20, 0x66, 0x0F, 0x11, 0x59, 0x30}
var programLoopStoreAligned = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
/*
#define RANDOMX_SCRATCHPAD_L3 2097152
@ -440,7 +436,7 @@ func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte {
program = append(program, 0xc0+byte(readReg[1]))
//todo: prefetch scratchpad
program = append(program, programLoopStore...)
program = append(program, programLoopStoreAligned...)
if BranchesWithin32B {
branchBegin := uint32(len(program))

View file

@ -19,20 +19,20 @@ TEXT ·vm_run(SB),$8-40
MOVQ (7*8)(AX), R15
// f0-f3
VMOVUPD (8*8)(AX), X0
VMOVUPD (10*8)(AX), X1
VMOVUPD (12*8)(AX), X2
VMOVUPD (14*8)(AX), X3
VMOVAPD (8*8)(AX), X0
VMOVAPD (10*8)(AX), X1
VMOVAPD (12*8)(AX), X2
VMOVAPD (14*8)(AX), X3
// e0-e3
VMOVUPD (16*8)(AX), X4
VMOVUPD (18*8)(AX), X5
VMOVUPD (20*8)(AX), X6
VMOVUPD (22*8)(AX), X7
VMOVAPD (16*8)(AX), X4
VMOVAPD (18*8)(AX), X5
VMOVAPD (20*8)(AX), X6
VMOVAPD (22*8)(AX), X7
// a0-a3
VMOVUPD (24*8)(AX), X8
VMOVUPD (26*8)(AX), X9
VMOVUPD (28*8)(AX), X10
VMOVUPD (30*8)(AX), X11
VMOVAPD (24*8)(AX), X8
VMOVAPD (26*8)(AX), X9
VMOVAPD (28*8)(AX), X10
VMOVAPD (30*8)(AX), X11
// mantissa mask
//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
@ -62,7 +62,12 @@ TEXT ·vm_run(SB),$8-40
// move register file back to registers
MOVQ rf+0(FP), AX
PREFETCHT0 0(AX)
// prefetchw BYTE PTR [rax]
// PREFETCHW 0(AX)
BYTE $0x0F
BYTE $0x0D
BYTE $0x08
// r0-r7
MOVQ R8, (0*8)(AX)
MOVQ R9, (1*8)(AX)
@ -74,15 +79,15 @@ TEXT ·vm_run(SB),$8-40
MOVQ R15, (7*8)(AX)
// f0-f3
VMOVUPD X0, (8*8)(AX)
VMOVUPD X1, (10*8)(AX)
VMOVUPD X2, (12*8)(AX)
VMOVUPD X3, (14*8)(AX)
VMOVAPD X0, (8*8)(AX)
VMOVAPD X1, (10*8)(AX)
VMOVAPD X2, (12*8)(AX)
VMOVAPD X3, (14*8)(AX)
// e0-e3
VMOVUPD X4, (16*8)(AX)
VMOVUPD X5, (18*8)(AX)
VMOVUPD X6, (20*8)(AX)
VMOVUPD X7, (22*8)(AX)
VMOVAPD X4, (16*8)(AX)
VMOVAPD X5, (18*8)(AX)
VMOVAPD X6, (20*8)(AX)
VMOVAPD X7, (22*8)(AX)
// a0-a3 are constant, no need to move
@ -109,20 +114,20 @@ TEXT ·vm_run_full(SB),$32-64
MOVQ (7*8)(AX), R15
// f0-f3
VMOVUPD (8*8)(AX), X0
VMOVUPD (10*8)(AX), X1
VMOVUPD (12*8)(AX), X2
VMOVUPD (14*8)(AX), X3
VMOVAPD (8*8)(AX), X0
VMOVAPD (10*8)(AX), X1
VMOVAPD (12*8)(AX), X2
VMOVAPD (14*8)(AX), X3
// e0-e3
VMOVUPD (16*8)(AX), X4
VMOVUPD (18*8)(AX), X5
VMOVUPD (20*8)(AX), X6
VMOVUPD (22*8)(AX), X7
VMOVAPD (16*8)(AX), X4
VMOVAPD (18*8)(AX), X5
VMOVAPD (20*8)(AX), X6
VMOVAPD (22*8)(AX), X7
// load constants a0-a3
VMOVUPD (24*8)(AX), X8
VMOVUPD (26*8)(AX), X9
VMOVUPD (28*8)(AX), X10
VMOVUPD (30*8)(AX), X11
VMOVAPD (24*8)(AX), X8
VMOVAPD (26*8)(AX), X9
VMOVAPD (28*8)(AX), X10
VMOVAPD (30*8)(AX), X11
//TODO: rest of init
@ -166,7 +171,13 @@ TEXT ·vm_run_full(SB),$32-64
// move register file back to registers
MOVQ rf+0(FP), AX
PREFETCHT0 0(AX)
// prefetchw BYTE PTR [rax]
// PREFETCHW 0(AX)
BYTE $0x0F
BYTE $0x0D
BYTE $0x08
// r0-r7
MOVQ R8, (0*8)(AX)
MOVQ R9, (1*8)(AX)
@ -178,15 +189,15 @@ TEXT ·vm_run_full(SB),$32-64
MOVQ R15, (7*8)(AX)
// f0-f3
VMOVUPD X0, (8*8)(AX)
VMOVUPD X1, (10*8)(AX)
VMOVUPD X2, (12*8)(AX)
VMOVUPD X3, (14*8)(AX)
VMOVAPD X0, (8*8)(AX)
VMOVAPD X1, (10*8)(AX)
VMOVAPD X2, (12*8)(AX)
VMOVAPD X3, (14*8)(AX)
// e0-e3
VMOVUPD X4, (16*8)(AX)
VMOVUPD X5, (18*8)(AX)
VMOVUPD X6, (20*8)(AX)
VMOVUPD X7, (22*8)(AX)
VMOVAPD X4, (16*8)(AX)
VMOVAPD X5, (18*8)(AX)
VMOVAPD X6, (20*8)(AX)
VMOVAPD X7, (22*8)(AX)
// a0-a3 are constant, no need to move