amd64: Implemented VM JIT
All checks were successful
continuous-integration/drone/push Build is passing

This commit is contained in:
DataHoarder 2024-04-18 12:09:05 +02:00
parent d72726b0fe
commit d20dd880ce
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
26 changed files with 849 additions and 103 deletions

View file

@ -8,21 +8,22 @@ This package implements RandomX without CGO, using only Golang code, pure float6
All test cases pass properly.
Uses minimal Go assembly due to having to set rounding mode natively. Native hard float can be added with supporting rounding mode under _asm_.
JIT is supported on a few platforms but can be hard-disabled via the `disable_jit` build flag, or at runtime.
A pure Golang implementation can be used on platforms without hard float support or via the `purego` build flag manually.
| Platform | Supported | Hard Float | SuperScalar JIT | Notes |
|:-----------:|:---------:|:----------:|:---------------:|:----------------:|
| **386** | ✅ | ✅ | ❌ | |
| **amd64** | ✅ | ✅ | ✅* | JIT only on Unix |
| **arm** | ✅* | ❌ | ❌ | |
| **arm64** | ✅ | ✅ | ❌ | |
| **mips** | ✅* | ❌ | ❌ | |
| **mips64** | ✅* | ❌ | ❌ | |
| **riscv64** | ✅* | ❌ | ❌ | |
| **wasm** | ✅* | ❌ | ❌ | |
| Platform | Hard Float | Hard AES | JIT | Native | purego | Notes |
|:-----------:|:----------:|:--------:|:---:|:------:|:------:|:----------------:|
| **386** | ✅ | ❌ | ❌ | ✅ | ✅ | |
| **amd64** | ✅ | ✅ | ✅* | ✅ | ✅ | JIT only on Unix |
| **arm** | ❌ | ❌ | ❌ | ❌ | ✅ | |
| **arm64** | ✅ | ❌ | ❌ | ✅ | ✅ | |
| **mips** | ❌ | ❌ | ❌ | ❌ | ✅ | |
| **mips64** | ❌ | ❌ | ❌ | ❌ | ✅ | |
| **riscv64** | ❌ | ❌ | ❌ | ❌ | ✅ | |
| **wasm** | ❌ | ❌ | ❌ | ❌ | ✅ | |
* these platforms only support software floating point / purego and will not be performant.
Any platform with no hard float support (soft float using [softfloat64](git.gammaspectra.live/P2Pool/softfloat64)) will be vastly slow.
Native hard float can be added with supporting rounding mode under _asm_.

View file

@ -8,18 +8,23 @@ import (
_ "unsafe"
)
//go:noescape
//go:linkname hard_aesdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesdec
func hard_aesdec(state *[4]uint32, key *[4]uint32)
//go:noescape
//go:linkname hard_aesenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesenc
func hard_aesenc(state *[4]uint32, key *[4]uint32)
//go:noescape
//go:linkname hard_aesroundtrip_decenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_decenc
func hard_aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32)
//go:noescape
//go:linkname hard_aesroundtrip_encdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec
func hard_aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32)
//go:noescape
//go:linkname hard_aesroundtrip_encdec1 git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec1
func hard_aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32)
@ -45,10 +50,10 @@ func aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) {
if supportsAES {
hard_aesroundtrip_decenc(states, keys)
} else {
aesdec(&states[0], &keys[0])
aesenc(&states[1], &keys[1])
aesdec(&states[2], &keys[2])
aesenc(&states[3], &keys[3])
soft_aesdec(&states[0], &keys[0])
soft_aesenc(&states[1], &keys[1])
soft_aesdec(&states[2], &keys[2])
soft_aesenc(&states[3], &keys[3])
}
}
@ -56,10 +61,10 @@ func aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) {
if supportsAES {
hard_aesroundtrip_encdec(states, keys)
} else {
aesenc(&states[0], &keys[0])
aesdec(&states[1], &keys[1])
aesenc(&states[2], &keys[2])
aesdec(&states[3], &keys[3])
soft_aesenc(&states[0], &keys[0])
soft_aesdec(&states[1], &keys[1])
soft_aesenc(&states[2], &keys[2])
soft_aesdec(&states[3], &keys[3])
}
}
@ -67,9 +72,9 @@ func aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) {
if supportsAES {
hard_aesroundtrip_encdec1(states, key)
} else {
aesenc(&states[0], key)
aesdec(&states[1], key)
aesenc(&states[2], key)
aesdec(&states[3], key)
soft_aesenc(&states[0], key)
soft_aesdec(&states[1], key)
soft_aesenc(&states[2], key)
soft_aesdec(&states[3], key)
}
}

View file

@ -5,43 +5,43 @@
TEXT ·aesenc(SB),NOSPLIT|NOFRAME,$0-16
MOVQ state+0(FP), AX
MOVQ key+8(FP), BX
MOVUPS 0(AX), X0
MOVUPS 0(BX), X1
VMOVDQU32 0(AX), X0
VMOVDQU32 0(BX), X1
AESENC X1, X0
MOVUPS X0, 0(AX)
VMOVDQU32 X0, 0(AX)
RET
TEXT ·aesdec(SB),NOSPLIT|NOFRAME,$0-16
MOVQ state+0(FP), AX
MOVQ key+8(FP), BX
MOVUPS 0(AX), X0
MOVUPS 0(BX), X1
VMOVDQU32 0(AX), X0
VMOVDQU32 0(BX), X1
AESDEC X1, X0
MOVUPS X0, 0(AX)
VMOVDQU32 X0, 0(AX)
RET
TEXT ·aesroundtrip_decenc(SB),NOSPLIT|NOFRAME,$0-16
MOVQ states+0(FP), AX
MOVQ keys+8(FP), BX
MOVUPS 0(AX), X0
MOVUPS 0(BX), X1
MOVUPS 16(AX), X2
MOVUPS 16(BX), X3
MOVUPS 32(AX), X4
MOVUPS 32(BX), X5
MOVUPS 48(AX), X6
MOVUPS 48(BX), X7
VMOVDQU32 0(AX), X0
VMOVDQU32 0(BX), X1
VMOVDQU32 16(AX), X2
VMOVDQU32 16(BX), X3
VMOVDQU32 32(AX), X4
VMOVDQU32 32(BX), X5
VMOVDQU32 48(AX), X6
VMOVDQU32 48(BX), X7
AESDEC X1, X0
AESENC X3, X2
AESDEC X5, X4
AESENC X7, X6
MOVUPS X0, 0(AX)
MOVUPS X2, 16(AX)
MOVUPS X4, 32(AX)
MOVUPS X6, 48(AX)
VMOVDQU32 X0, 0(AX)
VMOVDQU32 X2, 16(AX)
VMOVDQU32 X4, 32(AX)
VMOVDQU32 X6, 48(AX)
RET
@ -49,24 +49,24 @@ TEXT ·aesroundtrip_encdec(SB),NOSPLIT|NOFRAME,$0-16
MOVQ states+0(FP), AX
MOVQ keys+8(FP), BX
MOVUPS 0(AX), X0
MOVUPS 0(BX), X1
MOVUPS 16(AX), X2
MOVUPS 16(BX), X3
MOVUPS 32(AX), X4
MOVUPS 32(BX), X5
MOVUPS 48(AX), X6
MOVUPS 48(BX), X7
VMOVDQU32 0(AX), X0
VMOVDQU32 0(BX), X1
VMOVDQU32 16(AX), X2
VMOVDQU32 16(BX), X3
VMOVDQU32 32(AX), X4
VMOVDQU32 32(BX), X5
VMOVDQU32 48(AX), X6
VMOVDQU32 48(BX), X7
AESENC X1, X0
AESDEC X3, X2
AESENC X5, X4
AESDEC X7, X6
MOVUPS X0, 0(AX)
MOVUPS X2, 16(AX)
MOVUPS X4, 32(AX)
MOVUPS X6, 48(AX)
VMOVDQU32 X0, 0(AX)
VMOVDQU32 X2, 16(AX)
VMOVDQU32 X4, 32(AX)
VMOVDQU32 X6, 48(AX)
RET
@ -74,20 +74,20 @@ TEXT ·aesroundtrip_encdec1(SB),NOSPLIT|NOFRAME,$0-16
MOVQ states+0(FP), AX
MOVQ key+8(FP), BX
MOVUPS 0(BX), X0
MOVUPS 0(AX), X1
MOVUPS 16(AX), X2
MOVUPS 32(AX), X3
MOVUPS 48(AX), X4
VMOVDQU32 0(BX), X0
VMOVDQU32 0(AX), X1
VMOVDQU32 16(AX), X2
VMOVDQU32 32(AX), X3
VMOVDQU32 48(AX), X4
AESENC X0, X1
AESDEC X0, X2
AESENC X0, X3
AESDEC X0, X4
MOVUPS X1, 0(AX)
MOVUPS X2, 16(AX)
MOVUPS X3, 32(AX)
MOVUPS X4, 48(AX)
VMOVDQU32 X1, 0(AX)
VMOVDQU32 X2, 16(AX)
VMOVDQU32 X3, 32(AX)
VMOVDQU32 X4, 48(AX)
RET

7
asm/cpuid_amd64.go Normal file
View file

@ -0,0 +1,7 @@
//go:build amd64 && !purego
package asm
func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
func Cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
func Xgetbv(index uint32) (eax, edx uint32)

34
asm/cpuid_amd64.s Normal file
View file

@ -0,0 +1,34 @@
//go:build amd64 && !purego
#include "textflag.h"
// func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
TEXT ·Cpuid(SB), 7, $0
XORQ CX, CX
MOVL op+0(FP), AX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func Cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
TEXT ·Cpuidex(SB), 7, $0
MOVL op+0(FP), AX
MOVL op2+4(FP), CX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func xgetbv(index uint32) (eax, edx uint32)
TEXT ·Xgetbv(SB), 7, $0
MOVL index+0(FP), CX
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
MOVL AX, eax+8(FP)
MOVL DX, edx+12(FP)
RET

View file

@ -40,11 +40,18 @@ func (cache *Randomx_Cache) HasJIT() bool {
func (cache *Randomx_Cache) VM_Initialize() *VM {
return &VM{
vm := &VM{
Dataset: &Randomx_DatasetLight{
Cache: cache,
},
}
if cache.HasJIT() {
vm.JITProgram = mapProgram(nil, int(RandomXCodeSize))
if cache.Flags&RANDOMX_FLAG_SECURE == 0 {
mapProgramRWX(vm.JITProgram)
}
}
return vm
}
func (cache *Randomx_Cache) Close() error {

View file

@ -106,8 +106,18 @@ const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
const STOREL3CONDITION = 14
const RANDOMX_FLAG_DEFAULT = uint64(0)
const RANDOMX_FLAG_JIT = uint64(1 << iota)
const RANDOMX_FLAG_DEFAULT = 0
const (
RANDOMX_FLAG_LARGE_PAGES = 1 << iota
RANDOMX_FLAG_HARD_AES
RANDOMX_FLAG_FULL_MEM
RANDOMX_FLAG_JIT
RANDOMX_FLAG_SECURE
RANDOMX_FLAG_ARGON2_SSSE3
RANDOMX_FLAG_ARGON2_AVX2
RANDOMX_FLAG_ARGON2
)
func isZeroOrPowerOf2(x uint32) bool {
return (x & (x - 1)) == 0

View file

@ -4,4 +4,5 @@ type Randomx_Dataset interface {
InitDataset(startItem, endItem uint64)
ReadDataset(address uint64, r, cache *RegisterLine)
PrefetchDataset(address uint64)
Flags() uint64
}

View file

@ -21,6 +21,10 @@ func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLin
}
}
func (d *Randomx_DatasetLight) Flags() uint64 {
return d.Cache.Flags
}
func (d *Randomx_DatasetLight) InitDataset(startItem, endItem uint64) {
//d.Cache.initDataset(d.Cache.Programs)
}

View file

@ -1,3 +1,5 @@
package randomx
type SuperScalarProgramFunc []byte
type VMProgramFunc []byte

View file

@ -5,3 +5,24 @@ package randomx
func (f SuperScalarProgramFunc) Close() error {
return nil
}
func (f VMProgramFunc) Close() error {
return nil
}
func mapProgram(program []byte, size int) []byte {
return nil
}
func mapProgramRW(execFunc []byte) {
}
func mapProgramRX(execFunc []byte) {
}
// mapProgramRWX insecure!
func mapProgramRWX(execFunc []byte) {
}

View file

@ -9,10 +9,56 @@ import (
func (f SuperScalarProgramFunc) Close() error {
return unix.Munmap(f)
}
func (f VMProgramFunc) Close() error {
return unix.Munmap(f)
}
func mapProgram(program []byte) []byte {
// Write only
execFunc, err := unix.Mmap(-1, 0, len(program), unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
func mapProgramRW(execFunc []byte) {
err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_WRITE)
if err != nil {
defer func() {
// unmap if we err
err := unix.Munmap(execFunc)
if err != nil {
panic(err)
}
}()
panic(err)
}
}
func mapProgramRX(execFunc []byte) {
err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_EXEC)
if err != nil {
defer func() {
// unmap if we err
err := unix.Munmap(execFunc)
if err != nil {
panic(err)
}
}()
panic(err)
}
}
// mapProgramRWX insecure!
func mapProgramRWX(execFunc []byte) {
err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_WRITE|unix.PROT_EXEC)
if err != nil {
defer func() {
// unmap if we err
err := unix.Munmap(execFunc)
if err != nil {
panic(err)
}
}()
panic(err)
}
}
func mapProgram(program []byte, size int) []byte {
// Read and Write only
execFunc, err := unix.Mmap(-1, 0, max(size, len(program)), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
if err != nil {
panic(err)
}

View file

@ -2,6 +2,12 @@
package randomx
import (
"bytes"
"encoding/binary"
"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
)
/*
REGISTER ALLOCATION:
@ -11,7 +17,7 @@ package randomx
; rcx -> temporary
; rdx -> temporary
; rsi -> scratchpad pointer
; rdi -> return address // dataset pointer
; rdi -> (not used)
; rbp -> (do not use, it's used by Golang sampling) jump target //todo: memory registers "ma" (high 32 bits), "mx" (low 32 bits)
; rsp -> stack pointer
; r8 -> "r0"
@ -134,7 +140,7 @@ var CALL = 0xe8
var REX_ADD_I = []byte{0x49, 0x81}
var REX_TEST = []byte{0x49, 0xF7}
var JZ = []byte{0x0f, 0x84}
var JZ_SHORT = 0x74
var JZ_SHORT byte = 0x74
var RET byte = 0xc3
@ -151,6 +157,172 @@ var NOP6 = []byte{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00}
var NOP7 = []byte{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}
var NOP8 = []byte{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
var JMP_ALIGN_PREFIX = [14][]byte{
{},
{0x2E},
{0x2E, 0x2E},
{0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
}
func genSIB(scale, index, base int) byte {
return byte((scale << 6) | (index << 3) | base)
}
func genAddressReg(buf []byte, instr *ByteCodeInstruction, rax bool) []byte {
buf = append(buf, LEA_32...)
if rax {
buf = append(buf, 0x80+instr.Src+0)
} else {
buf = append(buf, 0x80+instr.Src+8)
}
if instr.Src == RegisterNeedsSib {
buf = append(buf, 0x24)
}
buf = binary.LittleEndian.AppendUint32(buf, uint32(instr.Imm))
if rax {
buf = append(buf, AND_EAX_I)
} else {
buf = append(buf, AND_ECX_I...)
}
buf = binary.LittleEndian.AppendUint32(buf, instr.MemMask)
return buf
}
func valAsString(values ...uint32) []byte {
r := make([]byte, 4*len(values))
for i, v := range values {
dst := r[i*4:]
dst[0] = byte(v & 0xff)
dst[1] = byte((v >> 8) & 0xff)
dst[2] = byte((v >> 16) & 0xff)
dst[3] = byte((v >> 24) & 0xff)
switch {
case dst[0] == 0:
return r[:i*4]
case dst[1] == 0:
return r[:i*4+1]
case dst[2] == 0:
return r[:i*4+2]
case dst[3] == 0:
return r[:i*4+3]
}
}
return r
}
func familyModel(maxFunctionId uint32) (family, model, stepping int) {
if maxFunctionId < 0x1 {
return 0, 0, 0
}
eax, _, _, _ := asm.Cpuid(1)
// If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0].
family = int((eax >> 8) & 0xf)
extFam := family == 0x6 // Intel is 0x6, needs extended model.
if family == 0xf {
// Add ExtFamily
family += int((eax >> 20) & 0xff)
extFam = true
}
// If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0].
model = int((eax >> 4) & 0xf)
if extFam {
// Add ExtModel
model += int((eax >> 12) & 0xf0)
}
stepping = int(eax & 0xf)
return family, model, stepping
}
var BranchesWithin32B = func() bool {
a, b, c, d := asm.Cpuid(0)
v := string(valAsString(b, d, c))
if v == "GenuineIntel" {
family, model, stepping := familyModel(a)
// Intel JCC erratum mitigation
if family == 6 {
// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
return ((model == 0x4E) && (stepping == 0x3)) ||
((model == 0x55) && ((stepping == 0x4) || (stepping == 0x7))) ||
((model == 0x5E) && (stepping == 0x3)) ||
((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
((model == 0xA6) && (stepping == 0x0)) ||
((model == 0xAE) && (stepping == 0xA))
}
}
return false
}()
/*
;# callee-saved registers - Microsoft x64 calling convention
push rbx
push rbp
push rdi
push rsi
push r12
push r13
push r14
push r15
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm6
movdqu xmmword ptr [rsp+48], xmm7
movdqu xmmword ptr [rsp+32], xmm8
movdqu xmmword ptr [rsp+16], xmm9
movdqu xmmword ptr [rsp+0], xmm10
sub rsp, 80
movdqu xmmword ptr [rsp+64], xmm11
movdqu xmmword ptr [rsp+48], xmm12
movdqu xmmword ptr [rsp+32], xmm13
movdqu xmmword ptr [rsp+16], xmm14
movdqu xmmword ptr [rsp+0], xmm15
;# function arguments
push rcx ;# RegisterFile& registerFile
mov rbp, qword ptr [rdx] ;# "mx", "ma"
mov rdi, qword ptr [rdx+8] ;# uint8_t* dataset
mov rsi, r8 ;# uint8_t* scratchpad
mov rbx, r9 ;# loop counter
mov rax, rbp
ror rbp, 32
;# zero integer registers
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
;# load constant registers
lea rcx, [rcx+120]
movapd xmm8, xmmword ptr [rcx+72]
movapd xmm9, xmmword ptr [rcx+88]
movapd xmm10, xmmword ptr [rcx+104]
movapd xmm11, xmmword ptr [rcx+120]
movapd xmm13, xmmword ptr [mantissaMask]
movapd xmm14, xmmword ptr [exp240]
movapd xmm15, xmmword ptr [scaleMask]
mov rdx, rax
and eax, RANDOMX_SCRATCHPAD_MASK
ror rdx, 32
and edx, RANDOMX_SCRATCHPAD_MASK
jmp rx_program_loop_begin
*/
var randomx_program_prologue = bytes.Repeat(NOP1, 64)
var randomx_program_loop_begin = bytes.Repeat(NOP1, 64)

5
jit_generic.go Normal file
View file

@ -0,0 +1,5 @@
//go:build !unix || !amd64 || disable_jit || purego
package randomx
var RandomXCodeSize uint64 = 0

View file

@ -63,6 +63,7 @@ func Test_Randomx(t *testing.T) {
}()
vm := c.VM_Initialize()
defer vm.Close()
var output_hash [32]byte
vm.CalculateHash(tt.input, &output_hash)
@ -92,6 +93,7 @@ func Benchmark_RandomX(b *testing.B) {
}()
vm := c.VM_Initialize()
defer vm.Close()
b.ResetTimer()
for i := 0; i < b.N; i++ {
var output_hash [32]byte
@ -119,6 +121,7 @@ func Benchmark_RandomXParallel(b *testing.B) {
b.RunParallel(func(pb *testing.PB) {
var output_hash [32]byte
vm := c.VM_Initialize()
defer vm.Close()
for pb.Next() {
vm.CalculateHash(tt.input, &output_hash)

View file

@ -702,7 +702,10 @@ type Register struct {
//RegisterNeedsSib = 4; //x86 r12 register
}
// RegisterNeedsDisplacement x86 r13 register
const RegisterNeedsDisplacement = 5
// RegisterNeedsSib x86 r12 register
const RegisterNeedsSib = 4
func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters []int, cycle int, Registers []Register, gen *Blake2Generator) bool {

View file

@ -4,7 +4,6 @@ package randomx
import (
"encoding/binary"
"runtime"
"unsafe"
)
@ -17,21 +16,6 @@ func (f SuperScalarProgramFunc) Execute(rf uintptr) {
}
superscalar_run(rf, uintptr(unsafe.Pointer(unsafe.SliceData(f))))
return
var reservedStackHack [8 * 8]byte
for i := range reservedStackHack {
reservedStackHack[i] = uint8(i)
}
memoryPtr := &f
fun := *(*func(v uintptr))(unsafe.Pointer(&memoryPtr))
fun(rf)
for i := range reservedStackHack {
reservedStackHack[i] = uint8(-i)
}
runtime.KeepAlive(reservedStackHack)
}
// generateSuperscalarCode
@ -106,5 +90,5 @@ func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgra
program = append(program, RET)
return mapProgram(program)
return mapProgram(program, len(program))
}

View file

@ -25,6 +25,7 @@ TEXT ·superscalar_run(SB),$0-16
// todo: not supported by golang
// PREFETCHW 0(SI)
PREFETCHT0 0(SI)
// move registers back to register line
MOVQ R8, 0(SI)

30
vm.go
View file

@ -46,6 +46,8 @@ type VM struct {
ScratchPad ScratchPad
Dataset Randomx_Dataset
JITProgram VMProgramFunc
}
// Run calculate hash based on input
@ -95,6 +97,16 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) {
var rlCache RegisterLine
if vm.JITProgram != nil {
if vm.Dataset.Flags()&RANDOMX_FLAG_SECURE > 0 {
mapProgramRW(vm.JITProgram)
byteCode.generateCode(vm.JITProgram)
mapProgramRX(vm.JITProgram)
} else {
byteCode.generateCode(vm.JITProgram)
}
}
for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
spMix := reg.R[readReg[0]] ^ reg.R[readReg[1]]
@ -120,7 +132,11 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) {
}
// Run the actual bytecode
byteCode.Execute(&reg, &vm.ScratchPad, eMask)
if vm.JITProgram != nil {
vm.JITProgram.Execute(&reg, &vm.ScratchPad, eMask)
} else {
byteCode.Execute(&reg, &vm.ScratchPad, eMask)
}
mem.mx ^= reg.R[readReg[2]] ^ reg.R[readReg[3]]
mem.mx &= CacheLineAlignMask
@ -183,9 +199,10 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
// final loop executes here
reg := vm.Run(tempHash, roundingMode)
roundingMode = reg.FPRC
// always force a restore
reg.FPRC = 0xff
//restore rounding mode
// restore rounding mode to 0
SetRoundingMode(&reg, 0)
return reg
@ -214,3 +231,10 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
hash256.Sum(output[:0])
}
func (vm *VM) Close() error {
if vm.JITProgram != nil {
return vm.JITProgram.Close()
}
return nil
}

View file

@ -31,7 +31,7 @@ type ByteCodeInstruction struct {
}
func (i ByteCodeInstruction) jumpTarget() int {
return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
return int(int16((uint16(i.ImmB) << 8) | uint16(i.Src)))
}
func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {

312
vm_bytecode_jit_amd64.go Normal file
View file

@ -0,0 +1,312 @@
//go:build unix && amd64 && !disable_jit && !purego
package randomx
import (
"encoding/binary"
"math/bits"
"unsafe"
)
//go:noescape
func vm_run(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64, jmp uintptr)
func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
if f == nil {
panic("program is nil")
}
jmpPtr := uintptr(unsafe.Pointer(unsafe.SliceData(f)))
vm_run(rf, pad, eMask, jmpPtr)
}
func (c *ByteCode) generateCode(program []byte) {
program = program[:0]
var instructionOffsets [RANDOMX_PROGRAM_SIZE]int32
var codePos int32
for ix := range c {
instructionOffsets[ix] = codePos
curLen := len(program)
instr := &c[ix]
switch instr.Opcode {
case VM_IADD_RS:
program = append(program, REX_LEA...)
if instr.Dst == RegisterNeedsDisplacement {
program = append(program, 0xac)
} else {
program = append(program, 0x04+8*instr.Dst)
}
program = append(program, genSIB(int(instr.ImmB), int(instr.Src), int(instr.Dst)))
if instr.Dst == RegisterNeedsDisplacement {
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
}
case VM_IADD_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_ADD_RM...)
program = append(program, 0x04+8*instr.Dst)
program = append(program, 0x06)
case VM_IADD_MZ:
program = append(program, REX_ADD_RM...)
program = append(program, 0x86+8*instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_ISUB_R:
program = append(program, REX_SUB_RR...)
program = append(program, 0xc0+8*instr.Dst+instr.Src)
case VM_ISUB_I:
program = append(program, REX_81...)
program = append(program, 0xe8+instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_ISUB_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_SUB_RM...)
program = append(program, 0x04+8*instr.Dst)
program = append(program, 0x06)
case VM_ISUB_MZ:
program = append(program, REX_SUB_RM...)
program = append(program, 0x86+8*instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_IMUL_R:
program = append(program, REX_IMUL_RR...)
program = append(program, 0xc0+8*instr.Dst+instr.Src)
case VM_IMUL_I:
// also handles imul_rcp, with 64-bit special
if bits.Len64(instr.Imm) > 32 {
program = append(program, MOV_RAX_I...)
program = binary.LittleEndian.AppendUint64(program, instr.Imm)
program = append(program, REX_IMUL_RM...)
program = append(program, 0xc0+8*instr.Dst)
} else {
program = append(program, REX_IMUL_RRI...)
program = append(program, 0xc0+9*instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
}
case VM_IMUL_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_IMUL_RM...)
program = append(program, 0x04+8*instr.Dst)
program = append(program, 0x06)
case VM_IMUL_MZ:
program = append(program, REX_IMUL_RM...)
program = append(program, 0x86+8*instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_IMULH_R:
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_MUL_R...)
program = append(program, 0xe0+instr.Src)
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_IMULH_M:
program = genAddressReg(program, instr, false)
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_MUL_MEM...)
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_IMULH_MZ:
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_MUL_M...)
program = append(program, 0xa6)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_ISMULH_R:
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_MUL_R...)
program = append(program, 0xe8+instr.Src)
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_ISMULH_M:
program = genAddressReg(program, instr, false)
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_IMUL_MEM...)
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_ISMULH_MZ:
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Dst)
program = append(program, REX_MUL_M...)
program = append(program, 0xae)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
program = append(program, REX_MOV_R64R...)
program = append(program, 0xc2+8*instr.Dst)
case VM_INEG_R:
program = append(program, REX_NEG...)
program = append(program, 0xd8+instr.Dst)
case VM_IXOR_R:
program = append(program, REX_XOR_RR...)
program = append(program, 0xc0+8*instr.Dst+instr.Src)
case VM_IXOR_I:
program = append(program, REX_XOR_RI...)
program = append(program, 0xf0+instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_IXOR_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_XOR_RM...)
program = append(program, 0x04+8*instr.Dst)
program = append(program, 0x06)
case VM_IXOR_MZ:
program = append(program, REX_XOR_RM...)
program = append(program, 0x86+8*instr.Dst)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
case VM_IROR_R:
program = append(program, REX_MOV_RR...)
program = append(program, 0xc8+instr.Src)
program = append(program, REX_ROT_CL...)
program = append(program, 0xc8+instr.Dst)
case VM_IROR_I:
program = append(program, REX_ROT_I8...)
program = append(program, 0xc8+instr.Dst)
program = append(program, byte(instr.Imm&63))
case VM_IROL_R:
program = append(program, REX_MOV_RR...)
program = append(program, 0xc8+instr.Src)
program = append(program, REX_ROT_CL...)
program = append(program, 0xc0+instr.Dst)
case VM_IROL_I:
program = append(program, REX_ROT_I8...)
program = append(program, 0xc0+instr.Dst)
program = append(program, byte(instr.Imm&63))
case VM_ISWAP_R:
program = append(program, REX_XCHG...)
program = append(program, 0xc0+instr.Src+8*instr.Dst)
case VM_FSWAP_RF:
program = append(program, SHUFPD...)
program = append(program, 0xc0+9*instr.Dst)
program = append(program, 1)
case VM_FSWAP_RE:
program = append(program, SHUFPD...)
program = append(program, 0xc0+9*(instr.Dst+RegistersCountFloat))
program = append(program, 1)
case VM_FADD_R:
program = append(program, REX_ADDPD...)
program = append(program, 0xc0+instr.Src+8*instr.Dst)
case VM_FADD_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_CVTDQ2PD_XMM12...)
program = append(program, REX_ADDPD...)
program = append(program, 0xc4+8*instr.Dst)
case VM_FSUB_R:
program = append(program, REX_SUBPD...)
program = append(program, 0xc0+instr.Src+8*instr.Dst)
case VM_FSUB_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_CVTDQ2PD_XMM12...)
program = append(program, REX_SUBPD...)
program = append(program, 0xc4+8*instr.Dst)
case VM_FSCAL_R:
program = append(program, REX_XORPS...)
program = append(program, 0xc7+8*instr.Dst)
case VM_FMUL_R:
program = append(program, REX_MULPD...)
program = append(program, 0xe0+instr.Src+8*instr.Dst)
case VM_FDIV_M:
program = genAddressReg(program, instr, true)
program = append(program, REX_CVTDQ2PD_XMM12...)
program = append(program, REX_ANDPS_XMM12...)
program = append(program, REX_DIVPD...)
program = append(program, 0xe4+8*instr.Dst)
case VM_FSQRT_R:
program = append(program, SQRTPD...)
program = append(program, 0xe4+9*instr.Dst)
case VM_CFROUND:
program = append(program, REX_MOV_RR64...)
program = append(program, 0xc0+instr.Src)
rotate := byte((13 - instr.Imm) & 63)
if rotate != 0 {
program = append(program, ROL_RAX...)
program = append(program, rotate)
}
program = append(program, AND_OR_MOV_LDMXCSR...)
case VM_CBRANCH:
reg := instr.Dst
target := instr.jumpTarget() + 1
jmpOffset := instructionOffsets[target] - (codePos + 16)
if BranchesWithin32B {
branchBegin := uint32(codePos + 7)
branchEnd := branchBegin
if jmpOffset >= -128 {
branchEnd += 9
} else {
branchEnd += 13
}
// If the jump crosses or touches 32-byte boundary, align it
if (branchBegin ^ branchEnd) >= 32 {
alignmentSize := 32 - (branchBegin & 31)
alignmentSize -= alignmentSize
program = append(program, JMP_ALIGN_PREFIX[alignmentSize]...)
}
}
program = append(program, REX_ADD_I...)
program = append(program, 0xc0+reg)
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
program = append(program, REX_TEST...)
program = append(program, 0xc0+reg)
program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
if jmpOffset >= -128 {
program = append(program, JZ_SHORT)
program = append(program, byte(jmpOffset))
} else {
program = append(program, JZ...)
program = binary.LittleEndian.AppendUint32(program, uint32(jmpOffset-4))
}
case VM_ISTORE:
//genAddressRegDst
program = append(program, LEA_32...)
program = append(program, 0x80+instr.Dst)
if instr.Dst == RegisterNeedsSib {
program = append(program, 0x24)
}
program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
program = append(program, AND_EAX_I)
program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
program = append(program, REX_MOV_MR...)
program = append(program, 0x04+8*instr.Src)
program = append(program, 0x06)
case VM_NOP:
program = append(program, NOP1...)
}
codePos += int32(len(program) - curLen)
}
program = append(program, RET)
}

91
vm_bytecode_jit_amd64.s Normal file
View file

@ -0,0 +1,91 @@
//go:build unix && amd64 && !disable_jit && !purego
#include "textflag.h"
TEXT ·vm_run(SB),$8-40
// move register file to registers
MOVQ rf+0(FP), AX
PREFETCHNTA 0(AX)
// r0-r7
MOVQ (0*8)(AX), R8
MOVQ (1*8)(AX), R9
MOVQ (2*8)(AX), R10
MOVQ (3*8)(AX), R11
MOVQ (4*8)(AX), R12
MOVQ (5*8)(AX), R13
MOVQ (6*8)(AX), R14
MOVQ (7*8)(AX), R15
// f0-f3
VMOVUPD (8*8)(AX), X0
VMOVUPD (10*8)(AX), X1
VMOVUPD (12*8)(AX), X2
VMOVUPD (14*8)(AX), X3
// e0-e3
VMOVUPD (16*8)(AX), X4
VMOVUPD (18*8)(AX), X5
VMOVUPD (20*8)(AX), X6
VMOVUPD (22*8)(AX), X7
// a0-a3
VMOVUPD (24*8)(AX), X8
VMOVUPD (26*8)(AX), X9
VMOVUPD (28*8)(AX), X10
VMOVUPD (30*8)(AX), X11
//TODO: rest of init
// mantissa mask
//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
MOVQ $0x00ffffffffffffff, AX
VMOVQ AX, X13
VPBROADCASTQ X13, X13
// eMask
VMOVDQU64 eMask+16(FP), X14
// scale mask
//VMOVQ $0x80F0000000000000, $0x80F0000000000000, X15
MOVQ $0x80F0000000000000, AX
VMOVQ AX, X15
VPBROADCASTQ X15, X15
// scratchpad pointer
MOVQ pad+8(FP), SI
// JIT location
MOVQ jmp+32(FP), AX
// jump to JIT code
CALL AX
// move register file back to registers
MOVQ rf+0(FP), AX
PREFETCHT0 0(AX)
// r0-r7
MOVQ R8, (0*8)(AX)
MOVQ R9, (1*8)(AX)
MOVQ R10, (2*8)(AX)
MOVQ R11, (3*8)(AX)
MOVQ R12, (4*8)(AX)
MOVQ R13, (5*8)(AX)
MOVQ R14, (6*8)(AX)
MOVQ R15, (7*8)(AX)
// f0-f3
VMOVUPD X0, (8*8)(AX)
VMOVUPD X1, (10*8)(AX)
VMOVUPD X2, (12*8)(AX)
VMOVUPD X3, (14*8)(AX)
// e0-e3
VMOVUPD X4, (16*8)(AX)
VMOVUPD X5, (18*8)(AX)
VMOVUPD X6, (20*8)(AX)
VMOVUPD X7, (22*8)(AX)
// a0-a3 are constant, no need to move
RET

View file

@ -0,0 +1,11 @@
//go:build !unix || !amd64 || disable_jit || purego
package randomx
func (c *ByteCode) generateCode(program []byte) {
}
func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
}

View file

@ -13,7 +13,7 @@ import (
// It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
for pc := 0; pc < len(c); pc++ {
i := &c[pc]
switch i.Opcode {
case VM_NOP: // we do nothing
@ -111,8 +111,8 @@ func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
SetRoundingMode(f, uint8(tmp))
case VM_CBRANCH:
f.R[i.Src] += i.Imm
if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
f.R[i.Dst] += i.Imm
if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
pc = i.jumpTarget()
}
case VM_ISTORE:

View file

@ -12,7 +12,7 @@ import (
// It is the caller's responsibility to set and restore the mode to IEEE 754 roundTiesToEven between full executions
// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
for pc := 0; pc < len(c); pc++ {
i := &c[pc]
switch i.Opcode {
case VM_NOP: // we do nothing
@ -110,8 +110,8 @@ func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
SetRoundingMode(f, uint8(tmp))
case VM_CBRANCH:
f.R[i.Src] += i.Imm
if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
f.R[i.Dst] += i.Imm
if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
pc = i.jumpTarget()
}
case VM_ISTORE:

View file

@ -70,7 +70,7 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
registerUsage[i] = -1
}
for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
for i := 0; i < len(bc); i++ {
instr := VM_Instruction(prog[i*8:])
ibc := &bc[i]
@ -312,10 +312,12 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
case 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238: //25 // CBRANCH and CFROUND are interchanged
ibc.Opcode = VM_CBRANCH
ibc.Src = instr.Dst() % RegistersCount
//TODO:??? it's +1 on other
ibc.Dst = instr.Dst() % RegistersCount
target := uint16(int16(registerUsage[ibc.Src]))
ibc.Dst = uint8(target)
target := uint16(int16(registerUsage[ibc.Dst]))
// set target!
ibc.Src = uint8(target)
ibc.ImmB = uint8(target >> 8)
shift := uint64(instr.Mod()>>4) + CONDITIONOFFSET