General cleanup, improve load32 and dataset execution

This commit is contained in:
DataHoarder 2024-04-12 23:51:55 +02:00
parent 244cff31f9
commit 1bb1da8bbc
Signed by: DataHoarder
SSH key fingerprint: SHA256:OLTRf6Fl87G52SiR7sWLGNzlJt4WOX+tfI2yxo0z7xk
8 changed files with 117 additions and 178 deletions

View file

@ -30,7 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package aes
import (
"encoding/binary"
"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
"unsafe"
)
@ -48,20 +47,20 @@ import (
//
// Hashing throughput: >20 GiB/s per CPU core with hardware AES
func HashAes1Rx4(input []byte, output *[64]byte) {
if len(input)%64 != 0 {
panic("unsupported")
}
// states are copied
states := keys.AesHash1R_State
var in [4][4]uint32
for input_ptr := 0; input_ptr < len(input); input_ptr += 64 {
for i := 0; i < 63; i += 4 { // load 64 bytes
in[i/16][(i%16)/4] = binary.LittleEndian.Uint32(input[input_ptr+i:])
}
in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:])))
soft_aesenc(&states[0], &in[0])
soft_aesdec(&states[1], &in[1])
soft_aesenc(&states[2], &in[2])
soft_aesdec(&states[3], &in[3])
}
soft_aesenc(&states[0], &keys.AesHash1R_XKeys[0])
@ -74,11 +73,7 @@ func HashAes1Rx4(input []byte, output *[64]byte) {
soft_aesenc(&states[2], &keys.AesHash1R_XKeys[1])
soft_aesdec(&states[3], &keys.AesHash1R_XKeys[1])
// write back to state
for i := 0; i < 63; i += 4 {
binary.LittleEndian.PutUint32(output[i:], states[i/16][(i%16)/4])
}
copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:])
}
// FillAes1Rx4
@ -110,15 +105,17 @@ func FillAes1Rx4(state *[64]byte, output []byte) {
}
// FillAes4Rx4 used to generate final program
func FillAes4Rx4(state *[64]byte, output []byte) {
var states [4][4]uint32
for i := 0; i < 63; i += 4 {
states[i/16][(i%16)/4] = binary.LittleEndian.Uint32(state[i:])
func FillAes4Rx4(state [64]byte, output []byte) {
if len(output)%len(state) != 0 {
panic("unsupported")
}
outptr := 0
for ; outptr < len(output); outptr += 64 {
// state is copied on caller
// Copy state
states := (*[4][4]uint32)(unsafe.Pointer(&state))
for outptr := 0; outptr < len(output); outptr += len(state) {
soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[0])
soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[0])
soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[4])
@ -139,11 +136,7 @@ func FillAes4Rx4(state *[64]byte, output []byte) {
soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[7])
soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[7])
// store bytes to output buffer
for i := 0; i < 63; i += 4 {
binary.LittleEndian.PutUint32(output[outptr+i:], states[i/16][(i%16)/4])
}
copy(output[outptr:], state[:])
}
}

View file

@ -34,6 +34,10 @@ func Randomx_alloc_cache(flags uint64) *Randomx_Cache {
}
}
func (cache *Randomx_Cache) HasJIT() bool {
return cache.Flags&RANDOMX_FLAG_JIT > 0 && cache.JitPrograms[0] != nil
}
func (cache *Randomx_Cache) VM_Initialize() *VM {
return &VM{
@ -102,34 +106,45 @@ func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64)
rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]
if cache.JitPrograms[0] != nil {
for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
mix := cache.GetMixBlock(registerValue)
for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
mix := cache.GetMixBlock(registerValue)
cache.JitPrograms[i].Execute(rl)
program := cache.Programs[i]
for q := range rl {
rl[q] ^= mix[q]
}
registerValue = rl[cache.Programs[i].AddressRegister()]
executeSuperscalar(program.Program(), rl)
for q := range rl {
rl[q] ^= mix[q]
}
} else {
for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
mix := cache.GetMixBlock(registerValue)
program := cache.Programs[i]
registerValue = rl[program.AddressRegister()]
executeSuperscalar(program.Program(), rl)
}
}
for q := range rl {
rl[q] ^= mix[q]
}
func (cache *Randomx_Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint64) {
registerValue := itemNumber
registerValue = rl[program.AddressRegister()]
rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0]
rl[1] = rl[0] ^ keys.SuperScalar_Constants[1]
rl[2] = rl[0] ^ keys.SuperScalar_Constants[2]
rl[3] = rl[0] ^ keys.SuperScalar_Constants[3]
rl[4] = rl[0] ^ keys.SuperScalar_Constants[4]
rl[5] = rl[0] ^ keys.SuperScalar_Constants[5]
rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]
for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
mix := cache.GetMixBlock(registerValue)
cache.JitPrograms[i].Execute(rl)
for q := range rl {
rl[q] ^= mix[q]
}
registerValue = rl[cache.Programs[i].AddressRegister()]
}
}

View file

@ -85,6 +85,7 @@ const ScratchpadL1Mask16 = (ScratchpadL1/2 - 1) * 16
const ScratchpadL2Mask16 = (ScratchpadL2/2 - 1) * 16
const ScratchpadL3Mask = (ScratchpadL3 - 1) * 8
const ScratchpadL3Mask64 = (ScratchpadL3/8 - 1) * 64
const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
const CONDITIONMASK = ((1 << RANDOMX_JUMP_BITS) - 1)
const STOREL3CONDITION = 14

View file

@ -10,7 +10,11 @@ func (d *Randomx_DatasetLight) PrefetchDataset(address uint64) {
}
func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLine) {
d.Cache.InitDatasetItem(cache, address/CacheLineSize)
if d.Cache.HasJIT() {
d.Cache.InitDatasetItemJIT(cache, address/CacheLineSize)
} else {
d.Cache.InitDatasetItem(cache, address/CacheLineSize)
}
for i := range r {
r[i] ^= cache[i]

View file

@ -8,6 +8,9 @@ import (
)
func (f ProgramFunc) Execute(rl *RegisterLine) {
if f == nil {
panic("program is nil")
}
memoryPtr := &f
fun := *(*func(rl *RegisterLine))(unsafe.Pointer(&memoryPtr))

45
vm.go
View file

@ -44,10 +44,10 @@ type REG struct {
}
type VM struct {
State_start [64]byte
buffer [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
Prog []byte
ScratchPad [ScratchpadSize]byte
StateStart [64]byte
buffer [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
Prog []byte
ScratchPad [ScratchpadSize]byte
ByteCode [RANDOMX_PROGRAM_SIZE]InstructionByteCode
@ -71,8 +71,8 @@ func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
}
type Config struct {
eMask [2]uint64
readReg0, readReg1, readReg2, readReg3 uint64
eMask [2]uint64
readReg [4]uint64
}
type REGISTER_FILE struct {
@ -82,15 +82,14 @@ type REGISTER_FILE struct {
a [4][2]float64
}
type MemoryRegisters struct {
mx, ma uint64 //addr_t mx, ma;
mempry uint64 // uint8_t* memory = nullptr;
mx, ma uint64
}
const LOW = 0
const HIGH = 1
// calculate hash based on input
func (vm *VM) Run(input_hash *[64]byte) {
func (vm *VM) Run(input_hash [64]byte) {
//fmt.Printf("%x \n", input_hash)
@ -112,14 +111,13 @@ func (vm *VM) Run(input_hash *[64]byte) {
vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
vm.mem.mx = vm.entropy[10]
addressRegisters := vm.entropy[12]
vm.config.readReg0 = 0 + (addressRegisters & 1)
addressRegisters >>= 1
vm.config.readReg1 = 2 + (addressRegisters & 1)
addressRegisters >>= 1
vm.config.readReg2 = 4 + (addressRegisters & 1)
addressRegisters >>= 1
vm.config.readReg3 = 6 + (addressRegisters & 1)
for i := range vm.config.readReg {
vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
addressRegisters >>= 1
}
vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
@ -134,7 +132,7 @@ func (vm *VM) Run(input_hash *[64]byte) {
var rlCache RegisterLine
for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
spMix := vm.reg.r[vm.config.readReg0] ^ vm.reg.r[vm.config.readReg1]
spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]
spAddr0 ^= spMix
spAddr0 &= ScratchpadL3Mask64
@ -146,21 +144,20 @@ func (vm *VM) Run(input_hash *[64]byte) {
}
for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
vm.reg.f[i][LOW] = vm.Load32F(spAddr1 + 8*i)
vm.reg.f[i][HIGH] = vm.Load32F(spAddr1 + 8*i + 4)
vm.reg.f[i] = vm.Load32FA(spAddr1 + 8*i)
}
for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
vm.reg.e[i][LOW] = vm.Load32F(spAddr1 + 8*(i+REGISTERCOUNTFLT))
vm.reg.e[i][HIGH] = vm.Load32F(spAddr1 + 8*(i+REGISTERCOUNTFLT) + 4)
vm.reg.e[i] = vm.Load32FA(spAddr1 + 8*(i+REGISTERCOUNTFLT))
vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
}
// todo: pass register file directly!
vm.InterpretByteCode()
vm.mem.mx ^= vm.reg.r[vm.config.readReg2] ^ vm.reg.r[vm.config.readReg3]
vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
vm.mem.mx &= CacheLineAlignMask
vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
@ -214,7 +211,7 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
hash512, _ := blake2b.New512(nil)
for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
vm.Run(&tempHash)
vm.Run(tempHash)
hash512.Reset()
for i := range vm.reg.r {
@ -247,7 +244,7 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
}
// final loop executes here
vm.Run(&tempHash)
vm.Run(tempHash)
// now hash the scratch pad and place into register a
aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)

View file

@ -30,11 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
package randomx
import (
"fmt"
"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
"math"
"math/bits"
"unsafe"
)
import "math"
import "math/bits"
import "encoding/binary"
//reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding
@ -156,11 +156,11 @@ func (vm *VM) Compile_TO_Bytecode() {
ibc.idst = &vm.reg.r[dst]
if dst != RegisterNeedsDisplacement {
ibc.isrc = &vm.reg.r[src]
ibc.shift = uint16((instr.Mod() >> 2) % 4)
ibc.shift = (instr.Mod() >> 2) % 4
ibc.imm = 0
} else {
ibc.isrc = &vm.reg.r[src]
ibc.shift = uint16((instr.Mod() >> 2) % 4)
ibc.shift = (instr.Mod() >> 2) % 4
ibc.imm = signExtend2sCompl(instr.IMM())
}
registerUsage[dst] = i
@ -534,7 +534,7 @@ type InstructionByteCode struct {
simm int64
Opcode VM_Instruction_Type
target int16
shift uint16
shift uint8
memMask uint32
/*
union {
@ -563,179 +563,105 @@ func (ibc *InstructionByteCode) getScratchpadAddress() uint64 {
return (*ibc.isrc + ibc.imm) & uint64(ibc.memMask)
}
func (vm *VM) Load64(addr uint64) uint64 {
return binary.LittleEndian.Uint64(vm.ScratchPad[addr:])
}
func (vm *VM) Load32(addr uint64) uint32 {
return binary.LittleEndian.Uint32(vm.ScratchPad[addr:])
func (ibc *InstructionByteCode) getScratchpadDestAddress() uint64 {
return (*ibc.idst + ibc.imm) & uint64(ibc.memMask)
}
func (vm *VM) Load32F(addr uint64) float64 {
return float64(int32(vm.Load32(addr)))
func (vm *VM) Load64(addr uint64) uint64 {
return *(*uint64)(unsafe.Pointer(&vm.ScratchPad[addr]))
}
func (vm *VM) Load32(addr uint64) uint32 {
return *(*uint32)(unsafe.Pointer(&vm.ScratchPad[addr]))
}
func (vm *VM) Load32F(addr uint64) (lo, hi float64) {
a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
return float64(a[LOW]), float64(a[HIGH])
}
func (vm *VM) Load32FA(addr uint64) [2]float64 {
a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
return [2]float64{float64(a[LOW]), float64(a[HIGH])}
}
func (vm *VM) InterpretByteCode() {
for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
ibc := &vm.ByteCode[pc]
//fmt.Printf("PCLOOP %d opcode %d %s dst %d src %d\n",pc,ibc.Opcode, Names[ibc.Opcode], ibc.dst, ibc.src)
switch ibc.Opcode {
case VM_IADD_RS:
*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm
//panic("VM_IADD_RS")
case VM_IADD_M:
*ibc.idst += vm.Load64(ibc.getScratchpadAddress())
//panic("VM_IADD_M")
case VM_ISUB_R:
*ibc.idst -= *ibc.isrc
//panic("VM_ISUB_R")
case VM_ISUB_M:
*ibc.idst -= vm.Load64(ibc.getScratchpadAddress())
//panic("VM_ISUB_M")
case VM_IMUL_R: // also handles imul_rcp
case VM_IMUL_R:
// also handles imul_rcp
*ibc.idst *= *ibc.isrc
//panic("VM_IMUL_R")
case VM_IMUL_M:
*ibc.idst *= vm.Load64(ibc.getScratchpadAddress())
//panic("VM_IMUL_M")
case VM_IMULH_R:
*ibc.idst, _ = bits.Mul64(*ibc.idst, *ibc.isrc)
// panic("VM_IMULH_R")
case VM_IMULH_M:
*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadAddress()))
// fmt.Printf("%x \n",*ibc.idst )
// panic("VM_IMULH_M")
case VM_ISMULH_R:
*ibc.idst = uint64(smulh(int64(*ibc.idst), int64(*ibc.isrc)))
// fmt.Printf("dst %x\n", *ibc.idst)
// panic("VM_ISMULH_R")
*ibc.idst = smulh(int64(*ibc.idst), int64(*ibc.isrc))
case VM_ISMULH_M:
*ibc.idst = uint64(smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress()))))
//fmt.Printf("%x \n",*ibc.idst )
// panic("VM_ISMULH_M")
*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress())))
case VM_INEG_R:
*ibc.idst = (^(*ibc.idst)) + 1 // 2's complement negative
//panic("VM_INEG_R")
case VM_IXOR_R:
*ibc.idst ^= *ibc.isrc
case VM_IXOR_M:
*ibc.idst ^= vm.Load64(ibc.getScratchpadAddress())
//panic("VM_IXOR_M")
case VM_IROR_R:
*ibc.idst = bits.RotateLeft64(*ibc.idst, 0-int(*ibc.isrc&63))
//panic("VM_IROR_R")
case VM_IROL_R:
*ibc.idst = bits.RotateLeft64(*ibc.idst, int(*ibc.isrc&63))
case VM_ISWAP_R:
*ibc.idst, *ibc.isrc = *ibc.isrc, *ibc.idst
//fmt.Printf("%x %x\n",*ibc.idst, *ibc.isrc )
//panic("VM_ISWAP_R")
case VM_FSWAP_R:
//TODO: could be F+E
ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
// fmt.Printf("%+v \n",ibc.fdst )
// panic("VM_FSWAP_R")
case VM_FADD_R:
ibc.fdst[LOW] += ibc.fsrc[LOW]
ibc.fdst[HIGH] += ibc.fsrc[HIGH]
//panic("VM_FADD_R")
case VM_FADD_M:
ibc.fdst[LOW] += vm.Load32F(ibc.getScratchpadAddress() + 0)
ibc.fdst[HIGH] += vm.Load32F(ibc.getScratchpadAddress() + 4)
//panic("VM_FADD_M")
lo, hi := vm.Load32F(ibc.getScratchpadAddress())
ibc.fdst[LOW] += lo
ibc.fdst[HIGH] += hi
case VM_FSUB_R:
ibc.fdst[LOW] -= ibc.fsrc[LOW]
ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
//fmt.Printf("fdst float %+v\n", ibc.fdst )
//panic("VM_FSUB_R")
case VM_FSUB_M:
ibc.fdst[LOW] -= vm.Load32F(ibc.getScratchpadAddress() + 0)
ibc.fdst[HIGH] -= vm.Load32F(ibc.getScratchpadAddress() + 4)
//panic("VM_FSUB_M")
case VM_FSCAL_R: // no dependent on rounding modes
//mask := math.Float64frombits(0x80F0000000000000)
lo, hi := vm.Load32F(ibc.getScratchpadAddress())
ibc.fdst[LOW] -= lo
ibc.fdst[HIGH] -= hi
case VM_FSCAL_R:
// no dependent on rounding modes
ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)
//fmt.Printf("fdst float %+v\n", ibc.fdst )
//panic("VM_FSCA_M")
case VM_FMUL_R:
ibc.fdst[LOW] *= ibc.fsrc[LOW]
ibc.fdst[HIGH] *= ibc.fsrc[HIGH]
//panic("VM_FMUL_R")
case VM_FDIV_M:
ibc.fdst[LOW] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+0), vm.config.eMask[LOW])
ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+4), vm.config.eMask[HIGH])
//panic("VM_FDIV_M")
lo, hi := vm.Load32F(ibc.getScratchpadAddress())
ibc.fdst[LOW] /= MaskRegisterExponentMantissa(lo, vm.config.eMask[LOW])
ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(hi, vm.config.eMask[HIGH])
case VM_FSQRT_R:
ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
// panic("VM_FSQRT")
case VM_CBRANCH:
//fmt.Printf("pc %d src %x imm %x\n",pc ,*ibc.isrc, ibc.imm)
*ibc.isrc += ibc.imm
//fmt.Printf("pc %d\n",pc)
if (*ibc.isrc & uint64(ibc.memMask)) == 0 {
pc = int(ibc.target)
}
// fmt.Printf("pc %d\n",pc)
//panic("VM_CBRANCH")
case VM_CFROUND:
tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
asm.SetRoundingMode(asm.RoundingMode(tmp))
//panic("round not implemented")
//panic("VM_CFROUND")
case VM_ISTORE:
binary.LittleEndian.PutUint64(vm.ScratchPad[(*ibc.idst+ibc.imm)&uint64(ibc.memMask):], *ibc.isrc)
//panic("VM_ISTOREM")
case VM_NOP: // we do nothing
default:
panic("instruction not implemented")
}
/*fmt.Printf("REGS ")
for j := 0; j <7;j++ {
fmt.Printf("%16x, " , vm.reg.r[j])
}
fmt.Printf("\n")
*/
}
}
var umm888_ = fmt.Sprintf("")