Use direct register and scratchpad under bytecode execution

Remove zero register from vm bytecode
2024-04-15 02:14:01 +02:00 · 2024-04-14 15:43:54 +02:00
6 changed files with 315 additions and 346 deletions
--- a/bytecode.go
+++ b/bytecode.go
@ -0,0 +1,205 @@
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"math"
+	"math/bits"
+)
+
+type ByteCodeInstruction struct {
+	dst, src byte
+	immB     uint8
+	Opcode   ByteCodeInstructionOp
+	memMask  uint32
+	imm      uint64
+	/*
+		union {
+			int_reg_t* idst;
+			rx_vec_f128* fdst;
+		};
+		union {
+			int_reg_t* isrc;
+			rx_vec_f128* fsrc;
+		};
+		union {
+			uint64_t imm;
+			int64_t simm;
+		};
+		InstructionType type;
+		union {
+			int16_t target;
+			uint16_t shift;
+		};
+		uint32_t memMask;
+	*/
+
+}
+
+func (i ByteCodeInstruction) jumpTarget() int {
+	return int(int16((uint16(i.immB) << 8) | uint16(i.dst)))
+}
+
+func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
+	return uint32(ptr+i.imm) & i.memMask
+}
+
+func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
+	return uint32(i.imm) & i.memMask
+}
+
+type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
+
+func (c *ByteCode) Execute(f RegisterFile, pad *ScratchPad, eMask [2]uint64) RegisterFile {
+	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+		ibc := &c[pc]
+		switch ibc.Opcode {
+		case VM_IADD_RS:
+			f.r[ibc.dst] += (f.r[ibc.src] << ibc.immB) + ibc.imm
+		case VM_IADD_M:
+			f.r[ibc.dst] += pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src]))
+		case VM_IADD_MZ:
+			f.r[ibc.dst] += pad.Load64(uint32(ibc.imm))
+		case VM_ISUB_R:
+			f.r[ibc.dst] -= f.r[ibc.src]
+		case VM_ISUB_I:
+			f.r[ibc.dst] -= ibc.imm
+		case VM_ISUB_M:
+			f.r[ibc.dst] -= pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src]))
+		case VM_ISUB_MZ:
+			f.r[ibc.dst] -= pad.Load64(uint32(ibc.imm))
+		case VM_IMUL_R:
+			f.r[ibc.dst] *= f.r[ibc.src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.r[ibc.dst] *= ibc.imm
+		case VM_IMUL_M:
+			f.r[ibc.dst] *= pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src]))
+		case VM_IMUL_MZ:
+			f.r[ibc.dst] *= pad.Load64(uint32(ibc.imm))
+		case VM_IMULH_R:
+			f.r[ibc.dst], _ = bits.Mul64(f.r[ibc.dst], f.r[ibc.src])
+		case VM_IMULH_M:
+			f.r[ibc.dst], _ = bits.Mul64(f.r[ibc.dst], pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src])))
+		case VM_IMULH_MZ:
+			f.r[ibc.dst], _ = bits.Mul64(f.r[ibc.dst], pad.Load64(uint32(ibc.imm)))
+		case VM_ISMULH_R:
+			f.r[ibc.dst] = smulh(int64(f.r[ibc.dst]), int64(f.r[ibc.src]))
+		case VM_ISMULH_M:
+			f.r[ibc.dst] = smulh(int64(f.r[ibc.dst]), int64(pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src]))))
+		case VM_ISMULH_MZ:
+			f.r[ibc.dst] = smulh(int64(f.r[ibc.dst]), int64(pad.Load64(uint32(ibc.imm))))
+		case VM_INEG_R:
+			//f.r[ibc.dst] = (^(f.r[ibc.dst])) + 1 // 2's complement negative
+			f.r[ibc.dst] = -f.r[ibc.dst]
+		case VM_IXOR_R:
+			f.r[ibc.dst] ^= f.r[ibc.src]
+		case VM_IXOR_I:
+			f.r[ibc.dst] ^= ibc.imm
+		case VM_IXOR_M:
+			f.r[ibc.dst] ^= pad.Load64(ibc.getScratchpadAddress(f.r[ibc.src]))
+		case VM_IXOR_MZ:
+			f.r[ibc.dst] ^= pad.Load64(uint32(ibc.imm))
+		case VM_IROR_R:
+			f.r[ibc.dst] = bits.RotateLeft64(f.r[ibc.dst], 0-int(f.r[ibc.src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.r[ibc.dst] = bits.RotateLeft64(f.r[ibc.dst], 0-int(ibc.imm&63))
+		case VM_IROL_R:
+			f.r[ibc.dst] = bits.RotateLeft64(f.r[ibc.dst], int(f.r[ibc.src]&63))
+		case VM_IROL_I:
+			f.r[ibc.dst] = bits.RotateLeft64(f.r[ibc.dst], int(ibc.imm&63))
+		case VM_ISWAP_R:
+			f.r[ibc.dst], f.r[ibc.src] = f.r[ibc.src], f.r[ibc.dst]
+		case VM_FSWAP_RF:
+			f.f[ibc.dst][HIGH], f.f[ibc.dst][LOW] = f.f[ibc.dst][LOW], f.f[ibc.dst][HIGH]
+		case VM_FSWAP_RE:
+			f.e[ibc.dst][HIGH], f.e[ibc.dst][LOW] = f.e[ibc.dst][LOW], f.e[ibc.dst][HIGH]
+		case VM_FADD_R:
+			f.f[ibc.dst][LOW] += f.a[ibc.src][LOW]
+			f.f[ibc.dst][HIGH] += f.a[ibc.src][HIGH]
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(ibc.getScratchpadAddress(f.r[ibc.src]))
+			f.f[ibc.dst][LOW] += lo
+			f.f[ibc.dst][HIGH] += hi
+		case VM_FSUB_R:
+			f.f[ibc.dst][LOW] -= f.a[ibc.src][LOW]
+			f.f[ibc.dst][HIGH] -= f.a[ibc.src][HIGH]
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(ibc.getScratchpadAddress(f.r[ibc.src]))
+			f.f[ibc.dst][LOW] -= lo
+			f.f[ibc.dst][HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.f[ibc.dst][LOW] = math.Float64frombits(math.Float64bits(f.f[ibc.dst][LOW]) ^ 0x80F0000000000000)
+			f.f[ibc.dst][HIGH] = math.Float64frombits(math.Float64bits(f.f[ibc.dst][HIGH]) ^ 0x80F0000000000000)
+		case VM_FMUL_R:
+			f.e[ibc.dst][LOW] *= f.a[ibc.src][LOW]
+			f.e[ibc.dst][HIGH] *= f.a[ibc.src][HIGH]
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(ibc.getScratchpadAddress(f.r[ibc.src]))
+			f.e[ibc.dst][LOW] /= MaskRegisterExponentMantissa(lo, eMask[LOW])
+			f.e[ibc.dst][HIGH] /= MaskRegisterExponentMantissa(hi, eMask[HIGH])
+		case VM_FSQRT_R:
+			f.e[ibc.dst][LOW] = math.Sqrt(f.e[ibc.dst][LOW])
+			f.e[ibc.dst][HIGH] = math.Sqrt(f.e[ibc.dst][HIGH])
+		case VM_CBRANCH:
+			f.r[ibc.src] += ibc.imm
+			if (f.r[ibc.src] & uint64(ibc.memMask)) == 0 {
+				pc = ibc.jumpTarget()
+			}
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.r[ibc.src], 0-int(ibc.imm))) % 4 // rotate right
+			asm.SetRoundingMode(asm.RoundingMode(tmp))
+		case VM_ISTORE:
+			pad.Store64(ibc.getScratchpadAddress(f.r[ibc.dst]), f.r[ibc.src])
+		case VM_NOP: // we do nothing
+		}
+	}
+	return f
+}
+
+type ByteCodeInstructionOp int
+
+const (
+	VM_NOP = ByteCodeInstructionOp(iota)
+	VM_IADD_RS
+	VM_IADD_M
+	VM_IADD_MZ
+	VM_ISUB_R
+	VM_ISUB_I
+	VM_ISUB_M
+	VM_ISUB_MZ
+	VM_IMUL_R
+	VM_IMUL_I
+	VM_IMUL_M
+	VM_IMUL_MZ
+	VM_IMULH_R
+	VM_IMULH_M
+	VM_IMULH_MZ
+	VM_ISMULH_R
+	VM_ISMULH_M
+	VM_ISMULH_MZ
+	VM_INEG_R
+	VM_IXOR_R
+	VM_IXOR_I
+	VM_IXOR_M
+	VM_IXOR_MZ
+	VM_IROR_R
+	VM_IROR_I
+	VM_IROL_R
+	VM_IROL_I
+	VM_ISWAP_R
+	VM_FSWAP_RF
+	VM_FSWAP_RE
+	VM_FADD_R
+	VM_FADD_M
+	VM_FSUB_R
+	VM_FSUB_M
+	VM_FSCAL_R
+	VM_FMUL_R
+	VM_FDIV_M
+	VM_FSQRT_R
+	VM_CBRANCH
+	VM_CFROUND
+	VM_ISTORE
+)
--- a/config.go
+++ b/config.go
@ -87,12 +87,9 @@ const ScratchpadL3Mask = (ScratchpadL3 - 1) * 8
 const ScratchpadL3Mask64 = (ScratchpadL3/8 - 1) * 64

 const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
-const CONDITIONMASK = ((1 << RANDOMX_JUMP_BITS) - 1)
+const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14

-const REGISTERSCOUNT = 8
-const REGISTERCOUNTFLT = 4
-
 const mantissaSize = 52
 const exponentSize = 11
 const mantissaMask = (uint64(1) << mantissaSize) - 1
--- a/register.go
+++ b/register.go
@ -1,3 +1,17 @@
 package randomx

-type RegisterLine [REGISTERSCOUNT]uint64
+const RegistersCount = 8
+const RegistersCountFloat = 4
+
+type RegisterLine [RegistersCount]uint64
+
+type RegisterFile struct {
+	r RegisterLine
+	f [RegistersCountFloat][2]float64
+	e [RegistersCountFloat][2]float64
+	a [RegistersCountFloat][2]float64
+}
+
+type MemoryRegisters struct {
+	mx, ma uint64
+}
--- a/superscalar_amd64.go
+++ b/superscalar_amd64.go
@ -85,8 +85,8 @@ func generateSuperscalarCode(scalarProgram SuperScalarProgram) ProgramFunc {
 	for i := range p {
 		instr := &p[i]

-		dst := instr.Dst_Reg % REGISTERSCOUNT
-		src := instr.Src_Reg % REGISTERSCOUNT
+		dst := instr.Dst_Reg % RegistersCount
+		src := instr.Src_Reg % RegistersCount

 		switch instr.Opcode {
 		case S_ISUB_R:
--- a/vm.go
+++ b/vm.go
@ -47,15 +47,15 @@ type VM struct {
 	StateStart [64]byte
 	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
 	Prog       []byte
-	ScratchPad [ScratchpadSize]byte
+	ScratchPad ScratchPad

-	ByteCode [RANDOMX_PROGRAM_SIZE]InstructionByteCode
+	ByteCode ByteCode

 	// program configuration  see program.hpp

 	entropy [16]uint64

-	reg           REGISTER_FILE // the register file
+	reg           RegisterFile // the register file
 	mem           MemoryRegisters
 	config        Config // configuration
 	datasetOffset uint64
@ -75,16 +75,6 @@ type Config struct {
 	readReg [4]uint64
 }

-type REGISTER_FILE struct {
-	r RegisterLine
-	f [4][2]float64
-	e [4][2]float64
-	a [4][2]float64
-}
-type MemoryRegisters struct {
-	mx, ma uint64
-}
-
 const LOW = 0
 const HIGH = 1

@ -120,7 +110,7 @@ func (vm *VM) Run(input_hash [64]byte) {
 	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
 	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])

-	vm.Compile_TO_Bytecode()
+	vm.CompileToBytecode()

 	spAddr0 := vm.mem.mx
 	spAddr1 := vm.mem.ma
@ -135,23 +125,23 @@ func (vm *VM) Run(input_hash [64]byte) {
 		spAddr1 ^= spMix >> 32
 		spAddr1 &= ScratchpadL3Mask64

-		for i := uint64(0); i < REGISTERSCOUNT; i++ {
-			vm.reg.r[i] ^= vm.Load64(spAddr0 + 8*i)
+		for i := uint64(0); i < RegistersCount; i++ {
+			vm.reg.r[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.f[i] = vm.Load32FA(spAddr1 + 8*i)
+		for i := uint64(0); i < RegistersCountFloat; i++ {
+			vm.reg.f[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.e[i] = vm.Load32FA(spAddr1 + 8*(i+REGISTERCOUNTFLT))
+		for i := uint64(0); i < RegistersCountFloat; i++ {
+			vm.reg.e[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))

 			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
 			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
 		}

 		// todo: pass register file directly!
-		vm.InterpretByteCode()
+		vm.reg = vm.ByteCode.Execute(vm.reg, &vm.ScratchPad, vm.config.eMask)

 		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
 		vm.mem.mx &= CacheLineAlignMask
@ -163,16 +153,16 @@ func (vm *VM) Run(input_hash [64]byte) {
 		// swap the elements
 		vm.mem.mx, vm.mem.ma = vm.mem.ma, vm.mem.mx

-		for i := uint64(0); i < REGISTERSCOUNT; i++ {
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr1+8*i:], vm.reg.r[i])
+		for i := uint64(0); i < RegistersCount; i++ {
+			vm.ScratchPad.Store64(uint32(spAddr1+8*i), vm.reg.r[i])
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
+		for i := uint64(0); i < RegistersCountFloat; i++ {
 			vm.reg.f[i][LOW] = math.Float64frombits(math.Float64bits(vm.reg.f[i][LOW]) ^ math.Float64bits(vm.reg.e[i][LOW]))
 			vm.reg.f[i][HIGH] = math.Float64frombits(math.Float64bits(vm.reg.f[i][HIGH]) ^ math.Float64bits(vm.reg.e[i][HIGH]))

-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i:], math.Float64bits(vm.reg.f[i][LOW]))
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i+8:], math.Float64bits(vm.reg.f[i][HIGH]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(vm.reg.f[i][LOW]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(vm.reg.f[i][HIGH]))
 		}

 		spAddr0 = 0
@ -183,9 +173,7 @@ func (vm *VM) Run(input_hash [64]byte) {
 }

 func (vm *VM) InitScratchpad(seed *[64]byte) {
-	// calculate and fill scratchpad
-	clear(vm.ScratchPad[:])
-	aes.FillAes1Rx4(seed, vm.ScratchPad[:])
+	vm.ScratchPad.Init(seed)
 }

 func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -30,17 +30,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
-	"math"
-	"math/bits"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
 	"unsafe"
 )
 import "encoding/binary"

 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding

-var Zero uint64 = 0
-
 // since go does not have union, use byte array
 type VM_Instruction []byte // it is hardcode 8 bytes

@ -60,80 +56,11 @@ func (ins VM_Instruction) Opcode() byte {
 	return ins[0]
 }

-type VM_Instruction_Type int
-
-const (
-	VM_IADD_RS  VM_Instruction_Type = 0
-	VM_IADD_M   VM_Instruction_Type = 1
-	VM_ISUB_R   VM_Instruction_Type = 2
-	VM_ISUB_M   VM_Instruction_Type = 3
-	VM_IMUL_R   VM_Instruction_Type = 4
-	VM_IMUL_M   VM_Instruction_Type = 5
-	VM_IMULH_R  VM_Instruction_Type = 6
-	VM_IMULH_M  VM_Instruction_Type = 7
-	VM_ISMULH_R VM_Instruction_Type = 8
-	VM_ISMULH_M VM_Instruction_Type = 9
-	VM_IMUL_RCP VM_Instruction_Type = 10
-	VM_INEG_R   VM_Instruction_Type = 11
-	VM_IXOR_R   VM_Instruction_Type = 12
-	VM_IXOR_M   VM_Instruction_Type = 13
-	VM_IROR_R   VM_Instruction_Type = 14
-	VM_IROL_R   VM_Instruction_Type = 15
-	VM_ISWAP_R  VM_Instruction_Type = 16
-	VM_FSWAP_R  VM_Instruction_Type = 17
-	VM_FADD_R   VM_Instruction_Type = 18
-	VM_FADD_M   VM_Instruction_Type = 19
-	VM_FSUB_R   VM_Instruction_Type = 20
-	VM_FSUB_M   VM_Instruction_Type = 21
-	VM_FSCAL_R  VM_Instruction_Type = 22
-	VM_FMUL_R   VM_Instruction_Type = 23
-	VM_FDIV_M   VM_Instruction_Type = 24
-	VM_FSQRT_R  VM_Instruction_Type = 25
-	VM_CBRANCH  VM_Instruction_Type = 26
-	VM_CFROUND  VM_Instruction_Type = 27
-	VM_ISTORE   VM_Instruction_Type = 28
-	VM_NOP      VM_Instruction_Type = 29
-)
-
-var Names = map[VM_Instruction_Type]string{
-
-	VM_IADD_RS:  "VM_IADD_RS",
-	VM_IADD_M:   "VM_IADD_M",
-	VM_ISUB_R:   "VM_ISUB_R",
-	VM_ISUB_M:   "VM_ISUB_M",
-	VM_IMUL_R:   "VM_IMUL_R",
-	VM_IMUL_M:   "VM_IMUL_M",
-	VM_IMULH_R:  "VM_IMULH_R",
-	VM_IMULH_M:  "VM_IMULH_M",
-	VM_ISMULH_R: "VM_ISMULH_R",
-	VM_ISMULH_M: "VM_ISMULH_M",
-	VM_IMUL_RCP: "VM_IMUL_RCP",
-	VM_INEG_R:   "VM_INEG_R",
-	VM_IXOR_R:   "VM_IXOR_R",
-	VM_IXOR_M:   "VM_IXOR_M",
-	VM_IROR_R:   "VM_IROR_R",
-	VM_IROL_R:   "VM_IROL_R",
-	VM_ISWAP_R:  "VM_ISWAP_R",
-	VM_FSWAP_R:  "VM_FSWAP_R",
-	VM_FADD_R:   "VM_FADD_R",
-	VM_FADD_M:   "VM_FADD_M",
-	VM_FSUB_R:   "VM_FSUB_R",
-	VM_FSUB_M:   "VM_FSUB_M",
-	VM_FSCAL_R:  "VM_FSCAL_R",
-	VM_FMUL_R:   "VM_FMUL_R",
-	VM_FDIV_M:   "VM_FDIV_M",
-	VM_FSQRT_R:  "VM_FSQRT_R",
-	VM_CBRANCH:  "VM_CBRANCH",
-	VM_CFROUND:  "VM_CFROUND",
-	VM_ISTORE:   "VM_ISTORE",
-	VM_NOP:      "VM_NOP",
-}
-
-// this will interpret single vm instruction
+// CompileToBytecode this will interpret single vm instruction
 // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions
-func (vm *VM) Compile_TO_Bytecode() {
+func (vm *VM) CompileToBytecode() {

-	var registerUsage [REGISTERSCOUNT]int
+	var registerUsage [RegistersCount]int
 	for i := range registerUsage {
 		registerUsage[i] = -1
 	}
@ -143,146 +70,126 @@ func (vm *VM) Compile_TO_Bytecode() {
 		ibc := &vm.ByteCode[i]

 		opcode := instr.Opcode()
-		dst := instr.Dst() % REGISTERSCOUNT // bit shift optimization
-		src := instr.Src() % REGISTERSCOUNT
+		dst := instr.Dst() % RegistersCount // bit shift optimization
+		src := instr.Src() % RegistersCount
 		ibc.dst = dst
 		ibc.src = src
 		switch opcode {
 		case 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15: // 16 frequency
 			ibc.Opcode = VM_IADD_RS
-			ibc.idst = &vm.reg.r[dst]
 			if dst != RegisterNeedsDisplacement {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
+				//shift
+				ibc.immB = (instr.Mod() >> 2) % 4
 				ibc.imm = 0
 			} else {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
+				//shift
+				ibc.immB = (instr.Mod() >> 2) % 4
 				ibc.imm = signExtend2sCompl(instr.IMM())
 			}
 			registerUsage[dst] = i

 		case 16, 17, 18, 19, 20, 21, 22: // 7
 			ibc.Opcode = VM_IADD_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_IADD_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38: // 16
 			ibc.Opcode = VM_ISUB_R
-			ibc.idst = &vm.reg.r[dst]

-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
+			if src == dst {
 				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+				ibc.Opcode = VM_ISUB_I
 			}
 			registerUsage[dst] = i
 		case 39, 40, 41, 42, 43, 44, 45: // 7
 			ibc.Opcode = VM_ISUB_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_ISUB_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61: // 16
 			ibc.Opcode = VM_IMUL_R
-			ibc.idst = &vm.reg.r[dst]

-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
+			if src == dst {
 				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+				ibc.Opcode = VM_IMUL_I
 			}
 			registerUsage[dst] = i
 		case 62, 63, 64, 65: //4
 			ibc.Opcode = VM_IMUL_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_IMUL_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 66, 67, 68, 69: //4
 			ibc.Opcode = VM_IMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 70: //1
 			ibc.Opcode = VM_IMULH_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_IMULH_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 71, 72, 73, 74: //4
 			ibc.Opcode = VM_ISMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 75: //1
 			ibc.Opcode = VM_ISMULH_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_ISMULH_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 76, 77, 78, 79, 80, 81, 82, 83: // 8
 			divisor := instr.IMM()
 			if !isZeroOrPowerOf2(divisor) {
-				ibc.Opcode = VM_IMUL_R
-				ibc.idst = &vm.reg.r[dst]
+				ibc.Opcode = VM_IMUL_I
 				ibc.imm = randomx_reciprocal(divisor)
-				ibc.isrc = &ibc.imm
 				registerUsage[dst] = i
 			} else {
 				ibc.Opcode = VM_NOP
@ -290,66 +197,49 @@ func (vm *VM) Compile_TO_Bytecode() {

 		case 84, 85: //2
 			ibc.Opcode = VM_INEG_R
-			ibc.idst = &vm.reg.r[dst]
 			registerUsage[dst] = i
 		case 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100: //15
 			ibc.Opcode = VM_IXOR_R
-			ibc.idst = &vm.reg.r[dst]

-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
+			if src == dst {
 				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+				ibc.Opcode = VM_IXOR_I
 			}
 			registerUsage[dst] = i
 		case 101, 102, 103, 104, 105: //5
 			ibc.Opcode = VM_IXOR_M
-			ibc.idst = &vm.reg.r[dst]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
 					ibc.memMask = ScratchpadL1Mask
 				} else {
 					ibc.memMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
+				ibc.Opcode = VM_IXOR_MZ
 				ibc.memMask = ScratchpadL3Mask
+				ibc.imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 106, 107, 108, 109, 110, 111, 112, 113: //8
 			ibc.Opcode = VM_IROR_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
+			if src == dst {
 				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+				ibc.Opcode = VM_IROR_I
 			}
 			registerUsage[dst] = i
 		case 114, 115: // 2 IROL_R
 			ibc.Opcode = VM_IROL_R
-			ibc.idst = &vm.reg.r[dst]

-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
+			if src == dst {
 				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+				ibc.Opcode = VM_IROL_I
 			}
 			registerUsage[dst] = i

 		case 116, 117, 118, 119: //4
 			if src != dst {
 				ibc.Opcode = VM_ISWAP_R
-				ibc.idst = &vm.reg.r[dst]
-				ibc.isrc = &vm.reg.r[src]
 				registerUsage[dst] = i
 				registerUsage[src] = i
 			} else {
@ -359,24 +249,21 @@ func (vm *VM) Compile_TO_Bytecode() {

 		// below are floating point instructions
 		case 120, 121, 122, 123: // 4
-			ibc.Opcode = VM_FSWAP_R
-			if dst < REGISTERCOUNTFLT {
-				ibc.fdst = &vm.reg.f[dst]
+			//ibc.Opcode = VM_FSWAP_R
+			if dst < RegistersCountFloat {
+				ibc.Opcode = VM_FSWAP_RF
 			} else {
-				ibc.fdst = &vm.reg.e[dst-REGISTERCOUNTFLT]
+				ibc.Opcode = VM_FSWAP_RE
+				ibc.dst = dst - RegistersCountFloat
 			}
 		case 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139: //16
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FADD_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]

 		case 140, 141, 142, 143, 144: //5
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FADD_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
 				ibc.memMask = ScratchpadL1Mask
 			} else {
@ -385,16 +272,12 @@ func (vm *VM) Compile_TO_Bytecode() {
 			ibc.imm = signExtend2sCompl(instr.IMM())

 		case 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160: //16
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FSUB_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 161, 162, 163, 164, 165: //5
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSUB_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
 				ibc.memMask = ScratchpadL1Mask
 			} else {
@ -403,20 +286,15 @@ func (vm *VM) Compile_TO_Bytecode() {
 			ibc.imm = signExtend2sCompl(instr.IMM())

 		case 166, 167, 168, 169, 170, 171: //6
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSCAL_R
-			ibc.fdst = &vm.reg.f[dst]
 		case 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203: //32
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FMUL_R
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 204, 205, 206, 207: //4
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FDIV_M
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
 				ibc.memMask = ScratchpadL1Mask
 			} else {
@ -424,15 +302,17 @@ func (vm *VM) Compile_TO_Bytecode() {
 			}
 			ibc.imm = signExtend2sCompl(instr.IMM())
 		case 208, 209, 210, 211, 212, 213: //6
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSQRT_R
-			ibc.fdst = &vm.reg.e[dst]

 		case 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238: //25  // CBRANCH and CFROUND are interchanged
 			ibc.Opcode = VM_CBRANCH
-			reg := instr.Dst() % REGISTERSCOUNT
-			ibc.isrc = &vm.reg.r[reg]
-			ibc.target = int16(registerUsage[reg])
+			ibc.src = instr.Dst() % RegistersCount
+
+			target := uint16(int16(registerUsage[ibc.src]))
+			ibc.dst = uint8(target)
+			ibc.immB = uint8(target >> 8)
+
 			shift := uint64(instr.Mod()>>4) + CONDITIONOFFSET
 			//conditionmask := CONDITIONMASK << shift
 			ibc.imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
@ -441,19 +321,16 @@ func (vm *VM) Compile_TO_Bytecode() {
 			}
 			ibc.memMask = CONDITIONMASK << shift

-			for j := 0; j < REGISTERSCOUNT; j++ {
+			for j := 0; j < RegistersCount; j++ {
 				registerUsage[j] = i
 			}

 		case 239: //1
 			ibc.Opcode = VM_CFROUND
-			ibc.isrc = &vm.reg.r[src]
 			ibc.imm = uint64(instr.IMM() & 63)

 		case 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255: //16
 			ibc.Opcode = VM_ISTORE
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			ibc.imm = signExtend2sCompl(instr.IMM())
 			if (instr.Mod() >> 4) < STOREL3CONDITION {
 				if (instr.Mod() % 4) != 0 {
@ -474,142 +351,30 @@ func (vm *VM) Compile_TO_Bytecode() {

 }

-type InstructionByteCode struct {
-	dst, src   byte
-	idst, isrc *uint64
-	fdst, fsrc *[2]float64
-	imm        uint64
-	simm       int64
-	Opcode     VM_Instruction_Type
-	target     int16
-	shift      uint8
-	memMask    uint32
-	/*
-		union {
-			int_reg_t* idst;
-			rx_vec_f128* fdst;
-		};
-		union {
-			int_reg_t* isrc;
-			rx_vec_f128* fsrc;
-		};
-		union {
-			uint64_t imm;
-			int64_t simm;
-		};
-		InstructionType type;
-		union {
-			int16_t target;
-			uint16_t shift;
-		};
-		uint32_t memMask;
-	*/
+type ScratchPad [ScratchpadSize]byte

+func (pad *ScratchPad) Init(seed *[64]byte) {
+	// calculate and fill scratchpad
+	clear(pad[:])
+	aes.FillAes1Rx4(seed, pad[:])
+}
+func (pad *ScratchPad) Store64(addr uint32, val uint64) {
+	*(*uint64)(unsafe.Pointer(&pad[addr])) = val
+	//binary.LittleEndian.PutUint64(pad[addr:], val)
+}
+func (pad *ScratchPad) Load64(addr uint32) uint64 {
+	return *(*uint64)(unsafe.Pointer(&pad[addr]))
+}
+func (pad *ScratchPad) Load32(addr uint32) uint32 {
+	return *(*uint32)(unsafe.Pointer(&pad[addr]))
 }

-func (ibc *InstructionByteCode) getScratchpadAddress() uint64 {
-	return (*ibc.isrc + ibc.imm) & uint64(ibc.memMask)
-}
-
-func (ibc *InstructionByteCode) getScratchpadDestAddress() uint64 {
-	return (*ibc.idst + ibc.imm) & uint64(ibc.memMask)
-}
-
-func (vm *VM) Load64(addr uint64) uint64 {
-	return *(*uint64)(unsafe.Pointer(&vm.ScratchPad[addr]))
-}
-func (vm *VM) Load32(addr uint64) uint32 {
-	return *(*uint32)(unsafe.Pointer(&vm.ScratchPad[addr]))
-}
-
-func (vm *VM) Load32F(addr uint64) (lo, hi float64) {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
 	return float64(a[LOW]), float64(a[HIGH])
 }

-func (vm *VM) Load32FA(addr uint64) [2]float64 {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
 	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
 }
-
-func (vm *VM) InterpretByteCode() {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-		ibc := &vm.ByteCode[pc]
-		switch ibc.Opcode {
-		case VM_IADD_RS:
-			*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm
-		case VM_IADD_M:
-			*ibc.idst += vm.Load64(ibc.getScratchpadAddress())
-		case VM_ISUB_R:
-			*ibc.idst -= *ibc.isrc
-		case VM_ISUB_M:
-			*ibc.idst -= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IMUL_R:
-			// also handles imul_rcp
-			*ibc.idst *= *ibc.isrc
-		case VM_IMUL_M:
-			*ibc.idst *= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IMULH_R:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, *ibc.isrc)
-		case VM_IMULH_M:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadAddress()))
-		case VM_ISMULH_R:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(*ibc.isrc))
-		case VM_ISMULH_M:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress())))
-		case VM_INEG_R:
-			*ibc.idst = (^(*ibc.idst)) + 1 // 2's complement negative
-		case VM_IXOR_R:
-			*ibc.idst ^= *ibc.isrc
-		case VM_IXOR_M:
-			*ibc.idst ^= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IROR_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, 0-int(*ibc.isrc&63))
-		case VM_IROL_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, int(*ibc.isrc&63))
-		case VM_ISWAP_R:
-			*ibc.idst, *ibc.isrc = *ibc.isrc, *ibc.idst
-		case VM_FSWAP_R:
-			ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
-		case VM_FADD_R:
-			ibc.fdst[LOW] += ibc.fsrc[LOW]
-			ibc.fdst[HIGH] += ibc.fsrc[HIGH]
-		case VM_FADD_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] += lo
-			ibc.fdst[HIGH] += hi
-		case VM_FSUB_R:
-			ibc.fdst[LOW] -= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
-		case VM_FSUB_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] -= lo
-			ibc.fdst[HIGH] -= hi
-		case VM_FSCAL_R:
-			// no dependent on rounding modes
-			ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
-			ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)
-		case VM_FMUL_R:
-			ibc.fdst[LOW] *= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] *= ibc.fsrc[HIGH]
-		case VM_FDIV_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(lo, vm.config.eMask[LOW])
-			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(hi, vm.config.eMask[HIGH])
-		case VM_FSQRT_R:
-			ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
-			ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
-		case VM_CBRANCH:
-			*ibc.isrc += ibc.imm
-			if (*ibc.isrc & uint64(ibc.memMask)) == 0 {
-				pc = int(ibc.target)
-			}
-		case VM_CFROUND:
-			tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
-			asm.SetRoundingMode(asm.RoundingMode(tmp))
-		case VM_ISTORE:
-			binary.LittleEndian.PutUint64(vm.ScratchPad[(*ibc.idst+ibc.imm)&uint64(ibc.memMask):], *ibc.isrc)
-		case VM_NOP: // we do nothing
-		}
-	}
-}
Author	SHA1	Message	Date
DataHoarder	e4866b5bfd	Use direct register and scratchpad under bytecode execution All checks were successful continuous-integration/drone/push Build is passing Details	2024-04-15 02:14:01 +02:00
DataHoarder	b72f79a653	Remove zero register from vm bytecode	2024-04-14 15:43:54 +02:00