Move argon2 / float packages to their own folders, cleanup vm Run

Use direct register and scratchpad under bytecode execution
2024-04-15 04:14:15 +02:00 · 2024-04-15 02:22:04 +02:00
13 changed files with 599 additions and 537 deletions
--- a/argon2.go
+++ b/argon2.go
@ -1,58 +0,0 @@
-package randomx
-
-import "golang.org/x/crypto/blake2b"
-
-import (
-	_ "golang.org/x/crypto/argon2"
-	_ "unsafe"
-)
-
-// see reference configuration.h
-// Cache size in KiB. Must be a power of 2.
-const RANDOMX_ARGON_MEMORY = 262144
-
-// Number of Argon2d iterations for Cache initialization.
-const RANDOMX_ARGON_ITERATIONS = 3
-
-// Number of parallel lanes for Cache initialization.
-const RANDOMX_ARGON_LANES = 1
-
-// Argon2d salt
-const RANDOMX_ARGON_SALT = "RandomX\x03"
-const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
-
-const ArgonBlockSize uint32 = 1024
-
-type argonBlock [128]uint64
-
-const syncPoints = 4
-
-//go:linkname argon2_initHash golang.org/x/crypto/argon2.initHash
-func argon2_initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
-
-//go:linkname argon2_initBlocks golang.org/x/crypto/argon2.initBlocks
-func argon2_initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []argonBlock
-
-//go:linkname argon2_processBlocks golang.org/x/crypto/argon2.processBlocks
-func argon2_processBlocks(B []argonBlock, time, memory, threads uint32, mode int)
-
-// argon2_buildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
-func argon2_buildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []argonBlock {
-	if time < 1 {
-		panic("argon2: number of rounds too small")
-	}
-	if threads < 1 {
-		panic("argon2: parallelism degree too low")
-	}
-	const mode = 0 /* argon2d */
-	h0 := argon2_initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
-
-	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
-	if memory < 2*syncPoints*uint32(threads) {
-		memory = 2 * syncPoints * uint32(threads)
-	}
-	B := argon2_initBlocks(&h0, memory, uint32(threads))
-	argon2_processBlocks(B, time, memory, uint32(threads), mode)
-
-	return B
-}
--- a/argon2/argon2.go
+++ b/argon2/argon2.go
@ -0,0 +1,44 @@
+package argon2
+
+import "golang.org/x/crypto/blake2b"
+
+import (
+	_ "golang.org/x/crypto/argon2"
+	_ "unsafe"
+)
+
+const BlockSize uint32 = 1024
+
+type Block [BlockSize / 8]uint64
+
+const syncPoints = 4
+
+//go:linkname initHash golang.org/x/crypto/argon2.initHash
+func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
+
+//go:linkname initBlocks golang.org/x/crypto/argon2.initBlocks
+func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []Block
+
+//go:linkname processBlocks golang.org/x/crypto/argon2.processBlocks
+func processBlocks(B []Block, time, memory, threads uint32, mode int)
+
+// BuildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
+func BuildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []Block {
+	if time < 1 {
+		panic("argon2: number of rounds too small")
+	}
+	if threads < 1 {
+		panic("argon2: parallelism degree too low")
+	}
+	const mode = 0 /* argon2d */
+	h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
+
+	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
+	if memory < 2*syncPoints*uint32(threads) {
+		memory = 2 * syncPoints * uint32(threads)
+	}
+	B := initBlocks(&h0, memory, uint32(threads))
+	processBlocks(B, time, memory, uint32(threads), mode)
+
+	return B
+}
--- a/asm/round.go
+++ b/asm/round.go
@ -1,14 +1,7 @@
 package asm

-type RoundingMode uint8
+import "git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"

-const (
-	RoundingModeToNearest = RoundingMode(iota)
-	RoundingModeToNegative
-	RoundingModeToPositive
-	RoundingModeToZero
-)
-
-func SetRoundingMode(mode RoundingMode) {
+func SetRoundingMode(mode softfloat.RoundingMode) {
 	setRoundingMode(uint8(mode))
 }
--- a/bytecode.go
+++ b/bytecode.go
@ -1,190 +0,0 @@
-package randomx
-
-import (
-	"encoding/binary"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
-	"math"
-	"math/bits"
-)
-
-type ByteCodeInstruction struct {
-	dst, src   byte
-	idst, isrc *uint64
-	fdst, fsrc *[2]float64
-	imm        uint64
-	simm       int64
-	Opcode     ByteCodeInstructionOp
-	target     int16
-	shift      uint8
-	memMask    uint32
-	/*
-		union {
-			int_reg_t* idst;
-			rx_vec_f128* fdst;
-		};
-		union {
-			int_reg_t* isrc;
-			rx_vec_f128* fsrc;
-		};
-		union {
-			uint64_t imm;
-			int64_t simm;
-		};
-		InstructionType type;
-		union {
-			int16_t target;
-			uint16_t shift;
-		};
-		uint32_t memMask;
-	*/
-
-}
-
-func (i ByteCodeInstruction) getScratchpadSrcAddress() uint64 {
-	return (*i.isrc + i.imm) & uint64(i.memMask)
-}
-
-func (i ByteCodeInstruction) getScratchpadZeroAddress() uint64 {
-	return i.imm & uint64(i.memMask)
-}
-
-func (i ByteCodeInstruction) getScratchpadDestAddress() uint64 {
-	return (*i.idst + i.imm) & uint64(i.memMask)
-}
-
-type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
-
-func (c *ByteCode) Interpret(vm *VM) {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-		ibc := c[pc]
-		switch ibc.Opcode {
-		case VM_IADD_RS:
-			*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm
-		case VM_IADD_M:
-			*ibc.idst += vm.Load64(ibc.getScratchpadSrcAddress())
-		case VM_IADD_MZ:
-			*ibc.idst += vm.Load64(ibc.getScratchpadZeroAddress())
-		case VM_ISUB_R:
-			*ibc.idst -= *ibc.isrc
-		case VM_ISUB_M:
-			*ibc.idst -= vm.Load64(ibc.getScratchpadSrcAddress())
-		case VM_ISUB_MZ:
-			*ibc.idst -= vm.Load64(ibc.getScratchpadZeroAddress())
-		case VM_IMUL_R:
-			// also handles imul_rcp
-			*ibc.idst *= *ibc.isrc
-		case VM_IMUL_M:
-			*ibc.idst *= vm.Load64(ibc.getScratchpadSrcAddress())
-		case VM_IMUL_MZ:
-			*ibc.idst *= vm.Load64(ibc.getScratchpadZeroAddress())
-		case VM_IMULH_R:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, *ibc.isrc)
-		case VM_IMULH_M:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadSrcAddress()))
-		case VM_IMULH_MZ:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadZeroAddress()))
-		case VM_ISMULH_R:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(*ibc.isrc))
-		case VM_ISMULH_M:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadSrcAddress())))
-		case VM_ISMULH_MZ:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadZeroAddress())))
-		case VM_INEG_R:
-			*ibc.idst = (^(*ibc.idst)) + 1 // 2's complement negative
-		case VM_IXOR_R:
-			*ibc.idst ^= *ibc.isrc
-		case VM_IXOR_M:
-			*ibc.idst ^= vm.Load64(ibc.getScratchpadSrcAddress())
-		case VM_IXOR_MZ:
-			*ibc.idst ^= vm.Load64(ibc.getScratchpadZeroAddress())
-		case VM_IROR_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, 0-int(*ibc.isrc&63))
-		case VM_IROL_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, int(*ibc.isrc&63))
-		case VM_ISWAP_R:
-			*ibc.idst, *ibc.isrc = *ibc.isrc, *ibc.idst
-		case VM_FSWAP_R:
-			ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
-		case VM_FADD_R:
-			ibc.fdst[LOW] += ibc.fsrc[LOW]
-			ibc.fdst[HIGH] += ibc.fsrc[HIGH]
-		case VM_FADD_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadSrcAddress())
-			ibc.fdst[LOW] += lo
-			ibc.fdst[HIGH] += hi
-		case VM_FSUB_R:
-			ibc.fdst[LOW] -= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
-		case VM_FSUB_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadSrcAddress())
-			ibc.fdst[LOW] -= lo
-			ibc.fdst[HIGH] -= hi
-		case VM_FSCAL_R:
-			// no dependent on rounding modes
-			ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
-			ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)
-		case VM_FMUL_R:
-			ibc.fdst[LOW] *= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] *= ibc.fsrc[HIGH]
-		case VM_FDIV_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadSrcAddress())
-			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(lo, vm.config.eMask[LOW])
-			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(hi, vm.config.eMask[HIGH])
-		case VM_FSQRT_R:
-			ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
-			ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
-		case VM_CBRANCH:
-			*ibc.isrc += ibc.imm
-			if (*ibc.isrc & uint64(ibc.memMask)) == 0 {
-				pc = int(ibc.target)
-			}
-		case VM_CFROUND:
-			tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
-			asm.SetRoundingMode(asm.RoundingMode(tmp))
-		case VM_ISTORE:
-			binary.LittleEndian.PutUint64(vm.ScratchPad[(*ibc.idst+ibc.imm)&uint64(ibc.memMask):], *ibc.isrc)
-		case VM_NOP: // we do nothing
-		}
-	}
-}
-
-type ByteCodeInstructionOp int
-
-const (
-	VM_NOP = ByteCodeInstructionOp(iota)
-	VM_IADD_RS
-	VM_IADD_M
-	VM_IADD_MZ
-	VM_ISUB_R
-	VM_ISUB_M
-	VM_ISUB_MZ
-	VM_IMUL_R
-	VM_IMUL_M
-	VM_IMUL_MZ
-	VM_IMULH_R
-	VM_IMULH_M
-	VM_IMULH_MZ
-	VM_ISMULH_R
-	VM_ISMULH_M
-	VM_ISMULH_MZ
-	VM_IMUL_RCP
-	VM_INEG_R
-	VM_IXOR_R
-	VM_IXOR_M
-	VM_IXOR_MZ
-	VM_IROR_R
-	VM_IROL_R
-	VM_ISWAP_R
-	VM_FSWAP_R
-	VM_FADD_R
-	VM_FADD_M
-	VM_FSUB_R
-	VM_FSUB_M
-	VM_FSCAL_R
-	VM_FMUL_R
-	VM_FDIV_M
-	VM_FSQRT_R
-	VM_CBRANCH
-	VM_CFROUND
-	VM_ISTORE
-)
--- a/cache.go
+++ b/cache.go
@ -1,6 +1,7 @@
 package randomx

 import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
 	"runtime"
 	"slices"
@ -66,9 +67,9 @@ func (cache *Randomx_Cache) Init(key []byte) {

 	kkey := slices.Clone(key)

-	argonBlocks := argon2_buildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
+	argonBlocks := argon2.BuildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)

-	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argonBlock{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
+	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argon2.Block{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))

 	cache.Blocks = memoryBlocks

--- a/config.go
+++ b/config.go
@ -29,6 +29,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package randomx

+import "git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
+
+// see reference configuration.h
+// Cache size in KiB. Must be a power of 2.
+const RANDOMX_ARGON_MEMORY = 262144
+
+// Number of Argon2d iterations for Cache initialization.
+const RANDOMX_ARGON_ITERATIONS = 3
+
+// Number of parallel lanes for Cache initialization.
+const RANDOMX_ARGON_LANES = 1
+
+// Argon2d salt
+const RANDOMX_ARGON_SALT = "RandomX\x03"
+const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
+
 // Number of random Cache accesses per Dataset item. Minimum is 2.
 const RANDOMX_CACHE_ACCESSES = 8

@ -74,7 +90,7 @@ const ScratchpadSize uint32 = RANDOMX_SCRATCHPAD_L3

 const CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & (^(CacheLineSize - 1))

-const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(ArgonBlockSize)
+const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(argon2.BlockSize)

 const ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / 8
 const ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / 8
@ -87,19 +103,9 @@ const ScratchpadL3Mask = (ScratchpadL3 - 1) * 8
 const ScratchpadL3Mask64 = (ScratchpadL3/8 - 1) * 64

 const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
-const CONDITIONMASK = ((1 << RANDOMX_JUMP_BITS) - 1)
+const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14

-const mantissaSize = 52
-const exponentSize = 11
-const mantissaMask = (uint64(1) << mantissaSize) - 1
-const exponentMask = (uint64(1) << exponentSize) - 1
-const exponentBias = 1023
-const dynamicExponentBits = 4
-const staticExponentBits = 4
-const constExponentBits uint64 = 0x300
-const dynamicMantissaMask = (uint64(1) << (mantissaSize + dynamicExponentBits)) - 1
-
 const RANDOMX_FLAG_DEFAULT = uint64(0)
 const RANDOMX_FLAG_JIT = uint64(1 << iota)

--- a/register.go
+++ b/register.go
@ -1,15 +1,33 @@
 package randomx

+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
+)
+
 const RegistersCount = 8
 const RegistersCountFloat = 4

+const LOW = 0
+const HIGH = 1
+
 type RegisterLine [RegistersCount]uint64

 type RegisterFile struct {
-	r RegisterLine
-	f [RegistersCountFloat][2]float64
-	e [RegistersCountFloat][2]float64
-	a [RegistersCountFloat][2]float64
+	R RegisterLine
+	F [RegistersCountFloat][2]float64
+	E [RegistersCountFloat][2]float64
+	A [RegistersCountFloat][2]float64
+
+	FPRC softfloat.RoundingMode
+}
+
+func (f *RegisterFile) SetRoundingMode(mode softfloat.RoundingMode) {
+	if f.FPRC == mode {
+		return
+	}
+	f.FPRC = mode
+	asm.SetRoundingMode(mode)
 }

 type MemoryRegisters struct {
--- a/softfloat/const.go
+++ b/softfloat/const.go
@ -0,0 +1,37 @@
+package softfloat
+
+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+	bias64          = -1<<(expbits64-1) + 1
+
+	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
+	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
+	neg64 uint64 = 1 << (expbits64 + mantbits64)
+)
+
+const mantissaMask = (uint64(1) << mantbits64) - 1
+const exponentMask = (uint64(1) << expbits64) - 1
+const exponentBias = 1023
+const dynamicExponentBits = 4
+const staticExponentBits = 4
+const constExponentBits uint64 = 0x300
+const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
+
+const mask22bit = (uint64(1) << 22) - 1
+
+type RoundingMode uint8
+
+const (
+	// RoundingModeToNearest IEEE 754 roundTiesToEven
+	RoundingModeToNearest = RoundingMode(iota)
+
+	// RoundingModeToNegative IEEE 754 roundTowardNegative
+	RoundingModeToNegative
+
+	// RoundingModeToPositive IEEE 754 roundTowardPositive
+	RoundingModeToPositive
+
+	// RoundingModeToZero IEEE 754 roundTowardZero
+	RoundingModeToZero
+)
--- a/softfloat/funcs.go
+++ b/softfloat/funcs.go
@ -0,0 +1,35 @@
+package softfloat
+
+import "math"
+
+func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
+	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
+}
+
+func ScaleNegate(f float64) float64 {
+	return math.Float64frombits(math.Float64bits(f) ^ 0x80F0000000000000)
+}
+
+func SmallPositiveFloatBits(entropy uint64) float64 {
+	exponent := entropy >> 59 //0..31
+	mantissa := entropy & mantissaMask
+	exponent += exponentBias
+	exponent &= exponentMask
+	exponent = exponent << mantbits64
+	return math.Float64frombits(exponent | mantissa)
+}
+
+func StaticExponent(entropy uint64) uint64 {
+	exponent := constExponentBits
+	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
+	exponent <<= mantbits64
+	return exponent
+}
+
+func EMask(entropy uint64) uint64 {
+	return (entropy & mask22bit) | StaticExponent(entropy)
+}
+
+func Xor(a, b float64) float64 {
+	return math.Float64frombits(math.Float64bits(a) ^ math.Float64bits(b))
+}
--- a/softfloat/softfloat.go
+++ b/softfloat/softfloat.go
@ -0,0 +1,27 @@
+package softfloat
+
+import (
+	_ "runtime"
+	_ "unsafe"
+)
+
+//go:linkname funpack64 runtime.funpack64
+func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool)
+
+//go:linkname fpack64 runtime.fpack64
+func fpack64(sign, mant uint64, exp int, trunc uint64) uint64
+
+//go:linkname fadd64 runtime.fadd64
+func fadd64(f, g uint64) uint64
+
+//go:linkname fsub64 runtime.fsub64
+func fsub64(f, g uint64) uint64
+
+//go:linkname fneg64 runtime.fneg64
+func fneg64(f uint64) uint64
+
+//go:linkname fmul64 runtime.fmul64
+func fmul64(f uint64) uint64
+
+//go:linkname fdiv64 runtime.fdiv64
+func fdiv64(f uint64) uint64
--- a/vm.go
+++ b/vm.go
@ -31,9 +31,10 @@ package randomx

 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 	"math"
 	"runtime"
+	"unsafe"
 )
 import "encoding/binary"
 import "golang.org/x/crypto/blake2b"
@ -45,17 +46,10 @@ type REG struct {

 type VM struct {
 	StateStart [64]byte
-	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
-	Prog       []byte
-	ScratchPad [ScratchpadSize]byte
+	ScratchPad ScratchPad

 	ByteCode ByteCode

-	// program configuration  see program.hpp
-
-	entropy [16]uint64
-
-	reg           RegisterFile // the register file
 	mem           MemoryRegisters
 	config        Config // configuration
 	datasetOffset uint64
@ -66,51 +60,47 @@ type VM struct {

 }

-func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
-	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
-}
-
 type Config struct {
 	eMask   [2]uint64
 	readReg [4]uint64
 }

-const LOW = 0
-const HIGH = 1
+// Run calculate hash based on input
+// Warning: Underlying callers will run asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg RegisterFile) {

-// calculate hash based on input
-func (vm *VM) Run(input_hash [64]byte) {
+	reg.FPRC = roundingMode

-	aes.FillAes4Rx4(input_hash, vm.buffer[:])
+	// buffer first 128 bytes are entropy below rest are program bytes
+	var buffer [16*8 + RANDOMX_PROGRAM_SIZE*8]byte
+	aes.FillAes4Rx4(inputHash, buffer[:])

-	for i := range vm.entropy {
-		vm.entropy[i] = binary.LittleEndian.Uint64(vm.buffer[i*8:])
-	}
+	entropy := (*[16]uint64)(unsafe.Pointer(&buffer))

-	vm.Prog = vm.buffer[len(vm.entropy)*8:]
-
-	clear(vm.reg.r[:])
+	prog := buffer[len(entropy)*8:]

 	// do more initialization before we run

-	for i := range vm.entropy[:8] {
-		vm.reg.a[i/2][i%2] = math.Float64frombits(getSmallPositiveFloatBits(vm.entropy[i]))
+	for i := range entropy[:8] {
+		reg.A[i/2][i%2] = softfloat.SmallPositiveFloatBits(entropy[i])
 	}

-	vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
-	vm.mem.mx = vm.entropy[10]
+	vm.mem.ma = entropy[8] & CacheLineAlignMask
+	vm.mem.mx = entropy[10]

-	addressRegisters := vm.entropy[12]
+	addressRegisters := entropy[12]
 	for i := range vm.config.readReg {
 		vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
 		addressRegisters >>= 1
 	}

-	vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
-	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
-	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
+	vm.datasetOffset = (entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
+	vm.config.eMask[LOW] = softfloat.EMask(entropy[14])
+	vm.config.eMask[HIGH] = softfloat.EMask(entropy[15])

-	vm.CompileToBytecode()
+	vm.ByteCode = CompileProgramToByteCode(prog)

 	spAddr0 := vm.mem.mx
 	spAddr1 := vm.mem.ma
@ -118,51 +108,52 @@ func (vm *VM) Run(input_hash [64]byte) {
 	var rlCache RegisterLine

 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
-		spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]
+		spMix := reg.R[vm.config.readReg[0]] ^ reg.R[vm.config.readReg[1]]

 		spAddr0 ^= spMix
 		spAddr0 &= ScratchpadL3Mask64
 		spAddr1 ^= spMix >> 32
 		spAddr1 &= ScratchpadL3Mask64

+		//TODO: optimize these loads!
 		for i := uint64(0); i < RegistersCount; i++ {
-			vm.reg.r[i] ^= vm.Load64(spAddr0 + 8*i)
+			reg.R[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
 		}

 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i] = vm.Load32FA(spAddr1 + 8*i)
+			reg.F[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
 		}

 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.e[i] = vm.Load32FA(spAddr1 + 8*(i+RegistersCountFloat))
+			reg.E[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))

-			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
-			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
+			reg.E[i][LOW] = softfloat.MaskRegisterExponentMantissa(reg.E[i][LOW], vm.config.eMask[LOW])
+			reg.E[i][HIGH] = softfloat.MaskRegisterExponentMantissa(reg.E[i][HIGH], vm.config.eMask[HIGH])
 		}

-		// todo: pass register file directly!
-		vm.ByteCode.Interpret(vm)
+		// Run the actual bytecode
+		vm.ByteCode.Execute(&reg, &vm.ScratchPad, vm.config.eMask)

-		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
+		vm.mem.mx ^= reg.R[vm.config.readReg[2]] ^ reg.R[vm.config.readReg[3]]
 		vm.mem.mx &= CacheLineAlignMask

 		vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
 		// execute diffuser superscalar program to get dataset 64 bytes
-		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &vm.reg.r, &rlCache)
+		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &reg.R, &rlCache)

 		// swap the elements
 		vm.mem.mx, vm.mem.ma = vm.mem.ma, vm.mem.mx

 		for i := uint64(0); i < RegistersCount; i++ {
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr1+8*i:], vm.reg.r[i])
+			vm.ScratchPad.Store64(uint32(spAddr1+8*i), reg.R[i])
 		}

 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i][LOW] = math.Float64frombits(math.Float64bits(vm.reg.f[i][LOW]) ^ math.Float64bits(vm.reg.e[i][LOW]))
-			vm.reg.f[i][HIGH] = math.Float64frombits(math.Float64bits(vm.reg.f[i][HIGH]) ^ math.Float64bits(vm.reg.e[i][HIGH]))
+			reg.F[i][LOW] = softfloat.Xor(reg.F[i][LOW], reg.E[i][LOW])
+			reg.F[i][HIGH] = softfloat.Xor(reg.F[i][HIGH], reg.E[i][HIGH])

-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i:], math.Float64bits(vm.reg.f[i][LOW]))
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i+8:], math.Float64bits(vm.reg.f[i][HIGH]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(reg.F[i][LOW]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(reg.F[i][HIGH]))
 		}

 		spAddr0 = 0
@ -170,58 +161,52 @@ func (vm *VM) Run(input_hash [64]byte) {

 	}

+	return reg
+
 }

 func (vm *VM) InitScratchpad(seed *[64]byte) {
-	// calculate and fill scratchpad
-	clear(vm.ScratchPad[:])
-	aes.FillAes1Rx4(seed, vm.ScratchPad[:])
+	vm.ScratchPad.Init(seed)
 }

-func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
+func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
+
 	var buf [8]byte
+	hash512, _ := blake2b.New512(nil)

 	// Lock thread due to rounding mode flags
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
-	//restore rounding mode to golang expected one
-	defer asm.SetRoundingMode(asm.RoundingModeToNearest)

-	// reset rounding mode if new hash being calculated
-	asm.SetRoundingMode(asm.RoundingModeToNearest)
-
-	tempHash := blake2b.Sum512(input)
-
-	vm.InitScratchpad(&tempHash)
-
-	hash512, _ := blake2b.New512(nil)
+	roundingMode := softfloat.RoundingModeToNearest

 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
-		vm.Run(tempHash)
+		reg := vm.Run(tempHash, roundingMode)
+		roundingMode = reg.FPRC

 		hash512.Reset()
-		for i := range vm.reg.r {
-			binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+		for i := range reg.R {
+			binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 			hash512.Write(buf[:])
 		}
-		for i := range vm.reg.f {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+		for i := range reg.F {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 			hash512.Write(buf[:])
 		}

-		for i := range vm.reg.e {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+		for i := range reg.E {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 			hash512.Write(buf[:])
 		}

-		for i := range vm.reg.a {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][LOW]))
+		for i := range reg.A {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][HIGH]))
 			hash512.Write(buf[:])
 		}

@ -229,7 +214,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	}

 	// final loop executes here
-	vm.Run(tempHash)
+	reg := vm.Run(tempHash, roundingMode)
+	roundingMode = reg.FPRC
+
+	reg.SetRoundingMode(softfloat.RoundingModeToNearest)
+
+	return reg
+}
+
+func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
+	var buf [8]byte
+
+	tempHash := blake2b.Sum512(input)
+
+	vm.InitScratchpad(&tempHash)
+
+	reg := vm.RunLoops(tempHash)

 	// now hash the scratch pad and place into register a
 	aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)
@ -238,22 +238,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {

 	hash256.Reset()

-	for i := range vm.reg.r {
-		binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+	for i := range reg.R {
+		binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 		hash256.Write(buf[:])
 	}

-	for i := range vm.reg.f {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+	for i := range reg.F {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 		hash256.Write(buf[:])
 	}

-	for i := range vm.reg.e {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+	for i := range reg.E {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 		hash256.Write(buf[:])
 	}

@ -262,25 +262,3 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {

 	hash256.Sum(output[:0])
 }
-
-const mask22bit = (uint64(1) << 22) - 1
-
-func getSmallPositiveFloatBits(entropy uint64) uint64 {
-	exponent := entropy >> 59 //0..31
-	mantissa := entropy & mantissaMask
-	exponent += exponentBias
-	exponent &= exponentMask
-	exponent = exponent << mantissaSize
-	return exponent | mantissa
-}
-
-func getStaticExponent(entropy uint64) uint64 {
-	exponent := constExponentBits
-	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
-	exponent <<= mantissaSize
-	return exponent
-}
-
-func getFloatMask(entropy uint64) uint64 {
-	return (entropy & mask22bit) | getStaticExponent(entropy)
-}
--- a/vm_bytecode.go
+++ b/vm_bytecode.go
@ -0,0 +1,207 @@
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
+	"math"
+	"math/bits"
+)
+
+type ByteCodeInstruction struct {
+	Dst, Src byte
+	ImmB     uint8
+	Opcode   ByteCodeInstructionOp
+	MemMask  uint32
+	Imm      uint64
+	/*
+		union {
+			int_reg_t* idst;
+			rx_vec_f128* fdst;
+		};
+		union {
+			int_reg_t* isrc;
+			rx_vec_f128* fsrc;
+		};
+		union {
+			uint64_t imm;
+			int64_t simm;
+		};
+		InstructionType type;
+		union {
+			int16_t target;
+			uint16_t shift;
+		};
+		uint32_t memMask;
+	*/
+
+}
+
+func (i ByteCodeInstruction) jumpTarget() int {
+	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
+}
+
+func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
+	return uint32(ptr+i.Imm) & i.MemMask
+}
+
+func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
+	return uint32(i.Imm) & i.MemMask
+}
+
+type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] += lo
+			f.F[i.Dst][HIGH] += hi
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] -= lo
+			f.F[i.Dst][HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = softfloat.ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = softfloat.ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
+			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] /= softfloat.MaskRegisterExponentMantissa(lo, eMask[LOW])
+			f.E[i.Dst][HIGH] /= softfloat.MaskRegisterExponentMantissa(hi, eMask[HIGH])
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
+			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
+		case VM_CBRANCH:
+			f.R[i.Src] += i.Imm
+			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			f.SetRoundingMode(softfloat.RoundingMode(tmp))
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+type ByteCodeInstructionOp int
+
+const (
+	VM_NOP = ByteCodeInstructionOp(iota)
+	VM_IADD_RS
+	VM_IADD_M
+	VM_IADD_MZ
+	VM_ISUB_R
+	VM_ISUB_I
+	VM_ISUB_M
+	VM_ISUB_MZ
+	VM_IMUL_R
+	VM_IMUL_I
+	VM_IMUL_M
+	VM_IMUL_MZ
+	VM_IMULH_R
+	VM_IMULH_M
+	VM_IMULH_MZ
+	VM_ISMULH_R
+	VM_ISMULH_M
+	VM_ISMULH_MZ
+	VM_INEG_R
+	VM_IXOR_R
+	VM_IXOR_I
+	VM_IXOR_M
+	VM_IXOR_MZ
+	VM_IROR_R
+	VM_IROR_I
+	VM_IROL_R
+	VM_IROL_I
+	VM_ISWAP_R
+	VM_FSWAP_RF
+	VM_FSWAP_RE
+	VM_FADD_R
+	VM_FADD_M
+	VM_FSUB_R
+	VM_FSUB_M
+	VM_FSCAL_R
+	VM_FMUL_R
+	VM_FDIV_M
+	VM_FSQRT_R
+	VM_CBRANCH
+	VM_CFROUND
+	VM_ISTORE
+)
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -30,14 +30,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
 	"unsafe"
 )
 import "encoding/binary"

 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding

-// since go does not have union, use byte array
-type VM_Instruction []byte // it is hardcode 8 bytes
+// VM_Instruction since go does not have union, use byte array
+type VM_Instruction [8]byte // it is hardcode 8 bytes

 func (ins VM_Instruction) IMM() uint32 {
 	return binary.LittleEndian.Uint32(ins[4:])
@ -55,9 +56,9 @@ func (ins VM_Instruction) Opcode() byte {
 	return ins[0]
 }

-// CompileToBytecode this will interpret single vm instruction
+// CompileProgramToByteCode this will interpret single vm instruction into executable opcodes
 // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions
-func (vm *VM) CompileToBytecode() {
+func CompileProgramToByteCode(prog []byte) (bc ByteCode) {

 	var registerUsage [RegistersCount]int
 	for i := range registerUsage {
@ -65,150 +66,130 @@ func (vm *VM) CompileToBytecode() {
 	}

 	for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
-		instr := VM_Instruction(vm.Prog[i*8:])
-		ibc := &vm.ByteCode[i]
+		instr := VM_Instruction(prog[i*8:])
+		ibc := &bc[i]

 		opcode := instr.Opcode()
 		dst := instr.Dst() % RegistersCount // bit shift optimization
 		src := instr.Src() % RegistersCount
-		ibc.dst = dst
-		ibc.src = src
+		ibc.Dst = dst
+		ibc.Src = src
 		switch opcode {
 		case 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15: // 16 frequency
 			ibc.Opcode = VM_IADD_RS
-			ibc.idst = &vm.reg.r[dst]
 			if dst != RegisterNeedsDisplacement {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
-				ibc.imm = 0
+				//shift
+				ibc.ImmB = (instr.Mod() >> 2) % 4
+				ibc.Imm = 0
 			} else {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
-				ibc.imm = signExtend2sCompl(instr.IMM())
+				//shift
+				ibc.ImmB = (instr.Mod() >> 2) % 4
+				ibc.Imm = signExtend2sCompl(instr.IMM())
 			}
 			registerUsage[dst] = i

 		case 16, 17, 18, 19, 20, 21, 22: // 7
 			ibc.Opcode = VM_IADD_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_IADD_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38: // 16
 			ibc.Opcode = VM_ISUB_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = signExtend2sCompl(instr.IMM())
+				ibc.Opcode = VM_ISUB_I
 			}
 			registerUsage[dst] = i
 		case 39, 40, 41, 42, 43, 44, 45: // 7
 			ibc.Opcode = VM_ISUB_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_ISUB_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61: // 16
 			ibc.Opcode = VM_IMUL_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = signExtend2sCompl(instr.IMM())
+				ibc.Opcode = VM_IMUL_I
 			}
 			registerUsage[dst] = i
 		case 62, 63, 64, 65: //4
 			ibc.Opcode = VM_IMUL_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_IMUL_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 66, 67, 68, 69: //4
 			ibc.Opcode = VM_IMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 70: //1
 			ibc.Opcode = VM_IMULH_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_IMULH_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 71, 72, 73, 74: //4
 			ibc.Opcode = VM_ISMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 75: //1
 			ibc.Opcode = VM_ISMULH_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_ISMULH_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 76, 77, 78, 79, 80, 81, 82, 83: // 8
 			divisor := instr.IMM()
 			if !isZeroOrPowerOf2(divisor) {
-				ibc.Opcode = VM_IMUL_R
-				ibc.idst = &vm.reg.r[dst]
-				ibc.imm = randomx_reciprocal(divisor)
-				ibc.isrc = &ibc.imm
+				ibc.Opcode = VM_IMUL_I
+				ibc.Imm = randomx_reciprocal(divisor)
 				registerUsage[dst] = i
 			} else {
 				ibc.Opcode = VM_NOP
@ -216,66 +197,49 @@ func (vm *VM) CompileToBytecode() {

 		case 84, 85: //2
 			ibc.Opcode = VM_INEG_R
-			ibc.idst = &vm.reg.r[dst]
 			registerUsage[dst] = i
 		case 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100: //15
 			ibc.Opcode = VM_IXOR_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = signExtend2sCompl(instr.IMM())
+				ibc.Opcode = VM_IXOR_I
 			}
 			registerUsage[dst] = i
 		case 101, 102, 103, 104, 105: //5
 			ibc.Opcode = VM_IXOR_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
 				ibc.Opcode = VM_IXOR_MZ
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 106, 107, 108, 109, 110, 111, 112, 113: //8
 			ibc.Opcode = VM_IROR_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+			if src == dst {
+				ibc.Imm = signExtend2sCompl(instr.IMM())
+				ibc.Opcode = VM_IROR_I
 			}
 			registerUsage[dst] = i
 		case 114, 115: // 2 IROL_R
 			ibc.Opcode = VM_IROL_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = signExtend2sCompl(instr.IMM())
+				ibc.Opcode = VM_IROL_I
 			}
 			registerUsage[dst] = i

 		case 116, 117, 118, 119: //4
 			if src != dst {
 				ibc.Opcode = VM_ISWAP_R
-				ibc.idst = &vm.reg.r[dst]
-				ibc.isrc = &vm.reg.r[src]
 				registerUsage[dst] = i
 				registerUsage[src] = i
 			} else {
@ -285,87 +249,77 @@ func (vm *VM) CompileToBytecode() {

 		// below are floating point instructions
 		case 120, 121, 122, 123: // 4
-			ibc.Opcode = VM_FSWAP_R
+			//ibc.Opcode = VM_FSWAP_R
 			if dst < RegistersCountFloat {
-				ibc.fdst = &vm.reg.f[dst]
+				ibc.Opcode = VM_FSWAP_RF
 			} else {
-				ibc.fdst = &vm.reg.e[dst-RegistersCountFloat]
+				ibc.Opcode = VM_FSWAP_RE
+				ibc.Dst = dst - RegistersCountFloat
 			}
 		case 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139: //16
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
-			src := instr.Src() % RegistersCountFloat
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FADD_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]

 		case 140, 141, 142, 143, 144: //5
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FADD_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())

 		case 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160: //16
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
-			src := instr.Src() % RegistersCountFloat
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FSUB_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 161, 162, 163, 164, 165: //5
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSUB_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())

 		case 166, 167, 168, 169, 170, 171: //6
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSCAL_R
-			ibc.fdst = &vm.reg.f[dst]
 		case 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203: //32
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
-			src := instr.Src() % RegistersCountFloat
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FMUL_R
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 204, 205, 206, 207: //4
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FDIV_M
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 		case 208, 209, 210, 211, 212, 213: //6
-			dst := instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSQRT_R
-			ibc.fdst = &vm.reg.e[dst]

 		case 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238: //25  // CBRANCH and CFROUND are interchanged
 			ibc.Opcode = VM_CBRANCH
-			reg := instr.Dst() % RegistersCount
-			ibc.isrc = &vm.reg.r[reg]
-			ibc.target = int16(registerUsage[reg])
+			ibc.Src = instr.Dst() % RegistersCount
+
+			target := uint16(int16(registerUsage[ibc.Src]))
+			ibc.Dst = uint8(target)
+			ibc.ImmB = uint8(target >> 8)
+
 			shift := uint64(instr.Mod()>>4) + CONDITIONOFFSET
 			//conditionmask := CONDITIONMASK << shift
-			ibc.imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
+			ibc.Imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
 			if CONDITIONOFFSET > 0 || shift > 0 {
-				ibc.imm &= (^(uint64(1) << (shift - 1)))
+				ibc.Imm &= ^(uint64(1) << (shift - 1))
 			}
-			ibc.memMask = CONDITIONMASK << shift
+			ibc.MemMask = CONDITIONMASK << shift

 			for j := 0; j < RegistersCount; j++ {
 				registerUsage[j] = i
@ -373,23 +327,20 @@ func (vm *VM) CompileToBytecode() {

 		case 239: //1
 			ibc.Opcode = VM_CFROUND
-			ibc.isrc = &vm.reg.r[src]
-			ibc.imm = uint64(instr.IMM() & 63)
+			ibc.Imm = uint64(instr.IMM() & 63)

 		case 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255: //16
 			ibc.Opcode = VM_ISTORE
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = signExtend2sCompl(instr.IMM())
 			if (instr.Mod() >> 4) < STOREL3CONDITION {
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}

 			} else {
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
 			}

 		default:
@ -398,21 +349,34 @@ func (vm *VM) CompileToBytecode() {
 		}
 	}

+	return bc
+
 }

-func (vm *VM) Load64(addr uint64) uint64 {
-	return *(*uint64)(unsafe.Pointer(&vm.ScratchPad[addr]))
+type ScratchPad [ScratchpadSize]byte
+
+func (pad *ScratchPad) Init(seed *[64]byte) {
+	// calculate and fill scratchpad
+	clear(pad[:])
+	aes.FillAes1Rx4(seed, pad[:])
 }
-func (vm *VM) Load32(addr uint64) uint32 {
-	return *(*uint32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+func (pad *ScratchPad) Store64(addr uint32, val uint64) {
+	*(*uint64)(unsafe.Pointer(&pad[addr])) = val
+	//binary.LittleEndian.PutUint64(pad[addr:], val)
+}
+func (pad *ScratchPad) Load64(addr uint32) uint64 {
+	return *(*uint64)(unsafe.Pointer(&pad[addr]))
+}
+func (pad *ScratchPad) Load32(addr uint32) uint32 {
+	return *(*uint32)(unsafe.Pointer(&pad[addr]))
 }

-func (vm *VM) Load32F(addr uint64) (lo, hi float64) {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
 	return float64(a[LOW]), float64(a[HIGH])
 }

-func (vm *VM) Load32FA(addr uint64) [2]float64 {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
 	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
 }
Author	SHA1	Message	Date
DataHoarder	432590f930	Move argon2 / float packages to their own folders, cleanup vm Run All checks were successful continuous-integration/drone/push Build is passing Details	2024-04-15 04:14:15 +02:00
DataHoarder	5b9b3c3565	Use direct register and scratchpad under bytecode execution	2024-04-15 02:22:04 +02:00