Move argon2 / float packages to their own folders, cleanup vm Run

2024-04-15 03:05:21 +02:00 · 2024-04-15 03:05:21 +02:00 · 432590f930
parent 5b9b3c3565
commit 432590f930
13 changed files with 480 additions and 392 deletions
--- a/argon2.go
+++ b/argon2.go
@ -1,58 +0,0 @@
 package randomx
 import "golang.org/x/crypto/blake2b"
 import (
 	_ "golang.org/x/crypto/argon2"
 	_ "unsafe"
 )
 // see reference configuration.h
 // Cache size in KiB. Must be a power of 2.
 const RANDOMX_ARGON_MEMORY = 262144
 // Number of Argon2d iterations for Cache initialization.
 const RANDOMX_ARGON_ITERATIONS = 3
 // Number of parallel lanes for Cache initialization.
 const RANDOMX_ARGON_LANES = 1
 // Argon2d salt
 const RANDOMX_ARGON_SALT = "RandomX\x03"
 const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
 const ArgonBlockSize uint32 = 1024
 type argonBlock [128]uint64
 const syncPoints = 4
 //go:linkname argon2_initHash golang.org/x/crypto/argon2.initHash
 func argon2_initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
 //go:linkname argon2_initBlocks golang.org/x/crypto/argon2.initBlocks
 func argon2_initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []argonBlock
 //go:linkname argon2_processBlocks golang.org/x/crypto/argon2.processBlocks
 func argon2_processBlocks(B []argonBlock, time, memory, threads uint32, mode int)
 // argon2_buildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
 func argon2_buildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []argonBlock {
 	if time < 1 {
 		panic("argon2: number of rounds too small")
 	}
 	if threads < 1 {
 		panic("argon2: parallelism degree too low")
 	}
 	const mode = 0 /* argon2d */
 	h0 := argon2_initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
 	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
 	if memory < 2*syncPoints*uint32(threads) {
 		memory = 2 * syncPoints * uint32(threads)
 	}
 	B := argon2_initBlocks(&h0, memory, uint32(threads))
 	argon2_processBlocks(B, time, memory, uint32(threads), mode)
 	return B
 }
--- a/argon2/argon2.go
+++ b/argon2/argon2.go
@ -0,0 +1,44 @@
 package argon2
 import "golang.org/x/crypto/blake2b"
 import (
 	_ "golang.org/x/crypto/argon2"
 	_ "unsafe"
 )
 const BlockSize uint32 = 1024
 type Block [BlockSize / 8]uint64
 const syncPoints = 4
 //go:linkname initHash golang.org/x/crypto/argon2.initHash
 func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
 //go:linkname initBlocks golang.org/x/crypto/argon2.initBlocks
 func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []Block
 //go:linkname processBlocks golang.org/x/crypto/argon2.processBlocks
 func processBlocks(B []Block, time, memory, threads uint32, mode int)
 // BuildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
 func BuildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []Block {
 	if time < 1 {
 		panic("argon2: number of rounds too small")
 	}
 	if threads < 1 {
 		panic("argon2: parallelism degree too low")
 	}
 	const mode = 0 /* argon2d */
 	h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
 	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
 	if memory < 2*syncPoints*uint32(threads) {
 		memory = 2 * syncPoints * uint32(threads)
 	}
 	B := initBlocks(&h0, memory, uint32(threads))
 	processBlocks(B, time, memory, uint32(threads), mode)
 	return B
 }
--- a/asm/round.go
+++ b/asm/round.go
@ -1,14 +1,7 @@
 package asm
-type RoundingMode uint8
+import "git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
-const (
+func SetRoundingMode(mode softfloat.RoundingMode) {
 	RoundingModeToNearest = RoundingMode(iota)
 	RoundingModeToNegative
 	RoundingModeToPositive
 	RoundingModeToZero
 )
 func SetRoundingMode(mode RoundingMode) {
 	setRoundingMode(uint8(mode))
 }
--- a/bytecode.go
+++ b/bytecode.go
@ -1,205 +0,0 @@
 package randomx
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
 	"math"
 	"math/bits"
 )
 type ByteCodeInstruction struct {
 	Dst, Src byte
 	ImmB     uint8
 	Opcode   ByteCodeInstructionOp
 	MemMask  uint32
 	Imm      uint64
 	/*
 		union {
 			int_reg_t* idst;
 			rx_vec_f128* fdst;
 		};
 		union {
 			int_reg_t* isrc;
 			rx_vec_f128* fsrc;
 		};
 		union {
 			uint64_t imm;
 			int64_t simm;
 		};
 		InstructionType type;
 		union {
 			int16_t target;
 			uint16_t shift;
 		};
 		uint32_t memMask;
 	*/
 }
 func (i ByteCodeInstruction) jumpTarget() int {
 	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
 }
 func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
 	return uint32(ptr+i.Imm) & i.MemMask
 }
 func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
 	return uint32(i.Imm) & i.MemMask
 }
 type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
 func (c *ByteCode) Execute(f RegisterFile, pad *ScratchPad, eMask [2]uint64) RegisterFile {
 	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
 		i := &c[pc]
 		switch i.Opcode {
 		case VM_IADD_RS:
 			f.r[i.Dst] += (f.r[i.Src] << i.ImmB) + i.Imm
 		case VM_IADD_M:
 			f.r[i.Dst] += pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
 		case VM_IADD_MZ:
 			f.r[i.Dst] += pad.Load64(uint32(i.Imm))
 		case VM_ISUB_R:
 			f.r[i.Dst] -= f.r[i.Src]
 		case VM_ISUB_I:
 			f.r[i.Dst] -= i.Imm
 		case VM_ISUB_M:
 			f.r[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
 		case VM_ISUB_MZ:
 			f.r[i.Dst] -= pad.Load64(uint32(i.Imm))
 		case VM_IMUL_R:
 			f.r[i.Dst] *= f.r[i.Src]
 		case VM_IMUL_I:
 			// also handles imul_rcp
 			f.r[i.Dst] *= i.Imm
 		case VM_IMUL_M:
 			f.r[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
 		case VM_IMUL_MZ:
 			f.r[i.Dst] *= pad.Load64(uint32(i.Imm))
 		case VM_IMULH_R:
 			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], f.r[i.Src])
 		case VM_IMULH_M:
 			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], pad.Load64(i.getScratchpadAddress(f.r[i.Src])))
 		case VM_IMULH_MZ:
 			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], pad.Load64(uint32(i.Imm)))
 		case VM_ISMULH_R:
 			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(f.r[i.Src]))
 		case VM_ISMULH_M:
 			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.r[i.Src]))))
 		case VM_ISMULH_MZ:
 			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
 		case VM_INEG_R:
 			//f.r[i.Dst] = (^(f.r[i.Dst])) + 1 // 2's complement negative
 			f.r[i.Dst] = -f.r[i.Dst]
 		case VM_IXOR_R:
 			f.r[i.Dst] ^= f.r[i.Src]
 		case VM_IXOR_I:
 			f.r[i.Dst] ^= i.Imm
 		case VM_IXOR_M:
 			f.r[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
 		case VM_IXOR_MZ:
 			f.r[i.Dst] ^= pad.Load64(uint32(i.Imm))
 		case VM_IROR_R:
 			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], 0-int(f.r[i.Src]&63))
 		case VM_IROR_I:
 			//todo: can merge into VM_IROL_I
 			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], 0-int(i.Imm&63))
 		case VM_IROL_R:
 			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], int(f.r[i.Src]&63))
 		case VM_IROL_I:
 			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], int(i.Imm&63))
 		case VM_ISWAP_R:
 			f.r[i.Dst], f.r[i.Src] = f.r[i.Src], f.r[i.Dst]
 		case VM_FSWAP_RF:
 			f.f[i.Dst][HIGH], f.f[i.Dst][LOW] = f.f[i.Dst][LOW], f.f[i.Dst][HIGH]
 		case VM_FSWAP_RE:
 			f.e[i.Dst][HIGH], f.e[i.Dst][LOW] = f.e[i.Dst][LOW], f.e[i.Dst][HIGH]
 		case VM_FADD_R:
 			f.f[i.Dst][LOW] += f.a[i.Src][LOW]
 			f.f[i.Dst][HIGH] += f.a[i.Src][HIGH]
 		case VM_FADD_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
 			f.f[i.Dst][LOW] += lo
 			f.f[i.Dst][HIGH] += hi
 		case VM_FSUB_R:
 			f.f[i.Dst][LOW] -= f.a[i.Src][LOW]
 			f.f[i.Dst][HIGH] -= f.a[i.Src][HIGH]
 		case VM_FSUB_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
 			f.f[i.Dst][LOW] -= lo
 			f.f[i.Dst][HIGH] -= hi
 		case VM_FSCAL_R:
 			// no dependent on rounding modes
 			f.f[i.Dst][LOW] = math.Float64frombits(math.Float64bits(f.f[i.Dst][LOW]) ^ 0x80F0000000000000)
 			f.f[i.Dst][HIGH] = math.Float64frombits(math.Float64bits(f.f[i.Dst][HIGH]) ^ 0x80F0000000000000)
 		case VM_FMUL_R:
 			f.e[i.Dst][LOW] *= f.a[i.Src][LOW]
 			f.e[i.Dst][HIGH] *= f.a[i.Src][HIGH]
 		case VM_FDIV_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
 			f.e[i.Dst][LOW] /= MaskRegisterExponentMantissa(lo, eMask[LOW])
 			f.e[i.Dst][HIGH] /= MaskRegisterExponentMantissa(hi, eMask[HIGH])
 		case VM_FSQRT_R:
 			f.e[i.Dst][LOW] = math.Sqrt(f.e[i.Dst][LOW])
 			f.e[i.Dst][HIGH] = math.Sqrt(f.e[i.Dst][HIGH])
 		case VM_CBRANCH:
 			f.r[i.Src] += i.Imm
 			if (f.r[i.Src] & uint64(i.MemMask)) == 0 {
 				pc = i.jumpTarget()
 			}
 		case VM_CFROUND:
 			tmp := (bits.RotateLeft64(f.r[i.Src], 0-int(i.Imm))) % 4 // rotate right
 			asm.SetRoundingMode(asm.RoundingMode(tmp))
 		case VM_ISTORE:
 			pad.Store64(i.getScratchpadAddress(f.r[i.Dst]), f.r[i.Src])
 		case VM_NOP: // we do nothing
 		}
 	}
 	return f
 }
 type ByteCodeInstructionOp int
 const (
 	VM_NOP = ByteCodeInstructionOp(iota)
 	VM_IADD_RS
 	VM_IADD_M
 	VM_IADD_MZ
 	VM_ISUB_R
 	VM_ISUB_I
 	VM_ISUB_M
 	VM_ISUB_MZ
 	VM_IMUL_R
 	VM_IMUL_I
 	VM_IMUL_M
 	VM_IMUL_MZ
 	VM_IMULH_R
 	VM_IMULH_M
 	VM_IMULH_MZ
 	VM_ISMULH_R
 	VM_ISMULH_M
 	VM_ISMULH_MZ
 	VM_INEG_R
 	VM_IXOR_R
 	VM_IXOR_I
 	VM_IXOR_M
 	VM_IXOR_MZ
 	VM_IROR_R
 	VM_IROR_I
 	VM_IROL_R
 	VM_IROL_I
 	VM_ISWAP_R
 	VM_FSWAP_RF
 	VM_FSWAP_RE
 	VM_FADD_R
 	VM_FADD_M
 	VM_FSUB_R
 	VM_FSUB_M
 	VM_FSCAL_R
 	VM_FMUL_R
 	VM_FDIV_M
 	VM_FSQRT_R
 	VM_CBRANCH
 	VM_CFROUND
 	VM_ISTORE
 )
--- a/cache.go
+++ b/cache.go
@ -1,6 +1,7 @@
 package randomx
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
 	"runtime"
 	"slices"
@ -66,9 +67,9 @@ func (cache *Randomx_Cache) Init(key []byte) {
 	kkey := slices.Clone(key)
-	argonBlocks := argon2_buildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
+	argonBlocks := argon2.BuildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
-	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argonBlock{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
+	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argon2.Block{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
 	cache.Blocks = memoryBlocks
--- a/config.go
+++ b/config.go
@ -29,6 +29,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx
 import "git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
 // see reference configuration.h
 // Cache size in KiB. Must be a power of 2.
 const RANDOMX_ARGON_MEMORY = 262144
 // Number of Argon2d iterations for Cache initialization.
 const RANDOMX_ARGON_ITERATIONS = 3
 // Number of parallel lanes for Cache initialization.
 const RANDOMX_ARGON_LANES = 1
 // Argon2d salt
 const RANDOMX_ARGON_SALT = "RandomX\x03"
 const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
 // Number of random Cache accesses per Dataset item. Minimum is 2.
 const RANDOMX_CACHE_ACCESSES = 8
@ -74,7 +90,7 @@ const ScratchpadSize uint32 = RANDOMX_SCRATCHPAD_L3
 const CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & (^(CacheLineSize - 1))
-const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(ArgonBlockSize)
+const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(argon2.BlockSize)
 const ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / 8
 const ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / 8
@ -90,16 +106,6 @@ const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
 const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14
 const mantissaSize = 52
 const exponentSize = 11
 const mantissaMask = (uint64(1) << mantissaSize) - 1
 const exponentMask = (uint64(1) << exponentSize) - 1
 const exponentBias = 1023
 const dynamicExponentBits = 4
 const staticExponentBits = 4
 const constExponentBits uint64 = 0x300
 const dynamicMantissaMask = (uint64(1) << (mantissaSize + dynamicExponentBits)) - 1
 const RANDOMX_FLAG_DEFAULT = uint64(0)
 const RANDOMX_FLAG_JIT = uint64(1 << iota)
--- a/register.go
+++ b/register.go
@ -1,5 +1,10 @@
 package randomx
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 )
 const RegistersCount = 8
 const RegistersCountFloat = 4
@ -9,10 +14,20 @@ const HIGH = 1
 type RegisterLine [RegistersCount]uint64
 type RegisterFile struct {
-	r RegisterLine
+	R RegisterLine
-	f [RegistersCountFloat][2]float64
+	F [RegistersCountFloat][2]float64
-	e [RegistersCountFloat][2]float64
+	E [RegistersCountFloat][2]float64
-	a [RegistersCountFloat][2]float64
+	A [RegistersCountFloat][2]float64
 	FPRC softfloat.RoundingMode
 }
 func (f *RegisterFile) SetRoundingMode(mode softfloat.RoundingMode) {
 	if f.FPRC == mode {
 		return
 	}
 	f.FPRC = mode
 	asm.SetRoundingMode(mode)
 }
 type MemoryRegisters struct {
--- a/softfloat/const.go
+++ b/softfloat/const.go
@ -0,0 +1,37 @@
 package softfloat
 const (
 	mantbits64 uint = 52
 	expbits64  uint = 11
 	bias64          = -1<<(expbits64-1) + 1
 	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
 	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
 	neg64 uint64 = 1 << (expbits64 + mantbits64)
 )
 const mantissaMask = (uint64(1) << mantbits64) - 1
 const exponentMask = (uint64(1) << expbits64) - 1
 const exponentBias = 1023
 const dynamicExponentBits = 4
 const staticExponentBits = 4
 const constExponentBits uint64 = 0x300
 const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
 const mask22bit = (uint64(1) << 22) - 1
 type RoundingMode uint8
 const (
 	// RoundingModeToNearest IEEE 754 roundTiesToEven
 	RoundingModeToNearest = RoundingMode(iota)
 	// RoundingModeToNegative IEEE 754 roundTowardNegative
 	RoundingModeToNegative
 	// RoundingModeToPositive IEEE 754 roundTowardPositive
 	RoundingModeToPositive
 	// RoundingModeToZero IEEE 754 roundTowardZero
 	RoundingModeToZero
 )
--- a/softfloat/funcs.go
+++ b/softfloat/funcs.go
@ -0,0 +1,35 @@
 package softfloat
 import "math"
 func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
 	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
 }
 func ScaleNegate(f float64) float64 {
 	return math.Float64frombits(math.Float64bits(f) ^ 0x80F0000000000000)
 }
 func SmallPositiveFloatBits(entropy uint64) float64 {
 	exponent := entropy >> 59 //0..31
 	mantissa := entropy & mantissaMask
 	exponent += exponentBias
 	exponent &= exponentMask
 	exponent = exponent << mantbits64
 	return math.Float64frombits(exponent | mantissa)
 }
 func StaticExponent(entropy uint64) uint64 {
 	exponent := constExponentBits
 	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
 	exponent <<= mantbits64
 	return exponent
 }
 func EMask(entropy uint64) uint64 {
 	return (entropy & mask22bit) | StaticExponent(entropy)
 }
 func Xor(a, b float64) float64 {
 	return math.Float64frombits(math.Float64bits(a) ^ math.Float64bits(b))
 }
--- a/softfloat/softfloat.go
+++ b/softfloat/softfloat.go
@ -0,0 +1,27 @@
 package softfloat
 import (
 	_ "runtime"
 	_ "unsafe"
 )
 //go:linkname funpack64 runtime.funpack64
 func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool)
 //go:linkname fpack64 runtime.fpack64
 func fpack64(sign, mant uint64, exp int, trunc uint64) uint64
 //go:linkname fadd64 runtime.fadd64
 func fadd64(f, g uint64) uint64
 //go:linkname fsub64 runtime.fsub64
 func fsub64(f, g uint64) uint64
 //go:linkname fneg64 runtime.fneg64
 func fneg64(f uint64) uint64
 //go:linkname fmul64 runtime.fmul64
 func fmul64(f uint64) uint64
 //go:linkname fdiv64 runtime.fdiv64
 func fdiv64(f uint64) uint64
--- a/vm.go
+++ b/vm.go
@ -31,9 +31,10 @@ package randomx
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 	"math"
 	"runtime"
 	"unsafe"
 )
 import "encoding/binary"
 import "golang.org/x/crypto/blake2b"
@ -45,17 +46,10 @@ type REG struct {
 type VM struct {
 	StateStart [64]byte
 	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
 	Prog       []byte
 	ScratchPad ScratchPad
 	ByteCode ByteCode
 	// program configuration  see program.hpp
 	entropy [16]uint64
 	reg           RegisterFile // the register file
 	mem           MemoryRegisters
 	config        Config // configuration
 	datasetOffset uint64
@ -66,48 +60,47 @@ type VM struct {
 }
 func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
 	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
 }
 type Config struct {
 	eMask   [2]uint64
 	readReg [4]uint64
 }
 // Run calculate hash based on input
-func (vm *VM) Run(inputHash [64]byte) {
+// Warning: Underlying callers will run asm.SetRoundingMode directly
 // It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
 // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
 func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg RegisterFile) {
-	aes.FillAes4Rx4(inputHash, vm.buffer[:])
+	reg.FPRC = roundingMode
-	for i := range vm.entropy {
+	// buffer first 128 bytes are entropy below rest are program bytes
-		vm.entropy[i] = binary.LittleEndian.Uint64(vm.buffer[i*8:])
+	var buffer [16*8 + RANDOMX_PROGRAM_SIZE*8]byte
-	}
+	aes.FillAes4Rx4(inputHash, buffer[:])
-	vm.Prog = vm.buffer[len(vm.entropy)*8:]
+	entropy := (*[16]uint64)(unsafe.Pointer(&buffer))
-	clear(vm.reg.r[:])
+	prog := buffer[len(entropy)*8:]
 	// do more initialization before we run
-	for i := range vm.entropy[:8] {
+	for i := range entropy[:8] {
-		vm.reg.a[i/2][i%2] = math.Float64frombits(getSmallPositiveFloatBits(vm.entropy[i]))
+		reg.A[i/2][i%2] = softfloat.SmallPositiveFloatBits(entropy[i])
 	}
-	vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
+	vm.mem.ma = entropy[8] & CacheLineAlignMask
-	vm.mem.mx = vm.entropy[10]
+	vm.mem.mx = entropy[10]
-	addressRegisters := vm.entropy[12]
+	addressRegisters := entropy[12]
 	for i := range vm.config.readReg {
 		vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
 		addressRegisters >>= 1
 	}
-	vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
+	vm.datasetOffset = (entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
-	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
+	vm.config.eMask[LOW] = softfloat.EMask(entropy[14])
-	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
+	vm.config.eMask[HIGH] = softfloat.EMask(entropy[15])
-	vm.CompileToBytecode()
+	vm.ByteCode = CompileProgramToByteCode(prog)
 	spAddr0 := vm.mem.mx
 	spAddr1 := vm.mem.ma
@ -115,50 +108,52 @@ func (vm *VM) Run(inputHash [64]byte) {
 	var rlCache RegisterLine
 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
-		spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]
+		spMix := reg.R[vm.config.readReg[0]] ^ reg.R[vm.config.readReg[1]]
 		spAddr0 ^= spMix
 		spAddr0 &= ScratchpadL3Mask64
 		spAddr1 ^= spMix >> 32
 		spAddr1 &= ScratchpadL3Mask64
 		//TODO: optimize these loads!
 		for i := uint64(0); i < RegistersCount; i++ {
-			vm.reg.r[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
+			reg.R[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
 		}
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
+			reg.F[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
 		}
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.e[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))
+			reg.E[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))
-			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
+			reg.E[i][LOW] = softfloat.MaskRegisterExponentMantissa(reg.E[i][LOW], vm.config.eMask[LOW])
-			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
+			reg.E[i][HIGH] = softfloat.MaskRegisterExponentMantissa(reg.E[i][HIGH], vm.config.eMask[HIGH])
 		}
-		vm.reg = vm.ByteCode.Execute(vm.reg, &vm.ScratchPad, vm.config.eMask)
+		// Run the actual bytecode
 		vm.ByteCode.Execute(&reg, &vm.ScratchPad, vm.config.eMask)
-		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
+		vm.mem.mx ^= reg.R[vm.config.readReg[2]] ^ reg.R[vm.config.readReg[3]]
 		vm.mem.mx &= CacheLineAlignMask
 		vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
 		// execute diffuser superscalar program to get dataset 64 bytes
-		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &vm.reg.r, &rlCache)
+		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &reg.R, &rlCache)
 		// swap the elements
 		vm.mem.mx, vm.mem.ma = vm.mem.ma, vm.mem.mx
 		for i := uint64(0); i < RegistersCount; i++ {
-			vm.ScratchPad.Store64(uint32(spAddr1+8*i), vm.reg.r[i])
+			vm.ScratchPad.Store64(uint32(spAddr1+8*i), reg.R[i])
 		}
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i][LOW] = math.Float64frombits(math.Float64bits(vm.reg.f[i][LOW]) ^ math.Float64bits(vm.reg.e[i][LOW]))
+			reg.F[i][LOW] = softfloat.Xor(reg.F[i][LOW], reg.E[i][LOW])
-			vm.reg.f[i][HIGH] = math.Float64frombits(math.Float64bits(vm.reg.f[i][HIGH]) ^ math.Float64bits(vm.reg.e[i][HIGH]))
+			reg.F[i][HIGH] = softfloat.Xor(reg.F[i][HIGH], reg.E[i][HIGH])
-			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(vm.reg.f[i][LOW]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(reg.F[i][LOW]))
-			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(vm.reg.f[i][HIGH]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(reg.F[i][HIGH]))
 		}
 		spAddr0 = 0
@ -166,56 +161,52 @@ func (vm *VM) Run(inputHash [64]byte) {
 	}
 	return reg
 }
 func (vm *VM) InitScratchpad(seed *[64]byte) {
 	vm.ScratchPad.Init(seed)
 }
-func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
+func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
 	var buf [8]byte
 	hash512, _ := blake2b.New512(nil)
 	// Lock thread due to rounding mode flags
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
 	//restore rounding mode to golang expected one
 	defer asm.SetRoundingMode(asm.RoundingModeToNearest)
-	// reset rounding mode if new hash being calculated
+	roundingMode := softfloat.RoundingModeToNearest
 	asm.SetRoundingMode(asm.RoundingModeToNearest)
 	tempHash := blake2b.Sum512(input)
 	vm.InitScratchpad(&tempHash)
 	hash512, _ := blake2b.New512(nil)
 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
-		vm.Run(tempHash)
+		reg := vm.Run(tempHash, roundingMode)
 		roundingMode = reg.FPRC
 		hash512.Reset()
-		for i := range vm.reg.r {
+		for i := range reg.R {
-			binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+			binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 			hash512.Write(buf[:])
 		}
-		for i := range vm.reg.f {
+		for i := range reg.F {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 			hash512.Write(buf[:])
 		}
-		for i := range vm.reg.e {
+		for i := range reg.E {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 			hash512.Write(buf[:])
 		}
-		for i := range vm.reg.a {
+		for i := range reg.A {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][LOW]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][HIGH]))
 			hash512.Write(buf[:])
 		}
@ -223,7 +214,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	}
 	// final loop executes here
-	vm.Run(tempHash)
+	reg := vm.Run(tempHash, roundingMode)
 	roundingMode = reg.FPRC
 	reg.SetRoundingMode(softfloat.RoundingModeToNearest)
 	return reg
 }
 func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	var buf [8]byte
 	tempHash := blake2b.Sum512(input)
 	vm.InitScratchpad(&tempHash)
 	reg := vm.RunLoops(tempHash)
 	// now hash the scratch pad and place into register a
 	aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)
@ -232,22 +238,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	hash256.Reset()
-	for i := range vm.reg.r {
+	for i := range reg.R {
-		binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+		binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 		hash256.Write(buf[:])
 	}
-	for i := range vm.reg.f {
+	for i := range reg.F {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 		hash256.Write(buf[:])
 	}
-	for i := range vm.reg.e {
+	for i := range reg.E {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 		hash256.Write(buf[:])
 	}
@ -256,25 +262,3 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	hash256.Sum(output[:0])
 }
 const mask22bit = (uint64(1) << 22) - 1
 func getSmallPositiveFloatBits(entropy uint64) uint64 {
 	exponent := entropy >> 59 //0..31
 	mantissa := entropy & mantissaMask
 	exponent += exponentBias
 	exponent &= exponentMask
 	exponent = exponent << mantissaSize
 	return exponent | mantissa
 }
 func getStaticExponent(entropy uint64) uint64 {
 	exponent := constExponentBits
 	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
 	exponent <<= mantissaSize
 	return exponent
 }
 func getFloatMask(entropy uint64) uint64 {
 	return (entropy & mask22bit) | getStaticExponent(entropy)
 }
--- a/vm_bytecode.go
+++ b/vm_bytecode.go
@ -0,0 +1,207 @@
 package randomx
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 	"math"
 	"math/bits"
 )
 type ByteCodeInstruction struct {
 	Dst, Src byte
 	ImmB     uint8
 	Opcode   ByteCodeInstructionOp
 	MemMask  uint32
 	Imm      uint64
 	/*
 		union {
 			int_reg_t* idst;
 			rx_vec_f128* fdst;
 		};
 		union {
 			int_reg_t* isrc;
 			rx_vec_f128* fsrc;
 		};
 		union {
 			uint64_t imm;
 			int64_t simm;
 		};
 		InstructionType type;
 		union {
 			int16_t target;
 			uint16_t shift;
 		};
 		uint32_t memMask;
 	*/
 }
 func (i ByteCodeInstruction) jumpTarget() int {
 	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
 }
 func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
 	return uint32(ptr+i.Imm) & i.MemMask
 }
 func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
 	return uint32(i.Imm) & i.MemMask
 }
 type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
 // Execute Runs a RandomX program with the given register file and scratchpad
 // Warning: This will call asm.SetRoundingMode directly
 // It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
 // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
 func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
 	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
 		i := &c[pc]
 		switch i.Opcode {
 		case VM_NOP: // we do nothing
 		case VM_IADD_RS:
 			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
 		case VM_IADD_M:
 			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
 		case VM_IADD_MZ:
 			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
 		case VM_ISUB_R:
 			f.R[i.Dst] -= f.R[i.Src]
 		case VM_ISUB_I:
 			f.R[i.Dst] -= i.Imm
 		case VM_ISUB_M:
 			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
 		case VM_ISUB_MZ:
 			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
 		case VM_IMUL_R:
 			f.R[i.Dst] *= f.R[i.Src]
 		case VM_IMUL_I:
 			// also handles imul_rcp
 			f.R[i.Dst] *= i.Imm
 		case VM_IMUL_M:
 			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
 		case VM_IMUL_MZ:
 			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
 		case VM_IMULH_R:
 			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
 		case VM_IMULH_M:
 			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
 		case VM_IMULH_MZ:
 			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
 		case VM_ISMULH_R:
 			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
 		case VM_ISMULH_M:
 			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
 		case VM_ISMULH_MZ:
 			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
 		case VM_INEG_R:
 			f.R[i.Dst] = -f.R[i.Dst]
 		case VM_IXOR_R:
 			f.R[i.Dst] ^= f.R[i.Src]
 		case VM_IXOR_I:
 			f.R[i.Dst] ^= i.Imm
 		case VM_IXOR_M:
 			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
 		case VM_IXOR_MZ:
 			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
 		case VM_IROR_R:
 			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
 		case VM_IROR_I:
 			//todo: can merge into VM_IROL_I
 			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
 		case VM_IROL_R:
 			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
 		case VM_IROL_I:
 			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
 		case VM_ISWAP_R:
 			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
 		case VM_FSWAP_RF:
 			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
 		case VM_FSWAP_RE:
 			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
 		case VM_FADD_R:
 			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
 			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
 		case VM_FADD_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
 			f.F[i.Dst][LOW] += lo
 			f.F[i.Dst][HIGH] += hi
 		case VM_FSUB_R:
 			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
 			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
 		case VM_FSUB_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
 			f.F[i.Dst][LOW] -= lo
 			f.F[i.Dst][HIGH] -= hi
 		case VM_FSCAL_R:
 			// no dependent on rounding modes
 			f.F[i.Dst][LOW] = softfloat.ScaleNegate(f.F[i.Dst][LOW])
 			f.F[i.Dst][HIGH] = softfloat.ScaleNegate(f.F[i.Dst][HIGH])
 		case VM_FMUL_R:
 			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
 			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
 		case VM_FDIV_M:
 			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
 			f.E[i.Dst][LOW] /= softfloat.MaskRegisterExponentMantissa(lo, eMask[LOW])
 			f.E[i.Dst][HIGH] /= softfloat.MaskRegisterExponentMantissa(hi, eMask[HIGH])
 		case VM_FSQRT_R:
 			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
 			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
 		case VM_CBRANCH:
 			f.R[i.Src] += i.Imm
 			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
 				pc = i.jumpTarget()
 			}
 		case VM_CFROUND:
 			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
 			f.SetRoundingMode(softfloat.RoundingMode(tmp))
 		case VM_ISTORE:
 			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
 		}
 	}
 }
 type ByteCodeInstructionOp int
 const (
 	VM_NOP = ByteCodeInstructionOp(iota)
 	VM_IADD_RS
 	VM_IADD_M
 	VM_IADD_MZ
 	VM_ISUB_R
 	VM_ISUB_I
 	VM_ISUB_M
 	VM_ISUB_MZ
 	VM_IMUL_R
 	VM_IMUL_I
 	VM_IMUL_M
 	VM_IMUL_MZ
 	VM_IMULH_R
 	VM_IMULH_M
 	VM_IMULH_MZ
 	VM_ISMULH_R
 	VM_ISMULH_M
 	VM_ISMULH_MZ
 	VM_INEG_R
 	VM_IXOR_R
 	VM_IXOR_I
 	VM_IXOR_M
 	VM_IXOR_MZ
 	VM_IROR_R
 	VM_IROR_I
 	VM_IROL_R
 	VM_IROL_I
 	VM_ISWAP_R
 	VM_FSWAP_RF
 	VM_FSWAP_RE
 	VM_FADD_R
 	VM_FADD_M
 	VM_FSUB_R
 	VM_FSUB_M
 	VM_FSCAL_R
 	VM_FMUL_R
 	VM_FDIV_M
 	VM_FSQRT_R
 	VM_CBRANCH
 	VM_CFROUND
 	VM_ISTORE
 )
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -37,8 +37,8 @@ import "encoding/binary"
 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding
-// since go does not have union, use byte array
+// VM_Instruction since go does not have union, use byte array
-type VM_Instruction []byte // it is hardcode 8 bytes
+type VM_Instruction [8]byte // it is hardcode 8 bytes
 func (ins VM_Instruction) IMM() uint32 {
 	return binary.LittleEndian.Uint32(ins[4:])
@ -56,9 +56,9 @@ func (ins VM_Instruction) Opcode() byte {
 	return ins[0]
 }
-// CompileToBytecode this will interpret single vm instruction
+// CompileProgramToByteCode this will interpret single vm instruction into executable opcodes
 // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions
-func (vm *VM) CompileToBytecode() {
+func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
 	var registerUsage [RegistersCount]int
 	for i := range registerUsage {
@ -66,8 +66,8 @@ func (vm *VM) CompileToBytecode() {
 	}
 	for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
-		instr := VM_Instruction(vm.Prog[i*8:])
+		instr := VM_Instruction(prog[i*8:])
-		ibc := &vm.ByteCode[i]
+		ibc := &bc[i]
 		opcode := instr.Opcode()
 		dst := instr.Dst() % RegistersCount // bit shift optimization
@ -317,7 +317,7 @@ func (vm *VM) CompileToBytecode() {
 			//conditionmask := CONDITIONMASK << shift
 			ibc.Imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
 			if CONDITIONOFFSET > 0 || shift > 0 {
-				ibc.Imm &= (^(uint64(1) << (shift - 1)))
+				ibc.Imm &= ^(uint64(1) << (shift - 1))
 			}
 			ibc.MemMask = CONDITIONMASK << shift
@ -349,6 +349,8 @@ func (vm *VM) CompileToBytecode() {
 		}
 	}
 	return bc
 }
 type ScratchPad [ScratchpadSize]byte