From 432590f93006e12aa959074df168c815fd518704 Mon Sep 17 00:00:00 2001
From: WeebDataHoarder <57538841+WeebDataHoarder@users.noreply.github.com>
Date: Mon, 15 Apr 2024 03:05:21 +0200
Subject: [PATCH] Move argon2 / float packages to their own folders, cleanup vm
 Run

---
 argon2.go              |  58 ------------
 argon2/argon2.go       |  44 +++++++++
 asm/round.go           |  11 +--
 bytecode.go            | 205 ----------------------------------------
 cache.go               |   5 +-
 config.go              |  28 +++---
 register.go            |  23 ++++-
 softfloat/const.go     |  37 ++++++++
 softfloat/funcs.go     |  35 +++++++
 softfloat/softfloat.go |  27 ++++++
 vm.go                  | 176 ++++++++++++++++-------------------
 vm_bytecode.go         | 207 +++++++++++++++++++++++++++++++++++++++++
 vm_instruction.go      |  16 ++--
 13 files changed, 480 insertions(+), 392 deletions(-)
 delete mode 100644 argon2.go
 create mode 100644 argon2/argon2.go
 delete mode 100644 bytecode.go
 create mode 100644 softfloat/const.go
 create mode 100644 softfloat/funcs.go
 create mode 100644 softfloat/softfloat.go
 create mode 100644 vm_bytecode.go

diff --git a/argon2.go b/argon2.go
deleted file mode 100644
index 863320a..0000000
--- a/argon2.go
+++ /dev/null
@@ -1,58 +0,0 @@
-package randomx
-
-import "golang.org/x/crypto/blake2b"
-
-import (
-	_ "golang.org/x/crypto/argon2"
-	_ "unsafe"
-)
-
-// see reference configuration.h
-// Cache size in KiB. Must be a power of 2.
-const RANDOMX_ARGON_MEMORY = 262144
-
-// Number of Argon2d iterations for Cache initialization.
-const RANDOMX_ARGON_ITERATIONS = 3
-
-// Number of parallel lanes for Cache initialization.
-const RANDOMX_ARGON_LANES = 1
-
-// Argon2d salt
-const RANDOMX_ARGON_SALT = "RandomX\x03"
-const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
-
-const ArgonBlockSize uint32 = 1024
-
-type argonBlock [128]uint64
-
-const syncPoints = 4
-
-//go:linkname argon2_initHash golang.org/x/crypto/argon2.initHash
-func argon2_initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
-
-//go:linkname argon2_initBlocks golang.org/x/crypto/argon2.initBlocks
-func argon2_initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []argonBlock
-
-//go:linkname argon2_processBlocks golang.org/x/crypto/argon2.processBlocks
-func argon2_processBlocks(B []argonBlock, time, memory, threads uint32, mode int)
-
-// argon2_buildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
-func argon2_buildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []argonBlock {
-	if time < 1 {
-		panic("argon2: number of rounds too small")
-	}
-	if threads < 1 {
-		panic("argon2: parallelism degree too low")
-	}
-	const mode = 0 /* argon2d */
-	h0 := argon2_initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
-
-	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
-	if memory < 2*syncPoints*uint32(threads) {
-		memory = 2 * syncPoints * uint32(threads)
-	}
-	B := argon2_initBlocks(&h0, memory, uint32(threads))
-	argon2_processBlocks(B, time, memory, uint32(threads), mode)
-
-	return B
-}
diff --git a/argon2/argon2.go b/argon2/argon2.go
new file mode 100644
index 0000000..bd962e4
--- /dev/null
+++ b/argon2/argon2.go
@@ -0,0 +1,44 @@
+package argon2
+
+import "golang.org/x/crypto/blake2b"
+
+import (
+	_ "golang.org/x/crypto/argon2"
+	_ "unsafe"
+)
+
+const BlockSize uint32 = 1024
+
+type Block [BlockSize / 8]uint64
+
+const syncPoints = 4
+
+//go:linkname initHash golang.org/x/crypto/argon2.initHash
+func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
+
+//go:linkname initBlocks golang.org/x/crypto/argon2.initBlocks
+func initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []Block
+
+//go:linkname processBlocks golang.org/x/crypto/argon2.processBlocks
+func processBlocks(B []Block, time, memory, threads uint32, mode int)
+
+// BuildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
+func BuildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []Block {
+	if time < 1 {
+		panic("argon2: number of rounds too small")
+	}
+	if threads < 1 {
+		panic("argon2: parallelism degree too low")
+	}
+	const mode = 0 /* argon2d */
+	h0 := initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
+
+	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
+	if memory < 2*syncPoints*uint32(threads) {
+		memory = 2 * syncPoints * uint32(threads)
+	}
+	B := initBlocks(&h0, memory, uint32(threads))
+	processBlocks(B, time, memory, uint32(threads), mode)
+
+	return B
+}
diff --git a/asm/round.go b/asm/round.go
index 659fc76..cc9acbc 100644
--- a/asm/round.go
+++ b/asm/round.go
@@ -1,14 +1,7 @@
 package asm
 
-type RoundingMode uint8
+import "git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 
-const (
-	RoundingModeToNearest = RoundingMode(iota)
-	RoundingModeToNegative
-	RoundingModeToPositive
-	RoundingModeToZero
-)
-
-func SetRoundingMode(mode RoundingMode) {
+func SetRoundingMode(mode softfloat.RoundingMode) {
 	setRoundingMode(uint8(mode))
 }
diff --git a/bytecode.go b/bytecode.go
deleted file mode 100644
index 4878e07..0000000
--- a/bytecode.go
+++ /dev/null
@@ -1,205 +0,0 @@
-package randomx
-
-import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
-	"math"
-	"math/bits"
-)
-
-type ByteCodeInstruction struct {
-	Dst, Src byte
-	ImmB     uint8
-	Opcode   ByteCodeInstructionOp
-	MemMask  uint32
-	Imm      uint64
-	/*
-		union {
-			int_reg_t* idst;
-			rx_vec_f128* fdst;
-		};
-		union {
-			int_reg_t* isrc;
-			rx_vec_f128* fsrc;
-		};
-		union {
-			uint64_t imm;
-			int64_t simm;
-		};
-		InstructionType type;
-		union {
-			int16_t target;
-			uint16_t shift;
-		};
-		uint32_t memMask;
-	*/
-
-}
-
-func (i ByteCodeInstruction) jumpTarget() int {
-	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
-}
-
-func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
-	return uint32(ptr+i.Imm) & i.MemMask
-}
-
-func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
-	return uint32(i.Imm) & i.MemMask
-}
-
-type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
-
-func (c *ByteCode) Execute(f RegisterFile, pad *ScratchPad, eMask [2]uint64) RegisterFile {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-		i := &c[pc]
-		switch i.Opcode {
-		case VM_IADD_RS:
-			f.r[i.Dst] += (f.r[i.Src] << i.ImmB) + i.Imm
-		case VM_IADD_M:
-			f.r[i.Dst] += pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
-		case VM_IADD_MZ:
-			f.r[i.Dst] += pad.Load64(uint32(i.Imm))
-		case VM_ISUB_R:
-			f.r[i.Dst] -= f.r[i.Src]
-		case VM_ISUB_I:
-			f.r[i.Dst] -= i.Imm
-		case VM_ISUB_M:
-			f.r[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
-		case VM_ISUB_MZ:
-			f.r[i.Dst] -= pad.Load64(uint32(i.Imm))
-		case VM_IMUL_R:
-			f.r[i.Dst] *= f.r[i.Src]
-		case VM_IMUL_I:
-			// also handles imul_rcp
-			f.r[i.Dst] *= i.Imm
-		case VM_IMUL_M:
-			f.r[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
-		case VM_IMUL_MZ:
-			f.r[i.Dst] *= pad.Load64(uint32(i.Imm))
-		case VM_IMULH_R:
-			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], f.r[i.Src])
-		case VM_IMULH_M:
-			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], pad.Load64(i.getScratchpadAddress(f.r[i.Src])))
-		case VM_IMULH_MZ:
-			f.r[i.Dst], _ = bits.Mul64(f.r[i.Dst], pad.Load64(uint32(i.Imm)))
-		case VM_ISMULH_R:
-			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(f.r[i.Src]))
-		case VM_ISMULH_M:
-			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.r[i.Src]))))
-		case VM_ISMULH_MZ:
-			f.r[i.Dst] = smulh(int64(f.r[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
-		case VM_INEG_R:
-			//f.r[i.Dst] = (^(f.r[i.Dst])) + 1 // 2's complement negative
-			f.r[i.Dst] = -f.r[i.Dst]
-		case VM_IXOR_R:
-			f.r[i.Dst] ^= f.r[i.Src]
-		case VM_IXOR_I:
-			f.r[i.Dst] ^= i.Imm
-		case VM_IXOR_M:
-			f.r[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.r[i.Src]))
-		case VM_IXOR_MZ:
-			f.r[i.Dst] ^= pad.Load64(uint32(i.Imm))
-		case VM_IROR_R:
-			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], 0-int(f.r[i.Src]&63))
-		case VM_IROR_I:
-			//todo: can merge into VM_IROL_I
-			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], 0-int(i.Imm&63))
-		case VM_IROL_R:
-			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], int(f.r[i.Src]&63))
-		case VM_IROL_I:
-			f.r[i.Dst] = bits.RotateLeft64(f.r[i.Dst], int(i.Imm&63))
-		case VM_ISWAP_R:
-			f.r[i.Dst], f.r[i.Src] = f.r[i.Src], f.r[i.Dst]
-		case VM_FSWAP_RF:
-			f.f[i.Dst][HIGH], f.f[i.Dst][LOW] = f.f[i.Dst][LOW], f.f[i.Dst][HIGH]
-		case VM_FSWAP_RE:
-			f.e[i.Dst][HIGH], f.e[i.Dst][LOW] = f.e[i.Dst][LOW], f.e[i.Dst][HIGH]
-		case VM_FADD_R:
-			f.f[i.Dst][LOW] += f.a[i.Src][LOW]
-			f.f[i.Dst][HIGH] += f.a[i.Src][HIGH]
-		case VM_FADD_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
-			f.f[i.Dst][LOW] += lo
-			f.f[i.Dst][HIGH] += hi
-		case VM_FSUB_R:
-			f.f[i.Dst][LOW] -= f.a[i.Src][LOW]
-			f.f[i.Dst][HIGH] -= f.a[i.Src][HIGH]
-		case VM_FSUB_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
-			f.f[i.Dst][LOW] -= lo
-			f.f[i.Dst][HIGH] -= hi
-		case VM_FSCAL_R:
-			// no dependent on rounding modes
-			f.f[i.Dst][LOW] = math.Float64frombits(math.Float64bits(f.f[i.Dst][LOW]) ^ 0x80F0000000000000)
-			f.f[i.Dst][HIGH] = math.Float64frombits(math.Float64bits(f.f[i.Dst][HIGH]) ^ 0x80F0000000000000)
-		case VM_FMUL_R:
-			f.e[i.Dst][LOW] *= f.a[i.Src][LOW]
-			f.e[i.Dst][HIGH] *= f.a[i.Src][HIGH]
-		case VM_FDIV_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.r[i.Src]))
-			f.e[i.Dst][LOW] /= MaskRegisterExponentMantissa(lo, eMask[LOW])
-			f.e[i.Dst][HIGH] /= MaskRegisterExponentMantissa(hi, eMask[HIGH])
-		case VM_FSQRT_R:
-			f.e[i.Dst][LOW] = math.Sqrt(f.e[i.Dst][LOW])
-			f.e[i.Dst][HIGH] = math.Sqrt(f.e[i.Dst][HIGH])
-		case VM_CBRANCH:
-			f.r[i.Src] += i.Imm
-			if (f.r[i.Src] & uint64(i.MemMask)) == 0 {
-				pc = i.jumpTarget()
-			}
-		case VM_CFROUND:
-			tmp := (bits.RotateLeft64(f.r[i.Src], 0-int(i.Imm))) % 4 // rotate right
-			asm.SetRoundingMode(asm.RoundingMode(tmp))
-		case VM_ISTORE:
-			pad.Store64(i.getScratchpadAddress(f.r[i.Dst]), f.r[i.Src])
-		case VM_NOP: // we do nothing
-		}
-	}
-	return f
-}
-
-type ByteCodeInstructionOp int
-
-const (
-	VM_NOP = ByteCodeInstructionOp(iota)
-	VM_IADD_RS
-	VM_IADD_M
-	VM_IADD_MZ
-	VM_ISUB_R
-	VM_ISUB_I
-	VM_ISUB_M
-	VM_ISUB_MZ
-	VM_IMUL_R
-	VM_IMUL_I
-	VM_IMUL_M
-	VM_IMUL_MZ
-	VM_IMULH_R
-	VM_IMULH_M
-	VM_IMULH_MZ
-	VM_ISMULH_R
-	VM_ISMULH_M
-	VM_ISMULH_MZ
-	VM_INEG_R
-	VM_IXOR_R
-	VM_IXOR_I
-	VM_IXOR_M
-	VM_IXOR_MZ
-	VM_IROR_R
-	VM_IROR_I
-	VM_IROL_R
-	VM_IROL_I
-	VM_ISWAP_R
-	VM_FSWAP_RF
-	VM_FSWAP_RE
-	VM_FADD_R
-	VM_FADD_M
-	VM_FSUB_R
-	VM_FSUB_M
-	VM_FSCAL_R
-	VM_FMUL_R
-	VM_FDIV_M
-	VM_FSQRT_R
-	VM_CBRANCH
-	VM_CFROUND
-	VM_ISTORE
-)
diff --git a/cache.go b/cache.go
index bdf3eb8..8d88ee2 100644
--- a/cache.go
+++ b/cache.go
@@ -1,6 +1,7 @@
 package randomx
 
 import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
 	"runtime"
 	"slices"
@@ -66,9 +67,9 @@ func (cache *Randomx_Cache) Init(key []byte) {
 
 	kkey := slices.Clone(key)
 
-	argonBlocks := argon2_buildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
+	argonBlocks := argon2.BuildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
 
-	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argonBlock{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
+	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argon2.Block{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
 
 	cache.Blocks = memoryBlocks
 
diff --git a/config.go b/config.go
index f53241b..7fe7c20 100644
--- a/config.go
+++ b/config.go
@@ -29,6 +29,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 package randomx
 
+import "git.gammaspectra.live/P2Pool/go-randomx/v2/argon2"
+
+// see reference configuration.h
+// Cache size in KiB. Must be a power of 2.
+const RANDOMX_ARGON_MEMORY = 262144
+
+// Number of Argon2d iterations for Cache initialization.
+const RANDOMX_ARGON_ITERATIONS = 3
+
+// Number of parallel lanes for Cache initialization.
+const RANDOMX_ARGON_LANES = 1
+
+// Argon2d salt
+const RANDOMX_ARGON_SALT = "RandomX\x03"
+const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
+
 // Number of random Cache accesses per Dataset item. Minimum is 2.
 const RANDOMX_CACHE_ACCESSES = 8
 
@@ -74,7 +90,7 @@ const ScratchpadSize uint32 = RANDOMX_SCRATCHPAD_L3
 
 const CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & (^(CacheLineSize - 1))
 
-const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(ArgonBlockSize)
+const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(argon2.BlockSize)
 
 const ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / 8
 const ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / 8
@@ -90,16 +106,6 @@ const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
 const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14
 
-const mantissaSize = 52
-const exponentSize = 11
-const mantissaMask = (uint64(1) << mantissaSize) - 1
-const exponentMask = (uint64(1) << exponentSize) - 1
-const exponentBias = 1023
-const dynamicExponentBits = 4
-const staticExponentBits = 4
-const constExponentBits uint64 = 0x300
-const dynamicMantissaMask = (uint64(1) << (mantissaSize + dynamicExponentBits)) - 1
-
 const RANDOMX_FLAG_DEFAULT = uint64(0)
 const RANDOMX_FLAG_JIT = uint64(1 << iota)
 
diff --git a/register.go b/register.go
index e7ef1b1..fe9f85b 100644
--- a/register.go
+++ b/register.go
@@ -1,5 +1,10 @@
 package randomx
 
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
+)
+
 const RegistersCount = 8
 const RegistersCountFloat = 4
 
@@ -9,10 +14,20 @@ const HIGH = 1
 type RegisterLine [RegistersCount]uint64
 
 type RegisterFile struct {
-	r RegisterLine
-	f [RegistersCountFloat][2]float64
-	e [RegistersCountFloat][2]float64
-	a [RegistersCountFloat][2]float64
+	R RegisterLine
+	F [RegistersCountFloat][2]float64
+	E [RegistersCountFloat][2]float64
+	A [RegistersCountFloat][2]float64
+
+	FPRC softfloat.RoundingMode
+}
+
+func (f *RegisterFile) SetRoundingMode(mode softfloat.RoundingMode) {
+	if f.FPRC == mode {
+		return
+	}
+	f.FPRC = mode
+	asm.SetRoundingMode(mode)
 }
 
 type MemoryRegisters struct {
diff --git a/softfloat/const.go b/softfloat/const.go
new file mode 100644
index 0000000..fe15bbf
--- /dev/null
+++ b/softfloat/const.go
@@ -0,0 +1,37 @@
+package softfloat
+
+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+	bias64          = -1<<(expbits64-1) + 1
+
+	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
+	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
+	neg64 uint64 = 1 << (expbits64 + mantbits64)
+)
+
+const mantissaMask = (uint64(1) << mantbits64) - 1
+const exponentMask = (uint64(1) << expbits64) - 1
+const exponentBias = 1023
+const dynamicExponentBits = 4
+const staticExponentBits = 4
+const constExponentBits uint64 = 0x300
+const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
+
+const mask22bit = (uint64(1) << 22) - 1
+
+type RoundingMode uint8
+
+const (
+	// RoundingModeToNearest IEEE 754 roundTiesToEven
+	RoundingModeToNearest = RoundingMode(iota)
+
+	// RoundingModeToNegative IEEE 754 roundTowardNegative
+	RoundingModeToNegative
+
+	// RoundingModeToPositive IEEE 754 roundTowardPositive
+	RoundingModeToPositive
+
+	// RoundingModeToZero IEEE 754 roundTowardZero
+	RoundingModeToZero
+)
diff --git a/softfloat/funcs.go b/softfloat/funcs.go
new file mode 100644
index 0000000..b9ff822
--- /dev/null
+++ b/softfloat/funcs.go
@@ -0,0 +1,35 @@
+package softfloat
+
+import "math"
+
+func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
+	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
+}
+
+func ScaleNegate(f float64) float64 {
+	return math.Float64frombits(math.Float64bits(f) ^ 0x80F0000000000000)
+}
+
+func SmallPositiveFloatBits(entropy uint64) float64 {
+	exponent := entropy >> 59 //0..31
+	mantissa := entropy & mantissaMask
+	exponent += exponentBias
+	exponent &= exponentMask
+	exponent = exponent << mantbits64
+	return math.Float64frombits(exponent | mantissa)
+}
+
+func StaticExponent(entropy uint64) uint64 {
+	exponent := constExponentBits
+	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
+	exponent <<= mantbits64
+	return exponent
+}
+
+func EMask(entropy uint64) uint64 {
+	return (entropy & mask22bit) | StaticExponent(entropy)
+}
+
+func Xor(a, b float64) float64 {
+	return math.Float64frombits(math.Float64bits(a) ^ math.Float64bits(b))
+}
diff --git a/softfloat/softfloat.go b/softfloat/softfloat.go
new file mode 100644
index 0000000..4424c82
--- /dev/null
+++ b/softfloat/softfloat.go
@@ -0,0 +1,27 @@
+package softfloat
+
+import (
+	_ "runtime"
+	_ "unsafe"
+)
+
+//go:linkname funpack64 runtime.funpack64
+func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool)
+
+//go:linkname fpack64 runtime.fpack64
+func fpack64(sign, mant uint64, exp int, trunc uint64) uint64
+
+//go:linkname fadd64 runtime.fadd64
+func fadd64(f, g uint64) uint64
+
+//go:linkname fsub64 runtime.fsub64
+func fsub64(f, g uint64) uint64
+
+//go:linkname fneg64 runtime.fneg64
+func fneg64(f uint64) uint64
+
+//go:linkname fmul64 runtime.fmul64
+func fmul64(f uint64) uint64
+
+//go:linkname fdiv64 runtime.fdiv64
+func fdiv64(f uint64) uint64
diff --git a/vm.go b/vm.go
index 6a5145c..a51eb3c 100644
--- a/vm.go
+++ b/vm.go
@@ -31,9 +31,10 @@ package randomx
 
 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 	"math"
 	"runtime"
+	"unsafe"
 )
 import "encoding/binary"
 import "golang.org/x/crypto/blake2b"
@@ -45,17 +46,10 @@ type REG struct {
 
 type VM struct {
 	StateStart [64]byte
-	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
-	Prog       []byte
 	ScratchPad ScratchPad
 
 	ByteCode ByteCode
 
-	// program configuration  see program.hpp
-
-	entropy [16]uint64
-
-	reg           RegisterFile // the register file
 	mem           MemoryRegisters
 	config        Config // configuration
 	datasetOffset uint64
@@ -66,48 +60,47 @@ type VM struct {
 
 }
 
-func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
-	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
-}
-
 type Config struct {
 	eMask   [2]uint64
 	readReg [4]uint64
 }
 
 // Run calculate hash based on input
-func (vm *VM) Run(inputHash [64]byte) {
+// Warning: Underlying callers will run asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg RegisterFile) {
 
-	aes.FillAes4Rx4(inputHash, vm.buffer[:])
+	reg.FPRC = roundingMode
 
-	for i := range vm.entropy {
-		vm.entropy[i] = binary.LittleEndian.Uint64(vm.buffer[i*8:])
-	}
+	// buffer first 128 bytes are entropy below rest are program bytes
+	var buffer [16*8 + RANDOMX_PROGRAM_SIZE*8]byte
+	aes.FillAes4Rx4(inputHash, buffer[:])
 
-	vm.Prog = vm.buffer[len(vm.entropy)*8:]
+	entropy := (*[16]uint64)(unsafe.Pointer(&buffer))
 
-	clear(vm.reg.r[:])
+	prog := buffer[len(entropy)*8:]
 
 	// do more initialization before we run
 
-	for i := range vm.entropy[:8] {
-		vm.reg.a[i/2][i%2] = math.Float64frombits(getSmallPositiveFloatBits(vm.entropy[i]))
+	for i := range entropy[:8] {
+		reg.A[i/2][i%2] = softfloat.SmallPositiveFloatBits(entropy[i])
 	}
 
-	vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
-	vm.mem.mx = vm.entropy[10]
+	vm.mem.ma = entropy[8] & CacheLineAlignMask
+	vm.mem.mx = entropy[10]
 
-	addressRegisters := vm.entropy[12]
+	addressRegisters := entropy[12]
 	for i := range vm.config.readReg {
 		vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
 		addressRegisters >>= 1
 	}
 
-	vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
-	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
-	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
+	vm.datasetOffset = (entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
+	vm.config.eMask[LOW] = softfloat.EMask(entropy[14])
+	vm.config.eMask[HIGH] = softfloat.EMask(entropy[15])
 
-	vm.CompileToBytecode()
+	vm.ByteCode = CompileProgramToByteCode(prog)
 
 	spAddr0 := vm.mem.mx
 	spAddr1 := vm.mem.ma
@@ -115,50 +108,52 @@ func (vm *VM) Run(inputHash [64]byte) {
 	var rlCache RegisterLine
 
 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
-		spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]
+		spMix := reg.R[vm.config.readReg[0]] ^ reg.R[vm.config.readReg[1]]
 
 		spAddr0 ^= spMix
 		spAddr0 &= ScratchpadL3Mask64
 		spAddr1 ^= spMix >> 32
 		spAddr1 &= ScratchpadL3Mask64
 
+		//TODO: optimize these loads!
 		for i := uint64(0); i < RegistersCount; i++ {
-			vm.reg.r[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
+			reg.R[i] ^= vm.ScratchPad.Load64(uint32(spAddr0 + 8*i))
 		}
 
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
+			reg.F[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*i))
 		}
 
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.e[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))
+			reg.E[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))
 
-			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
-			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
+			reg.E[i][LOW] = softfloat.MaskRegisterExponentMantissa(reg.E[i][LOW], vm.config.eMask[LOW])
+			reg.E[i][HIGH] = softfloat.MaskRegisterExponentMantissa(reg.E[i][HIGH], vm.config.eMask[HIGH])
 		}
 
-		vm.reg = vm.ByteCode.Execute(vm.reg, &vm.ScratchPad, vm.config.eMask)
+		// Run the actual bytecode
+		vm.ByteCode.Execute(&reg, &vm.ScratchPad, vm.config.eMask)
 
-		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
+		vm.mem.mx ^= reg.R[vm.config.readReg[2]] ^ reg.R[vm.config.readReg[3]]
 		vm.mem.mx &= CacheLineAlignMask
 
 		vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
 		// execute diffuser superscalar program to get dataset 64 bytes
-		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &vm.reg.r, &rlCache)
+		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &reg.R, &rlCache)
 
 		// swap the elements
 		vm.mem.mx, vm.mem.ma = vm.mem.ma, vm.mem.mx
 
 		for i := uint64(0); i < RegistersCount; i++ {
-			vm.ScratchPad.Store64(uint32(spAddr1+8*i), vm.reg.r[i])
+			vm.ScratchPad.Store64(uint32(spAddr1+8*i), reg.R[i])
 		}
 
 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			vm.reg.f[i][LOW] = math.Float64frombits(math.Float64bits(vm.reg.f[i][LOW]) ^ math.Float64bits(vm.reg.e[i][LOW]))
-			vm.reg.f[i][HIGH] = math.Float64frombits(math.Float64bits(vm.reg.f[i][HIGH]) ^ math.Float64bits(vm.reg.e[i][HIGH]))
+			reg.F[i][LOW] = softfloat.Xor(reg.F[i][LOW], reg.E[i][LOW])
+			reg.F[i][HIGH] = softfloat.Xor(reg.F[i][HIGH], reg.E[i][HIGH])
 
-			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(vm.reg.f[i][LOW]))
-			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(vm.reg.f[i][HIGH]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(reg.F[i][LOW]))
+			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(reg.F[i][HIGH]))
 		}
 
 		spAddr0 = 0
@@ -166,56 +161,52 @@ func (vm *VM) Run(inputHash [64]byte) {
 
 	}
 
+	return reg
+
 }
 
 func (vm *VM) InitScratchpad(seed *[64]byte) {
 	vm.ScratchPad.Init(seed)
 }
 
-func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
+func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
+
 	var buf [8]byte
+	hash512, _ := blake2b.New512(nil)
 
 	// Lock thread due to rounding mode flags
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()
-	//restore rounding mode to golang expected one
-	defer asm.SetRoundingMode(asm.RoundingModeToNearest)
 
-	// reset rounding mode if new hash being calculated
-	asm.SetRoundingMode(asm.RoundingModeToNearest)
-
-	tempHash := blake2b.Sum512(input)
-
-	vm.InitScratchpad(&tempHash)
-
-	hash512, _ := blake2b.New512(nil)
+	roundingMode := softfloat.RoundingModeToNearest
 
 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
-		vm.Run(tempHash)
+		reg := vm.Run(tempHash, roundingMode)
+		roundingMode = reg.FPRC
 
 		hash512.Reset()
-		for i := range vm.reg.r {
-			binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+		for i := range reg.R {
+			binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 			hash512.Write(buf[:])
 		}
-		for i := range vm.reg.f {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+		for i := range reg.F {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 			hash512.Write(buf[:])
 		}
 
-		for i := range vm.reg.e {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+		for i := range reg.E {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 			hash512.Write(buf[:])
 		}
 
-		for i := range vm.reg.a {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][LOW]))
+		for i := range reg.A {
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][LOW]))
 			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][HIGH]))
+			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.A[i][HIGH]))
 			hash512.Write(buf[:])
 		}
 
@@ -223,7 +214,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	}
 
 	// final loop executes here
-	vm.Run(tempHash)
+	reg := vm.Run(tempHash, roundingMode)
+	roundingMode = reg.FPRC
+
+	reg.SetRoundingMode(softfloat.RoundingModeToNearest)
+
+	return reg
+}
+
+func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
+	var buf [8]byte
+
+	tempHash := blake2b.Sum512(input)
+
+	vm.InitScratchpad(&tempHash)
+
+	reg := vm.RunLoops(tempHash)
 
 	// now hash the scratch pad and place into register a
 	aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)
@@ -232,22 +238,22 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 
 	hash256.Reset()
 
-	for i := range vm.reg.r {
-		binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
+	for i := range reg.R {
+		binary.LittleEndian.PutUint64(buf[:], reg.R[i])
 		hash256.Write(buf[:])
 	}
 
-	for i := range vm.reg.f {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
+	for i := range reg.F {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.F[i][HIGH]))
 		hash256.Write(buf[:])
 	}
 
-	for i := range vm.reg.e {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
+	for i := range reg.E {
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][LOW]))
 		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
+		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(reg.E[i][HIGH]))
 		hash256.Write(buf[:])
 	}
 
@@ -256,25 +262,3 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 
 	hash256.Sum(output[:0])
 }
-
-const mask22bit = (uint64(1) << 22) - 1
-
-func getSmallPositiveFloatBits(entropy uint64) uint64 {
-	exponent := entropy >> 59 //0..31
-	mantissa := entropy & mantissaMask
-	exponent += exponentBias
-	exponent &= exponentMask
-	exponent = exponent << mantissaSize
-	return exponent | mantissa
-}
-
-func getStaticExponent(entropy uint64) uint64 {
-	exponent := constExponentBits
-	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
-	exponent <<= mantissaSize
-	return exponent
-}
-
-func getFloatMask(entropy uint64) uint64 {
-	return (entropy & mask22bit) | getStaticExponent(entropy)
-}
diff --git a/vm_bytecode.go b/vm_bytecode.go
new file mode 100644
index 0000000..9c582fd
--- /dev/null
+++ b/vm_bytecode.go
@@ -0,0 +1,207 @@
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
+	"math"
+	"math/bits"
+)
+
+type ByteCodeInstruction struct {
+	Dst, Src byte
+	ImmB     uint8
+	Opcode   ByteCodeInstructionOp
+	MemMask  uint32
+	Imm      uint64
+	/*
+		union {
+			int_reg_t* idst;
+			rx_vec_f128* fdst;
+		};
+		union {
+			int_reg_t* isrc;
+			rx_vec_f128* fsrc;
+		};
+		union {
+			uint64_t imm;
+			int64_t simm;
+		};
+		InstructionType type;
+		union {
+			int16_t target;
+			uint16_t shift;
+		};
+		uint32_t memMask;
+	*/
+
+}
+
+func (i ByteCodeInstruction) jumpTarget() int {
+	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
+}
+
+func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
+	return uint32(ptr+i.Imm) & i.MemMask
+}
+
+func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
+	return uint32(i.Imm) & i.MemMask
+}
+
+type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] += lo
+			f.F[i.Dst][HIGH] += hi
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] -= lo
+			f.F[i.Dst][HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = softfloat.ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = softfloat.ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
+			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] /= softfloat.MaskRegisterExponentMantissa(lo, eMask[LOW])
+			f.E[i.Dst][HIGH] /= softfloat.MaskRegisterExponentMantissa(hi, eMask[HIGH])
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
+			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
+		case VM_CBRANCH:
+			f.R[i.Src] += i.Imm
+			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			f.SetRoundingMode(softfloat.RoundingMode(tmp))
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+type ByteCodeInstructionOp int
+
+const (
+	VM_NOP = ByteCodeInstructionOp(iota)
+	VM_IADD_RS
+	VM_IADD_M
+	VM_IADD_MZ
+	VM_ISUB_R
+	VM_ISUB_I
+	VM_ISUB_M
+	VM_ISUB_MZ
+	VM_IMUL_R
+	VM_IMUL_I
+	VM_IMUL_M
+	VM_IMUL_MZ
+	VM_IMULH_R
+	VM_IMULH_M
+	VM_IMULH_MZ
+	VM_ISMULH_R
+	VM_ISMULH_M
+	VM_ISMULH_MZ
+	VM_INEG_R
+	VM_IXOR_R
+	VM_IXOR_I
+	VM_IXOR_M
+	VM_IXOR_MZ
+	VM_IROR_R
+	VM_IROR_I
+	VM_IROL_R
+	VM_IROL_I
+	VM_ISWAP_R
+	VM_FSWAP_RF
+	VM_FSWAP_RE
+	VM_FADD_R
+	VM_FADD_M
+	VM_FSUB_R
+	VM_FSUB_M
+	VM_FSCAL_R
+	VM_FMUL_R
+	VM_FDIV_M
+	VM_FSQRT_R
+	VM_CBRANCH
+	VM_CFROUND
+	VM_ISTORE
+)
diff --git a/vm_instruction.go b/vm_instruction.go
index 8de191a..f7d1315 100644
--- a/vm_instruction.go
+++ b/vm_instruction.go
@@ -37,8 +37,8 @@ import "encoding/binary"
 
 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding
 
-// since go does not have union, use byte array
-type VM_Instruction []byte // it is hardcode 8 bytes
+// VM_Instruction since go does not have union, use byte array
+type VM_Instruction [8]byte // it is hardcode 8 bytes
 
 func (ins VM_Instruction) IMM() uint32 {
 	return binary.LittleEndian.Uint32(ins[4:])
@@ -56,9 +56,9 @@ func (ins VM_Instruction) Opcode() byte {
 	return ins[0]
 }
 
-// CompileToBytecode this will interpret single vm instruction
+// CompileProgramToByteCode this will interpret single vm instruction into executable opcodes
 // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions
-func (vm *VM) CompileToBytecode() {
+func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
 
 	var registerUsage [RegistersCount]int
 	for i := range registerUsage {
@@ -66,8 +66,8 @@ func (vm *VM) CompileToBytecode() {
 	}
 
 	for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
-		instr := VM_Instruction(vm.Prog[i*8:])
-		ibc := &vm.ByteCode[i]
+		instr := VM_Instruction(prog[i*8:])
+		ibc := &bc[i]
 
 		opcode := instr.Opcode()
 		dst := instr.Dst() % RegistersCount // bit shift optimization
@@ -317,7 +317,7 @@ func (vm *VM) CompileToBytecode() {
 			//conditionmask := CONDITIONMASK << shift
 			ibc.Imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
 			if CONDITIONOFFSET > 0 || shift > 0 {
-				ibc.Imm &= (^(uint64(1) << (shift - 1)))
+				ibc.Imm &= ^(uint64(1) << (shift - 1))
 			}
 			ibc.MemMask = CONDITIONMASK << shift
 
@@ -349,6 +349,8 @@ func (vm *VM) CompileToBytecode() {
 		}
 	}
 
+	return bc
+
 }
 
 type ScratchPad [ScratchpadSize]byte