Include softfloat64 and allow for purego implementation

2024-04-17 05:56:05 +02:00 · 2024-04-17 05:56:05 +02:00 · aab8f99dd4
parent 432590f930
commit aab8f99dd4
27 changed files with 464 additions and 246 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -1,7 +1,7 @@
 ---
 kind: pipeline
 type: docker
-name: from-source-amd64
+name: go-amd64-asm-jit
 platform:
  os: linux
  arch: amd64
@ -28,7 +28,61 @@ steps:
 ---
 kind: pipeline
 type: docker
-name: from-source-386
+name: go-amd64-asm
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: amd64
+  GOAMD64: v3
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags disable_jit -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+---
+kind: pipeline
+type: docker
+name: go-amd64-purego
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: amd64
+  GOAMD64: v3
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+---
+kind: pipeline
+type: docker
+name: go-386-asm
 platform:
  os: linux
  arch: amd64
@ -55,7 +109,34 @@ steps:
 ---
 kind: pipeline
 type: docker
-name: from-source-arm64
+name: go-386-purego
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: 386
+  GO386: sse2
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+---
+kind: pipeline
+type: docker
+name: go-arm64-asm
 platform:
  os: linux
  arch: arm64
@ -78,4 +159,30 @@ steps:
      - apk update
      - apk add --no-cache git
      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+---
+kind: pipeline
+type: docker
+name: go-arm64-purego
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: arm64
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
 ...
--- a/README.md
+++ b/README.md
@ -4,21 +4,25 @@ Fork from [git.dero.io/DERO_Foundation/RandomX](https://git.dero.io/DERO_Foundat

 Original code failed RandomX testcases and was implemented using big.Float.

-This package implements RandomX without CGO, using only Golang code, pure float64 ops and two small assembly sections to implement CFROUND modes.
+This package implements RandomX without CGO, using only Golang code, pure float64 ops and two small assembly sections to implement CFROUND modes, with optional soft float implementation.

 All test cases pass properly.

-Uses minimal Go assembly due to having to set rounding mode natively. Support can be added with supporting rounding mode under _asm_.
+Uses minimal Go assembly due to having to set rounding mode natively. Native hard float can be added with supporting rounding mode under _asm_.

 JIT is supported on a few platforms but can be hard-disabled via the `disable_jit` build flag, or at runtime.

-|  Platform   | Supported | SuperScalar JIT |      Notes       |
-|:-----------:|:---------:|:---------------:|:----------------:|
-|   **386**   |     ✅     |        ❌        |                  |
-|  **amd64**  |     ✅     |       ✅*        | JIT only on Unix |
-|   **arm**   |     ❌     |        -        |                  |
-|  **arm64**  |     ✅     |        ❌        |                  |
-|  **mips**   |     ❌     |        -        |                  |
-| **mips64**  |     ❌     |        -        |                  |
-| **riscv64** |     ❌     |        -        |                  |
-|  **wasm**   |     ❌     |        -        |                  |
+A pure Golang implementation can be used on platforms without hard float support or via the `purego` build flag manually.
+
+|  Platform   | Supported | Hard Float | SuperScalar JIT |      Notes       |
+|:-----------:|:---------:|:----------:|:---------------:|:----------------:|
+|   **386**   |     ✅     |     ✅      |        ❌        |                  |
+|  **amd64**  |     ✅     |     ✅      |       ✅*        | JIT only on Unix |
+|   **arm**   |    ✅*     |     ❌      |        ❌        |                  |
+|  **arm64**  |     ✅     |     ✅      |        ❌        |                  |
+|  **mips**   |    ✅*     |     ❌      |        ❌        |                  |
+| **mips64**  |    ✅*     |     ❌      |        ❌        |                  |
+| **riscv64** |    ✅*     |     ❌      |        ❌        |                  |
+|  **wasm**   |    ✅*     |     ❌      |        ❌        |                  |
+
+&ast; these platforms only support software floating point / purego and will not be performant.
--- a/asm/round.go
+++ b/asm/round.go
@ -1,7 +1,5 @@
 package asm

-import "git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
-
-func SetRoundingMode(mode softfloat.RoundingMode) {
+func SetRoundingMode[T ~uint64 | ~uint8](mode T) {
 	setRoundingMode(uint8(mode))
 }
--- a/asm/round_386.go
+++ b/asm/round_386.go
@ -1,4 +1,4 @@
-//go:build 386
+//go:build 386 && !purego

 package asm

--- a/asm/round_386.s
+++ b/asm/round_386.s
@ -1,3 +1,5 @@
+//go:build 386 && !purego
+
 #include "textflag.h"

 // stmxcsr reads the MXCSR control and status register.
--- a/asm/round_amd64.go
+++ b/asm/round_amd64.go
@ -1,4 +1,4 @@
-//go:build amd64
+//go:build amd64 && !purego

 package asm

--- a/asm/round_amd64.s
+++ b/asm/round_amd64.s
@ -1,3 +1,5 @@
+//go:build amd64 && !purego
+
 #include "textflag.h"

 // stmxcsr reads the MXCSR control and status register.
--- a/asm/round_arm64.go
+++ b/asm/round_arm64.go
@ -1,4 +1,4 @@
-//go:build arm64
+//go:build arm64 && !purego

 package asm

--- a/asm/round_arm64.s
+++ b/asm/round_arm64.s
@ -1,3 +1,5 @@
+//go:build arm64 && !purego
+
 #include "textflag.h"

 TEXT ·getFPCR(SB),NOSPLIT,$0-8
--- a/asm/round_noasm.go
+++ b/asm/round_noasm.go
@ -1,4 +1,4 @@
-//go:build !arm64 && !amd64 && !386
+//go:build (!arm64 && !amd64 && !386) || purego

 package asm

--- a/exec_generic.go
+++ b/exec_generic.go
@ -1,4 +1,4 @@
-//go:build !unix || disable_jit
+//go:build !unix || disable_jit || purego

 package randomx

@ -7,5 +7,5 @@ func (f ProgramFunc) Execute(rl *RegisterLine) {
 }

 func (f ProgramFunc) Close() error {
-
+	return nil
 }
--- a/exec_mmap_unix.go
+++ b/exec_mmap_unix.go
@ -1,4 +1,4 @@
-//go:build unix && !disable_jit
+//go:build unix && !disable_jit && !purego

 package randomx

--- a/softfloat/funcs.go
+++ b/softfloat/funcs.go
@ -1,7 +1,22 @@
-package softfloat
+package randomx

 import "math"

+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+)
+
+const mantissaMask = (uint64(1) << mantbits64) - 1
+const exponentMask = (uint64(1) << expbits64) - 1
+const exponentBias = 1023
+const dynamicExponentBits = 4
+const staticExponentBits = 4
+const constExponentBits uint64 = 0x300
+const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
+
+const mask22bit = (uint64(1) << 22) - 1
+
 func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
 	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
 }
--- a/go.mod
+++ b/go.mod
@ -5,3 +5,5 @@ go 1.21
 require golang.org/x/crypto v0.22.0

 require golang.org/x/sys v0.19.0
+
+require git.gammaspectra.live/P2Pool/softfloat64 v1.0.0
--- a/go.sum
+++ b/go.sum
@ -1,3 +1,5 @@
+git.gammaspectra.live/P2Pool/softfloat64 v1.0.0 h1:XqxDpowntpV8gvBzG9bMC8VVzxZJT/YEk7BfwmaCamU=
+git.gammaspectra.live/P2Pool/softfloat64 v1.0.0/go.mod h1:ZhnGqXOS6F6aJpiiT38Cvk5eHoBNqjkKfp3w3AcnomA=
 golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30=
 golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
 golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
--- a/register.go
+++ b/register.go
@ -1,10 +1,5 @@
 package randomx

-import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
-)
-
 const RegistersCount = 8
 const RegistersCountFloat = 4

@ -19,15 +14,7 @@ type RegisterFile struct {
 	E [RegistersCountFloat][2]float64
 	A [RegistersCountFloat][2]float64

-	FPRC softfloat.RoundingMode
-}
-
-func (f *RegisterFile) SetRoundingMode(mode softfloat.RoundingMode) {
-	if f.FPRC == mode {
-		return
-	}
-	f.FPRC = mode
-	asm.SetRoundingMode(mode)
+	FPRC uint8
 }

 type MemoryRegisters struct {
--- a/softfloat/const.go
+++ b/softfloat/const.go
@ -1,37 +0,0 @@
-package softfloat
-
-const (
-	mantbits64 uint = 52
-	expbits64  uint = 11
-	bias64          = -1<<(expbits64-1) + 1
-
-	nan64 uint64 = (1<<expbits64-1)<<mantbits64 + 1<<(mantbits64-1) // quiet NaN, 0 payload
-	inf64 uint64 = (1<<expbits64 - 1) << mantbits64
-	neg64 uint64 = 1 << (expbits64 + mantbits64)
-)
-
-const mantissaMask = (uint64(1) << mantbits64) - 1
-const exponentMask = (uint64(1) << expbits64) - 1
-const exponentBias = 1023
-const dynamicExponentBits = 4
-const staticExponentBits = 4
-const constExponentBits uint64 = 0x300
-const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
-
-const mask22bit = (uint64(1) << 22) - 1
-
-type RoundingMode uint8
-
-const (
-	// RoundingModeToNearest IEEE 754 roundTiesToEven
-	RoundingModeToNearest = RoundingMode(iota)
-
-	// RoundingModeToNegative IEEE 754 roundTowardNegative
-	RoundingModeToNegative
-
-	// RoundingModeToPositive IEEE 754 roundTowardPositive
-	RoundingModeToPositive
-
-	// RoundingModeToZero IEEE 754 roundTowardZero
-	RoundingModeToZero
-)
--- a/softfloat/softfloat.go
+++ b/softfloat/softfloat.go
@ -1,27 +0,0 @@
-package softfloat
-
-import (
-	_ "runtime"
-	_ "unsafe"
-)
-
-//go:linkname funpack64 runtime.funpack64
-func funpack64(f uint64) (sign, mant uint64, exp int, inf, nan bool)
-
-//go:linkname fpack64 runtime.fpack64
-func fpack64(sign, mant uint64, exp int, trunc uint64) uint64
-
-//go:linkname fadd64 runtime.fadd64
-func fadd64(f, g uint64) uint64
-
-//go:linkname fsub64 runtime.fsub64
-func fsub64(f, g uint64) uint64
-
-//go:linkname fneg64 runtime.fneg64
-func fneg64(f uint64) uint64
-
-//go:linkname fmul64 runtime.fmul64
-func fmul64(f uint64) uint64
-
-//go:linkname fdiv64 runtime.fdiv64
-func fdiv64(f uint64) uint64
--- a/superscalar_amd64.go
+++ b/superscalar_amd64.go
@ -1,4 +1,4 @@
-//go:build unix && amd64 && !disable_jit
+//go:build unix && amd64 && !disable_jit && !purego

 package randomx

--- a/superscalar_noasm.go
+++ b/superscalar_noasm.go
@ -1,4 +1,4 @@
-//go:build !unix || !amd64 || disable_jit
+//go:build !unix || !amd64 || purego || disable_jit

 package randomx

--- a/vm.go
+++ b/vm.go
@ -31,7 +31,6 @@ package randomx

 import (
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
 	"math"
 	"runtime"
 	"unsafe"
@ -67,9 +66,9 @@ type Config struct {

 // Run calculate hash based on input
 // Warning: Underlying callers will run asm.SetRoundingMode directly
-// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
+// It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
 // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
-func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg RegisterFile) {
+func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) {

 	reg.FPRC = roundingMode

@ -84,7 +83,7 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg
 	// do more initialization before we run

 	for i := range entropy[:8] {
-		reg.A[i/2][i%2] = softfloat.SmallPositiveFloatBits(entropy[i])
+		reg.A[i/2][i%2] = SmallPositiveFloatBits(entropy[i])
 	}

 	vm.mem.ma = entropy[8] & CacheLineAlignMask
@ -97,8 +96,8 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg
 	}

 	vm.datasetOffset = (entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
-	vm.config.eMask[LOW] = softfloat.EMask(entropy[14])
-	vm.config.eMask[HIGH] = softfloat.EMask(entropy[15])
+	vm.config.eMask[LOW] = EMask(entropy[14])
+	vm.config.eMask[HIGH] = EMask(entropy[15])

 	vm.ByteCode = CompileProgramToByteCode(prog)

@ -127,8 +126,8 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg
 		for i := uint64(0); i < RegistersCountFloat; i++ {
 			reg.E[i] = vm.ScratchPad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))

-			reg.E[i][LOW] = softfloat.MaskRegisterExponentMantissa(reg.E[i][LOW], vm.config.eMask[LOW])
-			reg.E[i][HIGH] = softfloat.MaskRegisterExponentMantissa(reg.E[i][HIGH], vm.config.eMask[HIGH])
+			reg.E[i][LOW] = MaskRegisterExponentMantissa(reg.E[i][LOW], vm.config.eMask[LOW])
+			reg.E[i][HIGH] = MaskRegisterExponentMantissa(reg.E[i][HIGH], vm.config.eMask[HIGH])
 		}

 		// Run the actual bytecode
@ -149,8 +148,8 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode softfloat.RoundingMode) (reg
 		}

 		for i := uint64(0); i < RegistersCountFloat; i++ {
-			reg.F[i][LOW] = softfloat.Xor(reg.F[i][LOW], reg.E[i][LOW])
-			reg.F[i][HIGH] = softfloat.Xor(reg.F[i][HIGH], reg.E[i][HIGH])
+			reg.F[i][LOW] = Xor(reg.F[i][LOW], reg.E[i][LOW])
+			reg.F[i][HIGH] = Xor(reg.F[i][HIGH], reg.E[i][HIGH])

 			vm.ScratchPad.Store64(uint32(spAddr0+16*i), math.Float64bits(reg.F[i][LOW]))
 			vm.ScratchPad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(reg.F[i][HIGH]))
@ -178,7 +177,7 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
 	runtime.LockOSThread()
 	defer runtime.UnlockOSThread()

-	roundingMode := softfloat.RoundingModeToNearest
+	roundingMode := uint8(0)

 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
 		reg := vm.Run(tempHash, roundingMode)
@ -217,7 +216,8 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
 	reg := vm.Run(tempHash, roundingMode)
 	roundingMode = reg.FPRC

-	reg.SetRoundingMode(softfloat.RoundingModeToNearest)
+	//restore rounding mode
+	vm.ByteCode.SetRoundingMode(&reg, 0)

 	return reg
 }
--- a/vm_bytecode.go
+++ b/vm_bytecode.go
@ -1,11 +1,5 @@
 package randomx

-import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/softfloat"
-	"math"
-	"math/bits"
-)
-
 type ByteCodeInstruction struct {
 	Dst, Src byte
 	ImmB     uint8
@ -49,117 +43,6 @@ func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {

 type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction

-// Execute Runs a RandomX program with the given register file and scratchpad
-// Warning: This will call asm.SetRoundingMode directly
-// It is the caller's responsibility to set and restore the mode to softfloat.RoundingModeToNearest between full executions
-// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
-func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-		i := &c[pc]
-		switch i.Opcode {
-		case VM_NOP: // we do nothing
-		case VM_IADD_RS:
-			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
-		case VM_IADD_M:
-			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
-		case VM_IADD_MZ:
-			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
-		case VM_ISUB_R:
-			f.R[i.Dst] -= f.R[i.Src]
-		case VM_ISUB_I:
-			f.R[i.Dst] -= i.Imm
-		case VM_ISUB_M:
-			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
-		case VM_ISUB_MZ:
-			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
-		case VM_IMUL_R:
-			f.R[i.Dst] *= f.R[i.Src]
-		case VM_IMUL_I:
-			// also handles imul_rcp
-			f.R[i.Dst] *= i.Imm
-		case VM_IMUL_M:
-			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
-		case VM_IMUL_MZ:
-			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
-		case VM_IMULH_R:
-			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
-		case VM_IMULH_M:
-			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
-		case VM_IMULH_MZ:
-			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
-		case VM_ISMULH_R:
-			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
-		case VM_ISMULH_M:
-			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
-		case VM_ISMULH_MZ:
-			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
-		case VM_INEG_R:
-			f.R[i.Dst] = -f.R[i.Dst]
-		case VM_IXOR_R:
-			f.R[i.Dst] ^= f.R[i.Src]
-		case VM_IXOR_I:
-			f.R[i.Dst] ^= i.Imm
-		case VM_IXOR_M:
-			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
-		case VM_IXOR_MZ:
-			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
-		case VM_IROR_R:
-			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
-		case VM_IROR_I:
-			//todo: can merge into VM_IROL_I
-			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
-		case VM_IROL_R:
-			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
-		case VM_IROL_I:
-			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
-		case VM_ISWAP_R:
-			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
-		case VM_FSWAP_RF:
-			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
-		case VM_FSWAP_RE:
-			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
-		case VM_FADD_R:
-			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
-			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
-		case VM_FADD_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
-			f.F[i.Dst][LOW] += lo
-			f.F[i.Dst][HIGH] += hi
-		case VM_FSUB_R:
-			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
-			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
-		case VM_FSUB_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
-			f.F[i.Dst][LOW] -= lo
-			f.F[i.Dst][HIGH] -= hi
-		case VM_FSCAL_R:
-			// no dependent on rounding modes
-			f.F[i.Dst][LOW] = softfloat.ScaleNegate(f.F[i.Dst][LOW])
-			f.F[i.Dst][HIGH] = softfloat.ScaleNegate(f.F[i.Dst][HIGH])
-		case VM_FMUL_R:
-			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
-			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
-		case VM_FDIV_M:
-			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
-			f.E[i.Dst][LOW] /= softfloat.MaskRegisterExponentMantissa(lo, eMask[LOW])
-			f.E[i.Dst][HIGH] /= softfloat.MaskRegisterExponentMantissa(hi, eMask[HIGH])
-		case VM_FSQRT_R:
-			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
-			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
-		case VM_CBRANCH:
-			f.R[i.Src] += i.Imm
-			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
-				pc = i.jumpTarget()
-			}
-		case VM_CFROUND:
-			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
-			f.SetRoundingMode(softfloat.RoundingMode(tmp))
-		case VM_ISTORE:
-			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
-		}
-	}
-}
-
 type ByteCodeInstructionOp int

 const (
@ -201,7 +84,7 @@ const (
 	VM_FMUL_R
 	VM_FDIV_M
 	VM_FSQRT_R
-	VM_CBRANCH
 	VM_CFROUND
+	VM_CBRANCH
 	VM_ISTORE
 )
--- a/vm_bytecode_native.go
+++ b/vm_bytecode_native.go
@ -0,0 +1,130 @@
+//go:build (arm64 || amd64 || 386) && !purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"math"
+	"math/bits"
+)
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] += lo
+			f.F[i.Dst][HIGH] += hi
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] -= lo
+			f.F[i.Dst][HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
+			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] /= MaskRegisterExponentMantissa(lo, eMask[LOW])
+			f.E[i.Dst][HIGH] /= MaskRegisterExponentMantissa(hi, eMask[HIGH])
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
+			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			c.SetRoundingMode(f, uint8(tmp))
+
+		case VM_CBRANCH:
+			f.R[i.Src] += i.Imm
+			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+func (c *ByteCode) SetRoundingMode(f *RegisterFile, mode uint8) {
+	if f.FPRC == mode {
+		return
+	}
+	f.FPRC = mode
+	asm.SetRoundingMode(mode)
+}
--- a/vm_bytecode_purego.go
+++ b/vm_bytecode_purego.go
@ -0,0 +1,125 @@
+//go:build (!arm64 && !amd64 && !386) || purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/softfloat64"
+	"math/bits"
+)
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] = softfloat64.Add(f.F[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Add(f.F[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] = softfloat64.Add(f.F[i.Dst][LOW], lo, softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Add(f.F[i.Dst][HIGH], hi, softfloat64.RoundingMode(f.FPRC))
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] = softfloat64.Sub(f.F[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Sub(f.F[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] = softfloat64.Sub(f.F[i.Dst][LOW], lo, softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Sub(f.F[i.Dst][HIGH], hi, softfloat64.RoundingMode(f.FPRC))
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] = softfloat64.Mul(f.E[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Mul(f.E[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] = softfloat64.Div(f.E[i.Dst][LOW], MaskRegisterExponentMantissa(lo, eMask[LOW]), softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Div(f.E[i.Dst][HIGH], MaskRegisterExponentMantissa(hi, eMask[HIGH]), softfloat64.RoundingMode(f.FPRC))
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = softfloat64.Sqrt(f.E[i.Dst][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Sqrt(f.E[i.Dst][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			c.SetRoundingMode(f, uint8(tmp))
+
+		case VM_CBRANCH:
+			f.R[i.Src] += i.Imm
+			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+func (c *ByteCode) SetRoundingMode(f *RegisterFile, mode uint8) {
+	f.FPRC = mode
+}
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -370,13 +370,3 @@ func (pad *ScratchPad) Load64(addr uint32) uint64 {
 func (pad *ScratchPad) Load32(addr uint32) uint32 {
 	return *(*uint32)(unsafe.Pointer(&pad[addr]))
 }
-
-func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
-	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
-	return float64(a[LOW]), float64(a[HIGH])
-}
-
-func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
-	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
-	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
-}
--- a/vm_instruction_native.go
+++ b/vm_instruction_native.go
@ -0,0 +1,15 @@
+//go:build (arm64 || amd64 || 386) && !purego
+
+package randomx
+
+import "unsafe"
+
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return float64(a[LOW]), float64(a[HIGH])
+}
+
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
+}
--- a/vm_instruction_purego.go
+++ b/vm_instruction_purego.go
@ -0,0 +1,18 @@
+//go:build (!arm64 && !amd64 && !386) || purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/softfloat64"
+	"unsafe"
+)
+
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return softfloat64.Int32ToFloat64(a[LOW]), softfloat64.Int32ToFloat64(a[HIGH])
+}
+
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return [2]float64{softfloat64.Int32ToFloat64(a[LOW]), softfloat64.Int32ToFloat64(a[HIGH])}
+}