Working RandomX with minimal assembly for rounding mode (AMD64,ARM64)

2024-04-11 07:44:12 +02:00 · 2024-04-11 07:44:12 +02:00 · b207b994b3
parent 2a14ddf070
commit b207b994b3
13 changed files with 196 additions and 279 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -0,0 +1,54 @@
+---
+kind: pipeline
+type: docker
+name: from-source-amd64
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: amd64
+  GOAMD64: v3
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+---
+kind: pipeline
+type: docker
+name: from-source-arm64
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: arm64
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+...
--- a/2
+++ b/2
@ -1,4 +1,4 @@
-Copyright (c) 2019 DERO Foundation. All rights reserved.
+Copyright (c) 2024 WeebDataHoarder, DERO Foundation. All rights reserved.

 Redistribution and use in source and binary forms, with or without modification,
 are permitted provided that the following conditions are met:
--- a/example/example.go
+++ b/example/example.go
@ -1,95 +0,0 @@
-//go:build ignore
-// +build ignore
-
-/*
-Copyright (c) 2019 DERO Foundation. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-package main
-
-import "git.gammaspectra.live/P2Pool/go-randomx"
-import "fmt"
-
-func main() {
-	c := randomx.Randomx_alloc_cache(0)
-
-	key := []byte("RandomX example key\x00")
-	myinput := []byte("RandomX example input\x00")
-
-	c.Randomx_init_cache(key)
-
-	nonce := uint32(0) //uint32(len(key))
-	gen := randomx.Init_Blake2Generator(key, nonce)
-	for i := 0; i < 8; i++ {
-		c.Programs[i] = randomx.Build_SuperScalar_Program(gen) // build a superscalar program
-	}
-
-	vm := c.VM_Initialize()
-
-	_ = fmt.Sprintf("t")
-
-	var output_hash [32]byte
-	vm.CalculateHash(myinput, output_hash[:])
-
-	fmt.Printf("final output hash %x\n", output_hash)
-
-	vm.CalculateHash(myinput, output_hash[:])
-
-	fmt.Printf("final output hash %x\n", output_hash)
-
-	/*
-	   fmt.Printf("cache blocks %d block size %d %+v\n", len(c.Blocks), len(c.Blocks[0]), c.Blocks[0])
-
-	   register_value := uint64(0x70c13c)
-	   mask := randomx.CacheSize / randomx.CacheLineSize - 1;
-
-	   address :=  (register_value&mask)*   randomx.CacheLineSize
-
-
-	   var block [8]uint64
-
-	   c.GetBlock(address,block[:])
-
-	   for i := range block{
-	   	fmt.Printf("%d %16x\n", i, block[i])
-	   }
-
-	   //block := address / 1024
-
-	   //index_within_block := (address % 1024) / 8
-
-	   //fmt.Printf("mask %x address %x  block %d index_within_block %d  data %16x\n",mask, address, block, index_within_block,c.Blocks[block][index_within_block])
-
-	   /*
-	   for i := range c.Blocks[block]{
-	   	fmt.Printf("%3d %16x\n", i,c.Blocks[block][i])
-	   }
-	*/
-	//c.InitDatasetItem(nil,0x70c13c)
-
-}
--- a/fpu/round.go
+++ b/fpu/round.go
@ -0,0 +1,14 @@
+package fpu
+
+type RoundingMode uint8
+
+const (
+	RoundingModeToNearest = RoundingMode(iota)
+	RoundingModeToNegative
+	RoundingModeToPositive
+	RoundingModeToZero
+)
+
+func SetRoundingMode(mode RoundingMode) {
+	setRoundingMode(uint8(mode))
+}
--- a/fpu/round_amd64.go
+++ b/fpu/round_amd64.go
@ -0,0 +1,17 @@
+//go:build amd64
+// +build amd64
+
+package fpu
+
+// stmxcsr reads the MXCSR control and status register.
+func stmxcsr(addr *uint32)
+
+// ldmxcsr writes to the MXCSR control and status register.
+func ldmxcsr(addr *uint32)
+
+func setRoundingMode(mode uint8) {
+	var csr uint32
+	stmxcsr(&csr)
+	csr = (csr & (^uint32(0x6000))) | ((uint32(mode) & 3) << 13)
+	ldmxcsr(&csr)
+}
--- a/fpu/round_amd64.s
+++ b/fpu/round_amd64.s
@ -0,0 +1,13 @@
+#include "textflag.h"
+
+// stmxcsr reads the MXCSR control and status register.
+TEXT ·stmxcsr(SB),NOSPLIT|NOFRAME,$0-8
+	MOVQ addr+0(FP), SI
+	STMXCSR (SI)
+	RET
+
+// ldmxcsr writes to the MXCSR control and status register.
+TEXT ·ldmxcsr(SB),NOSPLIT|NOFRAME,$0-8
+	MOVQ addr+0(FP), SI
+	LDMXCSR (SI)
+	RET
--- a/fpu/round_arm64.go
+++ b/fpu/round_arm64.go
@ -0,0 +1,24 @@
+//go:build arm64
+// +build arm64
+
+package fpu
+
+// GetFPCR returns the value of FPCR register.
+func getFPCR() (value uint32)
+
+// SetFPCR writes the FPCR value.
+func setFPCR(value uint32)
+
+func setRoundingMode(mode uint8) {
+	switch mode {
+	// switch plus/minus infinity
+	case 1:
+		mode = 2
+	case 2:
+		mode = 1
+
+	}
+	fpcr := getFPCR()
+	fpcr = (fpcr & (^uint32(0x0C00000))) | ((uint32(mode) & 3) << 22)
+	setFPCR(fpcr)
+}
--- a/fpu/round_arm64.s
+++ b/fpu/round_arm64.s
@ -0,0 +1,11 @@
+#include "textflag.h"
+
+TEXT ·getFPCR(SB),NOSPLIT,$0-8
+	MOVD FPCR, R1
+	MOVD R1, value+0(FP)
+	RET
+
+TEXT ·setFPCR(SB),NOSPLIT,$0-8
+	MOVD value+0(FP), R1
+	MOVD R1, FPCR
+	RET
--- a/go.mod
+++ b/go.mod
@ -4,4 +4,7 @@ go 1.22

 require golang.org/x/crypto v0.22.0

-require golang.org/x/sys v0.19.0 // indirect
+require (
+	github.com/klauspost/cpuid/v2 v2.2.7 // indirect
+	golang.org/x/sys v0.19.0 // indirect
+)
--- a/go.sum
+++ b/go.sum
@ -1,8 +1,11 @@
+github.com/klauspost/cpuid/v2 v2.2.7 h1:ZWSB3igEs+d0qvnxR/ZBzXVmxkgt8DdzP6m9pfuVLDM=
+github.com/klauspost/cpuid/v2 v2.2.7/go.mod h1:Lcz8mBdAVJIBVzewtcLocK12l3Y+JytZYpaMropDUws=
 golang.org/x/crypto v0.1.0 h1:MDRAIl0xIo9Io2xV565hzXHw3zVseKrJKodhohM5CjU=
 golang.org/x/crypto v0.1.0/go.mod h1:RecgLatLF4+eUMCP1PoPZQb+cVrJcOPbHkTkbkB9sbw=
 golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30=
 golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
 golang.org/x/sys v0.1.0 h1:kunALQeHf1/185U1i0GOB/fy1IPRDDpuoOOqRReG57U=
 golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
+golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
 golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
 golang.org/x/sys v0.19.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA=
--- a/randomx_test.go
+++ b/randomx_test.go
@ -50,22 +50,24 @@ func Test_Randomx(t *testing.T) {

 	for ix, tt := range Tests {

-		c.Randomx_init_cache(tt.key)
+		t.Run(string(tt.key)+"_____"+string(tt.input), func(t *testing.T) {
+			c.Randomx_init_cache(tt.key)

-		nonce := uint32(0) //uint32(len(key))
-		gen := Init_Blake2Generator(tt.key, nonce)
-		for i := 0; i < 8; i++ {
-			c.Programs[i] = Build_SuperScalar_Program(gen) // build a superscalar program
-		}
-		vm := c.VM_Initialize()
+			nonce := uint32(0) //uint32(len(key))
+			gen := Init_Blake2Generator(tt.key, nonce)
+			for i := 0; i < 8; i++ {
+				c.Programs[i] = Build_SuperScalar_Program(gen) // build a superscalar program
+			}
+			vm := c.VM_Initialize()

-		var output_hash [32]byte
-		vm.CalculateHash(tt.input, output_hash[:])
+			var output_hash [32]byte
+			vm.CalculateHash(tt.input, output_hash[:])

-		actual := fmt.Sprintf("%x", output_hash)
-		if actual != tt.expected {
-			t.Errorf("#%d Fib(%d): expected %s, actual %s", ix, tt.key, tt.expected, actual)
-		}
+			actual := fmt.Sprintf("%x", output_hash)
+			if actual != tt.expected {
+				t.Errorf("#%d Fib(%v): expected %s, actual %s", ix, tt.key, tt.expected, actual)
+			}
+		})
 	}

 }
--- a/vm.go
+++ b/vm.go
@ -29,8 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package randomx

-import "math"
-import "math/big"
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/fpu"
+	"math"
+	"runtime"
+)
 import "math/bits"
 import "encoding/binary"
 import "golang.org/x/crypto/blake2b"
@ -57,10 +60,6 @@ type VM struct {
 	config        Config // configuration
 	datasetOffset uint64

-	RoundingMode big.RoundingMode
-
-	fresult, fdst, fsrc *big.Float
-
 	Cache *Randomx_Cache // randomx cache

 }
@ -78,12 +77,12 @@ func SubnormalsToZero(f float64, _ ...any) float64 {
 }

 func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
-	return SubnormalsToZero(math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode))
+	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
 }

 func (cache *Randomx_Cache) VM_Initialize() *VM {

-	return &VM{Cache: cache, RoundingMode: big.ToNearestEven, fresult: &big.Float{}, fdst: &big.Float{}, fsrc: &big.Float{}} //// setup the cache
+	return &VM{Cache: cache} //// setup the cache
 }

 type Config struct {
@ -240,7 +239,14 @@ func (vm *VM) Run(input_hash []byte) {
 func (vm *VM) CalculateHash(input []byte, output []byte) {
 	var buf [8]byte

-	vm.RoundingMode = big.ToNearestEven // reset rounding mode if new hash eing calculated
+	// Lock thread due to rounding mode flags
+	runtime.LockOSThread()
+	defer runtime.UnlockOSThread()
+	//restore rounding mode to golang expected one
+	defer fpu.SetRoundingMode(fpu.RoundingModeToNearest)
+
+	// reset rounding mode if new hash being calculated
+	fpu.SetRoundingMode(fpu.RoundingModeToNearest)

 	input_hash := blake2b.Sum512(input)

--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -29,9 +29,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package randomx

-import "fmt"
+import (
+	"fmt"
+	"git.gammaspectra.live/P2Pool/go-randomx/fpu"
+)
 import "math"
-import "math/big"
 import "math/bits"
 import "encoding/binary"

@ -570,7 +572,7 @@ func (vm *VM) Load32(addr uint64) uint32 {
 }

 func (vm *VM) Load32F(addr uint64) float64 {
-	return SubnormalsToZero(float64(int32(vm.Load32(addr))))
+	return float64(int32(vm.Load32(addr)))
 }

 func (vm *VM) InterpretByteCode() {
@ -650,179 +652,52 @@ func (vm *VM) InterpretByteCode() {
 			//fmt.Printf("%x  %x\n",*ibc.idst, *ibc.isrc )
 			//panic("VM_ISWAP_R")
 		case VM_FSWAP_R:
+			//TODO: could be F+E

-			ibc.fdst[HIGH], ibc.fdst[LOW] = SubnormalsToZero(ibc.fdst[LOW]), SubnormalsToZero(ibc.fdst[HIGH])
+			ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
 		//	fmt.Printf("%+v \n",ibc.fdst )
 		//	panic("VM_FSWAP_R")
 		case VM_FADD_R:
-			//ibc.fdst[LOW] += ibc.fsrc[LOW]
-			//ibc.fdst[HIGH] += ibc.fsrc[HIGH]
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[LOW]))
-			vm.fresult.Add(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[HIGH]))
-			vm.fresult.Add(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(ibc.fdst[LOW]+ibc.fsrc[LOW], vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(ibc.fdst[HIGH]+ibc.fsrc[HIGH], vm.RoundingMode)
+			ibc.fdst[LOW] += ibc.fsrc[LOW]
+			ibc.fdst[HIGH] += ibc.fsrc[HIGH]

 			//panic("VM_FADD_R")
 		case VM_FADD_M:
-			//ibc.fdst[LOW] += float64(unsigned32ToSigned2sCompl(vm.Load32(ibc.getScratchpadAddress()+0)))
-			//ibc.fdst[HIGH] += float64(unsigned32ToSigned2sCompl(vm.Load32(ibc.getScratchpadAddress()+4)))
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(vm.Load32F(ibc.getScratchpadAddress() + 0))
-			vm.fresult.Add(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(vm.Load32F(ibc.getScratchpadAddress() + 4))
-			vm.fresult.Add(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(ibc.fdst[LOW]+vm.Load32F(ibc.getScratchpadAddress()+0), vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(ibc.fdst[HIGH]+vm.Load32F(ibc.getScratchpadAddress()+4), vm.RoundingMode)
+			ibc.fdst[LOW] += vm.Load32F(ibc.getScratchpadAddress() + 0)
+			ibc.fdst[HIGH] += vm.Load32F(ibc.getScratchpadAddress() + 4)

 			//panic("VM_FADD_M")
 		case VM_FSUB_R:
-			//fmt.Printf("Rounding mode %d\n", vm.RoundingMode)
-			//ibc.fdst[LOW] -= ibc.fsrc[LOW]
-			//ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[LOW]))
-			vm.fresult.Sub(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[HIGH]))
-			vm.fresult.Sub(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(ibc.fdst[LOW]-ibc.fsrc[LOW], vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(ibc.fdst[HIGH]-ibc.fsrc[HIGH], vm.RoundingMode)
+			ibc.fdst[LOW] -= ibc.fsrc[LOW]
+			ibc.fdst[HIGH] -= ibc.fsrc[HIGH]

 			//fmt.Printf("fdst float %+v\n", ibc.fdst  )
 			//panic("VM_FSUB_R")
 		case VM_FSUB_M:
-			//ibc.fdst[LOW] -= float64(unsigned32ToSigned2sCompl(vm.Load32(ibc.getScratchpadAddress()+0)))
-			//ibc.fdst[HIGH] -= float64(unsigned32ToSigned2sCompl(vm.Load32(ibc.getScratchpadAddress()+4)))
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(vm.Load32F(ibc.getScratchpadAddress() + 0))
-			vm.fresult.Sub(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(vm.Load32F(ibc.getScratchpadAddress() + 4))
-			vm.fresult.Sub(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(ibc.fdst[LOW]-vm.Load32F(ibc.getScratchpadAddress()+0), vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(ibc.fdst[HIGH]-vm.Load32F(ibc.getScratchpadAddress()+4), vm.RoundingMode)
+			ibc.fdst[LOW] -= vm.Load32F(ibc.getScratchpadAddress() + 0)
+			ibc.fdst[HIGH] -= vm.Load32F(ibc.getScratchpadAddress() + 4)

 			//panic("VM_FSUB_M")
 		case VM_FSCAL_R: // no dependent on rounding modes
 			//mask := math.Float64frombits(0x80F0000000000000)
-			ibc.fdst[LOW] = SubnormalsToZero(math.Float64frombits(math.Float64bits(SubnormalsToZero(ibc.fdst[LOW])) ^ 0x80F0000000000000))
-			ibc.fdst[HIGH] = SubnormalsToZero(math.Float64frombits(math.Float64bits(SubnormalsToZero(ibc.fdst[HIGH])) ^ 0x80F0000000000000))
+			ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
+			ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)

 			//fmt.Printf("fdst float %+v\n", ibc.fdst  )
 			//panic("VM_FSCA_M")
 		case VM_FMUL_R:
+			ibc.fdst[LOW] *= ibc.fsrc[LOW]
+			ibc.fdst[HIGH] *= ibc.fsrc[HIGH]

-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[LOW]))
-			vm.fresult.Mul(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(SubnormalsToZero(ibc.fsrc[HIGH]))
-			vm.fresult.Mul(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//panic("VM_FMUK_M")
+			//panic("VM_FMUL_R")
 		case VM_FDIV_M:
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(ibc.fdst[LOW]/MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+0), vm.config.eMask[LOW]), vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(ibc.fdst[HIGH]/MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+4), vm.config.eMask[HIGH]), vm.RoundingMode)
-
-			//ibc.fdst[LOW] /= lo
-			//ibc.fdst[HIGH] /= high
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+0), vm.config.eMask[LOW]))
-			vm.fresult.Quo(vm.fdst, vm.fsrc)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fsrc.SetPrec(0)
-			vm.fsrc.SetFloat64(MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+4), vm.config.eMask[HIGH]))
-			vm.fresult.Quo(vm.fdst, vm.fsrc)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
+			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+0), vm.config.eMask[LOW])
+			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+4), vm.config.eMask[HIGH])

 			//panic("VM_FDIV_M")
 		case VM_FSQRT_R:
-			// ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
-			// ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[LOW]))
-			vm.fdst.SetMode(vm.RoundingMode)
-			vm.fresult.Sqrt(vm.fdst)
-			ibc.fdst[LOW] = SubnormalsToZero(vm.fresult.Float64())
-
-			vm.fresult.SetMode(vm.RoundingMode)
-			vm.fdst.SetPrec(0)
-			vm.fdst.SetFloat64(SubnormalsToZero(ibc.fdst[HIGH]))
-			vm.fdst.SetMode(vm.RoundingMode)
-			vm.fresult.Sqrt(vm.fdst)
-			ibc.fdst[HIGH] = SubnormalsToZero(vm.fresult.Float64())
-
-			//ibc.fdst[LOW] = ApplyRoundingMode(math.Sqrt(ibc.fdst[LOW]), vm.RoundingMode)
-			//ibc.fdst[HIGH] = ApplyRoundingMode(math.Sqrt(ibc.fdst[HIGH]), vm.RoundingMode)
+			ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
+			ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])

 			// panic("VM_FSQRT")
 		case VM_CBRANCH:
@ -839,17 +714,7 @@ func (vm *VM) InterpretByteCode() {
 		case VM_CFROUND:

 			tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
-			switch tmp {
-			case 0:
-				vm.RoundingMode = big.ToNearestEven // RoundToNearest
-			case 1:
-				vm.RoundingMode = big.ToNegativeInf // RoundDown
-			case 2:
-				vm.RoundingMode = big.ToPositiveInf // RoundUp
-			case 3:
-				vm.RoundingMode = big.ToZero // RoundToZero
-
-			}
+			fpu.SetRoundingMode(fpu.RoundingMode(tmp))

 			//panic("round not implemented")
 			//panic("VM_CFROUND")