General cleanup, improve load32 and dataset execution

2024-04-12 23:51:55 +02:00 · 2024-04-12 23:51:55 +02:00 · 1bb1da8bbc
parent 244cff31f9
commit 1bb1da8bbc
8 changed files with 117 additions and 178 deletions
--- a/aes/aes_const.go
+++ b/aes/aes_const.go
--- a/aes/aes_hash.go
+++ b/aes/aes_hash.go
@ -30,7 +30,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package aes

 import (
-	"encoding/binary"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
 	"unsafe"
 )
@ -48,20 +47,20 @@ import (
 //
 //	Hashing throughput: >20 GiB/s per CPU core with hardware AES
 func HashAes1Rx4(input []byte, output *[64]byte) {
+	if len(input)%64 != 0 {
+		panic("unsupported")
+	}

+	// states are copied
 	states := keys.AesHash1R_State

-	var in [4][4]uint32
 	for input_ptr := 0; input_ptr < len(input); input_ptr += 64 {
-		for i := 0; i < 63; i += 4 { // load 64 bytes
-			in[i/16][(i%16)/4] = binary.LittleEndian.Uint32(input[input_ptr+i:])
-		}
+		in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:])))

 		soft_aesenc(&states[0], &in[0])
 		soft_aesdec(&states[1], &in[1])
 		soft_aesenc(&states[2], &in[2])
 		soft_aesdec(&states[3], &in[3])
-
 	}

 	soft_aesenc(&states[0], &keys.AesHash1R_XKeys[0])
@ -74,11 +73,7 @@ func HashAes1Rx4(input []byte, output *[64]byte) {
 	soft_aesenc(&states[2], &keys.AesHash1R_XKeys[1])
 	soft_aesdec(&states[3], &keys.AesHash1R_XKeys[1])

-	// write back to state
-	for i := 0; i < 63; i += 4 {
-		binary.LittleEndian.PutUint32(output[i:], states[i/16][(i%16)/4])
-	}
-
+	copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:])
 }

 // FillAes1Rx4
@ -110,15 +105,17 @@ func FillAes1Rx4(state *[64]byte, output []byte) {
 }

 // FillAes4Rx4 used to generate final program
-func FillAes4Rx4(state *[64]byte, output []byte) {
-
-	var states [4][4]uint32
-	for i := 0; i < 63; i += 4 {
-		states[i/16][(i%16)/4] = binary.LittleEndian.Uint32(state[i:])
+func FillAes4Rx4(state [64]byte, output []byte) {
+	if len(output)%len(state) != 0 {
+		panic("unsupported")
 	}

-	outptr := 0
-	for ; outptr < len(output); outptr += 64 {
+	// state is copied on caller
+
+	// Copy state
+	states := (*[4][4]uint32)(unsafe.Pointer(&state))
+
+	for outptr := 0; outptr < len(output); outptr += len(state) {
 		soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[0])
 		soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[0])
 		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[4])
@ -139,11 +136,7 @@ func FillAes4Rx4(state *[64]byte, output []byte) {
 		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[7])
 		soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[7])

-		// store bytes to output buffer
-		for i := 0; i < 63; i += 4 {
-			binary.LittleEndian.PutUint32(output[outptr+i:], states[i/16][(i%16)/4])
-		}
-
+		copy(output[outptr:], state[:])
 	}

 }
--- a/cache.go
+++ b/cache.go
@ -34,6 +34,10 @@ func Randomx_alloc_cache(flags uint64) *Randomx_Cache {
 	}
 }

+func (cache *Randomx_Cache) HasJIT() bool {
+	return cache.Flags&RANDOMX_FLAG_JIT > 0 && cache.JitPrograms[0] != nil
+}
+
 func (cache *Randomx_Cache) VM_Initialize() *VM {

 	return &VM{
@ -102,34 +106,45 @@ func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64)
 	rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
 	rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]

-	if cache.JitPrograms[0] != nil {
-		for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
-			mix := cache.GetMixBlock(registerValue)
+	for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
+		mix := cache.GetMixBlock(registerValue)

-			cache.JitPrograms[i].Execute(rl)
+		program := cache.Programs[i]

-			for q := range rl {
-				rl[q] ^= mix[q]
-			}
-
-			registerValue = rl[cache.Programs[i].AddressRegister()]
+		executeSuperscalar(program.Program(), rl)

+		for q := range rl {
+			rl[q] ^= mix[q]
 		}
-	} else {
-		for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
-			mix := cache.GetMixBlock(registerValue)

-			program := cache.Programs[i]
+		registerValue = rl[program.AddressRegister()]

-			executeSuperscalar(program.Program(), rl)
+	}
+}

-			for q := range rl {
-				rl[q] ^= mix[q]
-			}
+func (cache *Randomx_Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint64) {
+	registerValue := itemNumber

-			registerValue = rl[program.AddressRegister()]
+	rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0]
+	rl[1] = rl[0] ^ keys.SuperScalar_Constants[1]
+	rl[2] = rl[0] ^ keys.SuperScalar_Constants[2]
+	rl[3] = rl[0] ^ keys.SuperScalar_Constants[3]
+	rl[4] = rl[0] ^ keys.SuperScalar_Constants[4]
+	rl[5] = rl[0] ^ keys.SuperScalar_Constants[5]
+	rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
+	rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]

+	for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
+		mix := cache.GetMixBlock(registerValue)
+
+		cache.JitPrograms[i].Execute(rl)
+
+		for q := range rl {
+			rl[q] ^= mix[q]
 		}
+
+		registerValue = rl[cache.Programs[i].AddressRegister()]
+
 	}
 }

--- a/config.go
+++ b/config.go
@ -85,6 +85,7 @@ const ScratchpadL1Mask16 = (ScratchpadL1/2 - 1) * 16
 const ScratchpadL2Mask16 = (ScratchpadL2/2 - 1) * 16
 const ScratchpadL3Mask = (ScratchpadL3 - 1) * 8
 const ScratchpadL3Mask64 = (ScratchpadL3/8 - 1) * 64
+
 const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
 const CONDITIONMASK = ((1 << RANDOMX_JUMP_BITS) - 1)
 const STOREL3CONDITION = 14
--- a/dataset_light.go
+++ b/dataset_light.go
@ -10,7 +10,11 @@ func (d *Randomx_DatasetLight) PrefetchDataset(address uint64) {
 }

 func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLine) {
-	d.Cache.InitDatasetItem(cache, address/CacheLineSize)
+	if d.Cache.HasJIT() {
+		d.Cache.InitDatasetItemJIT(cache, address/CacheLineSize)
+	} else {
+		d.Cache.InitDatasetItem(cache, address/CacheLineSize)
+	}

 	for i := range r {
 		r[i] ^= cache[i]
--- a/exec_mmap_unix.go
+++ b/exec_mmap_unix.go
@ -8,6 +8,9 @@ import (
 )

 func (f ProgramFunc) Execute(rl *RegisterLine) {
+	if f == nil {
+		panic("program is nil")
+	}
 	memoryPtr := &f
 	fun := *(*func(rl *RegisterLine))(unsafe.Pointer(&memoryPtr))

--- a/vm.go
+++ b/vm.go
@ -44,10 +44,10 @@ type REG struct {
 }

 type VM struct {
-	State_start [64]byte
-	buffer      [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
-	Prog        []byte
-	ScratchPad  [ScratchpadSize]byte
+	StateStart [64]byte
+	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
+	Prog       []byte
+	ScratchPad [ScratchpadSize]byte

 	ByteCode [RANDOMX_PROGRAM_SIZE]InstructionByteCode

@ -71,8 +71,8 @@ func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
 }

 type Config struct {
-	eMask                                  [2]uint64
-	readReg0, readReg1, readReg2, readReg3 uint64
+	eMask   [2]uint64
+	readReg [4]uint64
 }

 type REGISTER_FILE struct {
@ -82,15 +82,14 @@ type REGISTER_FILE struct {
 	a [4][2]float64
 }
 type MemoryRegisters struct {
-	mx, ma uint64 //addr_t mx, ma;
-	mempry uint64 //	uint8_t* memory = nullptr;
+	mx, ma uint64
 }

 const LOW = 0
 const HIGH = 1

 // calculate hash based on input
-func (vm *VM) Run(input_hash *[64]byte) {
+func (vm *VM) Run(input_hash [64]byte) {

 	//fmt.Printf("%x \n", input_hash)

@ -112,14 +111,13 @@ func (vm *VM) Run(input_hash *[64]byte) {

 	vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
 	vm.mem.mx = vm.entropy[10]
+
 	addressRegisters := vm.entropy[12]
-	vm.config.readReg0 = 0 + (addressRegisters & 1)
-	addressRegisters >>= 1
-	vm.config.readReg1 = 2 + (addressRegisters & 1)
-	addressRegisters >>= 1
-	vm.config.readReg2 = 4 + (addressRegisters & 1)
-	addressRegisters >>= 1
-	vm.config.readReg3 = 6 + (addressRegisters & 1)
+	for i := range vm.config.readReg {
+		vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
+		addressRegisters >>= 1
+	}
+
 	vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
 	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
 	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
@ -134,7 +132,7 @@ func (vm *VM) Run(input_hash *[64]byte) {
 	var rlCache RegisterLine

 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
-		spMix := vm.reg.r[vm.config.readReg0] ^ vm.reg.r[vm.config.readReg1]
+		spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]

 		spAddr0 ^= spMix
 		spAddr0 &= ScratchpadL3Mask64
@ -146,21 +144,20 @@ func (vm *VM) Run(input_hash *[64]byte) {
 		}

 		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.f[i][LOW] = vm.Load32F(spAddr1 + 8*i)
-			vm.reg.f[i][HIGH] = vm.Load32F(spAddr1 + 8*i + 4)
+			vm.reg.f[i] = vm.Load32FA(spAddr1 + 8*i)
 		}

 		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.e[i][LOW] = vm.Load32F(spAddr1 + 8*(i+REGISTERCOUNTFLT))
-			vm.reg.e[i][HIGH] = vm.Load32F(spAddr1 + 8*(i+REGISTERCOUNTFLT) + 4)
+			vm.reg.e[i] = vm.Load32FA(spAddr1 + 8*(i+REGISTERCOUNTFLT))

 			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
 			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
 		}

+		// todo: pass register file directly!
 		vm.InterpretByteCode()

-		vm.mem.mx ^= vm.reg.r[vm.config.readReg2] ^ vm.reg.r[vm.config.readReg3]
+		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
 		vm.mem.mx &= CacheLineAlignMask

 		vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
@ -214,7 +211,7 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	hash512, _ := blake2b.New512(nil)

 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
-		vm.Run(&tempHash)
+		vm.Run(tempHash)

 		hash512.Reset()
 		for i := range vm.reg.r {
@ -247,7 +244,7 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 	}

 	// final loop executes here
-	vm.Run(&tempHash)
+	vm.Run(tempHash)

 	// now hash the scratch pad and place into register a
 	aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -30,11 +30,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
-	"fmt"
 	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"math"
+	"math/bits"
+	"unsafe"
 )
-import "math"
-import "math/bits"
 import "encoding/binary"

 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding
@ -156,11 +156,11 @@ func (vm *VM) Compile_TO_Bytecode() {
 			ibc.idst = &vm.reg.r[dst]
 			if dst != RegisterNeedsDisplacement {
 				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = uint16((instr.Mod() >> 2) % 4)
+				ibc.shift = (instr.Mod() >> 2) % 4
 				ibc.imm = 0
 			} else {
 				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = uint16((instr.Mod() >> 2) % 4)
+				ibc.shift = (instr.Mod() >> 2) % 4
 				ibc.imm = signExtend2sCompl(instr.IMM())
 			}
 			registerUsage[dst] = i
@ -534,7 +534,7 @@ type InstructionByteCode struct {
 	simm       int64
 	Opcode     VM_Instruction_Type
 	target     int16
-	shift      uint16
+	shift      uint8
 	memMask    uint32
 	/*
 		union {
@ -563,179 +563,105 @@ func (ibc *InstructionByteCode) getScratchpadAddress() uint64 {
 	return (*ibc.isrc + ibc.imm) & uint64(ibc.memMask)
 }

-func (vm *VM) Load64(addr uint64) uint64 {
-	return binary.LittleEndian.Uint64(vm.ScratchPad[addr:])
-}
-func (vm *VM) Load32(addr uint64) uint32 {
-	return binary.LittleEndian.Uint32(vm.ScratchPad[addr:])
+func (ibc *InstructionByteCode) getScratchpadDestAddress() uint64 {
+	return (*ibc.idst + ibc.imm) & uint64(ibc.memMask)
 }

-func (vm *VM) Load32F(addr uint64) float64 {
-	return float64(int32(vm.Load32(addr)))
+func (vm *VM) Load64(addr uint64) uint64 {
+	return *(*uint64)(unsafe.Pointer(&vm.ScratchPad[addr]))
+}
+func (vm *VM) Load32(addr uint64) uint32 {
+	return *(*uint32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+}
+
+func (vm *VM) Load32F(addr uint64) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+	return float64(a[LOW]), float64(a[HIGH])
+}
+
+func (vm *VM) Load32FA(addr uint64) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
+	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
 }

 func (vm *VM) InterpretByteCode() {
-
 	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-
 		ibc := &vm.ByteCode[pc]
-		//fmt.Printf("PCLOOP %d opcode %d  %s  dst %d src %d\n",pc,ibc.Opcode, Names[ibc.Opcode], ibc.dst, ibc.src)
-
 		switch ibc.Opcode {
 		case VM_IADD_RS:
-
 			*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm
-
-			//panic("VM_IADD_RS")
 		case VM_IADD_M:
 			*ibc.idst += vm.Load64(ibc.getScratchpadAddress())
-
-			//panic("VM_IADD_M")
 		case VM_ISUB_R:
 			*ibc.idst -= *ibc.isrc
-
-			//panic("VM_ISUB_R")
-
 		case VM_ISUB_M:
-
 			*ibc.idst -= vm.Load64(ibc.getScratchpadAddress())
-
-			//panic("VM_ISUB_M")
-		case VM_IMUL_R: // also handles imul_rcp
-
+		case VM_IMUL_R:
+			// also handles imul_rcp
 			*ibc.idst *= *ibc.isrc
-
-			//panic("VM_IMUL_R")
 		case VM_IMUL_M:
 			*ibc.idst *= vm.Load64(ibc.getScratchpadAddress())
-
-			//panic("VM_IMUL_M")
 		case VM_IMULH_R:
-
 			*ibc.idst, _ = bits.Mul64(*ibc.idst, *ibc.isrc)
-
-			// panic("VM_IMULH_R")
 		case VM_IMULH_M:
 			*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadAddress()))
-			// fmt.Printf("%x \n",*ibc.idst )
-			// panic("VM_IMULH_M")
 		case VM_ISMULH_R:
-			*ibc.idst = uint64(smulh(int64(*ibc.idst), int64(*ibc.isrc)))
-			// fmt.Printf("dst %x\n", *ibc.idst)
-			// panic("VM_ISMULH_R")
+			*ibc.idst = smulh(int64(*ibc.idst), int64(*ibc.isrc))
 		case VM_ISMULH_M:
-			*ibc.idst = uint64(smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress()))))
-			//fmt.Printf("%x \n",*ibc.idst )
-			// panic("VM_ISMULH_M")
+			*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress())))
 		case VM_INEG_R:
 			*ibc.idst = (^(*ibc.idst)) + 1 // 2's complement negative
-
-			//panic("VM_INEG_R")
 		case VM_IXOR_R:
 			*ibc.idst ^= *ibc.isrc
-
 		case VM_IXOR_M:
 			*ibc.idst ^= vm.Load64(ibc.getScratchpadAddress())
-
-			//panic("VM_IXOR_M")
 		case VM_IROR_R:
 			*ibc.idst = bits.RotateLeft64(*ibc.idst, 0-int(*ibc.isrc&63))
-
-			//panic("VM_IROR_R")
-
 		case VM_IROL_R:
 			*ibc.idst = bits.RotateLeft64(*ibc.idst, int(*ibc.isrc&63))
-
 		case VM_ISWAP_R:
 			*ibc.idst, *ibc.isrc = *ibc.isrc, *ibc.idst
-			//fmt.Printf("%x  %x\n",*ibc.idst, *ibc.isrc )
-			//panic("VM_ISWAP_R")
 		case VM_FSWAP_R:
-			//TODO: could be F+E
-
 			ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
-		//	fmt.Printf("%+v \n",ibc.fdst )
-		//	panic("VM_FSWAP_R")
 		case VM_FADD_R:
 			ibc.fdst[LOW] += ibc.fsrc[LOW]
 			ibc.fdst[HIGH] += ibc.fsrc[HIGH]
-
-			//panic("VM_FADD_R")
 		case VM_FADD_M:
-			ibc.fdst[LOW] += vm.Load32F(ibc.getScratchpadAddress() + 0)
-			ibc.fdst[HIGH] += vm.Load32F(ibc.getScratchpadAddress() + 4)
-
-			//panic("VM_FADD_M")
+			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
+			ibc.fdst[LOW] += lo
+			ibc.fdst[HIGH] += hi
 		case VM_FSUB_R:
 			ibc.fdst[LOW] -= ibc.fsrc[LOW]
 			ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
-
-			//fmt.Printf("fdst float %+v\n", ibc.fdst  )
-			//panic("VM_FSUB_R")
 		case VM_FSUB_M:
-			ibc.fdst[LOW] -= vm.Load32F(ibc.getScratchpadAddress() + 0)
-			ibc.fdst[HIGH] -= vm.Load32F(ibc.getScratchpadAddress() + 4)
-
-			//panic("VM_FSUB_M")
-		case VM_FSCAL_R: // no dependent on rounding modes
-			//mask := math.Float64frombits(0x80F0000000000000)
+			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
+			ibc.fdst[LOW] -= lo
+			ibc.fdst[HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
 			ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
 			ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)
-
-			//fmt.Printf("fdst float %+v\n", ibc.fdst  )
-			//panic("VM_FSCA_M")
 		case VM_FMUL_R:
 			ibc.fdst[LOW] *= ibc.fsrc[LOW]
 			ibc.fdst[HIGH] *= ibc.fsrc[HIGH]
-
-			//panic("VM_FMUL_R")
 		case VM_FDIV_M:
-			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+0), vm.config.eMask[LOW])
-			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(vm.Load32F(ibc.getScratchpadAddress()+4), vm.config.eMask[HIGH])
-
-			//panic("VM_FDIV_M")
+			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
+			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(lo, vm.config.eMask[LOW])
+			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(hi, vm.config.eMask[HIGH])
 		case VM_FSQRT_R:
 			ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
 			ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
-
-			// panic("VM_FSQRT")
 		case VM_CBRANCH:
-			//fmt.Printf("pc %d  src  %x   imm %x\n",pc ,*ibc.isrc,  ibc.imm)
 			*ibc.isrc += ibc.imm
-			//fmt.Printf("pc %d\n",pc)
 			if (*ibc.isrc & uint64(ibc.memMask)) == 0 {
 				pc = int(ibc.target)
-
 			}
-
-			// fmt.Printf("pc %d\n",pc)
-			//panic("VM_CBRANCH")
 		case VM_CFROUND:
-
 			tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
 			asm.SetRoundingMode(asm.RoundingMode(tmp))
-
-			//panic("round not implemented")
-			//panic("VM_CFROUND")
 		case VM_ISTORE:
 			binary.LittleEndian.PutUint64(vm.ScratchPad[(*ibc.idst+ibc.imm)&uint64(ibc.memMask):], *ibc.isrc)
-
-			//panic("VM_ISTOREM")
-
 		case VM_NOP: // we do nothing
-
-		default:
-			panic("instruction not implemented")
-
 		}
-		/*fmt.Printf("REGS ")
-		for j := 0; j <7;j++ {
-			fmt.Printf("%16x, " , vm.reg.r[j])
-		}
-		fmt.Printf("\n")
-		*/
-
 	}
 }
-
-var umm888_ = fmt.Sprintf("")