From d20dd880cebca048011076865d3db44ec771df71 Mon Sep 17 00:00:00 2001
From: WeebDataHoarder <57538841+WeebDataHoarder@users.noreply.github.com>
Date: Thu, 18 Apr 2024 12:09:05 +0200
Subject: [PATCH] amd64: Implemented VM JIT

---
 README.md                  |  27 ++--
 aes/round_amd64.go         |  29 ++--
 asm/aes_amd64.s            |  78 +++++-----
 asm/cpuid_amd64.go         |   7 +
 asm/cpuid_amd64.s          |  34 ++++
 cache.go                   |   9 +-
 config.go                  |  14 +-
 dataset.go                 |   1 +
 dataset_light.go           |   4 +
 exec.go                    |   2 +
 exec_generic.go            |  21 +++
 exec_mmap_unix.go          |  52 ++++++-
 jit_amd64.go               | 176 ++++++++++++++++++++-
 jit_generic.go             |   5 +
 randomx_test.go            |   3 +
 superscalar.go             |   3 +
 superscalar_jit_amd64.go   |  18 +--
 superscalar_jit_amd64.s    |   1 +
 vm.go                      |  30 +++-
 vm_bytecode.go             |   2 +-
 vm_bytecode_jit_amd64.go   | 312 +++++++++++++++++++++++++++++++++++++
 vm_bytecode_jit_amd64.s    |  91 +++++++++++
 vm_bytecode_jit_generic.go |  11 ++
 vm_bytecode_native.go      |   6 +-
 vm_bytecode_purego.go      |   6 +-
 vm_instruction.go          |  10 +-
 26 files changed, 849 insertions(+), 103 deletions(-)
 create mode 100644 asm/cpuid_amd64.go
 create mode 100644 asm/cpuid_amd64.s
 create mode 100644 jit_generic.go
 create mode 100644 vm_bytecode_jit_amd64.go
 create mode 100644 vm_bytecode_jit_amd64.s
 create mode 100644 vm_bytecode_jit_generic.go

diff --git a/README.md b/README.md
index 884dd3f..8d2eace 100644
--- a/README.md
+++ b/README.md
@@ -8,21 +8,22 @@ This package implements RandomX without CGO, using only Golang code, pure float6
 
 All test cases pass properly.
 
-Uses minimal Go assembly due to having to set rounding mode natively. Native hard float can be added with supporting rounding mode under _asm_.
-
 JIT is supported on a few platforms but can be hard-disabled via the `disable_jit` build flag, or at runtime.
 
 A pure Golang implementation can be used on platforms without hard float support or via the `purego` build flag manually.
 
-|  Platform   | Supported | Hard Float | SuperScalar JIT |      Notes       |
-|:-----------:|:---------:|:----------:|:---------------:|:----------------:|
-|   **386**   |     ✅     |     ✅      |        ❌        |                  |
-|  **amd64**  |     ✅     |     ✅      |       ✅*        | JIT only on Unix |
-|   **arm**   |    ✅*     |     ❌      |        ❌        |                  |
-|  **arm64**  |     ✅     |     ✅      |        ❌        |                  |
-|  **mips**   |    ✅*     |     ❌      |        ❌        |                  |
-| **mips64**  |    ✅*     |     ❌      |        ❌        |                  |
-| **riscv64** |    ✅*     |     ❌      |        ❌        |                  |
-|  **wasm**   |    ✅*     |     ❌      |        ❌        |                  |
+|  Platform   | Hard Float | Hard AES | JIT | Native | purego |      Notes       |
+|:-----------:|:----------:|:--------:|:---:|:------:|:------:|:----------------:|
+|   **386**   |     ✅      |    ❌     |  ❌  |   ✅    |   ✅    |                  |
+|  **amd64**  |     ✅      |    ✅     | ✅*  |   ✅    |   ✅    | JIT only on Unix |
+|   **arm**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
+|  **arm64**  |     ✅      |    ❌     |  ❌  |   ✅    |   ✅    |                  |
+|  **mips**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
+| **mips64**  |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
+| **riscv64** |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
+|  **wasm**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 
-&ast; these platforms only support software floating point / purego and will not be performant.
\ No newline at end of file
+
+Any platform with no hard float support (soft float using [softfloat64](git.gammaspectra.live/P2Pool/softfloat64)) will be vastly slow.
+
+Native hard float can be added with supporting rounding mode under _asm_.
\ No newline at end of file
diff --git a/aes/round_amd64.go b/aes/round_amd64.go
index 8363096..c63f6ff 100644
--- a/aes/round_amd64.go
+++ b/aes/round_amd64.go
@@ -8,18 +8,23 @@ import (
 	_ "unsafe"
 )
 
+//go:noescape
 //go:linkname hard_aesdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesdec
 func hard_aesdec(state *[4]uint32, key *[4]uint32)
 
+//go:noescape
 //go:linkname hard_aesenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesenc
 func hard_aesenc(state *[4]uint32, key *[4]uint32)
 
+//go:noescape
 //go:linkname hard_aesroundtrip_decenc git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_decenc
 func hard_aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32)
 
+//go:noescape
 //go:linkname hard_aesroundtrip_encdec git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec
 func hard_aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32)
 
+//go:noescape
 //go:linkname hard_aesroundtrip_encdec1 git.gammaspectra.live/P2Pool/go-randomx/v2/asm.aesroundtrip_encdec1
 func hard_aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32)
 
@@ -45,10 +50,10 @@ func aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) {
 	if supportsAES {
 		hard_aesroundtrip_decenc(states, keys)
 	} else {
-		aesdec(&states[0], &keys[0])
-		aesenc(&states[1], &keys[1])
-		aesdec(&states[2], &keys[2])
-		aesenc(&states[3], &keys[3])
+		soft_aesdec(&states[0], &keys[0])
+		soft_aesenc(&states[1], &keys[1])
+		soft_aesdec(&states[2], &keys[2])
+		soft_aesenc(&states[3], &keys[3])
 	}
 }
 
@@ -56,10 +61,10 @@ func aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) {
 	if supportsAES {
 		hard_aesroundtrip_encdec(states, keys)
 	} else {
-		aesenc(&states[0], &keys[0])
-		aesdec(&states[1], &keys[1])
-		aesenc(&states[2], &keys[2])
-		aesdec(&states[3], &keys[3])
+		soft_aesenc(&states[0], &keys[0])
+		soft_aesdec(&states[1], &keys[1])
+		soft_aesenc(&states[2], &keys[2])
+		soft_aesdec(&states[3], &keys[3])
 	}
 }
 
@@ -67,9 +72,9 @@ func aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) {
 	if supportsAES {
 		hard_aesroundtrip_encdec1(states, key)
 	} else {
-		aesenc(&states[0], key)
-		aesdec(&states[1], key)
-		aesenc(&states[2], key)
-		aesdec(&states[3], key)
+		soft_aesenc(&states[0], key)
+		soft_aesdec(&states[1], key)
+		soft_aesenc(&states[2], key)
+		soft_aesdec(&states[3], key)
 	}
 }
diff --git a/asm/aes_amd64.s b/asm/aes_amd64.s
index 3c161fe..40ac125 100644
--- a/asm/aes_amd64.s
+++ b/asm/aes_amd64.s
@@ -5,43 +5,43 @@
 TEXT ·aesenc(SB),NOSPLIT|NOFRAME,$0-16
 	MOVQ state+0(FP), AX
 	MOVQ key+8(FP), BX
-	MOVUPS 0(AX), X0
-	MOVUPS 0(BX), X1
+	VMOVDQU32 0(AX), X0
+	VMOVDQU32 0(BX), X1
 	AESENC X1, X0
-	MOVUPS X0, 0(AX)
+	VMOVDQU32 X0, 0(AX)
 	RET
 
 TEXT ·aesdec(SB),NOSPLIT|NOFRAME,$0-16
 	MOVQ state+0(FP), AX
 	MOVQ key+8(FP), BX
-	MOVUPS 0(AX), X0
-	MOVUPS 0(BX), X1
+	VMOVDQU32 0(AX), X0
+	VMOVDQU32 0(BX), X1
 	AESDEC X1, X0
-	MOVUPS X0, 0(AX)
+	VMOVDQU32 X0, 0(AX)
 	RET
 
 TEXT ·aesroundtrip_decenc(SB),NOSPLIT|NOFRAME,$0-16
 	MOVQ states+0(FP), AX
 	MOVQ keys+8(FP), BX
 
-	MOVUPS 0(AX), X0
-	MOVUPS 0(BX), X1
-	MOVUPS 16(AX), X2
-	MOVUPS 16(BX), X3
-	MOVUPS 32(AX), X4
-	MOVUPS 32(BX), X5
-	MOVUPS 48(AX), X6
-	MOVUPS 48(BX), X7
+	VMOVDQU32 0(AX), X0
+	VMOVDQU32 0(BX), X1
+	VMOVDQU32 16(AX), X2
+	VMOVDQU32 16(BX), X3
+	VMOVDQU32 32(AX), X4
+	VMOVDQU32 32(BX), X5
+	VMOVDQU32 48(AX), X6
+	VMOVDQU32 48(BX), X7
 
 	AESDEC X1, X0
 	AESENC X3, X2
 	AESDEC X5, X4
     AESENC X7, X6
 
-	MOVUPS X0, 0(AX)
-	MOVUPS X2, 16(AX)
-	MOVUPS X4, 32(AX)
-	MOVUPS X6, 48(AX)
+	VMOVDQU32 X0, 0(AX)
+	VMOVDQU32 X2, 16(AX)
+	VMOVDQU32 X4, 32(AX)
+	VMOVDQU32 X6, 48(AX)
 	RET
 
 
@@ -49,24 +49,24 @@ TEXT ·aesroundtrip_encdec(SB),NOSPLIT|NOFRAME,$0-16
 	MOVQ states+0(FP), AX
 	MOVQ keys+8(FP), BX
 
-	MOVUPS 0(AX), X0
-	MOVUPS 0(BX), X1
-	MOVUPS 16(AX), X2
-	MOVUPS 16(BX), X3
-	MOVUPS 32(AX), X4
-	MOVUPS 32(BX), X5
-	MOVUPS 48(AX), X6
-	MOVUPS 48(BX), X7
+	VMOVDQU32 0(AX), X0
+	VMOVDQU32 0(BX), X1
+	VMOVDQU32 16(AX), X2
+	VMOVDQU32 16(BX), X3
+	VMOVDQU32 32(AX), X4
+	VMOVDQU32 32(BX), X5
+	VMOVDQU32 48(AX), X6
+	VMOVDQU32 48(BX), X7
 
 	AESENC X1, X0
 	AESDEC X3, X2
 	AESENC X5, X4
     AESDEC X7, X6
 
-	MOVUPS X0, 0(AX)
-	MOVUPS X2, 16(AX)
-	MOVUPS X4, 32(AX)
-	MOVUPS X6, 48(AX)
+	VMOVDQU32 X0, 0(AX)
+	VMOVDQU32 X2, 16(AX)
+	VMOVDQU32 X4, 32(AX)
+	VMOVDQU32 X6, 48(AX)
 	RET
 
 
@@ -74,20 +74,20 @@ TEXT ·aesroundtrip_encdec1(SB),NOSPLIT|NOFRAME,$0-16
 	MOVQ states+0(FP), AX
 	MOVQ key+8(FP), BX
 
-	MOVUPS 0(BX), X0
-	MOVUPS 0(AX), X1
-	MOVUPS 16(AX), X2
-	MOVUPS 32(AX), X3
-	MOVUPS 48(AX), X4
+	VMOVDQU32 0(BX), X0
+	VMOVDQU32 0(AX), X1
+	VMOVDQU32 16(AX), X2
+	VMOVDQU32 32(AX), X3
+	VMOVDQU32 48(AX), X4
 
 	AESENC X0, X1
 	AESDEC X0, X2
 	AESENC X0, X3
     AESDEC X0, X4
 
-	MOVUPS X1, 0(AX)
-	MOVUPS X2, 16(AX)
-	MOVUPS X3, 32(AX)
-	MOVUPS X4, 48(AX)
+	VMOVDQU32 X1, 0(AX)
+	VMOVDQU32 X2, 16(AX)
+	VMOVDQU32 X3, 32(AX)
+	VMOVDQU32 X4, 48(AX)
 	RET
 
diff --git a/asm/cpuid_amd64.go b/asm/cpuid_amd64.go
new file mode 100644
index 0000000..494e896
--- /dev/null
+++ b/asm/cpuid_amd64.go
@@ -0,0 +1,7 @@
+//go:build amd64 && !purego
+
+package asm
+
+func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+func Cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+func Xgetbv(index uint32) (eax, edx uint32)
diff --git a/asm/cpuid_amd64.s b/asm/cpuid_amd64.s
new file mode 100644
index 0000000..aa5073d
--- /dev/null
+++ b/asm/cpuid_amd64.s
@@ -0,0 +1,34 @@
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+// func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·Cpuid(SB), 7, $0
+        XORQ CX, CX
+        MOVL op+0(FP), AX
+        CPUID
+        MOVL AX, eax+8(FP)
+        MOVL BX, ebx+12(FP)
+        MOVL CX, ecx+16(FP)
+        MOVL DX, edx+20(FP)
+        RET
+
+
+// func Cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·Cpuidex(SB), 7, $0
+        MOVL op+0(FP), AX
+        MOVL op2+4(FP), CX
+        CPUID
+        MOVL AX, eax+8(FP)
+        MOVL BX, ebx+12(FP)
+        MOVL CX, ecx+16(FP)
+        MOVL DX, edx+20(FP)
+        RET
+
+// func xgetbv(index uint32) (eax, edx uint32)
+TEXT ·Xgetbv(SB), 7, $0
+        MOVL index+0(FP), CX
+        BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
+        MOVL AX, eax+8(FP)
+        MOVL DX, edx+12(FP)
+        RET
diff --git a/cache.go b/cache.go
index 67b65cf..7bd43e2 100644
--- a/cache.go
+++ b/cache.go
@@ -40,11 +40,18 @@ func (cache *Randomx_Cache) HasJIT() bool {
 
 func (cache *Randomx_Cache) VM_Initialize() *VM {
 
-	return &VM{
+	vm := &VM{
 		Dataset: &Randomx_DatasetLight{
 			Cache: cache,
 		},
 	}
+	if cache.HasJIT() {
+		vm.JITProgram = mapProgram(nil, int(RandomXCodeSize))
+		if cache.Flags&RANDOMX_FLAG_SECURE == 0 {
+			mapProgramRWX(vm.JITProgram)
+		}
+	}
+	return vm
 }
 
 func (cache *Randomx_Cache) Close() error {
diff --git a/config.go b/config.go
index 7fe7c20..92aa9a2 100644
--- a/config.go
+++ b/config.go
@@ -106,8 +106,18 @@ const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
 const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14
 
-const RANDOMX_FLAG_DEFAULT = uint64(0)
-const RANDOMX_FLAG_JIT = uint64(1 << iota)
+const RANDOMX_FLAG_DEFAULT = 0
+
+const (
+	RANDOMX_FLAG_LARGE_PAGES = 1 << iota
+	RANDOMX_FLAG_HARD_AES
+	RANDOMX_FLAG_FULL_MEM
+	RANDOMX_FLAG_JIT
+	RANDOMX_FLAG_SECURE
+	RANDOMX_FLAG_ARGON2_SSSE3
+	RANDOMX_FLAG_ARGON2_AVX2
+	RANDOMX_FLAG_ARGON2
+)
 
 func isZeroOrPowerOf2(x uint32) bool {
 	return (x & (x - 1)) == 0
diff --git a/dataset.go b/dataset.go
index 3614177..a642f9d 100644
--- a/dataset.go
+++ b/dataset.go
@@ -4,4 +4,5 @@ type Randomx_Dataset interface {
 	InitDataset(startItem, endItem uint64)
 	ReadDataset(address uint64, r, cache *RegisterLine)
 	PrefetchDataset(address uint64)
+	Flags() uint64
 }
diff --git a/dataset_light.go b/dataset_light.go
index 21cb681..5a88d92 100644
--- a/dataset_light.go
+++ b/dataset_light.go
@@ -21,6 +21,10 @@ func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLin
 	}
 }
 
+func (d *Randomx_DatasetLight) Flags() uint64 {
+	return d.Cache.Flags
+}
+
 func (d *Randomx_DatasetLight) InitDataset(startItem, endItem uint64) {
 	//d.Cache.initDataset(d.Cache.Programs)
 }
diff --git a/exec.go b/exec.go
index c10b7b5..be707f3 100644
--- a/exec.go
+++ b/exec.go
@@ -1,3 +1,5 @@
 package randomx
 
 type SuperScalarProgramFunc []byte
+
+type VMProgramFunc []byte
diff --git a/exec_generic.go b/exec_generic.go
index 00d14aa..b9ab294 100644
--- a/exec_generic.go
+++ b/exec_generic.go
@@ -5,3 +5,24 @@ package randomx
 func (f SuperScalarProgramFunc) Close() error {
 	return nil
 }
+
+func (f VMProgramFunc) Close() error {
+	return nil
+}
+
+func mapProgram(program []byte, size int) []byte {
+	return nil
+}
+
+func mapProgramRW(execFunc []byte) {
+
+}
+
+func mapProgramRX(execFunc []byte) {
+
+}
+
+// mapProgramRWX insecure!
+func mapProgramRWX(execFunc []byte) {
+
+}
diff --git a/exec_mmap_unix.go b/exec_mmap_unix.go
index 5471cd5..f20a95f 100644
--- a/exec_mmap_unix.go
+++ b/exec_mmap_unix.go
@@ -9,10 +9,56 @@ import (
 func (f SuperScalarProgramFunc) Close() error {
 	return unix.Munmap(f)
 }
+func (f VMProgramFunc) Close() error {
+	return unix.Munmap(f)
+}
 
-func mapProgram(program []byte) []byte {
-	// Write only
-	execFunc, err := unix.Mmap(-1, 0, len(program), unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
+func mapProgramRW(execFunc []byte) {
+	err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_WRITE)
+	if err != nil {
+		defer func() {
+			// unmap if we err
+			err := unix.Munmap(execFunc)
+			if err != nil {
+				panic(err)
+			}
+		}()
+		panic(err)
+	}
+}
+
+func mapProgramRX(execFunc []byte) {
+	err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_EXEC)
+	if err != nil {
+		defer func() {
+			// unmap if we err
+			err := unix.Munmap(execFunc)
+			if err != nil {
+				panic(err)
+			}
+		}()
+		panic(err)
+	}
+}
+
+// mapProgramRWX insecure!
+func mapProgramRWX(execFunc []byte) {
+	err := unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_WRITE|unix.PROT_EXEC)
+	if err != nil {
+		defer func() {
+			// unmap if we err
+			err := unix.Munmap(execFunc)
+			if err != nil {
+				panic(err)
+			}
+		}()
+		panic(err)
+	}
+}
+
+func mapProgram(program []byte, size int) []byte {
+	// Read and Write only
+	execFunc, err := unix.Mmap(-1, 0, max(size, len(program)), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
 	if err != nil {
 		panic(err)
 	}
diff --git a/jit_amd64.go b/jit_amd64.go
index 96fe7c7..cc9b4a2 100644
--- a/jit_amd64.go
+++ b/jit_amd64.go
@@ -2,6 +2,12 @@
 
 package randomx
 
+import (
+	"bytes"
+	"encoding/binary"
+	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+)
+
 /*
 
 	REGISTER ALLOCATION:
@@ -11,7 +17,7 @@ package randomx
 	; rcx -> temporary
 	; rdx -> temporary
 	; rsi -> scratchpad pointer
-	; rdi -> return address // dataset pointer
+	; rdi -> (not used)
 	; rbp -> (do not use, it's used by Golang sampling) jump target //todo: memory registers "ma" (high 32 bits), "mx" (low 32 bits)
 	; rsp -> stack pointer
 	; r8  -> "r0"
@@ -134,7 +140,7 @@ var CALL = 0xe8
 var REX_ADD_I = []byte{0x49, 0x81}
 var REX_TEST = []byte{0x49, 0xF7}
 var JZ = []byte{0x0f, 0x84}
-var JZ_SHORT = 0x74
+var JZ_SHORT byte = 0x74
 
 var RET byte = 0xc3
 
@@ -151,6 +157,172 @@ var NOP6 = []byte{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00}
 var NOP7 = []byte{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}
 var NOP8 = []byte{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
 
+var JMP_ALIGN_PREFIX = [14][]byte{
+	{},
+	{0x2E},
+	{0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+}
+
 func genSIB(scale, index, base int) byte {
 	return byte((scale << 6) | (index << 3) | base)
 }
+func genAddressReg(buf []byte, instr *ByteCodeInstruction, rax bool) []byte {
+	buf = append(buf, LEA_32...)
+	if rax {
+		buf = append(buf, 0x80+instr.Src+0)
+	} else {
+		buf = append(buf, 0x80+instr.Src+8)
+	}
+	if instr.Src == RegisterNeedsSib {
+		buf = append(buf, 0x24)
+	}
+	buf = binary.LittleEndian.AppendUint32(buf, uint32(instr.Imm))
+	if rax {
+		buf = append(buf, AND_EAX_I)
+	} else {
+		buf = append(buf, AND_ECX_I...)
+	}
+	buf = binary.LittleEndian.AppendUint32(buf, instr.MemMask)
+	return buf
+}
+
+func valAsString(values ...uint32) []byte {
+	r := make([]byte, 4*len(values))
+	for i, v := range values {
+		dst := r[i*4:]
+		dst[0] = byte(v & 0xff)
+		dst[1] = byte((v >> 8) & 0xff)
+		dst[2] = byte((v >> 16) & 0xff)
+		dst[3] = byte((v >> 24) & 0xff)
+		switch {
+		case dst[0] == 0:
+			return r[:i*4]
+		case dst[1] == 0:
+			return r[:i*4+1]
+		case dst[2] == 0:
+			return r[:i*4+2]
+		case dst[3] == 0:
+			return r[:i*4+3]
+		}
+	}
+	return r
+}
+
+func familyModel(maxFunctionId uint32) (family, model, stepping int) {
+	if maxFunctionId < 0x1 {
+		return 0, 0, 0
+	}
+	eax, _, _, _ := asm.Cpuid(1)
+	// If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0].
+	family = int((eax >> 8) & 0xf)
+	extFam := family == 0x6 // Intel is 0x6, needs extended model.
+	if family == 0xf {
+		// Add ExtFamily
+		family += int((eax >> 20) & 0xff)
+		extFam = true
+	}
+	// If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0].
+	model = int((eax >> 4) & 0xf)
+	if extFam {
+		// Add ExtModel
+		model += int((eax >> 12) & 0xf0)
+	}
+	stepping = int(eax & 0xf)
+	return family, model, stepping
+}
+
+var BranchesWithin32B = func() bool {
+	a, b, c, d := asm.Cpuid(0)
+	v := string(valAsString(b, d, c))
+
+	if v == "GenuineIntel" {
+		family, model, stepping := familyModel(a)
+
+		// Intel JCC erratum mitigation
+		if family == 6 {
+			// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+			return ((model == 0x4E) && (stepping == 0x3)) ||
+				((model == 0x55) && ((stepping == 0x4) || (stepping == 0x7))) ||
+				((model == 0x5E) && (stepping == 0x3)) ||
+				((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
+				((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
+				((model == 0xA6) && (stepping == 0x0)) ||
+				((model == 0xAE) && (stepping == 0xA))
+		}
+	}
+	return false
+}()
+
+/*
+;# callee-saved registers - Microsoft x64 calling convention
+push rbx
+push rbp
+push rdi
+push rsi
+push r12
+push r13
+push r14
+push r15
+sub rsp, 80
+movdqu xmmword ptr [rsp+64], xmm6
+movdqu xmmword ptr [rsp+48], xmm7
+movdqu xmmword ptr [rsp+32], xmm8
+movdqu xmmword ptr [rsp+16], xmm9
+movdqu xmmword ptr [rsp+0], xmm10
+sub rsp, 80
+movdqu xmmword ptr [rsp+64], xmm11
+movdqu xmmword ptr [rsp+48], xmm12
+movdqu xmmword ptr [rsp+32], xmm13
+movdqu xmmword ptr [rsp+16], xmm14
+movdqu xmmword ptr [rsp+0], xmm15
+
+;# function arguments
+push rcx                    ;# RegisterFile& registerFile
+mov rbp, qword ptr [rdx]    ;# "mx", "ma"
+mov rdi, qword ptr [rdx+8]  ;# uint8_t* dataset
+mov rsi, r8                 ;# uint8_t* scratchpad
+mov rbx, r9                 ;# loop counter
+
+mov rax, rbp
+ror rbp, 32
+
+;# zero integer registers
+xor r8, r8
+xor r9, r9
+xor r10, r10
+xor r11, r11
+xor r12, r12
+xor r13, r13
+xor r14, r14
+xor r15, r15
+
+;# load constant registers
+lea rcx, [rcx+120]
+movapd xmm8, xmmword ptr [rcx+72]
+movapd xmm9, xmmword ptr [rcx+88]
+movapd xmm10, xmmword ptr [rcx+104]
+movapd xmm11, xmmword ptr [rcx+120]
+
+movapd xmm13, xmmword ptr [mantissaMask]
+movapd xmm14, xmmword ptr [exp240]
+movapd xmm15, xmmword ptr [scaleMask]
+mov rdx, rax
+and eax, RANDOMX_SCRATCHPAD_MASK
+ror rdx, 32
+and edx, RANDOMX_SCRATCHPAD_MASK
+jmp rx_program_loop_begin
+*/
+var randomx_program_prologue = bytes.Repeat(NOP1, 64)
+
+var randomx_program_loop_begin = bytes.Repeat(NOP1, 64)
diff --git a/jit_generic.go b/jit_generic.go
new file mode 100644
index 0000000..e84154f
--- /dev/null
+++ b/jit_generic.go
@@ -0,0 +1,5 @@
+//go:build !unix || !amd64 || disable_jit || purego
+
+package randomx
+
+var RandomXCodeSize uint64 = 0
diff --git a/randomx_test.go b/randomx_test.go
index 2988bb9..a94e914 100644
--- a/randomx_test.go
+++ b/randomx_test.go
@@ -63,6 +63,7 @@ func Test_Randomx(t *testing.T) {
 			}()
 
 			vm := c.VM_Initialize()
+			defer vm.Close()
 
 			var output_hash [32]byte
 			vm.CalculateHash(tt.input, &output_hash)
@@ -92,6 +93,7 @@ func Benchmark_RandomX(b *testing.B) {
 	}()
 
 	vm := c.VM_Initialize()
+	defer vm.Close()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		var output_hash [32]byte
@@ -119,6 +121,7 @@ func Benchmark_RandomXParallel(b *testing.B) {
 	b.RunParallel(func(pb *testing.PB) {
 		var output_hash [32]byte
 		vm := c.VM_Initialize()
+		defer vm.Close()
 
 		for pb.Next() {
 			vm.CalculateHash(tt.input, &output_hash)
diff --git a/superscalar.go b/superscalar.go
index 6cd9c0b..5b83032 100644
--- a/superscalar.go
+++ b/superscalar.go
@@ -702,7 +702,10 @@ type Register struct {
 	//RegisterNeedsSib = 4; //x86 r12 register
 }
 
+// RegisterNeedsDisplacement x86 r13 register
 const RegisterNeedsDisplacement = 5
+
+// RegisterNeedsSib x86 r12 register
 const RegisterNeedsSib = 4
 
 func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters []int, cycle int, Registers []Register, gen *Blake2Generator) bool {
diff --git a/superscalar_jit_amd64.go b/superscalar_jit_amd64.go
index 562532d..c1f2111 100644
--- a/superscalar_jit_amd64.go
+++ b/superscalar_jit_amd64.go
@@ -4,7 +4,6 @@ package randomx
 
 import (
 	"encoding/binary"
-	"runtime"
 	"unsafe"
 )
 
@@ -17,21 +16,6 @@ func (f SuperScalarProgramFunc) Execute(rf uintptr) {
 	}
 
 	superscalar_run(rf, uintptr(unsafe.Pointer(unsafe.SliceData(f))))
-	return
-
-	var reservedStackHack [8 * 8]byte
-	for i := range reservedStackHack {
-		reservedStackHack[i] = uint8(i)
-	}
-
-	memoryPtr := &f
-	fun := *(*func(v uintptr))(unsafe.Pointer(&memoryPtr))
-	fun(rf)
-
-	for i := range reservedStackHack {
-		reservedStackHack[i] = uint8(-i)
-	}
-	runtime.KeepAlive(reservedStackHack)
 }
 
 // generateSuperscalarCode
@@ -106,5 +90,5 @@ func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgra
 
 	program = append(program, RET)
 
-	return mapProgram(program)
+	return mapProgram(program, len(program))
 }
diff --git a/superscalar_jit_amd64.s b/superscalar_jit_amd64.s
index 2b078aa..397cef6 100644
--- a/superscalar_jit_amd64.s
+++ b/superscalar_jit_amd64.s
@@ -25,6 +25,7 @@ TEXT ·superscalar_run(SB),$0-16
 
     // todo: not supported by golang
     // PREFETCHW 0(SI)
+    PREFETCHT0 0(SI)
 
     // move registers back to register line
     MOVQ R8, 0(SI)
diff --git a/vm.go b/vm.go
index a6c6b54..4657825 100644
--- a/vm.go
+++ b/vm.go
@@ -46,6 +46,8 @@ type VM struct {
 	ScratchPad ScratchPad
 
 	Dataset Randomx_Dataset
+
+	JITProgram VMProgramFunc
 }
 
 // Run calculate hash based on input
@@ -95,6 +97,16 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) {
 
 	var rlCache RegisterLine
 
+	if vm.JITProgram != nil {
+		if vm.Dataset.Flags()&RANDOMX_FLAG_SECURE > 0 {
+			mapProgramRW(vm.JITProgram)
+			byteCode.generateCode(vm.JITProgram)
+			mapProgramRX(vm.JITProgram)
+		} else {
+			byteCode.generateCode(vm.JITProgram)
+		}
+	}
+
 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
 		spMix := reg.R[readReg[0]] ^ reg.R[readReg[1]]
 
@@ -120,7 +132,11 @@ func (vm *VM) Run(inputHash [64]byte, roundingMode uint8) (reg RegisterFile) {
 		}
 
 		// Run the actual bytecode
-		byteCode.Execute(&reg, &vm.ScratchPad, eMask)
+		if vm.JITProgram != nil {
+			vm.JITProgram.Execute(&reg, &vm.ScratchPad, eMask)
+		} else {
+			byteCode.Execute(&reg, &vm.ScratchPad, eMask)
+		}
 
 		mem.mx ^= reg.R[readReg[2]] ^ reg.R[readReg[3]]
 		mem.mx &= CacheLineAlignMask
@@ -183,9 +199,10 @@ func (vm *VM) RunLoops(tempHash [64]byte) RegisterFile {
 
 	// final loop executes here
 	reg := vm.Run(tempHash, roundingMode)
-	roundingMode = reg.FPRC
+	// always force a restore
+	reg.FPRC = 0xff
 
-	//restore rounding mode
+	// restore rounding mode to 0
 	SetRoundingMode(&reg, 0)
 
 	return reg
@@ -214,3 +231,10 @@ func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
 
 	hash256.Sum(output[:0])
 }
+
+func (vm *VM) Close() error {
+	if vm.JITProgram != nil {
+		return vm.JITProgram.Close()
+	}
+	return nil
+}
diff --git a/vm_bytecode.go b/vm_bytecode.go
index 556dd3b..8865a5e 100644
--- a/vm_bytecode.go
+++ b/vm_bytecode.go
@@ -31,7 +31,7 @@ type ByteCodeInstruction struct {
 }
 
 func (i ByteCodeInstruction) jumpTarget() int {
-	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Dst)))
+	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Src)))
 }
 
 func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
diff --git a/vm_bytecode_jit_amd64.go b/vm_bytecode_jit_amd64.go
new file mode 100644
index 0000000..32f68c5
--- /dev/null
+++ b/vm_bytecode_jit_amd64.go
@@ -0,0 +1,312 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+package randomx
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"unsafe"
+)
+
+//go:noescape
+func vm_run(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64, jmp uintptr)
+
+func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	if f == nil {
+		panic("program is nil")
+	}
+
+	jmpPtr := uintptr(unsafe.Pointer(unsafe.SliceData(f)))
+	vm_run(rf, pad, eMask, jmpPtr)
+}
+
+func (c *ByteCode) generateCode(program []byte) {
+	program = program[:0]
+
+	var instructionOffsets [RANDOMX_PROGRAM_SIZE]int32
+	var codePos int32
+
+	for ix := range c {
+		instructionOffsets[ix] = codePos
+		curLen := len(program)
+
+		instr := &c[ix]
+		switch instr.Opcode {
+
+		case VM_IADD_RS:
+			program = append(program, REX_LEA...)
+			if instr.Dst == RegisterNeedsDisplacement {
+				program = append(program, 0xac)
+			} else {
+				program = append(program, 0x04+8*instr.Dst)
+			}
+			program = append(program, genSIB(int(instr.ImmB), int(instr.Src), int(instr.Dst)))
+			if instr.Dst == RegisterNeedsDisplacement {
+				program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			}
+
+		case VM_IADD_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_ADD_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IADD_MZ:
+			program = append(program, REX_ADD_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_ISUB_R:
+			program = append(program, REX_SUB_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_ISUB_I:
+			program = append(program, REX_81...)
+			program = append(program, 0xe8+instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_ISUB_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_SUB_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_ISUB_MZ:
+			program = append(program, REX_SUB_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IMUL_R:
+			program = append(program, REX_IMUL_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_IMUL_I:
+			// also handles imul_rcp, with 64-bit special
+			if bits.Len64(instr.Imm) > 32 {
+				program = append(program, MOV_RAX_I...)
+				program = binary.LittleEndian.AppendUint64(program, instr.Imm)
+				program = append(program, REX_IMUL_RM...)
+				program = append(program, 0xc0+8*instr.Dst)
+			} else {
+				program = append(program, REX_IMUL_RRI...)
+				program = append(program, 0xc0+9*instr.Dst)
+				program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			}
+
+		case VM_IMUL_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_IMUL_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IMUL_MZ:
+			program = append(program, REX_IMUL_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_R...)
+			program = append(program, 0xe0+instr.Src)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_IMULH_M:
+			program = genAddressReg(program, instr, false)
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_MEM...)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+		case VM_IMULH_MZ:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_M...)
+			program = append(program, 0xa6)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_ISMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_R...)
+			program = append(program, 0xe8+instr.Src)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_ISMULH_M:
+			program = genAddressReg(program, instr, false)
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_IMUL_MEM...)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+		case VM_ISMULH_MZ:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_M...)
+			program = append(program, 0xae)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_INEG_R:
+			program = append(program, REX_NEG...)
+			program = append(program, 0xd8+instr.Dst)
+
+		case VM_IXOR_R:
+			program = append(program, REX_XOR_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_IXOR_I:
+			program = append(program, REX_XOR_RI...)
+			program = append(program, 0xf0+instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IXOR_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_XOR_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IXOR_MZ:
+			program = append(program, REX_XOR_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IROR_R:
+			program = append(program, REX_MOV_RR...)
+			program = append(program, 0xc8+instr.Src)
+			program = append(program, REX_ROT_CL...)
+			program = append(program, 0xc8+instr.Dst)
+		case VM_IROR_I:
+			program = append(program, REX_ROT_I8...)
+			program = append(program, 0xc8+instr.Dst)
+			program = append(program, byte(instr.Imm&63))
+
+		case VM_IROL_R:
+			program = append(program, REX_MOV_RR...)
+			program = append(program, 0xc8+instr.Src)
+			program = append(program, REX_ROT_CL...)
+			program = append(program, 0xc0+instr.Dst)
+		case VM_IROL_I:
+			program = append(program, REX_ROT_I8...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, byte(instr.Imm&63))
+
+		case VM_ISWAP_R:
+			program = append(program, REX_XCHG...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FSWAP_RF:
+			program = append(program, SHUFPD...)
+			program = append(program, 0xc0+9*instr.Dst)
+			program = append(program, 1)
+		case VM_FSWAP_RE:
+			program = append(program, SHUFPD...)
+			program = append(program, 0xc0+9*(instr.Dst+RegistersCountFloat))
+			program = append(program, 1)
+
+		case VM_FADD_R:
+			program = append(program, REX_ADDPD...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FADD_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_ADDPD...)
+			program = append(program, 0xc4+8*instr.Dst)
+
+		case VM_FSUB_R:
+			program = append(program, REX_SUBPD...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FSUB_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_SUBPD...)
+			program = append(program, 0xc4+8*instr.Dst)
+
+		case VM_FSCAL_R:
+			program = append(program, REX_XORPS...)
+			program = append(program, 0xc7+8*instr.Dst)
+
+		case VM_FMUL_R:
+			program = append(program, REX_MULPD...)
+			program = append(program, 0xe0+instr.Src+8*instr.Dst)
+
+		case VM_FDIV_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_ANDPS_XMM12...)
+			program = append(program, REX_DIVPD...)
+			program = append(program, 0xe4+8*instr.Dst)
+
+		case VM_FSQRT_R:
+			program = append(program, SQRTPD...)
+			program = append(program, 0xe4+9*instr.Dst)
+
+		case VM_CFROUND:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Src)
+			rotate := byte((13 - instr.Imm) & 63)
+			if rotate != 0 {
+				program = append(program, ROL_RAX...)
+				program = append(program, rotate)
+			}
+			program = append(program, AND_OR_MOV_LDMXCSR...)
+		case VM_CBRANCH:
+			reg := instr.Dst
+			target := instr.jumpTarget() + 1
+
+			jmpOffset := instructionOffsets[target] - (codePos + 16)
+
+			if BranchesWithin32B {
+				branchBegin := uint32(codePos + 7)
+				branchEnd := branchBegin
+				if jmpOffset >= -128 {
+					branchEnd += 9
+				} else {
+					branchEnd += 13
+				}
+				// If the jump crosses or touches 32-byte boundary, align it
+				if (branchBegin ^ branchEnd) >= 32 {
+					alignmentSize := 32 - (branchBegin & 31)
+					alignmentSize -= alignmentSize
+
+					program = append(program, JMP_ALIGN_PREFIX[alignmentSize]...)
+				}
+			}
+			program = append(program, REX_ADD_I...)
+			program = append(program, 0xc0+reg)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+			program = append(program, REX_TEST...)
+			program = append(program, 0xc0+reg)
+			program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
+
+			if jmpOffset >= -128 {
+				program = append(program, JZ_SHORT)
+				program = append(program, byte(jmpOffset))
+			} else {
+				program = append(program, JZ...)
+				program = binary.LittleEndian.AppendUint32(program, uint32(jmpOffset-4))
+			}
+
+		case VM_ISTORE:
+			//genAddressRegDst
+			program = append(program, LEA_32...)
+			program = append(program, 0x80+instr.Dst)
+			if instr.Dst == RegisterNeedsSib {
+				program = append(program, 0x24)
+			}
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, AND_EAX_I)
+			program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
+
+			program = append(program, REX_MOV_MR...)
+			program = append(program, 0x04+8*instr.Src)
+			program = append(program, 0x06)
+		case VM_NOP:
+			program = append(program, NOP1...)
+		}
+
+		codePos += int32(len(program) - curLen)
+	}
+	program = append(program, RET)
+}
diff --git a/vm_bytecode_jit_amd64.s b/vm_bytecode_jit_amd64.s
new file mode 100644
index 0000000..708581c
--- /dev/null
+++ b/vm_bytecode_jit_amd64.s
@@ -0,0 +1,91 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+#include "textflag.h"
+
+TEXT ·vm_run(SB),$8-40
+
+    // move register file to registers
+	MOVQ rf+0(FP), AX
+
+    PREFETCHNTA 0(AX)
+    // r0-r7
+    MOVQ (0*8)(AX), R8
+    MOVQ (1*8)(AX), R9
+    MOVQ (2*8)(AX), R10
+    MOVQ (3*8)(AX), R11
+    MOVQ (4*8)(AX), R12
+    MOVQ (5*8)(AX), R13
+    MOVQ (6*8)(AX), R14
+    MOVQ (7*8)(AX), R15
+
+    // f0-f3
+    VMOVUPD (8*8)(AX), X0
+    VMOVUPD (10*8)(AX), X1
+    VMOVUPD (12*8)(AX), X2
+    VMOVUPD (14*8)(AX), X3
+    // e0-e3
+    VMOVUPD (16*8)(AX), X4
+    VMOVUPD (18*8)(AX), X5
+    VMOVUPD (20*8)(AX), X6
+    VMOVUPD (22*8)(AX), X7
+    // a0-a3
+    VMOVUPD (24*8)(AX), X8
+    VMOVUPD (26*8)(AX), X9
+    VMOVUPD (28*8)(AX), X10
+    VMOVUPD (30*8)(AX), X11
+
+    //TODO: rest of init
+
+    // mantissa mask
+	//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
+    MOVQ $0x00ffffffffffffff, AX
+	VMOVQ AX, X13
+	VPBROADCASTQ X13, X13
+
+    // eMask
+	VMOVDQU64 eMask+16(FP), X14
+
+    // scale mask
+	//VMOVQ $0x80F0000000000000, $0x80F0000000000000, X15
+    MOVQ $0x80F0000000000000, AX
+	VMOVQ AX, X15
+	VPBROADCASTQ X15, X15
+
+    // scratchpad pointer
+    MOVQ pad+8(FP), SI
+
+    // JIT location
+    MOVQ jmp+32(FP), AX
+
+    // jump to JIT code
+    CALL AX
+
+
+    // move register file back to registers
+	MOVQ rf+0(FP), AX
+
+    PREFETCHT0 0(AX)
+    // r0-r7
+    MOVQ R8, (0*8)(AX)
+    MOVQ R9, (1*8)(AX)
+    MOVQ R10, (2*8)(AX)
+    MOVQ R11, (3*8)(AX)
+    MOVQ R12, (4*8)(AX)
+    MOVQ R13, (5*8)(AX)
+    MOVQ R14, (6*8)(AX)
+    MOVQ R15, (7*8)(AX)
+
+    // f0-f3
+    VMOVUPD X0, (8*8)(AX)
+    VMOVUPD X1, (10*8)(AX)
+    VMOVUPD X2, (12*8)(AX)
+    VMOVUPD X3, (14*8)(AX)
+    // e0-e3
+    VMOVUPD X4, (16*8)(AX)
+    VMOVUPD X5, (18*8)(AX)
+    VMOVUPD X6, (20*8)(AX)
+    VMOVUPD X7, (22*8)(AX)
+
+    // a0-a3 are constant, no need to move
+
+    RET
diff --git a/vm_bytecode_jit_generic.go b/vm_bytecode_jit_generic.go
new file mode 100644
index 0000000..915c989
--- /dev/null
+++ b/vm_bytecode_jit_generic.go
@@ -0,0 +1,11 @@
+//go:build !unix || !amd64 || disable_jit || purego
+
+package randomx
+
+func (c *ByteCode) generateCode(program []byte) {
+
+}
+
+func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+
+}
diff --git a/vm_bytecode_native.go b/vm_bytecode_native.go
index 8c5a148..dc25404 100644
--- a/vm_bytecode_native.go
+++ b/vm_bytecode_native.go
@@ -13,7 +13,7 @@ import (
 // It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
 // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
 func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+	for pc := 0; pc < len(c); pc++ {
 		i := &c[pc]
 		switch i.Opcode {
 		case VM_NOP: // we do nothing
@@ -111,8 +111,8 @@ func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
 			SetRoundingMode(f, uint8(tmp))
 
 		case VM_CBRANCH:
-			f.R[i.Src] += i.Imm
-			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+			f.R[i.Dst] += i.Imm
+			if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
 				pc = i.jumpTarget()
 			}
 		case VM_ISTORE:
diff --git a/vm_bytecode_purego.go b/vm_bytecode_purego.go
index 65b5722..d78582e 100644
--- a/vm_bytecode_purego.go
+++ b/vm_bytecode_purego.go
@@ -12,7 +12,7 @@ import (
 // It is the caller's responsibility to set and restore the mode to IEEE 754 roundTiesToEven between full executions
 // Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
 func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
+	for pc := 0; pc < len(c); pc++ {
 		i := &c[pc]
 		switch i.Opcode {
 		case VM_NOP: // we do nothing
@@ -110,8 +110,8 @@ func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
 			SetRoundingMode(f, uint8(tmp))
 
 		case VM_CBRANCH:
-			f.R[i.Src] += i.Imm
-			if (f.R[i.Src] & uint64(i.MemMask)) == 0 {
+			f.R[i.Dst] += i.Imm
+			if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
 				pc = i.jumpTarget()
 			}
 		case VM_ISTORE:
diff --git a/vm_instruction.go b/vm_instruction.go
index 66192b7..46c8d26 100644
--- a/vm_instruction.go
+++ b/vm_instruction.go
@@ -70,7 +70,7 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
 		registerUsage[i] = -1
 	}
 
-	for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
+	for i := 0; i < len(bc); i++ {
 		instr := VM_Instruction(prog[i*8:])
 		ibc := &bc[i]
 
@@ -312,10 +312,12 @@ func CompileProgramToByteCode(prog []byte) (bc ByteCode) {
 
 		case 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238: //25  // CBRANCH and CFROUND are interchanged
 			ibc.Opcode = VM_CBRANCH
-			ibc.Src = instr.Dst() % RegistersCount
+			//TODO:??? it's +1 on other
+			ibc.Dst = instr.Dst() % RegistersCount
 
-			target := uint16(int16(registerUsage[ibc.Src]))
-			ibc.Dst = uint8(target)
+			target := uint16(int16(registerUsage[ibc.Dst]))
+			// set target!
+			ibc.Src = uint8(target)
 			ibc.ImmB = uint8(target >> 8)
 
 			shift := uint64(instr.Mod()>>4) + CONDITIONOFFSET