Disable large page testing on 32-bit platforms

Support large pages, implement aligned / paged / large paged allocators
Ensure 16-byte alignment of dataset/scratchpad/register file and use more performance fetch/write SIMD on amd64
2024-05-02 16:28:38 +02:00 · 2024-05-02 16:18:50 +02:00 · 2024-05-02 12:06:38 +02:00 · 2024-05-02 11:42:23 +02:00 · 2024-05-02 04:16:52 +02:00 · 2024-05-02 03:46:03 +02:00
79 changed files with 4413 additions and 1691 deletions
--- a/.drone.yml
+++ b/.drone.yml
@ -1,7 +1,7 @@
 ---
 kind: pipeline
 type: docker
-name: from-source-amd64
+name: go-amd64-asm-jit
 platform:
  os: linux
  arch: amd64
@ -24,11 +24,65 @@ steps:
    commands:
      - apk update
      - apk add --no-cache git
-      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
 ---
 kind: pipeline
 type: docker
-name: from-source-386
+name: go-amd64-asm
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: amd64
+  GOAMD64: v3
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags disable_jit -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-amd64-purego
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: amd64
+  GOAMD64: v3
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-386-asm
 platform:
  os: linux
  arch: amd64
@ -51,11 +105,38 @@ steps:
    commands:
      - apk update
      - apk add --no-cache git
-      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
 ---
 kind: pipeline
 type: docker
-name: from-source-arm64
+name: go-386-purego
+platform:
+  os: linux
+  arch: amd64
+
+environment:
+  GOPROXY: direct
+  GOARCH: 386
+  GO386: sse2
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-arm64-asm
 platform:
  os: linux
  arch: arm64
@ -77,5 +158,113 @@ steps:
    commands:
      - apk update
      - apk add --no-cache git
-      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -v .
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-arm64-purego
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: arm64
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+      -
+---
+kind: pipeline
+type: docker
+name: go-arm-asm
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: arm
+  GOARM: 7
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-arm-purego
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: arm
+  GOARM: 7
+  GOOS: linux
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git
+      - go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
+---
+kind: pipeline
+type: docker
+name: go-wasm-purego
+platform:
+  os: linux
+  arch: arm64
+
+environment:
+  GOPROXY: direct
+  GOARCH: wasm
+  GOOS: wasip1
+  GOTRACEBACK: 2
+  GOEXPERIMENT: "cgocheck2,newinliner"
+  CGO_ENABLED: "0"
+
+workspace:
+  path: /drone/src
+
+steps:
+  - name: test
+    image: golang:1.22-alpine3.19
+    commands:
+      - apk update
+      - apk add --no-cache git bash
+      - apk add --no-cache wasmtime --repository=https://dl-cdn.alpinelinux.org/alpine/edge/testing
+      - PATH=$PATH:$(go env GOROOT)/misc/wasm go test -tags purego -p 1 -failfast -timeout 20m -cover -gcflags=-d=checkptr -short -v .
 ...
--- a/README.md
+++ b/README.md
@ -1,24 +1,41 @@
 # RandomX (Golang Implementation)
+RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs.
+RandomX uses random code execution (hence the name) together with several memory-hard techniques to minimize the efficiency advantage of specialized hardware.
+
+---

 Fork from [git.dero.io/DERO_Foundation/RandomX](https://git.dero.io/DERO_Foundation/RandomX). Also related, their [Analysis of RandomX writeup](https://medium.com/deroproject/analysis-of-randomx-dde9dfe9bbc6).

 Original code failed RandomX testcases and was implemented using big.Float.

-This package implements RandomX without CGO, using only Golang code, pure float64 ops and two small assembly sections to implement CFROUND modes.
+---
+
+This package implements RandomX without CGO, using only Golang code, native float64 ops, some assembly, but with optional soft float _purego_ implementation.

 All test cases pass properly.

-Uses minimal Go assembly due to having to set rounding mode natively. Support can be added with supporting rounding mode under _asm_.
+Supports Full mode and Light mode.

-JIT is supported on a few platforms but can be hard-disabled via the `disable_jit` build flag, or at runtime.
+For the C++ implementation and design of RandomX, see [github.com/tevador/RandomX](https://github.com/tevador/RandomX)

-|  Platform   | Supported | SuperScalar JIT |      Notes       |
-|:-----------:|:---------:|:---------------:|:----------------:|
-|   **386**   |     ✅     |        ❌        |                  |
-|  **amd64**  |     ✅     |       ✅*        | JIT only on Unix |
-|   **arm**   |     ❌     |        -        |                  |
-|  **arm64**  |     ✅     |        ❌        |                  |
-|  **mips**   |     ❌     |        -        |                  |
-| **mips64**  |     ❌     |        -        |                  |
-| **riscv64** |     ❌     |        -        |                  |
-|  **wasm**   |     ❌     |        -        |                  |
+|        Feature        |     386     |    amd64     |     arm     |    arm64    |    mips     |   mips64    |   riscv64   |    wasm     |
+|:---------------------:|:-----------:|:------------:|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|:-----------:|
+|        purego         |      ✅      |      ✅       |      ✅      |      ✅      |      ✅      |      ✅      |      ✅      |      ✅      |
+|       Full Mode       |      ❌      |      ✅       |      ❌      |      ✅      |      ❌      |      ✅      |      ✅      |      ❌      |
+|   Float Operations    |   **hw**    |    **hw**    |   **hw**    |   **hw**    |    soft     |    soft     |    soft     |    soft     |
+|    AES Operations     |    soft     |    **hw**    |    soft     |    soft     |    soft     |    soft     |    soft     |    soft     |
+| Superscalar Execution | interpreter | **compiler** | interpreter | interpreter | interpreter | interpreter | interpreter | interpreter |
+|     VM Execution      | interpreter | **compiler** | interpreter | interpreter |    soft     |    soft     |    soft     |    soft     |
+
+
+A pure Golang implementation can be used on platforms without hard float support or via the `purego` build tag manually.
+
+[TinyGo](https://github.com/tinygo-org/tinygo) is supported under the `purego` build tag.
+
+Any platform with no hard float support or when enabled manually will use soft float, using [softfloat64](https://git.gammaspectra.live/P2Pool/softfloat64). This will be very slow.
+
+Full mode is NOT recommended in 32-bit systems and is unsupported, although depending on system it might be able to run. You might want to manually run `runtime.GC()` if cleaning up dataset to free memory.
+
+Native hard float can be added with supporting rounding mode under _asm_.
+
+JIT only supported under Unix systems (Linux, *BSD, macOS), and can be hard-disabled via the `disable_jit` build flag, or at runtime.
--- a/aes/const.go
+++ b/aes/const.go
@ -1,145 +0,0 @@
-// Copyright 2009 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// Package aes implements AES encryption (formerly Rijndael), as defined in
-// U.S. Federal Information Processing Standards Publication 197.
-//
-// The AES operations in this package are not implemented using constant-time algorithms.
-// An exception is when running on systems with enabled hardware support for AES
-// that makes these operations constant-time. Examples include amd64 systems using AES-NI
-// extensions and s390x systems using Message-Security-Assist extensions.
-// On such systems, when the result of NewCipher is passed to cipher.NewGCM,
-// the GHASH operation used by GCM is also constant-time.
-package aes
-
-import (
-	"math/bits"
-)
-
-// Multiply b and c as GF(2) polynomials modulo poly
-func mul(b, c uint32) uint32 {
-	i := b
-	j := c
-	s := uint32(0)
-	for k := uint32(1); k < 0x100 && j != 0; k <<= 1 {
-		// Invariant: k == 1<<n, i == b * xⁿ
-
-		if j&k != 0 {
-			// s += i in GF(2); xor in binary
-			s ^= i
-			j ^= k // turn off bit to end loop early
-		}
-
-		// i *= x in GF(2) modulo the polynomial
-		i <<= 1
-		if i&0x100 != 0 {
-			i ^= poly
-		}
-	}
-	return s
-}
-
-// This file contains AES constants - 8720 bytes of initialized data.
-
-// https://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
-
-// AES is based on the mathematical behavior of binary polynomials
-// (polynomials over GF(2)) modulo the irreducible polynomial x⁸ + x⁴ + x³ + x + 1.
-// Addition of these binary polynomials corresponds to binary xor.
-// Reducing mod poly corresponds to binary xor with poly every
-// time a 0x100 bit appears.
-const poly = 1<<8 | 1<<4 | 1<<3 | 1<<1 | 1<<0 // x⁸ + x⁴ + x³ + x + 1
-
-// Powers of x mod poly in GF(2).
-var powx = [16]byte{
-	0x01,
-	0x02,
-	0x04,
-	0x08,
-	0x10,
-	0x20,
-	0x40,
-	0x80,
-	0x1b,
-	0x36,
-	0x6c,
-	0xd8,
-	0xab,
-	0x4d,
-	0x9a,
-	0x2f,
-}
-
-// FIPS-197 Figure 7. S-box substitution values in hexadecimal format.
-var sbox0 = [256]byte{
-	0x63, 0x7c, 0x77, 0x7b, 0xf2, 0x6b, 0x6f, 0xc5, 0x30, 0x01, 0x67, 0x2b, 0xfe, 0xd7, 0xab, 0x76,
-	0xca, 0x82, 0xc9, 0x7d, 0xfa, 0x59, 0x47, 0xf0, 0xad, 0xd4, 0xa2, 0xaf, 0x9c, 0xa4, 0x72, 0xc0,
-	0xb7, 0xfd, 0x93, 0x26, 0x36, 0x3f, 0xf7, 0xcc, 0x34, 0xa5, 0xe5, 0xf1, 0x71, 0xd8, 0x31, 0x15,
-	0x04, 0xc7, 0x23, 0xc3, 0x18, 0x96, 0x05, 0x9a, 0x07, 0x12, 0x80, 0xe2, 0xeb, 0x27, 0xb2, 0x75,
-	0x09, 0x83, 0x2c, 0x1a, 0x1b, 0x6e, 0x5a, 0xa0, 0x52, 0x3b, 0xd6, 0xb3, 0x29, 0xe3, 0x2f, 0x84,
-	0x53, 0xd1, 0x00, 0xed, 0x20, 0xfc, 0xb1, 0x5b, 0x6a, 0xcb, 0xbe, 0x39, 0x4a, 0x4c, 0x58, 0xcf,
-	0xd0, 0xef, 0xaa, 0xfb, 0x43, 0x4d, 0x33, 0x85, 0x45, 0xf9, 0x02, 0x7f, 0x50, 0x3c, 0x9f, 0xa8,
-	0x51, 0xa3, 0x40, 0x8f, 0x92, 0x9d, 0x38, 0xf5, 0xbc, 0xb6, 0xda, 0x21, 0x10, 0xff, 0xf3, 0xd2,
-	0xcd, 0x0c, 0x13, 0xec, 0x5f, 0x97, 0x44, 0x17, 0xc4, 0xa7, 0x7e, 0x3d, 0x64, 0x5d, 0x19, 0x73,
-	0x60, 0x81, 0x4f, 0xdc, 0x22, 0x2a, 0x90, 0x88, 0x46, 0xee, 0xb8, 0x14, 0xde, 0x5e, 0x0b, 0xdb,
-	0xe0, 0x32, 0x3a, 0x0a, 0x49, 0x06, 0x24, 0x5c, 0xc2, 0xd3, 0xac, 0x62, 0x91, 0x95, 0xe4, 0x79,
-	0xe7, 0xc8, 0x37, 0x6d, 0x8d, 0xd5, 0x4e, 0xa9, 0x6c, 0x56, 0xf4, 0xea, 0x65, 0x7a, 0xae, 0x08,
-	0xba, 0x78, 0x25, 0x2e, 0x1c, 0xa6, 0xb4, 0xc6, 0xe8, 0xdd, 0x74, 0x1f, 0x4b, 0xbd, 0x8b, 0x8a,
-	0x70, 0x3e, 0xb5, 0x66, 0x48, 0x03, 0xf6, 0x0e, 0x61, 0x35, 0x57, 0xb9, 0x86, 0xc1, 0x1d, 0x9e,
-	0xe1, 0xf8, 0x98, 0x11, 0x69, 0xd9, 0x8e, 0x94, 0x9b, 0x1e, 0x87, 0xe9, 0xce, 0x55, 0x28, 0xdf,
-	0x8c, 0xa1, 0x89, 0x0d, 0xbf, 0xe6, 0x42, 0x68, 0x41, 0x99, 0x2d, 0x0f, 0xb0, 0x54, 0xbb, 0x16,
-}
-
-// FIPS-197 Figure 14.  Inverse S-box substitution values in hexadecimal format.
-var sbox1 = [256]byte{
-	0x52, 0x09, 0x6a, 0xd5, 0x30, 0x36, 0xa5, 0x38, 0xbf, 0x40, 0xa3, 0x9e, 0x81, 0xf3, 0xd7, 0xfb,
-	0x7c, 0xe3, 0x39, 0x82, 0x9b, 0x2f, 0xff, 0x87, 0x34, 0x8e, 0x43, 0x44, 0xc4, 0xde, 0xe9, 0xcb,
-	0x54, 0x7b, 0x94, 0x32, 0xa6, 0xc2, 0x23, 0x3d, 0xee, 0x4c, 0x95, 0x0b, 0x42, 0xfa, 0xc3, 0x4e,
-	0x08, 0x2e, 0xa1, 0x66, 0x28, 0xd9, 0x24, 0xb2, 0x76, 0x5b, 0xa2, 0x49, 0x6d, 0x8b, 0xd1, 0x25,
-	0x72, 0xf8, 0xf6, 0x64, 0x86, 0x68, 0x98, 0x16, 0xd4, 0xa4, 0x5c, 0xcc, 0x5d, 0x65, 0xb6, 0x92,
-	0x6c, 0x70, 0x48, 0x50, 0xfd, 0xed, 0xb9, 0xda, 0x5e, 0x15, 0x46, 0x57, 0xa7, 0x8d, 0x9d, 0x84,
-	0x90, 0xd8, 0xab, 0x00, 0x8c, 0xbc, 0xd3, 0x0a, 0xf7, 0xe4, 0x58, 0x05, 0xb8, 0xb3, 0x45, 0x06,
-	0xd0, 0x2c, 0x1e, 0x8f, 0xca, 0x3f, 0x0f, 0x02, 0xc1, 0xaf, 0xbd, 0x03, 0x01, 0x13, 0x8a, 0x6b,
-	0x3a, 0x91, 0x11, 0x41, 0x4f, 0x67, 0xdc, 0xea, 0x97, 0xf2, 0xcf, 0xce, 0xf0, 0xb4, 0xe6, 0x73,
-	0x96, 0xac, 0x74, 0x22, 0xe7, 0xad, 0x35, 0x85, 0xe2, 0xf9, 0x37, 0xe8, 0x1c, 0x75, 0xdf, 0x6e,
-	0x47, 0xf1, 0x1a, 0x71, 0x1d, 0x29, 0xc5, 0x89, 0x6f, 0xb7, 0x62, 0x0e, 0xaa, 0x18, 0xbe, 0x1b,
-	0xfc, 0x56, 0x3e, 0x4b, 0xc6, 0xd2, 0x79, 0x20, 0x9a, 0xdb, 0xc0, 0xfe, 0x78, 0xcd, 0x5a, 0xf4,
-	0x1f, 0xdd, 0xa8, 0x33, 0x88, 0x07, 0xc7, 0x31, 0xb1, 0x12, 0x10, 0x59, 0x27, 0x80, 0xec, 0x5f,
-	0x60, 0x51, 0x7f, 0xa9, 0x19, 0xb5, 0x4a, 0x0d, 0x2d, 0xe5, 0x7a, 0x9f, 0x93, 0xc9, 0x9c, 0xef,
-	0xa0, 0xe0, 0x3b, 0x4d, 0xae, 0x2a, 0xf5, 0xb0, 0xc8, 0xeb, 0xbb, 0x3c, 0x83, 0x53, 0x99, 0x61,
-	0x17, 0x2b, 0x04, 0x7e, 0xba, 0x77, 0xd6, 0x26, 0xe1, 0x69, 0x14, 0x63, 0x55, 0x21, 0x0c, 0x7d,
-}
-
-// Lookup tables for encryption.
-var encLut = func() (te [4][256]uint32) {
-	for i := 0; i < 256; i++ {
-		s := uint32(sbox0[i])
-		s2 := mul(s, 2)
-		s3 := mul(s, 3)
-		w := s2<<24 | s<<16 | s<<8 | s3
-
-		for j := 0; j < 4; j++ {
-			te[j][i] = bits.ReverseBytes32(w)
-			w = w<<24 | w>>8
-		}
-	}
-	return te
-}()
-
-// Lookup tables for decryption.
-var decLut = func() (td [4][256]uint32) {
-	for i := 0; i < 256; i++ {
-		s := uint32(sbox1[i])
-		s9 := mul(s, 0x9)
-		sb := mul(s, 0xb)
-		sd := mul(s, 0xd)
-		se := mul(s, 0xe)
-		w := se<<24 | s9<<16 | sd<<8 | sb
-		for j := 0; j < 4; j++ {
-			td[j][i] = bits.ReverseBytes32(w)
-			w = w<<24 | w>>8
-		}
-	}
-	return td
-}()
--- a/aes/hash.go
+++ b/aes/hash.go
@ -1,142 +0,0 @@
-/*
-Copyright (c) 2019 DERO Foundation. All rights reserved.
-
-Redistribution and use in source and binary forms, with or without modification,
-are permitted provided that the following conditions are met:
-
-1. Redistributions of source code must retain the above copyright notice,
-this list of conditions and the following disclaimer.
-
-2. Redistributions in binary form must reproduce the above copyright notice,
-this list of conditions and the following disclaimer in the documentation
-and/or other materials provided with the distribution.
-
-3. Neither the name of the copyright holder nor the names of its contributors
-may be used to endorse or promote products derived from this software without
-specific prior written permission.
-
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-*/
-
-package aes
-
-import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
-	"unsafe"
-)
-
-// HashAes1Rx4
-//
-//	Calculate a 512-bit hash of 'input' using 4 lanes of AES.
-//	The input is treated as a set of round keys for the encryption
-//	of the initial state.
-//
-//	'inputSize' must be a multiple of 64.
-//
-//	For a 2 MiB input, this has the same security as 32768-round
-//	AES encryption.
-//
-//	Hashing throughput: >20 GiB/s per CPU core with hardware AES
-func HashAes1Rx4(input []byte, output *[64]byte) {
-	if len(input)%64 != 0 {
-		panic("unsupported")
-	}
-
-	// states are copied
-	states := keys.AesHash1R_State
-
-	for input_ptr := 0; input_ptr < len(input); input_ptr += 64 {
-		in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:])))
-
-		soft_aesenc(&states[0], &in[0])
-		soft_aesdec(&states[1], &in[1])
-		soft_aesenc(&states[2], &in[2])
-		soft_aesdec(&states[3], &in[3])
-	}
-
-	soft_aesenc(&states[0], &keys.AesHash1R_XKeys[0])
-	soft_aesdec(&states[1], &keys.AesHash1R_XKeys[0])
-	soft_aesenc(&states[2], &keys.AesHash1R_XKeys[0])
-	soft_aesdec(&states[3], &keys.AesHash1R_XKeys[0])
-
-	soft_aesenc(&states[0], &keys.AesHash1R_XKeys[1])
-	soft_aesdec(&states[1], &keys.AesHash1R_XKeys[1])
-	soft_aesenc(&states[2], &keys.AesHash1R_XKeys[1])
-	soft_aesdec(&states[3], &keys.AesHash1R_XKeys[1])
-
-	copy(output[:], (*[64]byte)(unsafe.Pointer(&states))[:])
-}
-
-// FillAes1Rx4
-//
-//	Fill 'output' with pseudorandom data based on 512-bit 'state'.
-//	The state is encrypted using a single AES round per 16 bytes of output
-//	in 4 lanes.
-//
-//	'output' size must be a multiple of 64.
-//
-//	The modified state is written back to 'state' to allow multiple
-//	calls to this function.
-func FillAes1Rx4(state *[64]byte, output []byte) {
-	if len(output)%len(state) != 0 {
-		panic("unsupported")
-	}
-
-	// Reference to state without copying
-	states := (*[4][4]uint32)(unsafe.Pointer(state))
-
-	for outptr := 0; outptr < len(output); outptr += len(state) {
-		soft_aesdec(&states[0], &keys.AesGenerator1R_Keys[0])
-		soft_aesenc(&states[1], &keys.AesGenerator1R_Keys[1])
-		soft_aesdec(&states[2], &keys.AesGenerator1R_Keys[2])
-		soft_aesenc(&states[3], &keys.AesGenerator1R_Keys[3])
-
-		copy(output[outptr:], state[:])
-	}
-}
-
-// FillAes4Rx4 used to generate final program
-func FillAes4Rx4(state [64]byte, output []byte) {
-	if len(output)%len(state) != 0 {
-		panic("unsupported")
-	}
-
-	// state is copied on caller
-
-	// Copy state
-	states := (*[4][4]uint32)(unsafe.Pointer(&state))
-
-	for outptr := 0; outptr < len(output); outptr += len(state) {
-		soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[0])
-		soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[0])
-		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[4])
-		soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[4])
-
-		soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[1])
-		soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[1])
-		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[5])
-		soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[5])
-
-		soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[2])
-		soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[2])
-		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[6])
-		soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[6])
-
-		soft_aesdec(&states[0], &keys.AesGenerator4R_Keys[3])
-		soft_aesenc(&states[1], &keys.AesGenerator4R_Keys[3])
-		soft_aesdec(&states[2], &keys.AesGenerator4R_Keys[7])
-		soft_aesenc(&states[3], &keys.AesGenerator4R_Keys[7])
-
-		copy(output[outptr:], state[:])
-	}
-
-}
--- a/argon2.go
+++ b/argon2.go
@ -1,58 +0,0 @@
-package randomx
-
-import "golang.org/x/crypto/blake2b"
-
-import (
-	_ "golang.org/x/crypto/argon2"
-	_ "unsafe"
-)
-
-// see reference configuration.h
-// Cache size in KiB. Must be a power of 2.
-const RANDOMX_ARGON_MEMORY = 262144
-
-// Number of Argon2d iterations for Cache initialization.
-const RANDOMX_ARGON_ITERATIONS = 3
-
-// Number of parallel lanes for Cache initialization.
-const RANDOMX_ARGON_LANES = 1
-
-// Argon2d salt
-const RANDOMX_ARGON_SALT = "RandomX\x03"
-const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
-
-const ArgonBlockSize uint32 = 1024
-
-type argonBlock [128]uint64
-
-const syncPoints = 4
-
-//go:linkname argon2_initHash golang.org/x/crypto/argon2.initHash
-func argon2_initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
-
-//go:linkname argon2_initBlocks golang.org/x/crypto/argon2.initBlocks
-func argon2_initBlocks(h0 *[blake2b.Size + 8]byte, memory, threads uint32) []argonBlock
-
-//go:linkname argon2_processBlocks golang.org/x/crypto/argon2.processBlocks
-func argon2_processBlocks(B []argonBlock, time, memory, threads uint32, mode int)
-
-// argon2_buildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call
-func argon2_buildBlocks(password, salt, secret, data []byte, time, memory uint32, threads uint8, keyLen uint32) []argonBlock {
-	if time < 1 {
-		panic("argon2: number of rounds too small")
-	}
-	if threads < 1 {
-		panic("argon2: parallelism degree too low")
-	}
-	const mode = 0 /* argon2d */
-	h0 := argon2_initHash(password, salt, secret, data, time, memory, uint32(threads), keyLen, mode)
-
-	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
-	if memory < 2*syncPoints*uint32(threads) {
-		memory = 2 * syncPoints * uint32(threads)
-	}
-	B := argon2_initBlocks(&h0, memory, uint32(threads))
-	argon2_processBlocks(B, time, memory, uint32(threads), mode)
-
-	return B
-}
--- a/asm/round.go
+++ b/asm/round.go
@ -1,14 +0,0 @@
-package asm
-
-type RoundingMode uint8
-
-const (
-	RoundingModeToNearest = RoundingMode(iota)
-	RoundingModeToNegative
-	RoundingModeToPositive
-	RoundingModeToZero
-)
-
-func SetRoundingMode(mode RoundingMode) {
-	setRoundingMode(uint8(mode))
-}
--- a/asm/round_386.go
+++ b/asm/round_386.go
@ -1,20 +0,0 @@
-//go:build 386
-
-package asm
-
-// stmxcsr reads the MXCSR control and status register.
-//
-//go:noescape
-func stmxcsr(addr *uint32)
-
-// ldmxcsr writes to the MXCSR control and status register.
-//
-//go:noescape
-func ldmxcsr(addr *uint32)
-
-func setRoundingMode(mode uint8) {
-	var csr uint32
-	stmxcsr(&csr)
-	csr = (csr & (^uint32(0x6000))) | ((uint32(mode) & 3) << 13)
-	ldmxcsr(&csr)
-}
--- a/asm/round_386.s
+++ b/asm/round_386.s
@ -1,13 +0,0 @@
-#include "textflag.h"
-
-// stmxcsr reads the MXCSR control and status register.
-TEXT ·stmxcsr(SB),NOSPLIT|NOFRAME,$0-4
-	MOVL addr+0(FP), SI
-	STMXCSR (SI)
-	RET
-
-// ldmxcsr writes to the MXCSR control and status register.
-TEXT ·ldmxcsr(SB),NOSPLIT|NOFRAME,$0-4
-	MOVL addr+0(FP), SI
-	LDMXCSR (SI)
-	RET
--- a/asm/round_amd64.go
+++ b/asm/round_amd64.go
@ -1,20 +0,0 @@
-//go:build amd64
-
-package asm
-
-// stmxcsr reads the MXCSR control and status register.
-//
-//go:noescape
-func stmxcsr(addr *uint32)
-
-// ldmxcsr writes to the MXCSR control and status register.
-//
-//go:noescape
-func ldmxcsr(addr *uint32)
-
-func setRoundingMode(mode uint8) {
-	var csr uint32
-	stmxcsr(&csr)
-	csr = (csr & (^uint32(0x6000))) | ((uint32(mode) & 3) << 13)
-	ldmxcsr(&csr)
-}
--- a/asm/round_amd64.s
+++ b/asm/round_amd64.s
@ -1,13 +0,0 @@
-#include "textflag.h"
-
-// stmxcsr reads the MXCSR control and status register.
-TEXT ·stmxcsr(SB),NOSPLIT|NOFRAME,$0-8
-	MOVQ addr+0(FP), SI
-	STMXCSR (SI)
-	RET
-
-// ldmxcsr writes to the MXCSR control and status register.
-TEXT ·ldmxcsr(SB),NOSPLIT|NOFRAME,$0-8
-	MOVQ addr+0(FP), SI
-	LDMXCSR (SI)
-	RET
--- a/blake2b.go
+++ b/blake2b.go
@ -1,50 +0,0 @@
-package randomx
-
-import (
-	"encoding/binary"
-	"golang.org/x/crypto/blake2b"
-)
-
-type Blake2Generator struct {
-	data           [64]byte
-	dataindex      int
-	allocRegIndex  [8]int
-	allocRegisters [8]Register
-}
-
-func Init_Blake2Generator(key []byte, nonce uint32) *Blake2Generator {
-	var b Blake2Generator
-	b.dataindex = len(b.data)
-	if len(key) > 60 {
-		copy(b.data[:], key[0:60])
-	} else {
-		copy(b.data[:], key)
-	}
-	binary.LittleEndian.PutUint32(b.data[60:], nonce)
-
-	return &b
-}
-
-func (b *Blake2Generator) checkdata(bytesNeeded int) {
-	if b.dataindex+bytesNeeded > cap(b.data) {
-		//blake2b(data, sizeof(data), data, sizeof(data), nullptr, 0);
-		h := blake2b.Sum512(b.data[:])
-		copy(b.data[:], h[:])
-		b.dataindex = 0
-	}
-
-}
-
-func (b *Blake2Generator) GetByte() byte {
-	b.checkdata(1)
-	ret := b.data[b.dataindex]
-	b.dataindex++
-	return ret
-}
-func (b *Blake2Generator) GetUint32() uint32 {
-	b.checkdata(4)
-	ret := binary.LittleEndian.Uint32(b.data[b.dataindex:])
-	b.dataindex += 4
-
-	return ret
-}
--- a/cache.go
+++ b/cache.go
@ -1,54 +1,83 @@
 package randomx

 import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/keys"
+	"errors"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/argon2"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/blake2"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/keys"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
 	"runtime"
-	"slices"
 	"unsafe"
 )

-type MemoryBlock [128]uint64
+type MemoryBlock [argon2.BlockSize / 8]uint64

 func (m *MemoryBlock) GetLine(addr uint64) *RegisterLine {
 	addr >>= 3
-	//[addr : addr+8 : addr+8]
-	return (*RegisterLine)(unsafe.Add(unsafe.Pointer(m), addr*8))
+	return (*RegisterLine)(unsafe.Pointer(unsafe.SliceData(m[addr : addr+8 : addr+8])))
 }

-type Randomx_Cache struct {
-	Blocks []MemoryBlock
+type Cache struct {
+	blocks *[RANDOMX_ARGON_MEMORY]MemoryBlock

-	Programs [RANDOMX_PROGRAM_COUNT]SuperScalarProgram
+	programs [RANDOMX_PROGRAM_COUNT]SuperScalarProgram

-	JitPrograms [RANDOMX_PROGRAM_COUNT]ProgramFunc
+	jitPrograms [RANDOMX_PROGRAM_COUNT]SuperScalarProgramFunc

-	Flags uint64
+	flags Flags
 }

-func Randomx_alloc_cache(flags uint64) *Randomx_Cache {
-	if flags == RANDOMX_FLAG_DEFAULT {
-		flags = RANDOMX_FLAG_JIT
-	}
-	return &Randomx_Cache{
-		Flags: flags,
+// NewCache Creates a randomx_cache structure and allocates memory for RandomX Cache.
+// *
+// * @param flags is any combination of these 2 flags (each flag can be set or not set):
+// *        RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
+// *        RANDOMX_FLAG_JIT - create cache structure with JIT compilation support; this makes
+// *                           subsequent Dataset initialization faster
+// *        Optionally, one of these two flags may be selected:
+// *        RANDOMX_FLAG_ARGON2_SSSE3 - optimized Argon2 for CPUs with the SSSE3 instruction set
+// *                                   makes subsequent cache initialization faster
+// *        RANDOMX_FLAG_ARGON2_AVX2 - optimized Argon2 for CPUs with the AVX2 instruction set
+// *                                   makes subsequent cache initialization faster
+// *
+// * @return Pointer to an allocated randomx_cache structure.
+// *         Returns NULL if:
+// *         (1) memory allocation fails
+// *         (2) the RANDOMX_FLAG_JIT is set and JIT compilation is not supported on the current platform
+// *         (3) an invalid or unsupported RANDOMX_FLAG_ARGON2 value is set
+// */
+func NewCache(flags Flags) (c *Cache, err error) {
+
+	var blocks *[RANDOMX_ARGON_MEMORY]MemoryBlock
+
+	if flags.Has(RANDOMX_FLAG_LARGE_PAGES) {
+		if largePageAllocator == nil {
+			return nil, errors.New("huge pages not supported")
+		}
+		blocks, err = memory.Allocate[[RANDOMX_ARGON_MEMORY]MemoryBlock](largePageAllocator)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		blocks, err = memory.Allocate[[RANDOMX_ARGON_MEMORY]MemoryBlock](cacheLineAlignedAllocator)
+
+		if err != nil {
+			return nil, err
+		}
 	}
+
+	return &Cache{
+		flags:  flags,
+		blocks: blocks,
+	}, nil
 }

-func (cache *Randomx_Cache) HasJIT() bool {
-	return cache.Flags&RANDOMX_FLAG_JIT > 0 && cache.JitPrograms[0] != nil
+func (c *Cache) hasInitializedJIT() bool {
+	return c.flags.HasJIT() && c.jitPrograms[0] != nil
 }

-func (cache *Randomx_Cache) VM_Initialize() *VM {
-
-	return &VM{
-		Dataset: &Randomx_DatasetLight{
-			Cache: cache,
-		},
-	}
-}
-
-func (cache *Randomx_Cache) Close() error {
-	for _, p := range cache.JitPrograms {
+// Close Releases all memory occupied by the Cache structure.
+func (c *Cache) Close() error {
+	for _, p := range c.jitPrograms {
 		if p != nil {
 			err := p.Close()
 			if err != nil {
@ -56,45 +85,63 @@ func (cache *Randomx_Cache) Close() error {
 			}
 		}
 	}
-	return nil
+
+	if c.flags.Has(RANDOMX_FLAG_LARGE_PAGES) {
+		return memory.Free(largePageAllocator, c.blocks)
+	} else {
+		return memory.Free(cacheLineAlignedAllocator, c.blocks)
+	}
 }

-func (cache *Randomx_Cache) Init(key []byte) {
-	// Lock due to external JIT madness
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
+// Init Initializes the cache memory and SuperscalarHash using the provided key value.
+// Does nothing if called again with the same key value.
+func (c *Cache) Init(key []byte) {
+	//TODO: cache key and do not regenerate

-	kkey := slices.Clone(key)
+	argonBlocks := unsafe.Slice((*argon2.Block)(unsafe.Pointer(c.blocks)), len(c.blocks))

-	argonBlocks := argon2_buildBlocks(kkey, []byte(RANDOMX_ARGON_SALT), []byte{}, []byte{}, RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES, 0)
+	argon2.BuildBlocks(argonBlocks, key, []byte(RANDOMX_ARGON_SALT), RANDOMX_ARGON_ITERATIONS, RANDOMX_ARGON_MEMORY, RANDOMX_ARGON_LANES)

-	memoryBlocks := unsafe.Slice((*MemoryBlock)(unsafe.Pointer(unsafe.SliceData(argonBlocks))), int(unsafe.Sizeof(argonBlock{}))/int(unsafe.Sizeof(MemoryBlock{}))*len(argonBlocks))
+	const nonce uint32 = 0

-	cache.Blocks = memoryBlocks
+	gen := blake2.New(key, nonce)
+	for i := range c.programs {
+		// build a superscalar program
+		prog := BuildSuperScalarProgram(gen)

-	nonce := uint32(0) //uint32(len(key))
-	gen := Init_Blake2Generator(key, nonce)
-	for i := 0; i < 8; i++ {
-		cache.Programs[i] = Build_SuperScalar_Program(gen) // build a superscalar program
-		if cache.Flags&RANDOMX_FLAG_JIT > 0 {
-			cache.JitPrograms[i] = generateSuperscalarCode(cache.Programs[i])
+		if c.flags.HasJIT() {
+			c.jitPrograms[i] = generateSuperscalarCode(prog)
+			// fallback if can't compile program
+			if c.jitPrograms[i] == nil {
+				c.programs[i] = prog
+			} else if err := memory.PageReadExecute(c.jitPrograms[i]); err != nil {
+				c.programs[i] = prog
+			} else {
+				c.programs[i] = SuperScalarProgram{prog[0]}
+			}
+		} else {
+			c.programs[i] = prog
 		}
 	}

 }

-// GetMixBlock fetch a 64 byte block in uint64 form
-func (cache *Randomx_Cache) GetMixBlock(addr uint64) *RegisterLine {
+const Mask = CacheSize/CacheLineSize - 1

-	mask := CacheSize/CacheLineSize - 1
+// getMixBlock fetch a 64 byte block in uint64 form
+func (c *Cache) getMixBlock(addr uint64) *RegisterLine {

-	addr = (addr & mask) * CacheLineSize
+	addr = (addr & Mask) * CacheLineSize

 	block := addr / 1024
-	return cache.Blocks[block].GetLine(addr % 1024)
+	return c.blocks[block].GetLine(addr % 1024)
 }

-func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64) {
+func (c *Cache) GetMemory() *[RANDOMX_ARGON_MEMORY]MemoryBlock {
+	return c.blocks
+}
+
+func (c *Cache) initDataset(rl *RegisterLine, itemNumber uint64) {
 	registerValue := itemNumber

 	rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0]
@ -106,51 +153,45 @@ func (cache *Randomx_Cache) InitDatasetItem(rl *RegisterLine, itemNumber uint64)
 	rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
 	rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]

-	for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
-		mix := cache.GetMixBlock(registerValue)
-
-		program := cache.Programs[i]
-
-		executeSuperscalar(program.Program(), rl)
-
-		for q := range rl {
-			rl[q] ^= mix[q]
+	if c.hasInitializedJIT() {
+		if c.flags.HasJIT() {
+			// Lock due to external JIT madness
+			runtime.LockOSThread()
+			defer runtime.UnlockOSThread()
 		}

-		registerValue = rl[program.AddressRegister()]
+		for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
+			mix := c.getMixBlock(registerValue)

+			c.jitPrograms[i].Execute(uintptr(unsafe.Pointer(rl)))
+
+			for q := range rl {
+				rl[q] ^= mix[q]
+			}
+
+			registerValue = rl[c.programs[i].AddressRegister()]
+
+		}
+	} else {
+		for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
+			mix := c.getMixBlock(registerValue)
+
+			program := c.programs[i]
+
+			executeSuperscalar(program.Program(), rl)
+
+			for q := range rl {
+				rl[q] ^= mix[q]
+			}
+
+			registerValue = rl[program.AddressRegister()]
+
+		}
 	}
 }

-func (cache *Randomx_Cache) InitDatasetItemJIT(rl *RegisterLine, itemNumber uint64) {
-	registerValue := itemNumber
-
-	rl[0] = (itemNumber + 1) * keys.SuperScalar_Constants[0]
-	rl[1] = rl[0] ^ keys.SuperScalar_Constants[1]
-	rl[2] = rl[0] ^ keys.SuperScalar_Constants[2]
-	rl[3] = rl[0] ^ keys.SuperScalar_Constants[3]
-	rl[4] = rl[0] ^ keys.SuperScalar_Constants[4]
-	rl[5] = rl[0] ^ keys.SuperScalar_Constants[5]
-	rl[6] = rl[0] ^ keys.SuperScalar_Constants[6]
-	rl[7] = rl[0] ^ keys.SuperScalar_Constants[7]
-
-	for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
-		mix := cache.GetMixBlock(registerValue)
-
-		cache.JitPrograms[i].Execute(rl)
-
-		for q := range rl {
-			rl[q] ^= mix[q]
-		}
-
-		registerValue = rl[cache.Programs[i].AddressRegister()]
-
-	}
-}
-
-func (cache *Randomx_Cache) initDataset(dataset []RegisterLine, startItem, endItem uint64) {
-	panic("todo")
+func (c *Cache) datasetInit(dataset []RegisterLine, startItem, endItem uint64) {
 	for itemNumber := startItem; itemNumber < endItem; itemNumber, dataset = itemNumber+1, dataset[1:] {
-		cache.InitDatasetItem(&dataset[0], itemNumber)
+		c.initDataset(&dataset[0], itemNumber)
 	}
 }
--- a/cache_test.go
+++ b/cache_test.go
@ -0,0 +1,101 @@
+package randomx
+
+import "testing"
+
+func Test_Cache_Init(t *testing.T) {
+	t.Parallel()
+
+	cache, err := NewCache(GetFlags())
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cache.Close()
+	cache.Init(Tests[1].key)
+
+	memory := cache.GetMemory()
+
+	var tests = []struct {
+		index int
+		value uint64
+	}{
+		{0, 0x191e0e1d23c02186},
+		{1568413, 0xf1b62fe6210bf8b1},
+		{33554431, 0x1f47f056d05cd99b},
+	}
+
+	for i, tt := range tests {
+		if memory[tt.index/128][tt.index%128] != tt.value {
+			t.Errorf("i=%d, index=%d", i, tt.index)
+			t.Errorf("expected=%016x, actual=%016x", tt.value, memory[tt.index/128][tt.index%128])
+		}
+	}
+
+}
+
+func Test_Cache_InitDataset(t *testing.T) {
+	t.Parallel()
+
+	var tests = []struct {
+		index int
+		value uint64
+	}{
+		{0, 0x680588a85ae222db},
+		{10000000, 0x7943a1f6186ffb72},
+		{20000000, 0x9035244d718095e1},
+		{30000000, 0x145a5091f7853099},
+	}
+
+	t.Run("interpreter", func(t *testing.T) {
+		t.Parallel()
+
+		flags := GetFlags()
+		flags &^= RANDOMX_FLAG_JIT
+
+		cache, err := NewCache(flags)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer cache.Close()
+		cache.Init(Tests[1].key)
+
+		var datasetItem RegisterLine
+
+		for i, tt := range tests {
+			cache.initDataset(&datasetItem, uint64(tt.index))
+			if datasetItem[0] != tt.value {
+				t.Errorf("i=%d, index=%d", i, tt.index)
+				t.Errorf("expected=%016x, actual=%016x", tt.value, datasetItem[0])
+			}
+		}
+	})
+
+	t.Run("compiler", func(t *testing.T) {
+		t.Parallel()
+
+		flags := GetFlags()
+		flags |= RANDOMX_FLAG_JIT
+		if !flags.HasJIT() {
+			t.Skip("not supported on this platform")
+		}
+
+		cache, err := NewCache(flags)
+		if err != nil {
+			t.Fatal(err)
+		}
+		defer cache.Close()
+		cache.Init(Tests[1].key)
+		if !cache.hasInitializedJIT() {
+			t.Skip("not supported on this platform")
+		}
+
+		var datasetItem RegisterLine
+
+		for i, tt := range tests {
+			cache.initDataset(&datasetItem, uint64(tt.index))
+			if datasetItem[0] != tt.value {
+				t.Errorf("i=%d, index=%d", i, tt.index)
+				t.Errorf("expected=%016x, actual=%016x", tt.value, datasetItem[0])
+			}
+		}
+	})
+}
--- a/commitment.go
+++ b/commitment.go
@ -0,0 +1,15 @@
+package randomx
+
+import "golang.org/x/crypto/blake2b"
+
+// CalculateCommitment Calculate a RandomX commitment from a RandomX hash and its input.
+func CalculateCommitment(input []byte, hashIn, hashOut *[RANDOMX_HASH_SIZE]byte) {
+	hasher, err := blake2b.New(RANDOMX_HASH_SIZE, nil)
+	if err != nil {
+		panic(err)
+	}
+
+	hasher.Write(input)
+	hasher.Write(hashIn[:])
+	hasher.Sum(hashOut[:0])
+}
--- a/commitment_test.go
+++ b/commitment_test.go
@ -0,0 +1,41 @@
+package randomx
+
+import (
+	"encoding/hex"
+	"testing"
+)
+
+func Test_CalculateCommitment(t *testing.T) {
+	t.Parallel()
+
+	cache, err := NewCache(GetFlags())
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer cache.Close()
+
+	test := Tests[1]
+
+	cache.Init(test.key)
+
+	vm, err := NewVM(GetFlags(), cache, nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer vm.Close()
+
+	var outputHash [RANDOMX_HASH_SIZE]byte
+
+	vm.CalculateHash(test.input, &outputHash)
+	CalculateCommitment(test.input, &outputHash, &outputHash)
+
+	outputHex := hex.EncodeToString(outputHash[:])
+
+	expected := "d53ccf348b75291b7be76f0a7ac8208bbced734b912f6fca60539ab6f86be919"
+
+	if expected != outputHex {
+		t.Errorf("key=%v, input=%v", test.key, test.input)
+		t.Errorf("expected=%s, actual=%s", expected, outputHex)
+		t.FailNow()
+	}
+}
--- a/config.go
+++ b/config.go
@ -29,6 +29,25 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package randomx

+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/argon2"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
+)
+
+// see reference configuration.h
+// Cache size in KiB. Must be a power of 2.
+const RANDOMX_ARGON_MEMORY = 262144
+
+// Number of Argon2d iterations for Cache initialization.
+const RANDOMX_ARGON_ITERATIONS = 3
+
+// Number of parallel lanes for Cache initialization.
+const RANDOMX_ARGON_LANES = 1
+
+// Argon2d salt
+const RANDOMX_ARGON_SALT = "RandomX\x03"
+const ArgonSaltSize uint32 = 8 //sizeof("" RANDOMX_ARGON_SALT) - 1
+
 // Number of random Cache accesses per Dataset item. Minimum is 2.
 const RANDOMX_CACHE_ACCESSES = 8

@ -65,7 +84,9 @@ const RANDOMX_JUMP_BITS = 8
 // Jump condition mask offset in bits. The sum of RANDOMX_JUMP_BITS and RANDOMX_JUMP_OFFSET must not exceed 16.
 const RANDOMX_JUMP_OFFSET = 8

-const DATASETEXTRAITEMS = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE
+const RANDOMX_HASH_SIZE = 32
+
+const DatasetExtraItems = RANDOMX_DATASET_EXTRA_SIZE / RANDOMX_DATASET_ITEM_SIZE

 const SuperscalarMaxSize = 3*RANDOMX_SUPERSCALAR_LATENCY + 2
 const RANDOMX_DATASET_ITEM_SIZE uint64 = 64
@ -74,7 +95,7 @@ const ScratchpadSize uint32 = RANDOMX_SCRATCHPAD_L3

 const CacheLineAlignMask = (RANDOMX_DATASET_BASE_SIZE - 1) & (^(CacheLineSize - 1))

-const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(ArgonBlockSize)
+const CacheSize uint64 = RANDOMX_ARGON_MEMORY * uint64(argon2.BlockSize)

 const ScratchpadL1 = RANDOMX_SCRATCHPAD_L1 / 8
 const ScratchpadL2 = RANDOMX_SCRATCHPAD_L2 / 8
@ -87,25 +108,13 @@ const ScratchpadL3Mask = (ScratchpadL3 - 1) * 8
 const ScratchpadL3Mask64 = (ScratchpadL3/8 - 1) * 64

 const CONDITIONOFFSET = RANDOMX_JUMP_OFFSET
-const CONDITIONMASK = ((1 << RANDOMX_JUMP_BITS) - 1)
+const CONDITIONMASK = (1 << RANDOMX_JUMP_BITS) - 1
 const STOREL3CONDITION = 14

-const REGISTERSCOUNT = 8
-const REGISTERCOUNTFLT = 4
-
-const mantissaSize = 52
-const exponentSize = 11
-const mantissaMask = (uint64(1) << mantissaSize) - 1
-const exponentMask = (uint64(1) << exponentSize) - 1
-const exponentBias = 1023
-const dynamicExponentBits = 4
-const staticExponentBits = 4
-const constExponentBits uint64 = 0x300
-const dynamicMantissaMask = (uint64(1) << (mantissaSize + dynamicExponentBits)) - 1
-
-const RANDOMX_FLAG_DEFAULT = uint64(0)
-const RANDOMX_FLAG_JIT = uint64(1 << iota)
-
 func isZeroOrPowerOf2(x uint32) bool {
 	return (x & (x - 1)) == 0
 }
+
+var largePageAllocator = memory.NewLargePageAllocator()
+var pageAllocator = memory.NewPageAllocator()
+var cacheLineAlignedAllocator = memory.NewAlignedAllocator(CacheLineSize)
--- a/dataset.go
+++ b/dataset.go
@ -1,7 +1,111 @@
 package randomx

-type Randomx_Dataset interface {
-	InitDataset(startItem, endItem uint64)
-	ReadDataset(address uint64, r, cache *RegisterLine)
-	PrefetchDataset(address uint64)
+import (
+	"errors"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
+	"sync"
+)
+
+const DatasetSize = RANDOMX_DATASET_BASE_SIZE + RANDOMX_DATASET_EXTRA_SIZE
+
+const DatasetItemCount = DatasetSize / CacheLineSize
+
+type Dataset struct {
+	memory []RegisterLine
+	flags  Flags
+}
+
+// NewDataset Creates a randomx_dataset structure and allocates memory for RandomX Dataset.
+// Only one flag is supported (can be set or not set): RANDOMX_FLAG_LARGE_PAGES - allocate memory in large pages
+// Returns nil if allocation fails
+func NewDataset(flags Flags) (result *Dataset, err error) {
+	defer func() {
+		//catch too large memory allocation or unable to allocate, for example on 32-bit targets or out of memory
+		if r := recover(); r != nil {
+			result = nil
+			if e, ok := r.(error); ok && e != nil {
+				err = e
+			} else {
+				err = errors.New("out of memory")
+			}
+		}
+	}()
+
+	var alignedMemory []RegisterLine
+
+	if flags.Has(RANDOMX_FLAG_LARGE_PAGES) {
+		if largePageAllocator == nil {
+			return nil, errors.New("huge pages not supported")
+		}
+		alignedMemory, err = memory.AllocateSlice[RegisterLine](largePageAllocator, DatasetItemCount)
+		if err != nil {
+			return nil, err
+		}
+	} else {
+		alignedMemory, err = memory.AllocateSlice[RegisterLine](cacheLineAlignedAllocator, DatasetItemCount)
+
+		if err != nil {
+			return nil, err
+		}
+	}
+
+	return &Dataset{
+		memory: alignedMemory,
+		flags:  flags,
+	}, nil
+}
+
+func (d *Dataset) prefetchDataset(address uint64) {
+
+}
+
+func (d *Dataset) readDataset(address uint64, r *RegisterLine) {
+	cache := &d.memory[address/CacheLineSize]
+
+	for i := range r {
+		r[i] ^= cache[i]
+	}
+}
+
+// Memory Returns a pointer to the internal memory buffer of the dataset structure.
+// The size of the internal memory buffer is DatasetItemCount * RANDOMX_DATASET_ITEM_SIZE.
+func (d *Dataset) Memory() []RegisterLine {
+	return d.memory
+}
+
+func (d *Dataset) InitDataset(cache *Cache, startItem, itemCount uint64) {
+	if startItem >= DatasetItemCount || itemCount > DatasetItemCount {
+		panic("out of range")
+	}
+	if startItem+itemCount > DatasetItemCount {
+		panic("out of range")
+	}
+	cache.datasetInit(d.memory[startItem:startItem+itemCount], startItem, startItem+itemCount)
+}
+
+func (d *Dataset) Close() error {
+	if d.flags.Has(RANDOMX_FLAG_LARGE_PAGES) {
+		return memory.FreeSlice(largePageAllocator, d.memory)
+	} else {
+		return memory.FreeSlice(cacheLineAlignedAllocator, d.memory)
+	}
+}
+
+func (d *Dataset) InitDatasetParallel(cache *Cache, n int) {
+	n = max(1, n)
+
+	var wg sync.WaitGroup
+	for i := uint64(1); i < uint64(n); i++ {
+		a := (DatasetItemCount * i) / uint64(n)
+		b := (DatasetItemCount * (i + 1)) / uint64(n)
+
+		wg.Add(1)
+		go func(a, b uint64) {
+			defer wg.Done()
+			d.InitDataset(cache, a, b-a)
+		}(a, b)
+	}
+
+	d.InitDataset(cache, 0, DatasetItemCount/uint64(n))
+	wg.Wait()
 }
--- a/dataset_light.go
+++ b/dataset_light.go
@ -1,26 +0,0 @@
-package randomx
-
-type Randomx_DatasetLight struct {
-	Cache  *Randomx_Cache
-	Memory []uint64
-}
-
-func (d *Randomx_DatasetLight) PrefetchDataset(address uint64) {
-
-}
-
-func (d *Randomx_DatasetLight) ReadDataset(address uint64, r, cache *RegisterLine) {
-	if d.Cache.HasJIT() {
-		d.Cache.InitDatasetItemJIT(cache, address/CacheLineSize)
-	} else {
-		d.Cache.InitDatasetItem(cache, address/CacheLineSize)
-	}
-
-	for i := range r {
-		r[i] ^= cache[i]
-	}
-}
-
-func (d *Randomx_DatasetLight) InitDataset(startItem, endItem uint64) {
-	//d.Cache.initDataset(d.Cache.Programs)
-}
--- a/exec.go
+++ b/exec.go
@ -1,3 +1,15 @@
 package randomx

-type ProgramFunc []byte
+import "git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
+
+type SuperScalarProgramFunc []byte
+
+type VMProgramFunc []byte
+
+func (f SuperScalarProgramFunc) Close() error {
+	return memory.FreeSlice(pageAllocator, f)
+}
+
+func (f VMProgramFunc) Close() error {
+	return memory.FreeSlice(pageAllocator, f)
+}
--- a/exec_generic.go
+++ b/exec_generic.go
@ -1,11 +0,0 @@
-//go:build !unix || disable_jit
-
-package randomx
-
-func (f ProgramFunc) Execute(rl *RegisterLine) {
-
-}
-
-func (f ProgramFunc) Close() error {
-
-}
--- a/exec_mmap_unix.go
+++ b/exec_mmap_unix.go
@ -1,50 +0,0 @@
-//go:build unix && !disable_jit
-
-package randomx
-
-import (
-	"golang.org/x/sys/unix"
-	"unsafe"
-)
-
-func (f ProgramFunc) Execute(rl *RegisterLine) {
-	if f == nil {
-		panic("program is nil")
-	}
-	memoryPtr := &f
-	fun := *(*func(rl *RegisterLine))(unsafe.Pointer(&memoryPtr))
-
-	fun(rl)
-}
-
-func (f ProgramFunc) Close() error {
-	return unix.Munmap(f)
-}
-
-func mapProgram(program []byte) ProgramFunc {
-	// Write only
-	execFunc, err := unix.Mmap(-1, 0, len(program), unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
-	if err != nil {
-		panic(err)
-	}
-
-	// Introduce machine code into the memory region
-	copy(execFunc, program)
-
-	// uphold W^X
-
-	// Read and Exec only
-	err = unix.Mprotect(execFunc, unix.PROT_READ|unix.PROT_EXEC)
-	if err != nil {
-		defer func() {
-			// unmap if we err
-			err := unix.Munmap(execFunc)
-			if err != nil {
-				panic(err)
-			}
-		}()
-		panic(err)
-	}
-
-	return execFunc
-}
--- a/flags.go
+++ b/flags.go
@ -0,0 +1,68 @@
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/aes"
+	"golang.org/x/sys/cpu"
+	"runtime"
+)
+
+type Flags uint64
+
+func (f Flags) Has(flags Flags) bool {
+	return f&flags == flags
+}
+
+func (f Flags) HasJIT() bool {
+	return f.Has(RANDOMX_FLAG_JIT) && supportsJIT
+}
+
+const RANDOMX_FLAG_DEFAULT Flags = 0
+
+const (
+	// RANDOMX_FLAG_LARGE_PAGES Select large page allocation for dataset
+	RANDOMX_FLAG_LARGE_PAGES = Flags(1 << iota)
+	// RANDOMX_FLAG_HARD_AES Selects between hardware or software AES
+	RANDOMX_FLAG_HARD_AES
+	// RANDOMX_FLAG_FULL_MEM Selects between full or light mode dataset
+	RANDOMX_FLAG_FULL_MEM
+	// RANDOMX_FLAG_JIT Enables JIT features
+	RANDOMX_FLAG_JIT
+	// RANDOMX_FLAG_SECURE Enables W^X for JIT code
+	RANDOMX_FLAG_SECURE
+	RANDOMX_FLAG_ARGON2_SSSE3
+	RANDOMX_FLAG_ARGON2_AVX2
+	RANDOMX_FLAG_ARGON2 = RANDOMX_FLAG_ARGON2_AVX2 | RANDOMX_FLAG_ARGON2_SSSE3
+)
+
+// GetFlags The recommended flags to be used on the current machine.
+// Does not include:
+// * RANDOMX_FLAG_LARGE_PAGES
+// * RANDOMX_FLAG_FULL_MEM
+// * RANDOMX_FLAG_SECURE
+// These flags must be added manually if desired.
+//
+// On OpenBSD RANDOMX_FLAG_SECURE is enabled by default in JIT mode as W^X is enforced by the OS.
+func GetFlags() (flags Flags) {
+	flags = RANDOMX_FLAG_DEFAULT
+	if runtime.GOARCH == "amd64" {
+		flags |= RANDOMX_FLAG_JIT
+
+		if aes.HasHardAESImplementation && cpu.X86.HasAES {
+			flags |= RANDOMX_FLAG_HARD_AES
+		}
+
+		if cpu.X86.HasSSSE3 {
+			flags |= RANDOMX_FLAG_ARGON2_SSSE3
+		}
+
+		if cpu.X86.HasAVX2 {
+			flags |= RANDOMX_FLAG_ARGON2_AVX2
+		}
+	}
+
+	if runtime.GOOS == "openbsd" || runtime.GOOS == "netbsd" || ((runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64") {
+		flags |= RANDOMX_FLAG_SECURE
+	}
+
+	return flags
+}
--- a/go.mod
+++ b/go.mod
@ -1,7 +1,9 @@
-module git.gammaspectra.live/P2Pool/go-randomx/v2
+module git.gammaspectra.live/P2Pool/go-randomx/v3

 go 1.21

 require golang.org/x/crypto v0.22.0

 require golang.org/x/sys v0.19.0
+
+require git.gammaspectra.live/P2Pool/softfloat64 v1.0.0
--- a/go.sum
+++ b/go.sum
@ -1,3 +1,5 @@
+git.gammaspectra.live/P2Pool/softfloat64 v1.0.0 h1:XqxDpowntpV8gvBzG9bMC8VVzxZJT/YEk7BfwmaCamU=
+git.gammaspectra.live/P2Pool/softfloat64 v1.0.0/go.mod h1:ZhnGqXOS6F6aJpiiT38Cvk5eHoBNqjkKfp3w3AcnomA=
 golang.org/x/crypto v0.22.0 h1:g1v0xeRhjcugydODzvb3mEM9SQ0HGp9s/nh3COQ/C30=
 golang.org/x/crypto v0.22.0/go.mod h1:vr6Su+7cTlO45qkww3VDJlzDn0ctJvRgYbC2NvXHt+M=
 golang.org/x/sys v0.19.0 h1:q5f1RH2jigJ1MoAWp2KTp3gm5zAGFUTarQZ5U386+4o=
--- a/internal/aes/const.go
+++ b/internal/aes/const.go
@ -0,0 +1,127 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package aes implements AES encryption (formerly Rijndael), as defined in
+// U.S. Federal Information Processing Standards Publication 197.
+//
+// The AES operations in this package are not implemented using constant-time algorithms.
+// An exception is when running on systems with enabled hardware support for AES
+// that makes these operations constant-time. Examples include amd64 systems using AES-NI
+// extensions and s390x systems using Message-Security-Assist extensions.
+// On such systems, when the result of NewCipher is passed to cipher.NewGCM,
+// the GHASH operation used by GCM is also constant-time.
+package aes
+
+import (
+	"bytes"
+	"math/bits"
+)
+
+// This file generates AES constants - 8720 bytes of initialized data.
+
+// https://csrc.nist.gov/publications/fips/fips197/fips-197.pdf
+
+// AES is based on the mathematical behavior of binary polynomials
+// (polynomials over GF(2)) modulo the irreducible polynomial x⁸ + x⁴ + x³ + x + 1.
+// Addition of these binary polynomials corresponds to binary xor.
+// Reducing mod poly corresponds to binary xor with poly every
+// time a 0x100 bit appears.
+const poly = 1<<8 | 1<<4 | 1<<3 | 1<<1 | 1<<0 // x⁸ + x⁴ + x³ + x + 1
+
+// Multiply b and c as GF(2) polynomials modulo poly
+func mul(b, c uint32) uint32 {
+	i := b
+	j := c
+	s := uint32(0)
+	for k := uint32(1); k < 0x100 && j != 0; k <<= 1 {
+		// Invariant: k == 1<<n, i == b * xⁿ
+
+		if j&k != 0 {
+			// s += i in GF(2); xor in binary
+			s ^= i
+			j ^= k // turn off bit to end loop early
+		}
+
+		// i *= x in GF(2) modulo the polynomial
+		i <<= 1
+		if i&0x100 != 0 {
+			i ^= poly
+		}
+	}
+	return s
+}
+
+// sbox0 FIPS-197 Figure 7. S-box substitution values generation
+var sbox0 = func() (sbox [256]byte) {
+	var p, q uint8 = 1, 1
+	for {
+		/* multiply p by 3 */
+		if p&0x80 != 0 {
+			p ^= (p << 1) ^ 0x1b
+		} else {
+			p ^= p << 1
+		}
+
+		/* divide q by 3 (equals multiplication by 0xf6) */
+		q ^= q << 1
+		q ^= q << 2
+		q ^= q << 4
+		if q&0x80 != 0 {
+			q ^= 0x09
+		}
+
+		/* compute the affine transformation */
+		xformed := q ^ bits.RotateLeft8(q, 1) ^ bits.RotateLeft8(q, 2) ^ bits.RotateLeft8(q, 3) ^ bits.RotateLeft8(q, 4)
+		sbox[p] = xformed ^ 0x63
+
+		if p == 1 {
+			break
+		}
+	}
+
+	/* 0 is a special case since it has no inverse */
+	sbox[0] = 0x63
+	return sbox
+}()
+
+// sbox1 FIPS-197 Figure 14. Inverse S-box substitution values generation
+var sbox1 = func() (isbox [256]byte) {
+	for i := range sbox0 {
+		isbox[i] = uint8(bytes.IndexByte(sbox0[:], uint8(i)))
+	}
+	return isbox
+}()
+
+// encLut Lookup tables for encryption.
+var encLut = func() (te [4][256]uint32) {
+	for i := 0; i < 256; i++ {
+		s := uint32(sbox0[i])
+		s2 := mul(s, 2)
+		s3 := mul(s, 3)
+		w := s2<<24 | s<<16 | s<<8 | s3
+
+		for j := 0; j < 4; j++ {
+			te[j][i] = bits.ReverseBytes32(w)
+			w = w<<24 | w>>8
+		}
+	}
+	return te
+}()
+
+// decLut Lookup tables for decryption.
+var decLut = func() (td [4][256]uint32) {
+	for i := 0; i < 256; i++ {
+		s := uint32(sbox1[i])
+		s9 := mul(s, 0x9)
+		sb := mul(s, 0xb)
+		sd := mul(s, 0xd)
+		se := mul(s, 0xe)
+		w := se<<24 | s9<<16 | sd<<8 | sb
+		for j := 0; j < 4; j++ {
+			td[j][i] = bits.ReverseBytes32(w)
+			w = w<<24 | w>>8
+		}
+	}
+	return td
+}()
--- a/internal/aes/hard_amd64.go
+++ b/internal/aes/hard_amd64.go
@ -0,0 +1,69 @@
+//go:build amd64 && !purego
+
+package aes
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/asm"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/keys"
+	"golang.org/x/sys/cpu"
+	"runtime"
+	"unsafe"
+)
+
+const HasHardAESImplementation = true
+
+type hardAES struct {
+}
+
+func NewHardAES() AES {
+	if cpu.X86.HasAES {
+		return hardAES{}
+	}
+
+	return nil
+}
+
+func (aes hardAES) HashAes1Rx4(input []byte, output *[64]byte) {
+	if len(input)%len(output) != 0 {
+		panic("unsupported")
+	}
+
+	asm.HashAes1Rx4(&keys.AesHash1R_State, &keys.AesHash1R_XKeys, output, unsafe.SliceData(input), uint64(len(input)))
+}
+
+func (aes hardAES) FillAes1Rx4(state *[64]byte, output []byte) {
+	if len(output)%len(state) != 0 {
+		panic("unsupported")
+	}
+
+	// Reference to state without copying
+	states := (*[4][4]uint32)(unsafe.Pointer(state))
+	asm.FillAes1Rx4(states, &keys.AesGenerator1R_Keys, unsafe.SliceData(output), uint64(len(output)))
+	runtime.KeepAlive(state)
+}
+
+func (aes hardAES) FillAes4Rx4(state [64]byte, output []byte) {
+	if len(output)%len(state) != 0 {
+		panic("unsupported")
+	}
+
+	// state is copied on caller
+
+	// Copy state
+	states := (*[4][4]uint32)(unsafe.Pointer(&state))
+
+	for outptr := 0; outptr < len(output); outptr += len(state) {
+		asm.AESRoundTrip_DecEnc(states, &fillAes4Rx4Keys0)
+		asm.AESRoundTrip_DecEnc(states, &fillAes4Rx4Keys1)
+		asm.AESRoundTrip_DecEnc(states, &fillAes4Rx4Keys2)
+		asm.AESRoundTrip_DecEnc(states, &fillAes4Rx4Keys3)
+
+		copy(output[outptr:], state[:])
+	}
+}
+
+func (aes hardAES) HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte) {
+	//TODO
+	aes.HashAes1Rx4(scratchpad, output)
+	aes.FillAes1Rx4(fillState, scratchpad)
+}
--- a/internal/aes/hard_generic.go
+++ b/internal/aes/hard_generic.go
@ -0,0 +1,9 @@
+//go:build !amd64 || purego
+
+package aes
+
+const HasHardAESImplementation = false
+
+func NewHardAES() AES {
+	return nil
+}
--- a/internal/aes/hash.go
+++ b/internal/aes/hash.go
@ -0,0 +1,59 @@
+/*
+Copyright (c) 2019 DERO Foundation. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation
+and/or other materials provided with the distribution.
+
+3. Neither the name of the copyright holder nor the names of its contributors
+may be used to endorse or promote products derived from this software without
+specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+package aes
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/keys"
+)
+
+var fillAes4Rx4Keys0 = [4][4]uint32{
+	keys.AesGenerator4R_Keys[0],
+	keys.AesGenerator4R_Keys[0],
+	keys.AesGenerator4R_Keys[4],
+	keys.AesGenerator4R_Keys[4],
+}
+var fillAes4Rx4Keys1 = [4][4]uint32{
+	keys.AesGenerator4R_Keys[1],
+	keys.AesGenerator4R_Keys[1],
+	keys.AesGenerator4R_Keys[5],
+	keys.AesGenerator4R_Keys[5],
+}
+var fillAes4Rx4Keys2 = [4][4]uint32{
+	keys.AesGenerator4R_Keys[2],
+	keys.AesGenerator4R_Keys[2],
+	keys.AesGenerator4R_Keys[6],
+	keys.AesGenerator4R_Keys[6],
+}
+var fillAes4Rx4Keys3 = [4][4]uint32{
+	keys.AesGenerator4R_Keys[3],
+	keys.AesGenerator4R_Keys[3],
+	keys.AesGenerator4R_Keys[7],
+	keys.AesGenerator4R_Keys[7],
+}
--- a/internal/aes/impl.go
+++ b/internal/aes/impl.go
@ -0,0 +1,38 @@
+package aes
+
+type AES interface {
+
+	// HashAes1Rx4
+	//
+	// Calculate a 512-bit hash of 'input' using 4 lanes of AES.
+	// The input is treated as a set of round keys for the encryption
+	// of the initial state.
+	//
+	// 'input' size must be a multiple of 64.
+	//
+	// For a 2 MiB input, this has the same security as 32768-round
+	// AES encryption.
+	//
+	// Hashing throughput: >20 GiB/s per CPU core with hardware AES
+	HashAes1Rx4(input []byte, output *[64]byte)
+
+	// FillAes1Rx4
+	//
+	// Fill 'output' with pseudorandom data based on 512-bit 'state'.
+	// The state is encrypted using a single AES round per 16 bytes of output
+	// in 4 lanes.
+	//
+	// 'output' size must be a multiple of 64.
+	//
+	// The modified state is written back to 'state' to allow multiple
+	// calls to this function.
+	FillAes1Rx4(state *[64]byte, output []byte)
+
+	// HashAndFillAes1Rx4 Hashes and fills scratchpad and output in one sweep
+	HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte)
+
+	// FillAes4Rx4 used to generate final program
+	//
+	// 'state' is copied when calling
+	FillAes4Rx4(state [64]byte, output []byte)
+}
--- a/internal/aes/round.go
+++ b/internal/aes/round.go
@ -29,3 +29,24 @@ func soft_aesdec(state *[4]uint32, key *[4]uint32) {
 	state[2] = key[2] ^ td0[uint8(s2)] ^ td1[uint8(s1>>8)] ^ td2[uint8(s0>>16)] ^ td3[uint8(s3>>24)]
 	state[3] = key[3] ^ td0[uint8(s3)] ^ td1[uint8(s2>>8)] ^ td2[uint8(s1>>16)] ^ td3[uint8(s0>>24)]
 }
+
+func soft_aesroundtrip_decenc(states *[4][4]uint32, keys *[4][4]uint32) {
+	soft_aesdec(&states[0], &keys[0])
+	soft_aesenc(&states[1], &keys[1])
+	soft_aesdec(&states[2], &keys[2])
+	soft_aesenc(&states[3], &keys[3])
+}
+
+func soft_aesroundtrip_encdec(states *[4][4]uint32, keys *[4][4]uint32) {
+	soft_aesenc(&states[0], &keys[0])
+	soft_aesdec(&states[1], &keys[1])
+	soft_aesenc(&states[2], &keys[2])
+	soft_aesdec(&states[3], &keys[3])
+}
+
+func soft_aesroundtrip_encdec1(states *[4][4]uint32, key *[4]uint32) {
+	soft_aesenc(&states[0], key)
+	soft_aesdec(&states[1], key)
+	soft_aesenc(&states[2], key)
+	soft_aesdec(&states[3], key)
+}
--- a/internal/aes/soft.go
+++ b/internal/aes/soft.go
@ -0,0 +1,75 @@
+package aes
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/keys"
+	"runtime"
+	"unsafe"
+)
+
+type softAES struct {
+}
+
+func NewSoftAES() AES {
+	return softAES{}
+}
+
+func (aes softAES) HashAes1Rx4(input []byte, output *[64]byte) {
+	if len(input)%len(output) != 0 {
+		panic("unsupported")
+	}
+	// states are copied
+	states := (*[4][4]uint32)(unsafe.Pointer(output))
+	*states = keys.AesHash1R_State
+
+	for input_ptr := 0; input_ptr < len(input); input_ptr += 64 {
+		in := (*[4][4]uint32)(unsafe.Pointer(unsafe.SliceData(input[input_ptr:])))
+
+		soft_aesroundtrip_encdec(states, in)
+	}
+
+	soft_aesroundtrip_encdec1(states, &keys.AesHash1R_XKeys[0])
+
+	soft_aesroundtrip_encdec1(states, &keys.AesHash1R_XKeys[1])
+
+	runtime.KeepAlive(output)
+}
+
+func (aes softAES) FillAes1Rx4(state *[64]byte, output []byte) {
+	if len(output)%len(state) != 0 {
+		panic("unsupported")
+	}
+	// Reference to state without copying
+	states := (*[4][4]uint32)(unsafe.Pointer(state))
+
+	for outptr := 0; outptr < len(output); outptr += len(state) {
+		soft_aesroundtrip_decenc(states, &keys.AesGenerator1R_Keys)
+
+		copy(output[outptr:], state[:])
+	}
+}
+
+func (aes softAES) FillAes4Rx4(state [64]byte, output []byte) {
+	if len(output)%len(state) != 0 {
+		panic("unsupported")
+	}
+
+	// state is copied on caller
+
+	// Copy state
+	states := (*[4][4]uint32)(unsafe.Pointer(&state))
+
+	for outptr := 0; outptr < len(output); outptr += len(state) {
+		soft_aesroundtrip_decenc(states, &fillAes4Rx4Keys0)
+		soft_aesroundtrip_decenc(states, &fillAes4Rx4Keys1)
+		soft_aesroundtrip_decenc(states, &fillAes4Rx4Keys2)
+		soft_aesroundtrip_decenc(states, &fillAes4Rx4Keys3)
+
+		copy(output[outptr:], state[:])
+	}
+}
+
+func (aes softAES) HashAndFillAes1Rx4(scratchpad []byte, output *[64]byte, fillState *[64]byte) {
+	//TODO
+	aes.HashAes1Rx4(scratchpad, output)
+	aes.FillAes1Rx4(fillState, scratchpad)
+}
--- a/internal/argon2/argon2.go
+++ b/internal/argon2/argon2.go
@ -0,0 +1,76 @@
+package argon2
+
+import (
+	"encoding/binary"
+	"golang.org/x/crypto/blake2b"
+)
+
+import (
+	_ "golang.org/x/crypto/argon2"
+	_ "unsafe"
+)
+
+const BlockSize uint32 = 1024
+
+type Block [BlockSize / 8]uint64
+
+const syncPoints = 4
+
+//go:linkname initHash golang.org/x/crypto/argon2.initHash
+func initHash(password, salt, key, data []byte, time, memory, threads, keyLen uint32, mode int) [blake2b.Size + 8]byte
+
+//go:linkname processBlocks golang.org/x/crypto/argon2.processBlocks
+func processBlocks(B []Block, time, memory, threads uint32, mode int)
+
+//go:linkname blake2bHash golang.org/x/crypto/argon2.blake2bHash
+func blake2bHash(out []byte, in []byte)
+
+// initBlocks From golang.org/x/crypto/argon2.initBlocks with external memory allocation
+func initBlocks(B []Block, h0 *[blake2b.Size + 8]byte, memory, threads uint32) {
+	var block0 [1024]byte
+
+	clear(B)
+
+	for lane := uint32(0); lane < threads; lane++ {
+		j := lane * (memory / threads)
+		binary.LittleEndian.PutUint32(h0[blake2b.Size+4:], lane)
+
+		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 0)
+		blake2bHash(block0[:], h0[:])
+		for i := range B[j+0] {
+			B[j+0][i] = binary.LittleEndian.Uint64(block0[i*8:])
+		}
+
+		binary.LittleEndian.PutUint32(h0[blake2b.Size:], 1)
+		blake2bHash(block0[:], h0[:])
+		for i := range B[j+1] {
+			B[j+1][i] = binary.LittleEndian.Uint64(block0[i*8:])
+		}
+	}
+}
+
+// BuildBlocks From golang.org/x/crypto/argon2.deriveKey without last deriveKey call and external memory allocation
+func BuildBlocks(B []Block, password, salt []byte, time, memory uint32, threads uint8) {
+	if time < 1 {
+		panic("argon2: number of rounds too small")
+	}
+	if threads < 1 {
+		panic("argon2: parallelism degree too low")
+	}
+
+	if len(B) != int(memory) {
+		panic("argon2: invalid block size")
+	}
+
+	const mode = 0 /* argon2d */
+	const keyLen = 0
+	h0 := initHash(password, salt, nil, nil, time, memory, uint32(threads), keyLen, mode)
+
+	memory = memory / (syncPoints * uint32(threads)) * (syncPoints * uint32(threads))
+	if memory < 2*syncPoints*uint32(threads) {
+		memory = 2 * syncPoints * uint32(threads)
+	}
+
+	initBlocks(B, &h0, memory, uint32(threads))
+	processBlocks(B, time, memory, uint32(threads), mode)
+}
--- a/internal/asm/aes_amd64.go
+++ b/internal/asm/aes_amd64.go
@ -0,0 +1,18 @@
+//go:build amd64 && !purego
+
+package asm
+
+//go:noescape
+func FillAes1Rx4(states *[4][4]uint32, keys *[4][4]uint32, output *byte, outputLen uint64)
+
+//go:noescape
+func HashAes1Rx4(initialState *[4][4]uint32, xKeys *[2][4]uint32, output *[64]byte, input *byte, inputLen uint64)
+
+//go:noescape
+func AESRoundTrip_DecEnc(states *[4][4]uint32, keys *[4][4]uint32)
+
+//go:noescape
+func AESRoundTrip_EncDec(states *[4][4]uint32, keys *[4][4]uint32)
+
+//go:noescape
+func AESRoundTrip_EncDec1(states *[4][4]uint32, key *[4]uint32)
--- a/internal/asm/aes_amd64.s
+++ b/internal/asm/aes_amd64.s
@ -0,0 +1,172 @@
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+TEXT ·FillAes1Rx4(SB),NOSPLIT|NOFRAME,$0-32
+	MOVQ states+0(FP), AX
+	MOVQ keys+8(FP), BX
+	MOVQ output+16(FP), CX
+	MOVQ outputLen+24(FP), DX
+
+    // initial state
+	VMOVDQU 0(AX), X0
+	VMOVDQU 16(AX), X1
+	VMOVDQU 32(AX), X2
+	VMOVDQU 48(AX), X3
+
+    // keys: X4-X7
+	VMOVDQU 0(BX), X4
+	VMOVDQU 16(BX), X5
+	VMOVDQU 32(BX), X6
+	VMOVDQU 48(BX), X7
+
+loop:
+
+	AESDEC X4, X0
+	AESENC X5, X1
+	AESDEC X6, X2
+    AESENC X7, X3
+
+    // store state onto output
+    VMOVDQU X0, 0(CX)
+    VMOVDQU X1, 16(CX)
+    VMOVDQU X2, 32(CX)
+    VMOVDQU X3, 48(CX)
+    ADDQ $64, CX
+
+    // outputLen -= 64, continue if not 0
+    SUBQ $64, DX
+    JNE loop
+
+    // offload initial state
+	VMOVDQU X0, 0(AX)
+	VMOVDQU X1, 16(AX)
+	VMOVDQU X2, 32(AX)
+	VMOVDQU X3, 48(AX)
+	RET
+
+
+TEXT ·HashAes1Rx4(SB),NOSPLIT|NOFRAME,$0-40
+	MOVQ initialState+0(FP), AX
+
+    // initial state
+	VMOVDQU 0(AX), X0
+	VMOVDQU 16(AX), X1
+	VMOVDQU 32(AX), X2
+	VMOVDQU 48(AX), X3
+
+
+	MOVQ xKeys+8(FP), AX
+	MOVQ output+16(FP), BX
+	MOVQ input+24(FP), CX
+	MOVQ inputLen+32(FP), DX
+
+loop:
+    // input as keys: X4-X7
+	VMOVDQU 0(CX), X4
+	VMOVDQU 16(CX), X5
+	VMOVDQU 32(CX), X6
+	VMOVDQU 48(CX), X7
+
+	AESENC X4, X0
+	AESDEC X5, X1
+	AESENC X6, X2
+    AESDEC X7, X3
+
+    ADDQ $64, CX
+    // inputLen -= 64, continue if not 0
+    SUBQ $64, DX
+    JNE loop
+
+    // do encdec1 with both keys!
+	VMOVDQU 0(AX), X4
+	VMOVDQU 16(AX), X5
+
+    AESENC X4, X0
+    AESDEC X4, X1
+    AESENC X4, X2
+    AESDEC X4, X3
+
+    AESENC X5, X0
+    AESDEC X5, X1
+    AESENC X5, X2
+    AESDEC X5, X3
+
+    // offload into output
+	VMOVDQU X0, 0(BX)
+	VMOVDQU X1, 16(BX)
+	VMOVDQU X2, 32(BX)
+	VMOVDQU X3, 48(BX)
+	RET
+
+TEXT ·AESRoundTrip_DecEnc(SB),NOSPLIT|NOFRAME,$0-16
+	MOVQ states+0(FP), AX
+	MOVQ keys+8(FP), BX
+
+	VMOVDQU 0(AX), X0
+	VMOVDQU 0(BX), X1
+	VMOVDQU 16(AX), X2
+	VMOVDQU 16(BX), X3
+	VMOVDQU 32(AX), X4
+	VMOVDQU 32(BX), X5
+	VMOVDQU 48(AX), X6
+	VMOVDQU 48(BX), X7
+
+	AESDEC X1, X0
+	AESENC X3, X2
+	AESDEC X5, X4
+    AESENC X7, X6
+
+	VMOVDQU X0, 0(AX)
+	VMOVDQU X2, 16(AX)
+	VMOVDQU X4, 32(AX)
+	VMOVDQU X6, 48(AX)
+	RET
+
+
+TEXT ·AESRoundTrip_EncDec(SB),NOSPLIT|NOFRAME,$0-16
+	MOVQ states+0(FP), AX
+	MOVQ keys+8(FP), BX
+
+	VMOVDQU 0(AX), X0
+	VMOVDQU 0(BX), X1
+	VMOVDQU 16(AX), X2
+	VMOVDQU 16(BX), X3
+	VMOVDQU 32(AX), X4
+	VMOVDQU 32(BX), X5
+	VMOVDQU 48(AX), X6
+	VMOVDQU 48(BX), X7
+
+	AESENC X1, X0
+	AESDEC X3, X2
+	AESENC X5, X4
+    AESDEC X7, X6
+
+	VMOVDQU X0, 0(AX)
+	VMOVDQU X2, 16(AX)
+	VMOVDQU X4, 32(AX)
+	VMOVDQU X6, 48(AX)
+	RET
+
+
+TEXT ·AESRoundTrip_EncDec1(SB),NOSPLIT|NOFRAME,$0-16
+	MOVQ states+0(FP), AX
+	MOVQ key+8(FP), BX
+
+	VMOVDQU 0(BX), X0
+	VMOVDQU 0(AX), X1
+	VMOVDQU 16(AX), X2
+	VMOVDQU 32(AX), X3
+	VMOVDQU 48(AX), X4
+
+	AESENC X0, X1
+	AESDEC X0, X2
+	AESENC X0, X3
+    AESDEC X0, X4
+
+	VMOVDQU X1, 0(AX)
+	VMOVDQU X2, 16(AX)
+	VMOVDQU X3, 32(AX)
+	VMOVDQU X4, 48(AX)
+	RET
+
--- a/internal/asm/aes_noasm.go
+++ b/internal/asm/aes_noasm.go
@ -0,0 +1,11 @@
+//go:build !amd64 || purego
+
+package asm
+
+func AESRoundEncrypt(state *[4]uint32, key *[4]uint32) {
+	panic("not implemented")
+}
+
+func AESRoundDecrypt(state *[4]uint32, key *[4]uint32) {
+	panic("not implemented")
+}
--- a/internal/asm/cpuid_amd64.go
+++ b/internal/asm/cpuid_amd64.go
@ -0,0 +1,5 @@
+//go:build amd64 && !purego
+
+package asm
+
+func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
--- a/internal/asm/cpuid_amd64.s
+++ b/internal/asm/cpuid_amd64.s
@ -0,0 +1,15 @@
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+// func Cpuid(op uint32) (eax, ebx, ecx, edx uint32)
+TEXT ·Cpuid(SB), 7, $0
+    XORQ CX, CX
+    MOVL op+0(FP), AX
+    CPUID
+    MOVL AX, eax+8(FP)
+    MOVL BX, ebx+12(FP)
+    MOVL CX, ecx+16(FP)
+    MOVL DX, edx+20(FP)
+    RET
+
--- a/internal/asm/round.go
+++ b/internal/asm/round.go
@ -0,0 +1,5 @@
+package asm
+
+func SetRoundingMode[T ~uint64 | ~uint8](mode T) {
+	setRoundingMode(uint8(mode))
+}
--- a/internal/asm/round_386.go
+++ b/internal/asm/round_386.go
@ -0,0 +1,6 @@
+//go:build 386 && !purego
+
+package asm
+
+//go:noescape
+func setRoundingMode(mode uint8)
--- a/internal/asm/round_386.s
+++ b/internal/asm/round_386.s
@ -0,0 +1,21 @@
+//go:build 386 && !purego
+
+#include "textflag.h"
+
+TEXT ·setRoundingMode(SB),NOSPLIT|NOFRAME,$4-1
+	MOVB addr+0(FP), AX
+	ANDL $3, AX
+	ROLL $13, AX
+
+    // get current MXCSR register
+	PUSHL AX
+	STMXCSR 0(SP)
+
+	// put new rounding mode
+	ANDL $~0x6000, 0(SP)
+	ORL AX, 0(SP)
+
+	// store new MXCSR register
+	LDMXCSR 0(SP)
+	POPL AX
+	RET
--- a/internal/asm/round_amd64.go
+++ b/internal/asm/round_amd64.go
@ -0,0 +1,6 @@
+//go:build amd64 && !purego
+
+package asm
+
+//go:noescape
+func setRoundingMode(mode uint8)
--- a/internal/asm/round_amd64.s
+++ b/internal/asm/round_amd64.s
@ -0,0 +1,21 @@
+//go:build amd64 && !purego
+
+#include "textflag.h"
+
+TEXT ·setRoundingMode(SB),NOSPLIT|NOFRAME,$8-1
+	MOVB addr+0(FP), AX
+	ANDQ $3, AX
+	ROLQ $13, AX
+
+    // get current MXCSR register
+	PUSHQ AX
+	STMXCSR 0(SP)
+
+	// put new rounding mode
+	ANDL $~0x6000, 0(SP)
+	ORL AX, 0(SP)
+
+	// store new MXCSR register
+	LDMXCSR 0(SP)
+	POPQ AX
+	RET
--- a/internal/asm/round_arm.go
+++ b/internal/asm/round_arm.go
@ -0,0 +1,23 @@
+//go:build (arm.6 || arm.7) && !purego
+
+package asm
+
+// GetFPSCR returns the value of FPSCR register.
+func getFPSCR() (value uint32)
+
+// SetFPSCR writes the FPSCR value.
+func setFPSCR(value uint32)
+
+func setRoundingMode(mode uint8) {
+	switch mode {
+	// switch plus/minus infinity
+	case 1:
+		mode = 2
+	case 2:
+		mode = 1
+
+	}
+	fpscr := getFPSCR()
+	fpscr = (fpscr & (^uint32(0x0C00000))) | ((uint32(mode) & 3) << 22)
+	setFPSCR(fpscr)
+}
--- a/internal/asm/round_arm.s
+++ b/internal/asm/round_arm.s
@ -0,0 +1,13 @@
+//go:build (arm.6 || arm.7) && !purego
+
+#include "textflag.h"
+
+TEXT ·getFPSCR(SB),NOSPLIT,$0-4
+	WORD	$0xeef1ba10	// vmrs r11, fpscr
+	MOVW R11, value+0(FP)
+	RET
+
+TEXT ·setFPSCR(SB),NOSPLIT,$0-4
+	MOVW value+0(FP), R11
+	WORD	$0xeee1ba10	// vmsr fpscr, r11
+	RET
--- a/internal/asm/round_arm64.go
+++ b/internal/asm/round_arm64.go
@ -1,4 +1,4 @@
-//go:build arm64
+//go:build arm64 && !purego

 package asm

--- a/internal/asm/round_arm64.s
+++ b/internal/asm/round_arm64.s
@ -1,3 +1,5 @@
+//go:build arm64 && !purego
+
 #include "textflag.h"

 TEXT ·getFPCR(SB),NOSPLIT,$0-8
--- a/internal/asm/round_noasm.go
+++ b/internal/asm/round_noasm.go
@ -1,4 +1,4 @@
-//go:build !arm64 && !amd64 && !386
+//go:build (!arm64 && !(arm.6 || arm.7) && !amd64 && !386) || purego

 package asm

--- a/internal/blake2/generator.go
+++ b/internal/blake2/generator.go
@ -0,0 +1,46 @@
+package blake2
+
+import (
+	"encoding/binary"
+	"golang.org/x/crypto/blake2b"
+)
+
+type Generator struct {
+	state [blake2b.Size]byte
+	i     int
+}
+
+func New(seed []byte, nonce uint32) *Generator {
+	var state [blake2b.Size]byte
+	copy(state[:60], seed)
+	binary.LittleEndian.PutUint32(state[60:], nonce)
+	g := &Generator{
+		i:     len(state),
+		state: state,
+	}
+
+	return g
+}
+
+func (g *Generator) GetUint32() (v uint32) {
+	if (g.i + 4) > len(g.state) {
+		g.reseed()
+	}
+	v = binary.LittleEndian.Uint32(g.state[g.i:])
+	g.i += 4
+	return v
+}
+
+func (g *Generator) GetByte() (v byte) {
+	if (g.i + 1) > len(g.state) {
+		g.reseed()
+	}
+	v = g.state[g.i]
+	g.i++
+	return v
+}
+
+func (g *Generator) reseed() {
+	g.state = blake2b.Sum512(g.state[:])
+	g.i = 0
+}
--- a/internal/keys/keys.go
+++ b/internal/keys/keys.go
--- a/internal/keys/keys_test.go
+++ b/internal/keys/keys_test.go
--- a/internal/memory/aligned.go
+++ b/internal/memory/aligned.go
@ -0,0 +1,32 @@
+package memory
+
+import "unsafe"
+
+type AlignedAllocator uint64
+
+func NewAlignedAllocator(alignment uint64) Allocator {
+	if !isZeroOrPowerOf2(alignment) {
+		panic("alignment must be a power of 2")
+	}
+	return AlignedAllocator(alignment)
+}
+
+func (a AlignedAllocator) AllocMemory(size uint64) ([]byte, error) {
+	if a <= 4 {
+		//slice allocations are 16-byte aligned, fast path
+		return make([]byte, size, max(size, uint64(a))), nil
+	}
+
+	memory := make([]byte, size+uint64(a))
+	ptr := uintptr(unsafe.Pointer(unsafe.SliceData(memory)))
+	align := uint64(a) - (uint64(ptr) & (uint64(a) - 1))
+	if align == uint64(a) {
+		return memory[:size:size], nil
+	}
+	return memory[align : align+size : align+size], nil
+}
+
+func (a AlignedAllocator) FreeMemory(memory []byte) error {
+	//let gc free
+	return nil
+}
--- a/internal/memory/alloc.go
+++ b/internal/memory/alloc.go
@ -0,0 +1,45 @@
+package memory
+
+import (
+	"unsafe"
+)
+
+type Allocator interface {
+	AllocMemory(size uint64) ([]byte, error)
+	FreeMemory(memory []byte) error
+}
+
+func Allocate[T any](a Allocator) (*T, error) {
+	var zeroType T
+
+	mem, err := a.AllocMemory(uint64(unsafe.Sizeof(zeroType)))
+	if err != nil {
+		return nil, err
+	}
+	return (*T)(unsafe.Pointer(unsafe.SliceData(mem))), nil
+}
+
+func Free[T any](a Allocator, v *T) error {
+	var zeroType T
+	return a.FreeMemory(unsafe.Slice((*byte)(unsafe.Pointer(v)), uint64(unsafe.Sizeof(zeroType))))
+}
+
+func AllocateSlice[T any, T2 ~int | ~uint64 | ~uint32](a Allocator, size T2) ([]T, error) {
+	var zeroType T
+
+	mem, err := a.AllocMemory(uint64(unsafe.Sizeof(zeroType)) * uint64(size))
+	if err != nil {
+		return nil, err
+	}
+	return unsafe.Slice((*T)(unsafe.Pointer(unsafe.SliceData(mem))), size), nil
+}
+
+func FreeSlice[T any](a Allocator, v []T) error {
+	var zeroType T
+
+	return a.FreeMemory(unsafe.Slice((*byte)(unsafe.Pointer(unsafe.SliceData(v))), uint64(unsafe.Sizeof(zeroType))*uint64(len(v))))
+}
+
+func isZeroOrPowerOf2(x uint64) bool {
+	return (x & (x - 1)) == 0
+}
--- a/internal/memory/large_freebsd.go
+++ b/internal/memory/large_freebsd.go
@ -0,0 +1,45 @@
+//go:build freebsd && !purego
+
+package memory
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+type LargePageAllocator struct {
+}
+
+func NewLargePageAllocator() Allocator {
+	return LargePageAllocator{}
+}
+
+/*
+ * Request specific alignment (n == log2 of the desired alignment).
+ *
+ * MAP_ALIGNED_SUPER requests optimal superpage alignment, but does
+ * not enforce a specific alignment.
+ */
+//#define	MAP_ALIGNED(n)	 ((n) << MAP_ALIGNMENT_SHIFT)
+//#define	MAP_ALIGNMENT_SHIFT	24
+//#define	MAP_ALIGNMENT_MASK	MAP_ALIGNED(0xff)
+//#define	MAP_ALIGNED_SUPER	MAP_ALIGNED(1) /* align on a superpage */
+
+const MAP_ALIGNED_SUPER = 1 << 24
+
+func (a LargePageAllocator) AllocMemory(size uint64) ([]byte, error) {
+
+	memory, err := unix.Mmap(-1, 0, int(size), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS|MAP_ALIGNED_SUPER)
+	if err != nil {
+		return nil, err
+	}
+
+	return memory, nil
+}
+
+func (a LargePageAllocator) FreeMemory(memory []byte) error {
+	if memory == nil {
+		return nil
+	}
+
+	return unix.Munmap(memory)
+}
--- a/internal/memory/large_other.go
+++ b/internal/memory/large_other.go
@ -0,0 +1,10 @@
+//go:build openbsd || netbsd || dragonfly || darwin || ios || !unix || purego
+
+package memory
+
+var LargePageNoMemoryErr error
+
+// NewLargePageAllocator Not supported in platform
+func NewLargePageAllocator() Allocator {
+	return nil
+}
--- a/internal/memory/large_unix.go
+++ b/internal/memory/large_unix.go
@ -0,0 +1,31 @@
+//go:build unix && !(freebsd || openbsd || netbsd || dragonfly || darwin || ios) && !purego
+
+package memory
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+type LargePageAllocator struct {
+}
+
+func NewLargePageAllocator() Allocator {
+	return LargePageAllocator{}
+}
+
+func (a LargePageAllocator) AllocMemory(size uint64) ([]byte, error) {
+	memory, err := unix.Mmap(-1, 0, int(size), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS|unix.MAP_HUGETLB|unix.MAP_POPULATE)
+	if err != nil {
+		return nil, err
+	}
+
+	return memory, nil
+}
+
+func (a LargePageAllocator) FreeMemory(memory []byte) error {
+	if memory == nil {
+		return nil
+	}
+
+	return unix.Munmap(memory)
+}
--- a/internal/memory/pages_other.go
+++ b/internal/memory/pages_other.go
@ -0,0 +1,22 @@
+//go:build !unix || purego
+
+package memory
+
+var PageNoMemoryErr error
+
+func NewPageAllocator() Allocator {
+	return nil
+}
+
+func PageReadWrite(memory []byte) error {
+	panic("not supported")
+}
+
+func PageReadExecute(memory []byte) error {
+	panic("not supported")
+}
+
+// PageReadWriteExecute Insecure!
+func PageReadWriteExecute(memory []byte) error {
+	panic("not supported")
+}
--- a/internal/memory/pages_unix.go
+++ b/internal/memory/pages_unix.go
@ -0,0 +1,46 @@
+//go:build unix && !purego
+
+package memory
+
+import (
+	"golang.org/x/sys/unix"
+)
+
+var PageNoMemoryErr = unix.ENOMEM
+
+type PageAllocator struct {
+}
+
+func NewPageAllocator() Allocator {
+	return PageAllocator{}
+}
+
+func (a PageAllocator) AllocMemory(size uint64) ([]byte, error) {
+	memory, err := unix.Mmap(-1, 0, int(size), unix.PROT_READ|unix.PROT_WRITE, unix.MAP_PRIVATE|unix.MAP_ANONYMOUS)
+	if err != nil {
+		return nil, err
+	}
+
+	return memory, nil
+}
+
+func (a PageAllocator) FreeMemory(memory []byte) error {
+	if memory == nil {
+		return nil
+	}
+
+	return unix.Munmap(memory)
+}
+
+func PageReadWrite(memory []byte) error {
+	return unix.Mprotect(memory, unix.PROT_READ|unix.PROT_WRITE)
+}
+
+func PageReadExecute(memory []byte) error {
+	return unix.Mprotect(memory, unix.PROT_READ|unix.PROT_EXEC)
+}
+
+// PageReadWriteExecute Insecure!
+func PageReadWriteExecute(memory []byte) error {
+	return unix.Mprotect(memory, unix.PROT_READ|unix.PROT_WRITE|unix.PROT_EXEC)
+}
--- a/jit_amd64.go
+++ b/jit_amd64.go
@ -0,0 +1,268 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+package randomx
+
+import (
+	"encoding/binary"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/asm"
+)
+
+const supportsJIT = true
+
+/*
+
+	REGISTER ALLOCATION:
+
+	; rax -> temporary
+	; rbx -> todo: iteration counter "ic"
+	; rcx -> temporary
+	; rdx -> temporary
+	; rsi -> scratchpad pointer
+	; rdi -> todo: dataset pointer
+	; rbp -> (do not use, it's used by Golang sampling) jump target //todo: memory registers "ma" (high 32 bits), "mx" (low 32 bits)
+	; rsp -> stack pointer
+	; r8  -> "r0"
+	; r9  -> "r1"
+	; r10 -> "r2"
+	; r11 -> "r3"
+	; r12 -> "r4"
+	; r13 -> "r5"
+	; r14 -> "r6"
+	; r15 -> "r7"
+	; xmm0 -> "f0"
+	; xmm1 -> "f1"
+	; xmm2 -> "f2"
+	; xmm3 -> "f3"
+	; xmm4 -> "e0"
+	; xmm5 -> "e1"
+	; xmm6 -> "e2"
+	; xmm7 -> "e3"
+	; xmm8 -> "a0"
+	; xmm9 -> "a1"
+	; xmm10 -> "a2"
+	; xmm11 -> "a3"
+	; xmm12 -> temporary
+	; xmm13 -> E 'and' mask = 0x00ffffffffffffff00ffffffffffffff
+	; xmm14 -> E 'or' mask  = 0x3*00000000******3*00000000******
+	; xmm15 -> scale mask   = 0x81f000000000000081f0000000000000
+
+*/
+
+const MaxRandomXInstrCodeSize = 32   //FDIV_M requires up to 32 bytes of x86 code
+const MaxSuperscalarInstrSize = 14   //IMUL_RCP requires 14 bytes of x86 code
+const SuperscalarProgramHeader = 128 //overhead per superscalar program
+const CodeAlign = 4096               //align code size to a multiple of 4 KiB
+const ReserveCodeSize = CodeAlign    //function prologue/epilogue + reserve
+
+func alignSize[T ~uintptr | ~uint32 | ~uint64 | ~int64 | ~int32 | ~int](pos, align T) T {
+	return ((pos-1)/align + 1) * align
+}
+
+var RandomXCodeSize = alignSize[uint64](ReserveCodeSize+MaxRandomXInstrCodeSize*RANDOMX_PROGRAM_SIZE, CodeAlign)
+var SuperscalarSize = alignSize[uint64](ReserveCodeSize+(SuperscalarProgramHeader+MaxSuperscalarInstrSize*SuperscalarMaxSize)*RANDOMX_CACHE_ACCESSES, CodeAlign)
+
+var CodeSize = uint32(RandomXCodeSize + SuperscalarSize)
+
+var superScalarHashOffset = int32(RandomXCodeSize)
+
+var REX_ADD_RR = []byte{0x4d, 0x03}
+var REX_ADD_RM = []byte{0x4c, 0x03}
+var REX_SUB_RR = []byte{0x4d, 0x2b}
+var REX_SUB_RM = []byte{0x4c, 0x2b}
+var REX_MOV_RR = []byte{0x41, 0x8b}
+var REX_MOV_RR64 = []byte{0x49, 0x8b}
+var REX_MOV_R64R = []byte{0x4c, 0x8b}
+var REX_IMUL_RR = []byte{0x4d, 0x0f, 0xaf}
+var REX_IMUL_RRI = []byte{0x4d, 0x69}
+var REX_IMUL_RM = []byte{0x4c, 0x0f, 0xaf}
+var REX_MUL_R = []byte{0x49, 0xf7}
+var REX_MUL_M = []byte{0x48, 0xf7}
+var REX_81 = []byte{0x49, 0x81}
+var AND_EAX_I byte = 0x25
+
+var MOV_EAX_I byte = 0xb8
+
+var MOV_RAX_I = []byte{0x48, 0xb8}
+var MOV_RCX_I = []byte{0x48, 0xb9}
+var REX_LEA = []byte{0x4f, 0x8d}
+var REX_MUL_MEM = []byte{0x48, 0xf7, 0x24, 0x0e}
+var REX_IMUL_MEM = []byte{0x48, 0xf7, 0x2c, 0x0e}
+var REX_SHR_RAX = []byte{0x48, 0xc1, 0xe8}
+var RAX_ADD_SBB_1 = []byte{0x48, 0x83, 0xC0, 0x01, 0x48, 0x83, 0xD8, 0x00}
+var MUL_RCX = []byte{0x48, 0xf7, 0xe1}
+var REX_SHR_RDX = []byte{0x48, 0xc1, 0xea}
+var REX_SH = []byte{0x49, 0xc1}
+var MOV_RCX_RAX_SAR_RCX_63 = []byte{0x48, 0x89, 0xc1, 0x48, 0xc1, 0xf9, 0x3f}
+var AND_ECX_I = []byte{0x81, 0xe1}
+var ADD_RAX_RCX = []byte{0x48, 0x01, 0xC8}
+var SAR_RAX_I8 = []byte{0x48, 0xC1, 0xF8}
+var NEG_RAX = []byte{0x48, 0xF7, 0xD8}
+var ADD_R_RAX = []byte{0x4C, 0x03}
+var XOR_EAX_EAX = []byte{0x33, 0xC0}
+var ADD_RDX_R = []byte{0x4c, 0x01}
+var SUB_RDX_R = []byte{0x4c, 0x29}
+var SAR_RDX_I8 = []byte{0x48, 0xC1, 0xFA}
+var TEST_RDX_RDX = []byte{0x48, 0x85, 0xD2}
+var SETS_AL_ADD_RDX_RAX = []byte{0x0F, 0x98, 0xC0, 0x48, 0x03, 0xD0}
+var REX_NEG = []byte{0x49, 0xF7}
+var REX_XOR_RR = []byte{0x4D, 0x33}
+var REX_XOR_RI = []byte{0x49, 0x81}
+var REX_XOR_RM = []byte{0x4c, 0x33}
+var REX_ROT_CL = []byte{0x49, 0xd3}
+var REX_ROT_I8 = []byte{0x49, 0xc1}
+var SHUFPD = []byte{0x66, 0x0f, 0xc6}
+var REX_ADDPD = []byte{0x66, 0x41, 0x0f, 0x58}
+var REX_CVTDQ2PD_XMM12 = []byte{0xf3, 0x44, 0x0f, 0xe6, 0x24, 0x06}
+var REX_SUBPD = []byte{0x66, 0x41, 0x0f, 0x5c}
+var REX_XORPS = []byte{0x41, 0x0f, 0x57}
+var REX_MULPD = []byte{0x66, 0x41, 0x0f, 0x59}
+var REX_MAXPD = []byte{0x66, 0x41, 0x0f, 0x5f}
+var REX_DIVPD = []byte{0x66, 0x41, 0x0f, 0x5e}
+var SQRTPD = []byte{0x66, 0x0f, 0x51}
+var AND_OR_MOV_LDMXCSR = []byte{0x25, 0x00, 0x60, 0x00, 0x00, 0x0D, 0xC0, 0x9F, 0x00, 0x00, 0x50, 0x0F, 0xAE, 0x14, 0x24, 0x58}
+var ROL_RAX = []byte{0x48, 0xc1, 0xc0}
+var XOR_ECX_ECX = []byte{0x33, 0xC9}
+var REX_CMP_R32I = []byte{0x41, 0x81}
+var REX_CMP_M32I = []byte{0x81, 0x3c, 0x06}
+var MOVAPD = []byte{0x66, 0x0f, 0x29}
+var REX_MOV_MR = []byte{0x4c, 0x89}
+var REX_XOR_EAX = []byte{0x41, 0x33}
+var SUB_EBX = []byte{0x83, 0xEB, 0x01}
+var JNZ = []byte{0x0f, 0x85}
+var JMP byte = 0xe9
+
+var REX_XOR_RAX_R64 = []byte{0x49, 0x33}
+var REX_XCHG = []byte{0x4d, 0x87}
+var REX_ANDPS_XMM12 = []byte{0x45, 0x0F, 0x54, 0xE5, 0x45, 0x0F, 0x56, 0xE6}
+var REX_PADD = []byte{0x66, 0x44, 0x0f}
+var PADD_OPCODES = []byte{0xfc, 0xfd, 0xfe, 0xd4}
+var CALL = 0xe8
+
+var REX_ADD_I = []byte{0x49, 0x81}
+var REX_TEST = []byte{0x49, 0xF7}
+var JZ = []byte{0x0f, 0x84}
+var JZ_SHORT byte = 0x74
+
+var RET byte = 0xc3
+
+var LEA_32 = []byte{0x41, 0x8d}
+var MOVNTI = []byte{0x4c, 0x0f, 0xc3}
+var ADD_EBX_I = []byte{0x81, 0xc3}
+
+var NOP1 = []byte{0x90}
+var NOP2 = []byte{0x66, 0x90}
+var NOP3 = []byte{0x66, 0x66, 0x90}
+var NOP4 = []byte{0x0F, 0x1F, 0x40, 0x00}
+var NOP5 = []byte{0x0F, 0x1F, 0x44, 0x00, 0x00}
+var NOP6 = []byte{0x66, 0x0F, 0x1F, 0x44, 0x00, 0x00}
+var NOP7 = []byte{0x0F, 0x1F, 0x80, 0x00, 0x00, 0x00, 0x00}
+var NOP8 = []byte{0x0F, 0x1F, 0x84, 0x00, 0x00, 0x00, 0x00, 0x00}
+
+var NOPX = [][]byte{NOP1, NOP2, NOP3, NOP4, NOP5, NOP6, NOP7, NOP8}
+
+var JMP_ALIGN_PREFIX = [14][]byte{
+	{},
+	{0x2E},
+	{0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x66, 0x66, 0x90, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x0F, 0x1F, 0x40, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+	{0x0F, 0x1F, 0x44, 0x00, 0x00, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E},
+}
+
+func genSIB(scale, index, base int) byte {
+	return byte((scale << 6) | (index << 3) | base)
+}
+func genAddressReg(buf []byte, instr *ByteCodeInstruction, rax bool) []byte {
+	buf = append(buf, LEA_32...)
+	if rax {
+		buf = append(buf, 0x80+instr.Src+0)
+	} else {
+		buf = append(buf, 0x80+instr.Src+8)
+	}
+	if instr.Src == RegisterNeedsSib {
+		buf = append(buf, 0x24)
+	}
+	buf = binary.LittleEndian.AppendUint32(buf, uint32(instr.Imm))
+	if rax {
+		buf = append(buf, AND_EAX_I)
+	} else {
+		buf = append(buf, AND_ECX_I...)
+	}
+	buf = binary.LittleEndian.AppendUint32(buf, instr.MemMask)
+	return buf
+}
+
+func valAsString(values ...uint32) []byte {
+	r := make([]byte, 4*len(values))
+	for i, v := range values {
+		dst := r[i*4:]
+		dst[0] = byte(v & 0xff)
+		dst[1] = byte((v >> 8) & 0xff)
+		dst[2] = byte((v >> 16) & 0xff)
+		dst[3] = byte((v >> 24) & 0xff)
+		switch {
+		case dst[0] == 0:
+			return r[:i*4]
+		case dst[1] == 0:
+			return r[:i*4+1]
+		case dst[2] == 0:
+			return r[:i*4+2]
+		case dst[3] == 0:
+			return r[:i*4+3]
+		}
+	}
+	return r
+}
+
+func familyModel(maxFunctionId uint32) (family, model, stepping int) {
+	if maxFunctionId < 0x1 {
+		return 0, 0, 0
+	}
+	eax, _, _, _ := asm.Cpuid(1)
+	// If BaseFamily[3:0] is less than Fh then ExtendedFamily[7:0] is reserved and Family is equal to BaseFamily[3:0].
+	family = int((eax >> 8) & 0xf)
+	extFam := family == 0x6 // Intel is 0x6, needs extended model.
+	if family == 0xf {
+		// Add ExtFamily
+		family += int((eax >> 20) & 0xff)
+		extFam = true
+	}
+	// If BaseFamily[3:0] is less than 0Fh then ExtendedModel[3:0] is reserved and Model is equal to BaseModel[3:0].
+	model = int((eax >> 4) & 0xf)
+	if extFam {
+		// Add ExtModel
+		model += int((eax >> 12) & 0xf0)
+	}
+	stepping = int(eax & 0xf)
+	return family, model, stepping
+}
+
+var BranchesWithin32B = func() bool {
+	a, b, c, d := asm.Cpuid(0)
+	v := string(valAsString(b, d, c))
+
+	if v == "GenuineIntel" {
+		family, model, stepping := familyModel(a)
+
+		// Intel JCC erratum mitigation
+		if family == 6 {
+			// Affected CPU models and stepping numbers are taken from https://www.intel.com/content/dam/support/us/en/documents/processors/mitigations-jump-conditional-code-erratum.pdf
+			return ((model == 0x4E) && (stepping == 0x3)) ||
+				((model == 0x55) && ((stepping == 0x4) || (stepping == 0x7))) ||
+				((model == 0x5E) && (stepping == 0x3)) ||
+				((model == 0x8E) && (stepping >= 0x9) && (stepping <= 0xC)) ||
+				((model == 0x9E) && (stepping >= 0x9) && (stepping <= 0xD)) ||
+				((model == 0xA6) && (stepping == 0x0)) ||
+				((model == 0xAE) && (stepping == 0xA))
+		}
+	}
+	return false
+}()
--- a/jit_generic.go
+++ b/jit_generic.go
@ -0,0 +1,7 @@
+//go:build !unix || !amd64 || disable_jit || purego
+
+package randomx
+
+const supportsJIT = false
+
+var RandomXCodeSize uint64 = 0
--- a/math.go
+++ b/math.go
@ -0,0 +1,79 @@
+package randomx
+
+import (
+	"math"
+	"math/bits"
+)
+
+const (
+	mantbits64 uint = 52
+	expbits64  uint = 11
+)
+
+const mantissaMask = (uint64(1) << mantbits64) - 1
+const exponentMask = (uint64(1) << expbits64) - 1
+const exponentBias = 1023
+const dynamicExponentBits = 4
+const staticExponentBits = 4
+const constExponentBits uint64 = 0x300
+const dynamicMantissaMask = (uint64(1) << (mantbits64 + dynamicExponentBits)) - 1
+
+const mask22bit = (uint64(1) << 22) - 1
+
+func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
+	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
+}
+
+func ScaleNegate(f float64) float64 {
+	return math.Float64frombits(math.Float64bits(f) ^ 0x80F0000000000000)
+}
+
+func SmallPositiveFloatBits(entropy uint64) float64 {
+	exponent := entropy >> 59 //0..31
+	mantissa := entropy & mantissaMask
+	exponent += exponentBias
+	exponent &= exponentMask
+	exponent = exponent << mantbits64
+	return math.Float64frombits(exponent | mantissa)
+}
+
+func StaticExponent(entropy uint64) uint64 {
+	exponent := constExponentBits
+	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
+	exponent <<= mantbits64
+	return exponent
+}
+
+func ExponentMask(entropy uint64) uint64 {
+	return (entropy & mask22bit) | StaticExponent(entropy)
+}
+
+func Xor(a, b float64) float64 {
+	return math.Float64frombits(math.Float64bits(a) ^ math.Float64bits(b))
+}
+
+func smulh(a, b int64) uint64 {
+	hi_, _ := bits.Mul64(uint64(a), uint64(b))
+	t1 := (a >> 63) & b
+	t2 := (b >> 63) & a
+	return uint64(int64(hi_) - t1 - t2)
+}
+
+// reciprocal
+// Calculates rcp = 2**x / divisor for highest integer x such that rcp < 2**64.
+// divisor must not be 0 or a power of 2
+func reciprocal(divisor uint32) uint64 {
+
+	const p2exp63 = uint64(1) << 63
+
+	quotient := p2exp63 / uint64(divisor)
+	remainder := p2exp63 % uint64(divisor)
+
+	shift := bits.Len32(divisor)
+
+	return (quotient << shift) + ((remainder << shift) / uint64(divisor))
+}
+
+func signExtend2sCompl(x uint32) uint64 {
+	return uint64(int64(int32(x)))
+}
--- a/math_test.go
+++ b/math_test.go
@ -0,0 +1,28 @@
+package randomx
+
+import "testing"
+
+func TestReciprocal(t *testing.T) {
+	t.Parallel()
+
+	var tests = []struct {
+		a uint32
+		b uint64
+	}{
+		{3, 12297829382473034410},
+		{13, 11351842506898185609},
+		{33, 17887751829051686415},
+		{65537, 18446462603027742720},
+		{15000001, 10316166306300415204},
+		{3845182035, 10302264209224146340},
+		{0xffffffff, 9223372039002259456},
+	}
+
+	for i, tt := range tests {
+		r := reciprocal(tt.a)
+		if r != tt.b {
+			t.Errorf("i=%d, a=%d", i, tt.a)
+			t.Errorf("expected=%016x, actual=%016x", tt.b, r)
+		}
+	}
+}
--- a/randomx_test.go
+++ b/randomx_test.go
@ -30,31 +30,95 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
-	"fmt"
+	"encoding/hex"
+	"errors"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/aes"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
+	"os"
 	"runtime"
+	"slices"
+	"strings"
+	"unsafe"
 )
 import "testing"

-var Tests = []struct {
-	key      []byte // key
-	input    []byte // input
-	expected string // expected result
-}{
-	{[]byte("RandomX example key\x00"), []byte("RandomX example input\x00"), "8a48e5f9db45ab79d9080574c4d81954fe6ac63842214aff73c244b26330b7c9"},
-	{[]byte("test key 000"), []byte("This is a test"), "639183aae1bf4c9a35884cb46b09cad9175f04efd7684e7262a0ac1c2f0b4e3f"},                                                    // test a
-	{[]byte("test key 000"), []byte("Lorem ipsum dolor sit amet"), "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"},                                        // test b
-	{[]byte("test key 000"), []byte("sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"), "c36d4ed4191e617309867ed66a443be4075014e2b061bcdaf9ce7b721d2b77a8"}, // test c
-	{[]byte("test key 001"), []byte("sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"), "e9ff4503201c0c2cca26d285c93ae883f9b1d30c9eb240b820756f2d5a7905fc"}, // test d
+type testdata struct {
+	name  string
+	key   []byte
+	input []byte
+	// expected result, in hex
+	expected string
 }

-func Test_Randomx(t *testing.T) {
+func mustHex(str string) []byte {
+	b, err := hex.DecodeString(str)
+	if err != nil {
+		panic(err)
+	}
+	return b
+}

-	c := Randomx_alloc_cache(0)
+var Tests = []testdata{
+	{"example", []byte("RandomX example key\x00"), []byte("RandomX example input\x00"), "8a48e5f9db45ab79d9080574c4d81954fe6ac63842214aff73c244b26330b7c9"},
+	{"test_a", []byte("test key 000"), []byte("This is a test"), "639183aae1bf4c9a35884cb46b09cad9175f04efd7684e7262a0ac1c2f0b4e3f"},
+	{"test_b", []byte("test key 000"), []byte("Lorem ipsum dolor sit amet"), "300a0adb47603dedb42228ccb2b211104f4da45af709cd7547cd049e9489c969"},
+	{"test_c", []byte("test key 000"), []byte("sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"), "c36d4ed4191e617309867ed66a443be4075014e2b061bcdaf9ce7b721d2b77a8"},
+	{"test_d", []byte("test key 001"), []byte("sed do eiusmod tempor incididunt ut labore et dolore magna aliqua"), "e9ff4503201c0c2cca26d285c93ae883f9b1d30c9eb240b820756f2d5a7905fc"},
+	{"test_e", []byte("test key 001"), mustHex("0b0b98bea7e805e0010a2126d287a2a0cc833d312cb786385a7c2f9de69d25537f584a9bc9977b00000000666fd8753bf61a8631f12984e3fd44f4014eca629276817b56f32e9b68bd82f416"), "c56414121acda1713c2f2a819d8ae38aed7c80c35c2a769298d34f03833cd5f1"},
+}

-	for ix, tt := range Tests {
+func testFlags(name string, flags Flags) (f Flags, skip bool) {
+	flags |= GetFlags()
+	flags &^= RANDOMX_FLAG_LARGE_PAGES

-		t.Run(string(tt.key)+"_____"+string(tt.input), func(t *testing.T) {
-			c.Init(tt.key)
+	nn := strings.Split(name, "/")
+	switch nn[len(nn)-1] {
+	case "interpreter":
+		flags &^= RANDOMX_FLAG_JIT
+	case "compiler":
+		flags |= RANDOMX_FLAG_JIT
+		if !flags.HasJIT() {
+			return flags, true
+		}
+
+	case "softaes":
+		flags &^= RANDOMX_FLAG_HARD_AES
+	case "hardaes":
+		flags |= RANDOMX_FLAG_HARD_AES
+		if aes.NewHardAES() == nil {
+			return flags, true
+		}
+	case "largepages":
+		flags |= RANDOMX_FLAG_LARGE_PAGES
+		if largePageAllocator == nil {
+			return flags, true
+		}
+		if unsafe.Sizeof(uint(0)) < 8 {
+			//not 64-bit platforms
+			return flags, true
+		}
+	}
+
+	return flags, false
+}
+
+func Test_RandomXLight(t *testing.T) {
+	t.Parallel()
+	for _, n := range []string{"interpreter", "compiler", "softaes", "hardaes", "largepages"} {
+		t.Run(n, func(t *testing.T) {
+			t.Parallel()
+			tFlags, skip := testFlags(t.Name(), 0)
+			if skip {
+				t.Skip("not supported on this platform")
+			}
+
+			c, err := NewCache(tFlags)
+			if err != nil {
+				if tFlags.Has(RANDOMX_FLAG_LARGE_PAGES) && errors.Is(err, memory.PageNoMemoryErr) {
+					t.Skip("cannot allocate memory")
+				}
+				t.Fatal(err)
+			}
 			defer func() {
 				err := c.Close()
 				if err != nil {
@ -62,66 +126,282 @@ func Test_Randomx(t *testing.T) {
 				}
 			}()

-			vm := c.VM_Initialize()
+			for _, test := range Tests {
+				t.Run(test.name, func(t *testing.T) {
+					c.Init(test.key)

-			var output_hash [32]byte
-			vm.CalculateHash(tt.input, &output_hash)
+					vm, err := NewVM(tFlags, c, nil)
+					if err != nil {
+						t.Fatal(err)
+					}
+					defer func() {
+						err := vm.Close()
+						if err != nil {
+							t.Error(err)
+						}
+					}()

-			actual := fmt.Sprintf("%x", output_hash)
-			if actual != tt.expected {
-				t.Errorf("#%d Fib(%v): expected %s, actual %s", ix, tt.key, tt.expected, actual)
+					var outputHash [RANDOMX_HASH_SIZE]byte
+
+					vm.CalculateHash(test.input, &outputHash)
+
+					outputHex := hex.EncodeToString(outputHash[:])
+
+					if outputHex != test.expected {
+						t.Errorf("key=%v, input=%v", test.key, test.input)
+						t.Errorf("expected=%s, actual=%s", test.expected, outputHex)
+						t.FailNow()
+					}
+				})
+			}
+
+		})
+	}
+}
+
+func Test_RandomXBatch(t *testing.T) {
+	t.Parallel()
+	for _, n := range []string{"softaes", "hardaes"} {
+		t.Run(n, func(t *testing.T) {
+			t.Parallel()
+			tFlags, skip := testFlags(t.Name(), 0)
+			if skip {
+				t.Skip("not supported on this platform")
+			}
+
+			c, err := NewCache(tFlags)
+			if tFlags.Has(RANDOMX_FLAG_LARGE_PAGES) && errors.Is(err, memory.PageNoMemoryErr) {
+				t.Skip("cannot allocate memory")
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err := c.Close()
+				if err != nil {
+					t.Error(err)
+				}
+			}()
+			tests := Tests[1:4]
+
+			c.Init(tests[0].key)
+			vm, err := NewVM(tFlags, c, nil)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err := vm.Close()
+				if err != nil {
+					t.Error(err)
+				}
+			}()
+
+			var outputHash [3][RANDOMX_HASH_SIZE]byte
+
+			vm.CalculateHashFirst(tests[0].input)
+			vm.CalculateHashNext(tests[1].input, &outputHash[0])
+			vm.CalculateHashNext(tests[2].input, &outputHash[1])
+			vm.CalculateHashLast(&outputHash[2])
+
+			for i, test := range tests {
+				outputHex := hex.EncodeToString(outputHash[i][:])
+
+				if outputHex != test.expected {
+					t.Errorf("key=%v, input=%v", test.key, test.input)
+					t.Errorf("expected=%s, actual=%s", test.expected, outputHex)
+					t.FailNow()
+				}
 			}
 		})
 	}
-
 }

-func Benchmark_RandomX(b *testing.B) {
+func Test_RandomXFull(t *testing.T) {
+	if testing.Short() {
+		t.Skip("Skipping full mode with -short")
+	}
+
+	if os.Getenv("CI") != "" {
+		t.Skip("Skipping full mode in CI environment")
+	}
+
+	for _, n := range []string{"interpreter", "compiler", "softaes", "hardaes", "largepages"} {
+		t.Run(n, func(t *testing.T) {
+
+			tFlags, skip := testFlags(t.Name(), RANDOMX_FLAG_FULL_MEM)
+			if skip {
+				t.Skip("not supported on this platform")
+			}
+
+			c, err := NewCache(tFlags)
+			if tFlags.Has(RANDOMX_FLAG_LARGE_PAGES) && errors.Is(err, memory.PageNoMemoryErr) {
+				t.Skip("cannot allocate memory")
+			}
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err := c.Close()
+				if err != nil {
+					t.Error(err)
+				}
+			}()
+
+			dataset, err := NewDataset(tFlags)
+			if err != nil {
+				t.Fatal(err)
+			}
+			defer func() {
+				err := dataset.Close()
+				if err != nil {
+					t.Error(err)
+				}
+			}()
+
+			for _, test := range Tests {
+				t.Run(test.name, func(t *testing.T) {
+					c.Init(test.key)
+					dataset.InitDatasetParallel(c, runtime.NumCPU())
+
+					vm, err := NewVM(tFlags, nil, dataset)
+					if err != nil {
+						t.Fatal(err)
+					}
+					defer func() {
+						err := vm.Close()
+						if err != nil {
+							t.Error(err)
+						}
+					}()
+
+					var outputHash [RANDOMX_HASH_SIZE]byte
+
+					vm.CalculateHash(test.input, &outputHash)
+
+					outputHex := hex.EncodeToString(outputHash[:])
+
+					if outputHex != test.expected {
+						t.Errorf("key=%v, input=%v", test.key, test.input)
+						t.Errorf("expected=%s, actual=%s", test.expected, outputHex)
+						t.FailNow()
+					}
+				})
+
+				// cleanup between runs
+				runtime.GC()
+			}
+
+		})
+
+		// cleanup 2 GiB between runs
+		runtime.GC()
+	}
+}
+
+var BenchmarkTest = Tests[0]
+var BenchmarkCache *Cache
+var BenchmarkDataset *Dataset
+
+var BenchmarkFlags = GetFlags()
+
+func TestMain(m *testing.M) {
+	if slices.Contains(os.Args, "-test.bench") {
+		flags := GetFlags()
+		flags |= RANDOMX_FLAG_FULL_MEM
+		var err error
+		//init light and full dataset
+		BenchmarkCache, err = NewCache(flags | RANDOMX_FLAG_LARGE_PAGES)
+		if err != nil {
+			BenchmarkCache, err = NewCache(flags)
+			if err != nil {
+				panic(err)
+			}
+		}
+		defer BenchmarkCache.Close()
+		BenchmarkCache.Init(BenchmarkTest.key)
+
+		BenchmarkDataset, err = NewDataset(flags | RANDOMX_FLAG_FULL_MEM | RANDOMX_FLAG_LARGE_PAGES)
+		if err != nil {
+			BenchmarkDataset, err = NewDataset(flags | RANDOMX_FLAG_FULL_MEM)
+			if err != nil {
+				panic(err)
+			}
+		}
+		defer BenchmarkDataset.Close()
+		BenchmarkDataset.InitDatasetParallel(BenchmarkCache, runtime.NumCPU())
+	}
+	os.Exit(m.Run())
+}
+
+func Benchmark_RandomXLight(b *testing.B) {
 	b.ReportAllocs()

-	tt := Tests[0]
+	vm, err := NewVM(BenchmarkFlags, BenchmarkCache, nil)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer vm.Close()

-	c := Randomx_alloc_cache(0)
-
-	c.Init(tt.key)
-	defer func() {
-		err := c.Close()
-		if err != nil {
-			b.Error(err)
-		}
-	}()
-
-	vm := c.VM_Initialize()
 	b.ResetTimer()
 	for i := 0; i < b.N; i++ {
 		var output_hash [32]byte
-		vm.CalculateHash(tt.input, &output_hash)
+		vm.CalculateHash(BenchmarkTest.input, &output_hash)
 		runtime.KeepAlive(output_hash)
 	}
 }

-func Benchmark_RandomXParallel(b *testing.B) {
+func Benchmark_RandomXFull(b *testing.B) {
 	b.ReportAllocs()

-	tt := Tests[0]
-
-	c := Randomx_alloc_cache(0)
-
-	c.Init(tt.key)
-	defer func() {
-		err := c.Close()
-		if err != nil {
-			b.Error(err)
-		}
-	}()
+	vm, err := NewVM(BenchmarkFlags|RANDOMX_FLAG_FULL_MEM, nil, BenchmarkDataset)
+	if err != nil {
+		b.Fatal(err)
+	}
+	defer vm.Close()

 	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		var output_hash [32]byte
+		vm.CalculateHash(BenchmarkTest.input, &output_hash)
+		runtime.KeepAlive(output_hash)
+	}
+}
+
+func Benchmark_RandomXLight_Parallel(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+
 	b.RunParallel(func(pb *testing.PB) {
 		var output_hash [32]byte
-		vm := c.VM_Initialize()
+
+		vm, err := NewVM(BenchmarkFlags, BenchmarkCache, nil)
+		if err != nil {
+			b.Fatal(err)
+		}
+		defer vm.Close()

 		for pb.Next() {
-			vm.CalculateHash(tt.input, &output_hash)
+			vm.CalculateHash(BenchmarkTest.input, &output_hash)
+			runtime.KeepAlive(output_hash)
+		}
+	})
+}
+
+func Benchmark_RandomXFull_Parallel(b *testing.B) {
+	b.ReportAllocs()
+	b.ResetTimer()
+
+	b.RunParallel(func(pb *testing.PB) {
+		var output_hash [32]byte
+
+		vm, err := NewVM(BenchmarkFlags|RANDOMX_FLAG_FULL_MEM, nil, BenchmarkDataset)
+		if err != nil {
+			b.Fatal(err)
+		}
+		defer vm.Close()
+
+		for pb.Next() {
+			vm.CalculateHash(BenchmarkTest.input, &output_hash)
 			runtime.KeepAlive(output_hash)
 		}
 	})
--- a/register.go
+++ b/register.go
@ -1,3 +1,29 @@
 package randomx

-type RegisterLine [REGISTERSCOUNT]uint64
+import "unsafe"
+
+const RegistersCount = 8
+const RegistersCountFloat = 4
+
+const LOW = 0
+const HIGH = 1
+
+type RegisterLine [RegistersCount]uint64
+
+type RegisterFile struct {
+	R RegisterLine
+	F [RegistersCountFloat][2]float64
+	E [RegistersCountFloat][2]float64
+	A [RegistersCountFloat][2]float64
+
+	FPRC uint8
+}
+
+const RegisterFileSize = RegistersCount*8 + RegistersCountFloat*2*8*3
+
+func (rf *RegisterFile) Memory() *[RegisterFileSize]byte {
+	return (*[RegisterFileSize]byte)(unsafe.Pointer(rf))
+}
+func (rf *RegisterFile) Clear() {
+	clear(rf.Memory()[:])
+}
--- a/superscalar.go
+++ b/superscalar.go
@ -29,7 +29,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 package randomx

-import "math/bits"
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/blake2"
+	"math/bits"
+)

 type ExecutionPort byte

@ -201,7 +204,7 @@ var buffer3 = []int{4, 9, 3}
 var buffer4 = []int{4, 4, 4, 4}
 var buffer5 = []int{3, 3, 10}

-var Decoder_To_Instruction_Length = [][]int{
+var decoderToInstructionSize = [][]int{
 	buffer0,
 	buffer1,
 	buffer2,
@ -258,7 +261,7 @@ func (d DecoderType) String() string {
 	}
 }

-func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *Blake2Generator) DecoderType {
+func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *blake2.Generator) DecoderType {

 	if ins.Opcode == S_IMULH_R || ins.Opcode == S_ISMULH_R {
 		return Decoder3310
@ -295,172 +298,20 @@ func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *Blake2Gene
 	return Decoder484
 }

-var slot3 = []*Instruction{&ISUB_R, &IXOR_R} // 3 length instruction will be filled with these
-var slot3L = []*Instruction{&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R}
-
-var slot4 = []*Instruction{&IROR_C, &IADD_RS}
-var slot7 = []*Instruction{&IXOR_C7, &IADD_C7}
-var slot8 = []*Instruction{&IXOR_C8, &IADD_C8}
-var slot9 = []*Instruction{&IXOR_C9, &IADD_C9}
-var slot10 = []*Instruction{&IMUL_RCP}
-
-// SuperScalarInstruction superscalar program is built with superscalar instructions
-type SuperScalarInstruction struct {
-	Opcode           byte
-	Dst_Reg          int
-	Src_Reg          int
-	Mod              byte
-	Imm32            uint32
-	Type             int
-	OpGroup          int
-	OpGroupPar       int
-	GroupParIsSource int
-	ins              *Instruction
-	CanReuse         bool
-}
-
-func (sins *SuperScalarInstruction) FixSrcReg() {
-	if sins.Src_Reg >= 0 {
-		// do nothing
-	} else {
-		sins.Src_Reg = sins.Dst_Reg
-	}
-
-}
-func (sins *SuperScalarInstruction) Reset() {
-	sins.Opcode = 99
-	sins.Src_Reg = -1
-	sins.Dst_Reg = -1
-	sins.CanReuse = false
-	sins.GroupParIsSource = 0
-}
-func create(sins *SuperScalarInstruction, ins *Instruction, gen *Blake2Generator) {
-	sins.Reset()
-	sins.ins = ins
-	sins.OpGroupPar = -1
-	sins.Opcode = ins.Opcode
-
-	switch ins.Opcode {
-	case S_ISUB_R:
-		sins.Mod = 0
-		sins.Imm32 = 0
-		sins.OpGroup = S_IADD_RS
-		sins.GroupParIsSource = 1
-	case S_IXOR_R:
-		sins.Mod = 0
-		sins.Imm32 = 0
-		sins.OpGroup = S_IXOR_R
-		sins.GroupParIsSource = 1
-	case S_IADD_RS:
-		sins.Mod = gen.GetByte()
-		// set modshift on Imm32
-		sins.Imm32 = uint32((sins.Mod >> 2) % 4) // bits 2-3
-		//sins.Imm32 = 0
-		sins.OpGroup = S_IADD_RS
-		sins.GroupParIsSource = 1
-	case S_IMUL_R:
-		sins.Mod = 0
-		sins.Imm32 = 0
-		sins.OpGroup = S_IMUL_R
-		sins.GroupParIsSource = 1
-	case S_IROR_C:
-		sins.Mod = 0
-
-		for sins.Imm32 = 0; sins.Imm32 == 0; {
-			sins.Imm32 = uint32(gen.GetByte() & 63)
-		}
-
-		sins.OpGroup = S_IROR_C
-		sins.OpGroupPar = -1
-	case S_IADD_C7, S_IADD_C8, S_IADD_C9:
-		sins.Mod = 0
-		sins.Imm32 = gen.GetUint32()
-		sins.OpGroup = S_IADD_C7
-		sins.OpGroupPar = -1
-	case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
-		sins.Mod = 0
-		sins.Imm32 = gen.GetUint32()
-		sins.OpGroup = S_IXOR_C7
-		sins.OpGroupPar = -1
-
-	case S_IMULH_R:
-		sins.CanReuse = true
-		sins.Mod = 0
-		sins.Imm32 = 0
-		sins.OpGroup = S_IMULH_R
-		sins.OpGroupPar = int(gen.GetUint32())
-	case S_ISMULH_R:
-		sins.CanReuse = true
-		sins.Mod = 0
-		sins.Imm32 = 0
-		sins.OpGroup = S_ISMULH_R
-		sins.OpGroupPar = int(gen.GetUint32())
-
-	case S_IMUL_RCP:
-
-		sins.Mod = 0
-		for {
-			sins.Imm32 = gen.GetUint32()
-			if (sins.Imm32&sins.Imm32 - 1) != 0 {
-				break
-			}
-		}
-
-		sins.OpGroup = S_IMUL_RCP
-
-	default:
-		panic("should not occur")
-
-	}
-
-}
-func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *Blake2Generator, instruction_len int, decoder_type int, islast, isfirst bool) {
-
-	switch instruction_len {
-	case 3:
-		if islast {
-			create(sins, slot3L[gen.GetByte()&3], gen)
-		} else {
-			create(sins, slot3[gen.GetByte()&1], gen)
-		}
-	case 4:
-		//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
-		if decoder_type == int(Decoder4444) && !islast {
-			create(sins, &IMUL_R, gen)
-		} else {
-			create(sins, slot4[gen.GetByte()&1], gen)
-		}
-	case 7:
-		create(sins, slot7[gen.GetByte()&1], gen)
-
-	case 8:
-		create(sins, slot8[gen.GetByte()&1], gen)
-
-	case 9:
-		create(sins, slot9[gen.GetByte()&1], gen)
-	case 10:
-		create(sins, slot10[0], gen)
-
-	default:
-		panic("should not be possible")
-	}
-
-}
-
 type SuperScalarProgram []SuperScalarInstruction

-func (p SuperScalarProgram) setAddressRegister(addressRegister int) {
-	p[0].Dst_Reg = addressRegister
+func (p SuperScalarProgram) setAddressRegister(addressRegister uint8) {
+	p[0].Dst = addressRegister
 }

-func (p SuperScalarProgram) AddressRegister() int {
-	return p[0].Dst_Reg
+func (p SuperScalarProgram) AddressRegister() uint8 {
+	return p[0].Dst
 }
 func (p SuperScalarProgram) Program() []SuperScalarInstruction {
 	return p[1:]
 }

-func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
+func BuildSuperScalarProgram(gen *blake2.Generator) SuperScalarProgram {
 	cycle := 0
 	depcycle := 0
 	//retire_cycle := 0
@ -474,12 +325,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 	code_size := 0
 	program := make(SuperScalarProgram, 1, 512)

-	preAllocatedRegisters := gen.allocRegIndex[:]
-
-	registers := gen.allocRegisters[:]
-	for i := range registers {
-		registers[i] = Register{}
-	}
+	var registers [8]Register

 	sins := &SuperScalarInstruction{}
 	sins.ins = &Instruction{Opcode: S_NOP}
@ -508,7 +354,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 				if ports_saturated || program_size >= SuperscalarMaxSize {
 					break
 				}
-				CreateSuperScalarInstruction(sins, gen, Decoder_To_Instruction_Length[int(decoder)][buffer_index], int(decoder), len(Decoder_To_Instruction_Length[decoder]) == (buffer_index+1), buffer_index == 0)
+				CreateSuperScalarInstruction(sins, gen, decoderToInstructionSize[decoder][buffer_index], decoder, len(decoderToInstructionSize[decoder]) == (buffer_index+1), buffer_index == 0)
 				macro_op_index = 0

 			}
@ -529,7 +375,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {

 			if macro_op_index == sins.ins.SrcOP { // FIXME
 				forward := 0
-				for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectSource(preAllocatedRegisters, scheduleCycle, registers, gen); forward++ {
+				for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectSource(scheduleCycle, &registers, gen); forward++ {
 					scheduleCycle++
 					cycle++
 				}
@ -547,7 +393,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {

 			if macro_op_index == sins.ins.DstOP { // FIXME
 				forward := 0
-				for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectDestination(preAllocatedRegisters, scheduleCycle, throwAwayCount > 0, registers, gen); forward++ {
+				for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectDestination(scheduleCycle, throwAwayCount > 0, &registers, gen); forward++ {
 					scheduleCycle++
 					cycle++
 				}
@ -569,9 +415,9 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 			depcycle = scheduleCycle + mop.GetLatency() // calculate when will the result be ready

 			if macro_op_index == sins.ins.ResultOP { // fix me
-				registers[sins.Dst_Reg].Latency = depcycle
-				registers[sins.Dst_Reg].LastOpGroup = sins.OpGroup
-				registers[sins.Dst_Reg].LastOpPar = sins.OpGroupPar
+				registers[sins.Dst].Latency = depcycle
+				registers[sins.Dst].LastOpGroup = sins.OpGroup
+				registers[sins.Dst].LastOpPar = sins.OpGroupPar

 			}

@ -609,12 +455,12 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 		if i == 0 {
 			continue
 		}
-		lastdst := asic_latencies[program[i].Dst_Reg] + 1
+		lastdst := asic_latencies[program[i].Dst] + 1
 		lastsrc := 0
-		if program[i].Dst_Reg != program[i].Src_Reg {
-			lastsrc = asic_latencies[program[i].Src_Reg] + 1
+		if program[i].Dst != program[i].Src {
+			lastsrc = asic_latencies[program[i].Src] + 1
 		}
-		asic_latencies[program[i].Dst_Reg] = max(lastdst, lastsrc)
+		asic_latencies[program[i].Dst] = max(lastdst, lastsrc)
 	}

 	asic_latency_max := 0
@ -628,7 +474,7 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 	}

 	// Set AddressRegister hack
-	program.setAddressRegister(address_reg)
+	program.setAddressRegister(uint8(address_reg))

 	return program
 }
@ -702,122 +548,101 @@ type Register struct {
 	//RegisterNeedsSib = 4; //x86 r12 register
 }

+// RegisterNeedsDisplacement x86 r13 register
 const RegisterNeedsDisplacement = 5
+
+// RegisterNeedsSib x86 r12 register
 const RegisterNeedsSib = 4

-func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters []int, cycle int, Registers []Register, gen *Blake2Generator) bool {
-	available_registers := preAllocatedAvailableRegisters[:0]
+func (sins *SuperScalarInstruction) SelectSource(cycle int, registers *[8]Register, gen *blake2.Generator) bool {
+	availableRegisters := make([]uint8, 0, 8)

-	for i := range Registers {
-		if Registers[i].Latency <= cycle {
-			available_registers = append(available_registers, i)
+	for i := range registers {
+		if registers[i].Latency <= cycle {
+			availableRegisters = append(availableRegisters, uint8(i))
 		}
 	}

-	if len(available_registers) == 2 && sins.Opcode == S_IADD_RS {
-		if available_registers[0] == RegisterNeedsDisplacement || available_registers[1] == RegisterNeedsDisplacement {
-			sins.Src_Reg = RegisterNeedsDisplacement
-			sins.OpGroupPar = sins.Src_Reg
+	if len(availableRegisters) == 2 && sins.Opcode == S_IADD_RS {
+		if availableRegisters[0] == RegisterNeedsDisplacement || availableRegisters[1] == RegisterNeedsDisplacement {
+			sins.Src = RegisterNeedsDisplacement
+			sins.OpGroupPar = int(sins.Src)
 			return true
 		}
 	}

-	if selectRegister(available_registers, gen, &sins.Src_Reg) {
+	if selectRegister(availableRegisters, gen, &sins.Src) {

 		if sins.GroupParIsSource == 0 {

 		} else {
-			sins.OpGroupPar = sins.Src_Reg
+			sins.OpGroupPar = int(sins.Src)
 		}
 		return true
 	}
 	return false
 }

-func (sins *SuperScalarInstruction) SelectDestination(preAllocatedAvailableRegisters []int, cycle int, allowChainedMul bool, Registers []Register, gen *Blake2Generator) bool {
-	preAllocatedAvailableRegisters = preAllocatedAvailableRegisters[:0]
+func (sins *SuperScalarInstruction) SelectDestination(cycle int, allowChainedMul bool, Registers *[8]Register, gen *blake2.Generator) bool {
+	var availableRegisters = make([]uint8, 0, 8)

 	for i := range Registers {
-		if Registers[i].Latency <= cycle && (sins.CanReuse || i != sins.Src_Reg) &&
+		if Registers[i].Latency <= cycle && (sins.CanReuse || uint8(i) != sins.Src) &&
 			(allowChainedMul || sins.OpGroup != S_IMUL_R || Registers[i].LastOpGroup != S_IMUL_R) &&
 			(Registers[i].LastOpGroup != sins.OpGroup || Registers[i].LastOpPar != sins.OpGroupPar) &&
 			(sins.Opcode != S_IADD_RS || i != RegisterNeedsDisplacement) {
-			preAllocatedAvailableRegisters = append(preAllocatedAvailableRegisters, i)
+			availableRegisters = append(availableRegisters, uint8(i))
 		}
 	}

-	return selectRegister(preAllocatedAvailableRegisters, gen, &sins.Dst_Reg)
+	return selectRegister(availableRegisters, gen, &sins.Dst)
 }

-func selectRegister(available_registers []int, gen *Blake2Generator, reg *int) bool {
+func selectRegister(availableRegisters []uint8, gen *blake2.Generator, reg *uint8) bool {
 	index := 0
-	if len(available_registers) == 0 {
+	if len(availableRegisters) == 0 {
 		return false
 	}

-	if len(available_registers) > 1 {
+	if len(availableRegisters) > 1 {
 		tmp := gen.GetUint32()

-		index = int(tmp % uint32(len(available_registers)))
+		index = int(tmp % uint32(len(availableRegisters)))
 	} else {
 		index = 0
 	}
-	*reg = available_registers[index]
+	*reg = availableRegisters[index]
 	return true
 }

-const Mask = CacheSize/CacheLineSize - 1
-
 // executeSuperscalar execute the superscalar program
 func executeSuperscalar(p []SuperScalarInstruction, r *RegisterLine) {
+	//TODO: produce around (14 * 8 * 8) = 896 different opcodes with hardcoded registers

 	for i := range p {
 		ins := &p[i]
 		switch ins.Opcode {
 		case S_ISUB_R:
-			r[ins.Dst_Reg] -= r[ins.Src_Reg]
+			r[ins.Dst] -= r[ins.Src]
 		case S_IXOR_R:
-			r[ins.Dst_Reg] ^= r[ins.Src_Reg]
+			r[ins.Dst] ^= r[ins.Src]
 		case S_IADD_RS:
-			r[ins.Dst_Reg] += r[ins.Src_Reg] << ins.Imm32
+			r[ins.Dst] += r[ins.Src] << ins.Imm32
 		case S_IMUL_R:
-			r[ins.Dst_Reg] *= r[ins.Src_Reg]
+			r[ins.Dst] *= r[ins.Src]
 		case S_IROR_C:
-			r[ins.Dst_Reg] = bits.RotateLeft64(r[ins.Dst_Reg], 0-int(ins.Imm32))
+			r[ins.Dst] = bits.RotateLeft64(r[ins.Dst], 0-int(ins.Imm32))
 		case S_IADD_C7, S_IADD_C8, S_IADD_C9:
-			r[ins.Dst_Reg] += signExtend2sCompl(ins.Imm32)
+			r[ins.Dst] += signExtend2sCompl(ins.Imm32)
 		case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
-			r[ins.Dst_Reg] ^= signExtend2sCompl(ins.Imm32)
+			r[ins.Dst] ^= signExtend2sCompl(ins.Imm32)
 		case S_IMULH_R:
-			r[ins.Dst_Reg], _ = bits.Mul64(r[ins.Dst_Reg], r[ins.Src_Reg])
+			r[ins.Dst], _ = bits.Mul64(r[ins.Dst], r[ins.Src])
 		case S_ISMULH_R:
-			r[ins.Dst_Reg] = smulh(int64(r[ins.Dst_Reg]), int64(r[ins.Src_Reg]))
+			r[ins.Dst] = smulh(int64(r[ins.Dst]), int64(r[ins.Src]))
 		case S_IMUL_RCP:
-			r[ins.Dst_Reg] *= randomx_reciprocal(ins.Imm32)
+			r[ins.Dst] *= ins.Imm64
 		}
 	}

 }
-
-func smulh(a, b int64) uint64 {
-	hi_, _ := bits.Mul64(uint64(a), uint64(b))
-	t1 := (a >> 63) & b
-	t2 := (b >> 63) & a
-	return uint64(int64(hi_) - t1 - t2)
-}
-
-func randomx_reciprocal(divisor uint32) uint64 {
-
-	const p2exp63 = uint64(1) << 63
-
-	quotient := p2exp63 / uint64(divisor)
-	remainder := p2exp63 % uint64(divisor)
-
-	shift := uint32(bits.Len32(divisor))
-
-	return (quotient << shift) + ((remainder << shift) / uint64(divisor))
-}
-
-func signExtend2sCompl(x uint32) uint64 {
-	return uint64(int64(int32(x)))
-}
--- a/superscalar_amd64.go
+++ b/superscalar_amd64.go
@ -1,152 +0,0 @@
-//go:build unix && amd64 && !disable_jit
-
-package randomx
-
-import (
-	"encoding/binary"
-)
-
-var REX_SUB_RR = []byte{0x4d, 0x2b}
-var REX_MOV_RR64 = []byte{0x49, 0x8b}
-var REX_MOV_R64R = []byte{0x4c, 0x8b}
-var REX_IMUL_RR = []byte{0x4d, 0x0f, 0xaf}
-var REX_IMUL_RM = []byte{0x4c, 0x0f, 0xaf}
-var REX_MUL_R = []byte{0x49, 0xf7}
-var REX_81 = []byte{0x49, 0x81}
-
-var MOV_RAX_I = []byte{0x48, 0xb8}
-var REX_LEA = []byte{0x4f, 0x8d}
-var REX_XOR_RR = []byte{0x4D, 0x33}
-var REX_XOR_RI = []byte{0x49, 0x81}
-var REX_ROT_I8 = []byte{0x49, 0xc1}
-
-func genSIB(scale, index, base int) byte {
-	return byte((scale << 6) | (index << 3) | base)
-}
-
-/*
-push rbp
-push rbx
-push rsi
-push r12
-push r13
-push r14
-push r15
-mov    rbp,rsp
-sub    rsp,(0x8*7)
-
-mov    rsi, rax; # register dataset
-
-prefetchnta byte ptr [rsi]
-
-mov r8, qword ptr [rsi+0]
-mov r9, qword ptr [rsi+8]
-mov r10, qword ptr [rsi+16]
-mov r11, qword ptr [rsi+24]
-mov r12, qword ptr [rsi+32]
-mov r13, qword ptr [rsi+40]
-mov r14, qword ptr [rsi+48]
-mov r15, qword ptr [rsi+56]
-*/
-var codeInitBlock = []byte{0x55, 0x53, 0x56, 0x41, 0x54, 0x41, 0x55, 0x41, 0x56, 0x41, 0x57, 0x48, 0x89, 0xE5, 0x48, 0x83, 0xEC, 0x38, 0x48, 0x89, 0xC6, 0x0F, 0x18, 0x06, 0x4C, 0x8B, 0x06, 0x4C, 0x8B, 0x4E, 0x08, 0x4C, 0x8B, 0x56, 0x10, 0x4C, 0x8B, 0x5E, 0x18, 0x4C, 0x8B, 0x66, 0x20, 0x4C, 0x8B, 0x6E, 0x28, 0x4C, 0x8B, 0x76, 0x30, 0x4C, 0x8B, 0x7E, 0x38}
-
-/*
-prefetchw byte ptr [rsi]
-
-mov qword ptr [rsi+0], r8
-mov qword ptr [rsi+8], r9
-mov qword ptr [rsi+16], r10
-mov qword ptr [rsi+24], r11
-mov qword ptr [rsi+32], r12
-mov qword ptr [rsi+40], r13
-mov qword ptr [rsi+48], r14
-mov qword ptr [rsi+56], r15
-
-add    rsp,(0x8*7)
-pop r15
-pop r14
-pop r13
-pop r12
-pop rsi
-pop rbx
-pop rbp
-ret
-*/
-var codeRetBlock = []byte{0x0F, 0x0D, 0x0E, 0x4C, 0x89, 0x06, 0x4C, 0x89, 0x4E, 0x08, 0x4C, 0x89, 0x56, 0x10, 0x4C, 0x89, 0x5E, 0x18, 0x4C, 0x89, 0x66, 0x20, 0x4C, 0x89, 0x6E, 0x28, 0x4C, 0x89, 0x76, 0x30, 0x4C, 0x89, 0x7E, 0x38, 0x48, 0x83, 0xC4, 0x38, 0x41, 0x5F, 0x41, 0x5E, 0x41, 0x5D, 0x41, 0x5C, 0x5E, 0x5B, 0x5D, 0xC3}
-
-// generateSuperscalarCode
-func generateSuperscalarCode(scalarProgram SuperScalarProgram) ProgramFunc {
-
-	var program []byte
-
-	program = append(program, codeInitBlock...)
-
-	p := scalarProgram.Program()
-	for i := range p {
-		instr := &p[i]
-
-		dst := instr.Dst_Reg % REGISTERSCOUNT
-		src := instr.Src_Reg % REGISTERSCOUNT
-
-		switch instr.Opcode {
-		case S_ISUB_R:
-			program = append(program, REX_SUB_RR...)
-			program = append(program, byte(0xc0+8*dst+src))
-		case S_IXOR_R:
-			program = append(program, REX_XOR_RR...)
-			program = append(program, byte(0xc0+8*dst+src))
-		case S_IADD_RS:
-			program = append(program, REX_LEA...)
-			program = append(program,
-				byte(0x04+8*dst),
-				genSIB(int(instr.Imm32), src, dst),
-			)
-		case S_IMUL_R:
-			program = append(program, REX_IMUL_RR...)
-			program = append(program, byte(0xc0+8*dst+src))
-		case S_IROR_C:
-			program = append(program, REX_ROT_I8...)
-			program = append(program,
-				byte(0xc8+dst),
-				byte(instr.Imm32&63),
-			)
-
-		case S_IADD_C7, S_IADD_C8, S_IADD_C9:
-			program = append(program, REX_81...)
-			program = append(program, byte(0xc0+dst))
-			program = binary.LittleEndian.AppendUint32(program, instr.Imm32)
-			//TODO: align NOP on C8/C9
-		case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
-			program = append(program, REX_XOR_RI...)
-			program = append(program, byte(0xf0+dst))
-			program = binary.LittleEndian.AppendUint32(program, instr.Imm32)
-			//TODO: align NOP on C8/C9
-
-		case S_IMULH_R:
-			program = append(program, REX_MOV_RR64...)
-			program = append(program, byte(0xc0+dst))
-			program = append(program, REX_MUL_R...)
-			program = append(program, byte(0xe0+src))
-			program = append(program, REX_MOV_R64R...)
-			program = append(program, byte(0xc2+8*dst))
-		case S_ISMULH_R:
-			program = append(program, REX_MOV_RR64...)
-			program = append(program, byte(0xc0+dst))
-			program = append(program, REX_MUL_R...)
-			program = append(program, byte(0xe8+src))
-			program = append(program, REX_MOV_R64R...)
-			program = append(program, byte(0xc2+8*dst))
-		case S_IMUL_RCP:
-			program = append(program, MOV_RAX_I...)
-			program = binary.LittleEndian.AppendUint64(program, randomx_reciprocal(instr.Imm32))
-			program = append(program, REX_IMUL_RM...)
-			program = append(program, byte(0xc0+8*instr.Dst_Reg))
-		default:
-			panic("unreachable")
-		}
-	}
-
-	program = append(program, codeRetBlock...)
-
-	return mapProgram(program)
-}
--- a/superscalar_instruction.go
+++ b/superscalar_instruction.go
@ -0,0 +1,157 @@
+package randomx
+
+import "git.gammaspectra.live/P2Pool/go-randomx/v3/internal/blake2"
+
+// SuperScalarInstruction superscalar program is built with superscalar instructions
+type SuperScalarInstruction struct {
+	Opcode           byte
+	Dst              uint8
+	Src              uint8
+	Mod              byte
+	Imm32            uint32
+	Imm64            uint64
+	OpGroup          int
+	OpGroupPar       int
+	GroupParIsSource int
+	ins              *Instruction
+	CanReuse         bool
+}
+
+func (sins *SuperScalarInstruction) FixSrcReg() {
+	if sins.Src == 0xff {
+		sins.Src = sins.Dst
+	}
+
+}
+func (sins *SuperScalarInstruction) Reset() {
+	sins.Opcode = 99
+	sins.Src = 0xff
+	sins.Dst = 0xff
+	sins.CanReuse = false
+	sins.GroupParIsSource = 0
+}
+
+func createSuperScalarInstruction(sins *SuperScalarInstruction, ins *Instruction, gen *blake2.Generator) {
+	sins.Reset()
+	sins.ins = ins
+	sins.OpGroupPar = -1
+	sins.Opcode = ins.Opcode
+
+	switch ins.Opcode {
+	case S_ISUB_R:
+		sins.Mod = 0
+		sins.Imm32 = 0
+		sins.OpGroup = S_IADD_RS
+		sins.GroupParIsSource = 1
+	case S_IXOR_R:
+		sins.Mod = 0
+		sins.Imm32 = 0
+		sins.OpGroup = S_IXOR_R
+		sins.GroupParIsSource = 1
+	case S_IADD_RS:
+		sins.Mod = gen.GetByte()
+		// set modshift on Imm32
+		sins.Imm32 = uint32((sins.Mod >> 2) % 4) // bits 2-3
+		//sins.Imm32 = 0
+		sins.OpGroup = S_IADD_RS
+		sins.GroupParIsSource = 1
+	case S_IMUL_R:
+		sins.Mod = 0
+		sins.Imm32 = 0
+		sins.OpGroup = S_IMUL_R
+		sins.GroupParIsSource = 1
+	case S_IROR_C:
+		sins.Mod = 0
+
+		for sins.Imm32 = 0; sins.Imm32 == 0; {
+			sins.Imm32 = uint32(gen.GetByte() & 63)
+		}
+
+		sins.OpGroup = S_IROR_C
+		sins.OpGroupPar = -1
+	case S_IADD_C7, S_IADD_C8, S_IADD_C9:
+		sins.Mod = 0
+		sins.Imm32 = gen.GetUint32()
+		sins.OpGroup = S_IADD_C7
+		sins.OpGroupPar = -1
+	case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
+		sins.Mod = 0
+		sins.Imm32 = gen.GetUint32()
+		sins.OpGroup = S_IXOR_C7
+		sins.OpGroupPar = -1
+
+	case S_IMULH_R:
+		sins.CanReuse = true
+		sins.Mod = 0
+		sins.Imm32 = 0
+		sins.OpGroup = S_IMULH_R
+		sins.OpGroupPar = int(gen.GetUint32())
+	case S_ISMULH_R:
+		sins.CanReuse = true
+		sins.Mod = 0
+		sins.Imm32 = 0
+		sins.OpGroup = S_ISMULH_R
+		sins.OpGroupPar = int(gen.GetUint32())
+
+	case S_IMUL_RCP:
+
+		sins.Mod = 0
+		for {
+			sins.Imm32 = gen.GetUint32()
+			if (sins.Imm32&sins.Imm32 - 1) != 0 {
+				break
+			}
+		}
+
+		sins.Imm64 = reciprocal(sins.Imm32)
+
+		sins.OpGroup = S_IMUL_RCP
+
+	default:
+		panic("should not occur")
+
+	}
+
+}
+
+var slot3 = []*Instruction{&ISUB_R, &IXOR_R} // 3 length instruction will be filled with these
+var slot3L = []*Instruction{&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R}
+
+var slot4 = []*Instruction{&IROR_C, &IADD_RS}
+var slot7 = []*Instruction{&IXOR_C7, &IADD_C7}
+var slot8 = []*Instruction{&IXOR_C8, &IADD_C8}
+var slot9 = []*Instruction{&IXOR_C9, &IADD_C9}
+var slot10 = []*Instruction{&IMUL_RCP}
+
+func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *blake2.Generator, instructionLen int, decoderType DecoderType, last, first bool) {
+
+	switch instructionLen {
+	case 3:
+		if last {
+			createSuperScalarInstruction(sins, slot3L[gen.GetByte()&3], gen)
+		} else {
+			createSuperScalarInstruction(sins, slot3[gen.GetByte()&1], gen)
+		}
+	case 4:
+		//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
+		if decoderType == Decoder4444 && !last {
+			createSuperScalarInstruction(sins, &IMUL_R, gen)
+		} else {
+			createSuperScalarInstruction(sins, slot4[gen.GetByte()&1], gen)
+		}
+	case 7:
+		createSuperScalarInstruction(sins, slot7[gen.GetByte()&1], gen)
+
+	case 8:
+		createSuperScalarInstruction(sins, slot8[gen.GetByte()&1], gen)
+
+	case 9:
+		createSuperScalarInstruction(sins, slot9[gen.GetByte()&1], gen)
+	case 10:
+		createSuperScalarInstruction(sins, slot10[0], gen)
+
+	default:
+		panic("should not be possible")
+	}
+
+}
--- a/superscalar_jit_amd64.go
+++ b/superscalar_jit_amd64.go
@ -0,0 +1,101 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+package randomx
+
+import (
+	"encoding/binary"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
+	"unsafe"
+)
+
+//go:noescape
+func superscalar_run(rf, jmp uintptr)
+
+func (f SuperScalarProgramFunc) Execute(rf uintptr) {
+	if f == nil {
+		panic("program is nil")
+	}
+
+	superscalar_run(rf, uintptr(unsafe.Pointer(unsafe.SliceData(f))))
+}
+
+// generateSuperscalarCode
+func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgramFunc {
+
+	var program []byte
+
+	p := scalarProgram.Program()
+	for i := range p {
+		instr := &p[i]
+
+		dst := instr.Dst % RegistersCount
+		src := instr.Src % RegistersCount
+
+		switch instr.Opcode {
+		case S_ISUB_R:
+			program = append(program, REX_SUB_RR...)
+			program = append(program, byte(0xc0+8*dst+src))
+		case S_IXOR_R:
+			program = append(program, REX_XOR_RR...)
+			program = append(program, byte(0xc0+8*dst+src))
+		case S_IADD_RS:
+			program = append(program, REX_LEA...)
+			program = append(program,
+				byte(0x04+8*dst),
+				genSIB(int(instr.Imm32), int(src), int(dst)),
+			)
+		case S_IMUL_R:
+			program = append(program, REX_IMUL_RR...)
+			program = append(program, byte(0xc0+8*dst+src))
+		case S_IROR_C:
+			program = append(program, REX_ROT_I8...)
+			program = append(program,
+				byte(0xc8+dst),
+				byte(instr.Imm32&63),
+			)
+
+		case S_IADD_C7, S_IADD_C8, S_IADD_C9:
+			program = append(program, REX_81...)
+			program = append(program, byte(0xc0+dst))
+			program = binary.LittleEndian.AppendUint32(program, instr.Imm32)
+			//TODO: align NOP on C8/C9
+		case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
+			program = append(program, REX_XOR_RI...)
+			program = append(program, byte(0xf0+dst))
+			program = binary.LittleEndian.AppendUint32(program, instr.Imm32)
+			//TODO: align NOP on C8/C9
+
+		case S_IMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, byte(0xc0+dst))
+			program = append(program, REX_MUL_R...)
+			program = append(program, byte(0xe0+src))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, byte(0xc2+8*dst))
+		case S_ISMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, byte(0xc0+dst))
+			program = append(program, REX_MUL_R...)
+			program = append(program, byte(0xe8+src))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, byte(0xc2+8*dst))
+		case S_IMUL_RCP:
+			program = append(program, MOV_RAX_I...)
+			program = binary.LittleEndian.AppendUint64(program, instr.Imm64)
+			program = append(program, REX_IMUL_RM...)
+			program = append(program, byte(0xc0+8*instr.Dst))
+		default:
+			panic("unreachable")
+		}
+	}
+
+	program = append(program, RET)
+
+	pagedMemory, err := memory.AllocateSlice[byte](pageAllocator, len(program))
+	if err != nil {
+		return nil
+	}
+	copy(pagedMemory, program)
+
+	return pagedMemory
+}
--- a/superscalar_jit_amd64.s
+++ b/superscalar_jit_amd64.s
@ -0,0 +1,42 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+#include "textflag.h"
+
+TEXT ·superscalar_run(SB),$0-16
+
+	MOVQ rf+0(FP), SI
+
+    PREFETCHNTA 0(SI)
+
+    // move register line to registers
+    MOVQ 0(SI), R8
+    MOVQ 8(SI), R9
+    MOVQ 16(SI), R10
+    MOVQ 24(SI), R11
+    MOVQ 32(SI), R12
+    MOVQ 40(SI), R13
+    MOVQ 48(SI), R14
+    MOVQ 56(SI), R15
+
+    MOVQ jmp+8(FP), AX
+    // jump to JIT code
+    CALL AX
+
+
+    // prefetchw BYTE PTR [rsi]
+    // PREFETCHW 0(SI)
+    BYTE $0x0F
+    BYTE $0x0D
+    BYTE $0x0E
+
+    // move registers back to register line
+    MOVQ R8, 0(SI)
+    MOVQ R9, 8(SI)
+    MOVQ R10, 16(SI)
+    MOVQ R11, 24(SI)
+    MOVQ R12, 32(SI)
+    MOVQ R13, 40(SI)
+    MOVQ R14, 48(SI)
+    MOVQ R15, 56(SI)
+
+    RET
--- a/superscalar_noasm.go
+++ b/superscalar_noasm.go
@ -1,8 +1,12 @@
-//go:build !unix || !amd64 || disable_jit
+//go:build !unix || !amd64 || purego || disable_jit

 package randomx

+func (f SuperScalarProgramFunc) Execute(rf uintptr) {
+
+}
+
 // generateSuperscalarCode
-func generateSuperscalarCode(scalarProgram SuperScalarProgram) ProgramFunc {
+func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgramFunc {
 	return nil
 }
--- a/vm.go
+++ b/vm.go
@ -30,267 +30,384 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/aes"
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
+	"errors"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/aes"
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/memory"
 	"math"
 	"runtime"
+	"unsafe"
 )
-import "encoding/binary"
 import "golang.org/x/crypto/blake2b"

-type REG struct {
-	Hi uint64
-	Lo uint64
-}
-
 type VM struct {
-	StateStart [64]byte
-	buffer     [RANDOMX_PROGRAM_SIZE*8 + 16*8]byte // first 128 bytes are entropy below rest are program bytes
-	Prog       []byte
-	ScratchPad [ScratchpadSize]byte
+	pad *ScratchPad

-	ByteCode [RANDOMX_PROGRAM_SIZE]InstructionByteCode
+	flags Flags

-	// program configuration  see program.hpp
+	// buffer first 128 bytes are entropy below rest are program bytes
+	buffer [16*8 + RANDOMX_PROGRAM_SIZE*8]byte

-	entropy [16]uint64
+	hashState [blake2b.Size]byte

-	reg           REGISTER_FILE // the register file
-	mem           MemoryRegisters
-	config        Config // configuration
-	datasetOffset uint64
+	registerFile *RegisterFile

-	Dataset Randomx_Dataset
+	AES aes.AES

-	Cache *Randomx_Cache // randomx cache
+	Cache   *Cache
+	Dataset *Dataset

+	program    ByteCode
+	jitProgram VMProgramFunc
 }

-func MaskRegisterExponentMantissa(f float64, mode uint64) float64 {
-	return math.Float64frombits((math.Float64bits(f) & dynamicMantissaMask) | mode)
-}
-
-type Config struct {
-	eMask   [2]uint64
-	readReg [4]uint64
-}
-
-type REGISTER_FILE struct {
-	r RegisterLine
-	f [4][2]float64
-	e [4][2]float64
-	a [4][2]float64
-}
-type MemoryRegisters struct {
-	mx, ma uint64
-}
-
-const LOW = 0
-const HIGH = 1
-
-// calculate hash based on input
-func (vm *VM) Run(input_hash [64]byte) {
-
-	aes.FillAes4Rx4(input_hash, vm.buffer[:])
-
-	for i := range vm.entropy {
-		vm.entropy[i] = binary.LittleEndian.Uint64(vm.buffer[i*8:])
+// NewVM  Creates and initializes a RandomX virtual machine.
+// *
+// * @param flags is any combination of these 5 flags (each flag can be set or not set):
+// *        RANDOMX_FLAG_LARGE_PAGES - allocate scratchpad memory in large pages
+// *        RANDOMX_FLAG_HARD_AES - virtual machine will use hardware accelerated AES
+// *        RANDOMX_FLAG_FULL_MEM - virtual machine will use the full dataset
+// *        RANDOMX_FLAG_JIT - virtual machine will use a JIT compiler
+// *        RANDOMX_FLAG_SECURE - when combined with RANDOMX_FLAG_JIT, the JIT pages are never
+// *                              writable and executable at the same time (W^X policy)
+// *        The numeric values of the first 4 flags are ordered so that a higher value will provide
+// *        faster hash calculation and a lower numeric value will provide higher portability.
+// *        Using RANDOMX_FLAG_DEFAULT (all flags not set) works on all platforms, but is the slowest.
+// * @param cache is a pointer to an initialized randomx_cache structure. Can be
+// *        NULL if RANDOMX_FLAG_FULL_MEM is set.
+// * @param dataset is a pointer to a randomx_dataset structure. Can be NULL
+// *        if RANDOMX_FLAG_FULL_MEM is not set.
+// *
+// * @return Pointer to an initialized randomx_vm structure.
+// *         Returns NULL if:
+// *         (1) Scratchpad memory allocation fails.
+// *         (2) The requested initialization flags are not supported on the current platform.
+// *         (3) cache parameter is NULL and RANDOMX_FLAG_FULL_MEM is not set
+// *         (4) dataset parameter is NULL and RANDOMX_FLAG_FULL_MEM is set
+// */
+func NewVM(flags Flags, cache *Cache, dataset *Dataset) (*VM, error) {
+	if cache == nil && !flags.Has(RANDOMX_FLAG_FULL_MEM) {
+		return nil, errors.New("nil cache in light mode")
+	}
+	if dataset == nil && flags.Has(RANDOMX_FLAG_FULL_MEM) {
+		return nil, errors.New("nil dataset in full mode")
 	}

-	vm.Prog = vm.buffer[len(vm.entropy)*8:]
+	pad, err := memory.Allocate[ScratchPad](cacheLineAlignedAllocator)
+	if err != nil {
+		return nil, err
+	}
+	registerFile, err := memory.Allocate[RegisterFile](cacheLineAlignedAllocator)
+	if err != nil {
+		return nil, err
+	}
+	_ = pad
+	_ = registerFile

-	clear(vm.reg.r[:])
+	vm := &VM{
+		Cache:        cache,
+		Dataset:      dataset,
+		flags:        flags,
+		pad:          new(ScratchPad),
+		registerFile: new(RegisterFile),
+	}
+
+	if flags.Has(RANDOMX_FLAG_HARD_AES) {
+		vm.AES = aes.NewHardAES()
+	}
+	// fallback
+	if vm.AES == nil {
+		vm.AES = aes.NewSoftAES()
+	}
+
+	if flags.HasJIT() {
+		vm.jitProgram, err = memory.AllocateSlice[byte](pageAllocator, int(RandomXCodeSize))
+		if err != nil {
+			return nil, err
+		}
+
+		if !flags.Has(RANDOMX_FLAG_SECURE) {
+			err = memory.PageReadWriteExecute(vm.jitProgram)
+			if err != nil {
+				vm.jitProgram.Close()
+				return nil, err
+			}
+		}
+	}
+
+	return vm, nil
+}
+
+// run calculate hash based on input. Not thread safe.
+// Warning: Underlying callers will run float64 SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to IEEE 754 roundTiesToEven between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (vm *VM) run() {
+
+	// buffer first 128 bytes are entropy below rest are program bytes
+	vm.AES.FillAes4Rx4(vm.hashState, vm.buffer[:])
+
+	entropy := (*[16]uint64)(unsafe.Pointer(&vm.buffer))

 	// do more initialization before we run

-	for i := range vm.entropy[:8] {
-		vm.reg.a[i/2][i%2] = math.Float64frombits(getSmallPositiveFloatBits(vm.entropy[i]))
+	reg := vm.registerFile
+	reg.Clear()
+
+	// initialize constant registers
+	for i := range entropy[:8] {
+		reg.A[i/2][i%2] = SmallPositiveFloatBits(entropy[i])
 	}

-	vm.mem.ma = vm.entropy[8] & CacheLineAlignMask
-	vm.mem.mx = vm.entropy[10]
+	// memory registers
+	var ma, mx uint32

-	addressRegisters := vm.entropy[12]
-	for i := range vm.config.readReg {
-		vm.config.readReg[i] = uint64(i*2) + (addressRegisters & 1)
+	ma = uint32(entropy[8] & CacheLineAlignMask)
+	mx = uint32(entropy[10])
+
+	addressRegisters := entropy[12]
+
+	var readReg [4]uint64
+	for i := range readReg {
+		readReg[i] = uint64(i*2) + (addressRegisters & 1)
 		addressRegisters >>= 1
 	}

-	vm.datasetOffset = (vm.entropy[13] % (DATASETEXTRAITEMS + 1)) * CacheLineSize
-	vm.config.eMask[LOW] = getFloatMask(vm.entropy[14])
-	vm.config.eMask[HIGH] = getFloatMask(vm.entropy[15])
+	datasetOffset := (entropy[13] % (DatasetExtraItems + 1)) * CacheLineSize

-	vm.Compile_TO_Bytecode()
+	eMask := [2]uint64{ExponentMask(entropy[14]), ExponentMask(entropy[15])}

-	spAddr0 := vm.mem.mx
-	spAddr1 := vm.mem.ma
+	prog := vm.buffer[len(entropy)*8:]
+	CompileProgramToByteCode(prog, &vm.program)
+
+	var jitProgram VMProgramFunc
+
+	if vm.jitProgram != nil {
+		if vm.Dataset == nil { //light mode
+			if vm.flags.Has(RANDOMX_FLAG_SECURE) {
+				err := memory.PageReadWrite(vm.jitProgram)
+				if err != nil {
+					panic(err)
+				}
+				jitProgram = vm.program.generateCode(vm.jitProgram, nil)
+				err = memory.PageReadExecute(vm.jitProgram)
+				if err != nil {
+					panic(err)
+				}
+			} else {
+				jitProgram = vm.program.generateCode(vm.jitProgram, nil)
+			}
+		} else {
+			// full mode and we have JIT
+			if vm.flags.Has(RANDOMX_FLAG_SECURE) {
+				err := memory.PageReadWrite(vm.jitProgram)
+				if err != nil {
+					panic(err)
+				}
+				jitProgram = vm.program.generateCode(vm.jitProgram, &readReg)
+				err = memory.PageReadExecute(vm.jitProgram)
+				if err != nil {
+					panic(err)
+				}
+			} else {
+				jitProgram = vm.program.generateCode(vm.jitProgram, &readReg)
+			}
+
+			vm.jitProgram.ExecuteFull(reg, vm.pad, &vm.Dataset.Memory()[datasetOffset/CacheLineSize], RANDOMX_PROGRAM_ITERATIONS, ma, mx, eMask)
+			return
+		}
+	}
+
+	spAddr0 := uint64(mx)
+	spAddr1 := uint64(ma)

 	var rlCache RegisterLine

 	for ic := 0; ic < RANDOMX_PROGRAM_ITERATIONS; ic++ {
-		spMix := vm.reg.r[vm.config.readReg[0]] ^ vm.reg.r[vm.config.readReg[1]]
+		spMix := reg.R[readReg[0]] ^ reg.R[readReg[1]]

 		spAddr0 ^= spMix
 		spAddr0 &= ScratchpadL3Mask64
 		spAddr1 ^= spMix >> 32
 		spAddr1 &= ScratchpadL3Mask64

-		for i := uint64(0); i < REGISTERSCOUNT; i++ {
-			vm.reg.r[i] ^= vm.Load64(spAddr0 + 8*i)
+		//TODO: optimize these loads!
+		for i := uint64(0); i < RegistersCount; i++ {
+			reg.R[i] ^= vm.pad.Load64(uint32(spAddr0 + 8*i))
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.f[i] = vm.Load32FA(spAddr1 + 8*i)
+		for i := uint64(0); i < RegistersCountFloat; i++ {
+			reg.F[i] = vm.pad.Load32FA(uint32(spAddr1 + 8*i))
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.e[i] = vm.Load32FA(spAddr1 + 8*(i+REGISTERCOUNTFLT))
+		for i := uint64(0); i < RegistersCountFloat; i++ {
+			reg.E[i] = vm.pad.Load32FA(uint32(spAddr1 + 8*(i+RegistersCountFloat)))

-			vm.reg.e[i][LOW] = MaskRegisterExponentMantissa(vm.reg.e[i][LOW], vm.config.eMask[LOW])
-			vm.reg.e[i][HIGH] = MaskRegisterExponentMantissa(vm.reg.e[i][HIGH], vm.config.eMask[HIGH])
+			reg.E[i][LOW] = MaskRegisterExponentMantissa(reg.E[i][LOW], eMask[LOW])
+			reg.E[i][HIGH] = MaskRegisterExponentMantissa(reg.E[i][HIGH], eMask[HIGH])
 		}

-		// todo: pass register file directly!
-		vm.InterpretByteCode()
+		// run the actual bytecode
+		if jitProgram != nil {
+			// light mode
+			jitProgram.Execute(reg, vm.pad, eMask)
+		} else {
+			vm.program.Execute(reg, vm.pad, eMask)
+		}

-		vm.mem.mx ^= vm.reg.r[vm.config.readReg[2]] ^ vm.reg.r[vm.config.readReg[3]]
-		vm.mem.mx &= CacheLineAlignMask
+		mx ^= uint32(reg.R[readReg[2]] ^ reg.R[readReg[3]])
+		mx &= uint32(CacheLineAlignMask)

-		vm.Dataset.PrefetchDataset(vm.datasetOffset + vm.mem.mx)
-		// execute diffuser superscalar program to get dataset 64 bytes
-		vm.Dataset.ReadDataset(vm.datasetOffset+vm.mem.ma, &vm.reg.r, &rlCache)
+		if vm.Dataset != nil {
+			// full mode
+			vm.Dataset.prefetchDataset(datasetOffset + uint64(mx))
+			// load output from superscalar program to get dataset 64 bytes
+			vm.Dataset.readDataset(datasetOffset+uint64(ma), &reg.R)
+		} else {
+			// light mode
+			// execute output from superscalar program to get dataset 64 bytes
+			vm.Cache.initDataset(&rlCache, (datasetOffset+uint64(ma))/CacheLineSize)
+			for i := range reg.R {
+				reg.R[i] ^= rlCache[i]
+			}
+		}

 		// swap the elements
-		vm.mem.mx, vm.mem.ma = vm.mem.ma, vm.mem.mx
+		mx, ma = ma, mx

-		for i := uint64(0); i < REGISTERSCOUNT; i++ {
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr1+8*i:], vm.reg.r[i])
+		for i := uint64(0); i < RegistersCount; i++ {
+			vm.pad.Store64(uint32(spAddr1+8*i), reg.R[i])
 		}

-		for i := uint64(0); i < REGISTERCOUNTFLT; i++ {
-			vm.reg.f[i][LOW] = math.Float64frombits(math.Float64bits(vm.reg.f[i][LOW]) ^ math.Float64bits(vm.reg.e[i][LOW]))
-			vm.reg.f[i][HIGH] = math.Float64frombits(math.Float64bits(vm.reg.f[i][HIGH]) ^ math.Float64bits(vm.reg.e[i][HIGH]))
+		for i := uint64(0); i < RegistersCountFloat; i++ {
+			reg.F[i][LOW] = Xor(reg.F[i][LOW], reg.E[i][LOW])
+			reg.F[i][HIGH] = Xor(reg.F[i][HIGH], reg.E[i][HIGH])

-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i:], math.Float64bits(vm.reg.f[i][LOW]))
-			binary.LittleEndian.PutUint64(vm.ScratchPad[spAddr0+16*i+8:], math.Float64bits(vm.reg.f[i][HIGH]))
+			vm.pad.Store64(uint32(spAddr0+16*i), math.Float64bits(reg.F[i][LOW]))
+			vm.pad.Store64(uint32(spAddr0+16*i+8), math.Float64bits(reg.F[i][HIGH]))
 		}

 		spAddr0 = 0
 		spAddr1 = 0

 	}
-
 }

-func (vm *VM) InitScratchpad(seed *[64]byte) {
-	// calculate and fill scratchpad
-	clear(vm.ScratchPad[:])
-	aes.FillAes1Rx4(seed, vm.ScratchPad[:])
+func (vm *VM) initScratchpad(seed *[64]byte) {
+	clear(vm.pad[:])
+	vm.AES.FillAes1Rx4(seed, vm.pad[:])
 }

-func (vm *VM) CalculateHash(input []byte, output *[32]byte) {
-	var buf [8]byte
+func (vm *VM) runLoops() {
+	if lockThreadDueToRoundingMode {
+		// Lock thread due to rounding mode flags
+		runtime.LockOSThread()
+		defer runtime.UnlockOSThread()
+	}

-	// Lock thread due to rounding mode flags
-	runtime.LockOSThread()
-	defer runtime.UnlockOSThread()
-	//restore rounding mode to golang expected one
-	defer asm.SetRoundingMode(asm.RoundingModeToNearest)
+	// always force a restore before startup
+	ResetRoundingMode(vm.registerFile)

-	// reset rounding mode if new hash being calculated
-	asm.SetRoundingMode(asm.RoundingModeToNearest)
-
-	tempHash := blake2b.Sum512(input)
-
-	vm.InitScratchpad(&tempHash)
-
-	hash512, _ := blake2b.New512(nil)
+	// restore rounding mode at the end
+	defer ResetRoundingMode(vm.registerFile)

 	for chain := 0; chain < RANDOMX_PROGRAM_COUNT-1; chain++ {
-		vm.Run(tempHash)
+		vm.run()

-		hash512.Reset()
-		for i := range vm.reg.r {
-			binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
-			hash512.Write(buf[:])
-		}
-		for i := range vm.reg.f {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
-			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
-			hash512.Write(buf[:])
-		}
-
-		for i := range vm.reg.e {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
-			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
-			hash512.Write(buf[:])
-		}
-
-		for i := range vm.reg.a {
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][LOW]))
-			hash512.Write(buf[:])
-			binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.a[i][HIGH]))
-			hash512.Write(buf[:])
-		}
-
-		hash512.Sum(tempHash[:0])
+		// write R, F, E, A registers
+		vm.hashState = blake2b.Sum512(vm.registerFile.Memory()[:])
 	}

 	// final loop executes here
-	vm.Run(tempHash)
+	vm.run()
+}

-	// now hash the scratch pad and place into register a
-	aes.HashAes1Rx4(vm.ScratchPad[:], &tempHash)
-
-	hash256, _ := blake2b.New256(nil)
-
-	hash256.Reset()
-
-	for i := range vm.reg.r {
-		binary.LittleEndian.PutUint64(buf[:], vm.reg.r[i])
-		hash256.Write(buf[:])
+// SetCache Reinitializes a virtual machine with a new Cache.
+// This function should be called anytime the Cache is reinitialized with a new key.
+// Does nothing if called with a Cache containing the same key value as already set.
+// VM must be initialized without RANDOMX_FLAG_FULL_MEM.
+func (vm *VM) SetCache(cache *Cache) {
+	if vm.flags.Has(RANDOMX_FLAG_FULL_MEM) {
+		panic("unsupported")
 	}
+	vm.Cache = cache
+	//todo
+}

-	for i := range vm.reg.f {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][LOW]))
-		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.f[i][HIGH]))
-		hash256.Write(buf[:])
+// SetDataset Reinitializes a virtual machine with a new Dataset.
+// VM must be initialized with RANDOMX_FLAG_FULL_MEM.
+func (vm *VM) SetDataset(dataset *Dataset) {
+	if !vm.flags.Has(RANDOMX_FLAG_FULL_MEM) {
+		panic("unsupported")
 	}
+	vm.Dataset = dataset
+}

-	for i := range vm.reg.e {
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][LOW]))
-		hash256.Write(buf[:])
-		binary.LittleEndian.PutUint64(buf[:], math.Float64bits(vm.reg.e[i][HIGH]))
-		hash256.Write(buf[:])
+// CalculateHash Calculates a RandomX hash value.
+func (vm *VM) CalculateHash(input []byte, output *[RANDOMX_HASH_SIZE]byte) {
+	vm.hashState = blake2b.Sum512(input)
+
+	vm.initScratchpad(&vm.hashState)
+
+	vm.runLoops()
+
+	// now hash the scratch pad as it will act as register A
+	vm.AES.HashAes1Rx4(vm.pad[:], &vm.hashState)
+
+	regMem := vm.registerFile.Memory()
+	// write hash onto register A
+	copy(regMem[RegisterFileSize-RegistersCountFloat*2*8:], vm.hashState[:])
+
+	// write R, F, E, A registers
+	*output = blake2b.Sum256(regMem[:])
+}
+
+// CalculateHashFirst will begin a hash calculation.
+func (vm *VM) CalculateHashFirst(input []byte) {
+	vm.hashState = blake2b.Sum512(input)
+
+	vm.initScratchpad(&vm.hashState)
+}
+
+// CalculateHashNext will output the hash value of the previous input and begin the calculation of the next hash.
+func (vm *VM) CalculateHashNext(nextInput []byte, output *[RANDOMX_HASH_SIZE]byte) {
+	vm.runLoops()
+
+	// now hash the scratch pad as it will act as register A
+	vm.AES.HashAes1Rx4(vm.pad[:], &vm.hashState)
+
+	// Finish current hash and fill the scratchpad for the next hash at the same time
+	regMem := vm.registerFile.Memory()
+	vm.hashState = blake2b.Sum512(nextInput)
+	// write hash onto register A
+	vm.AES.HashAndFillAes1Rx4(vm.pad[:], (*[64]byte)(unsafe.Pointer(unsafe.SliceData(regMem[RegisterFileSize-RegistersCountFloat*2*8:]))), &vm.hashState)
+	runtime.KeepAlive(regMem)
+
+	// write R, F, E, A registers
+	*output = blake2b.Sum256(regMem[:])
+}
+
+// CalculateHashLast will output the hash value of the previous input.
+func (vm *VM) CalculateHashLast(output *[RANDOMX_HASH_SIZE]byte) {
+	vm.runLoops()
+
+	// now hash the scratch pad as it will act as register A
+	vm.AES.HashAes1Rx4(vm.pad[:], &vm.hashState)
+
+	regMem := vm.registerFile.Memory()
+	// write hash onto register A
+	copy(regMem[RegisterFileSize-RegistersCountFloat*2*8:], vm.hashState[:])
+
+	// write R, F, E, A registers
+	*output = blake2b.Sum256(regMem[:])
+}
+
+// Close Releases all memory occupied by the structure.
+func (vm *VM) Close() error {
+	memory.Free(cacheLineAlignedAllocator, vm.pad)
+	memory.Free(cacheLineAlignedAllocator, vm.registerFile)
+
+	if vm.jitProgram != nil {
+		return vm.jitProgram.Close()
 	}
-
-	// copy tempHash as it first copied to register and then hashed
-	hash256.Write(tempHash[:])
-
-	hash256.Sum(output[:0])
-}
-
-const mask22bit = (uint64(1) << 22) - 1
-
-func getSmallPositiveFloatBits(entropy uint64) uint64 {
-	exponent := entropy >> 59 //0..31
-	mantissa := entropy & mantissaMask
-	exponent += exponentBias
-	exponent &= exponentMask
-	exponent = exponent << mantissaSize
-	return exponent | mantissa
-}
-
-func getStaticExponent(entropy uint64) uint64 {
-	exponent := constExponentBits
-	exponent |= (entropy >> (64 - staticExponentBits)) << dynamicExponentBits
-	exponent <<= mantissaSize
-	return exponent
-}
-
-func getFloatMask(entropy uint64) uint64 {
-	return (entropy & mask22bit) | getStaticExponent(entropy)
+	return nil
 }
--- a/vm_bytecode.go
+++ b/vm_bytecode.go
@ -0,0 +1,91 @@
+package randomx
+
+type ByteCodeInstruction struct {
+	Dst, Src byte
+	ImmB     uint8
+	Opcode   ByteCodeInstructionOp
+	MemMask  uint32
+	Imm      uint64
+	EMask    uint64
+	/*
+		union {
+			int_reg_t* idst;
+			rx_vec_f128* fdst;
+		};
+		union {
+			int_reg_t* isrc;
+			rx_vec_f128* fsrc;
+		};
+		union {
+			uint64_t imm;
+			int64_t simm;
+		};
+		InstructionType type;
+		union {
+			int16_t target;
+			uint16_t shift;
+		};
+		uint32_t memMask;
+	*/
+
+}
+
+func (i ByteCodeInstruction) jumpTarget() int {
+	return int(int16((uint16(i.ImmB) << 8) | uint16(i.Src)))
+}
+
+func (i ByteCodeInstruction) getScratchpadAddress(ptr uint64) uint32 {
+	return uint32(ptr+i.Imm) & i.MemMask
+}
+
+func (i ByteCodeInstruction) getScratchpadZeroAddress() uint32 {
+	return uint32(i.Imm) & i.MemMask
+}
+
+type ByteCode [RANDOMX_PROGRAM_SIZE]ByteCodeInstruction
+
+type ByteCodeInstructionOp int
+
+const (
+	VM_NOP = ByteCodeInstructionOp(iota)
+	VM_IADD_RS
+	VM_IADD_M
+	VM_IADD_MZ
+	VM_ISUB_R
+	VM_ISUB_I
+	VM_ISUB_M
+	VM_ISUB_MZ
+	VM_IMUL_R
+	VM_IMUL_I
+	VM_IMUL_M
+	VM_IMUL_MZ
+	VM_IMULH_R
+	VM_IMULH_M
+	VM_IMULH_MZ
+	VM_ISMULH_R
+	VM_ISMULH_M
+	VM_ISMULH_MZ
+	VM_INEG_R
+	VM_IXOR_R
+	VM_IXOR_I
+	VM_IXOR_M
+	VM_IXOR_MZ
+	VM_IROR_R
+	VM_IROR_I
+	VM_IROL_R
+	VM_IROL_I
+	VM_ISWAP_R
+	VM_FSWAP_RF
+	VM_FSWAP_RE
+	VM_FADD_R
+	VM_FADD_M
+	VM_FSUB_R
+	VM_FSUB_M
+	VM_FSCAL_R
+	VM_FMUL_R
+	VM_FDIV_M
+	VM_FSQRT_R
+	VM_CFROUND
+	VM_CBRANCH
+	VM_ISTORE
+)
--- a/vm_bytecode_jit_amd64.go
+++ b/vm_bytecode_jit_amd64.go
@ -0,0 +1,466 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+package randomx
+
+import (
+	"encoding/binary"
+	"math/bits"
+	"unsafe"
+)
+
+//go:noescape
+func vm_run(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64, jmp uintptr)
+
+//go:noescape
+func vm_run_full(rf *RegisterFile, pad *ScratchPad, dataset *RegisterLine, iterations, memoryRegisters uint64, eMask [2]uint64, jmp uintptr)
+
+/*
+#define RANDOMX_DATASET_BASE_SIZE 2147483648
+#define RANDOMX_DATASET_BASE_MASK    (RANDOMX_DATASET_BASE_SIZE-64)
+
+mov ecx, ebp                       ;# ecx = ma
+;#and ecx, RANDOMX_DATASET_BASE_MASK
+and ecx, 2147483584
+xor r8, qword ptr [rdi+rcx]
+ror rbp, 32                        ;# swap "ma" and "mx"
+xor rbp, rax                       ;# modify "mx"
+mov edx, ebp                       ;# edx = mx
+;#and edx, RANDOMX_DATASET_BASE_MASK
+and edx, 2147483584
+prefetchnta byte ptr [rdi+rdx]
+xor r9,  qword ptr [rdi+rcx+8]
+xor r10, qword ptr [rdi+rcx+16]
+xor r11, qword ptr [rdi+rcx+24]
+xor r12, qword ptr [rdi+rcx+32]
+xor r13, qword ptr [rdi+rcx+40]
+xor r14, qword ptr [rdi+rcx+48]
+xor r15, qword ptr [rdi+rcx+56]
+*/
+var programReadDataset = []byte{0x89, 0xE9, 0x81, 0xE1, 0xC0, 0xFF, 0xFF, 0x7F, 0x4C, 0x33, 0x04, 0x0F, 0x48, 0xC1, 0xCD, 0x20, 0x48, 0x31, 0xC5, 0x89, 0xEA, 0x81, 0xE2, 0xC0, 0xFF, 0xFF, 0x7F, 0x0F, 0x18, 0x04, 0x17, 0x4C, 0x33, 0x4C, 0x0F, 0x08, 0x4C, 0x33, 0x54, 0x0F, 0x10, 0x4C, 0x33, 0x5C, 0x0F, 0x18, 0x4C, 0x33, 0x64, 0x0F, 0x20, 0x4C, 0x33, 0x6C, 0x0F, 0x28, 0x4C, 0x33, 0x74, 0x0F, 0x30, 0x4C, 0x33, 0x7C, 0x0F, 0x38}
+
+/*
+lea rcx, [rsi+rax]
+push rcx
+xor r8,  qword ptr [rcx+0]
+xor r9,  qword ptr [rcx+8]
+xor r10, qword ptr [rcx+16]
+xor r11, qword ptr [rcx+24]
+xor r12, qword ptr [rcx+32]
+xor r13, qword ptr [rcx+40]
+xor r14, qword ptr [rcx+48]
+xor r15, qword ptr [rcx+56]
+lea rcx, [rsi+rdx]
+push rcx
+cvtdq2pd xmm0, qword ptr [rcx+0]
+cvtdq2pd xmm1, qword ptr [rcx+8]
+cvtdq2pd xmm2, qword ptr [rcx+16]
+cvtdq2pd xmm3, qword ptr [rcx+24]
+cvtdq2pd xmm4, qword ptr [rcx+32]
+cvtdq2pd xmm5, qword ptr [rcx+40]
+cvtdq2pd xmm6, qword ptr [rcx+48]
+cvtdq2pd xmm7, qword ptr [rcx+56]
+andps xmm4, xmm13
+andps xmm5, xmm13
+andps xmm6, xmm13
+andps xmm7, xmm13
+orps xmm4, xmm14
+orps xmm5, xmm14
+orps xmm6, xmm14
+orps xmm7, xmm14
+*/
+var programLoopLoad = []byte{0x48, 0x8D, 0x0C, 0x06, 0x51, 0x4C, 0x33, 0x01, 0x4C, 0x33, 0x49, 0x08, 0x4C, 0x33, 0x51, 0x10, 0x4C, 0x33, 0x59, 0x18, 0x4C, 0x33, 0x61, 0x20, 0x4C, 0x33, 0x69, 0x28, 0x4C, 0x33, 0x71, 0x30, 0x4C, 0x33, 0x79, 0x38, 0x48, 0x8D, 0x0C, 0x16, 0x51, 0xF3, 0x0F, 0xE6, 0x01, 0xF3, 0x0F, 0xE6, 0x49, 0x08, 0xF3, 0x0F, 0xE6, 0x51, 0x10, 0xF3, 0x0F, 0xE6, 0x59, 0x18, 0xF3, 0x0F, 0xE6, 0x61, 0x20, 0xF3, 0x0F, 0xE6, 0x69, 0x28, 0xF3, 0x0F, 0xE6, 0x71, 0x30, 0xF3, 0x0F, 0xE6, 0x79, 0x38, 0x41, 0x0F, 0x54, 0xE5, 0x41, 0x0F, 0x54, 0xED, 0x41, 0x0F, 0x54, 0xF5, 0x41, 0x0F, 0x54, 0xFD, 0x41, 0x0F, 0x56, 0xE6, 0x41, 0x0F, 0x56, 0xEE, 0x41, 0x0F, 0x56, 0xF6, 0x41, 0x0F, 0x56, 0xFE}
+
+/*
+pop rcx
+mov qword ptr [rcx+0], r8
+mov qword ptr [rcx+8], r9
+mov qword ptr [rcx+16], r10
+mov qword ptr [rcx+24], r11
+mov qword ptr [rcx+32], r12
+mov qword ptr [rcx+40], r13
+mov qword ptr [rcx+48], r14
+mov qword ptr [rcx+56], r15
+pop rcx
+xorpd xmm0, xmm4
+xorpd xmm1, xmm5
+xorpd xmm2, xmm6
+xorpd xmm3, xmm7
+
+;# aligned mode
+movapd xmmword ptr [rcx+0], xmm0
+movapd xmmword ptr [rcx+16], xmm1
+movapd xmmword ptr [rcx+32], xmm2
+movapd xmmword ptr [rcx+48], xmm3
+*/
+var programLoopStoreAligned = []byte{0x59, 0x4C, 0x89, 0x01, 0x4C, 0x89, 0x49, 0x08, 0x4C, 0x89, 0x51, 0x10, 0x4C, 0x89, 0x59, 0x18, 0x4C, 0x89, 0x61, 0x20, 0x4C, 0x89, 0x69, 0x28, 0x4C, 0x89, 0x71, 0x30, 0x4C, 0x89, 0x79, 0x38, 0x59, 0x66, 0x0F, 0x57, 0xC4, 0x66, 0x0F, 0x57, 0xCD, 0x66, 0x0F, 0x57, 0xD6, 0x66, 0x0F, 0x57, 0xDF, 0x66, 0x0F, 0x29, 0x01, 0x66, 0x0F, 0x29, 0x49, 0x10, 0x66, 0x0F, 0x29, 0x51, 0x20, 0x66, 0x0F, 0x29, 0x59, 0x30}
+
+/*
+#define RANDOMX_SCRATCHPAD_L3 2097152
+#define RANDOMX_SCRATCHPAD_MASK      (RANDOMX_SCRATCHPAD_L3-64)
+mov rdx, rax
+;#and eax, RANDOMX_SCRATCHPAD_MASK
+and eax, 2097088
+ror rdx, 32
+;#and edx, RANDOMX_SCRATCHPAD_MASK
+and edx, 2097088
+*/
+var programCalculateSpAddrs = []byte{0x48, 0x89, 0xC2, 0x25, 0xC0, 0xFF, 0x1F, 0x00, 0x48, 0xC1, 0xCA, 0x20, 0x81, 0xE2, 0xC0, 0xFF, 0x1F, 0x00}
+
+func (f VMProgramFunc) ExecuteFull(rf *RegisterFile, pad *ScratchPad, dataset *RegisterLine, iterations uint64, ma, mx uint32, eMask [2]uint64) {
+	if f == nil {
+		panic("program is nil")
+	}
+
+	jmpPtr := uintptr(unsafe.Pointer(unsafe.SliceData(f)))
+	vm_run_full(rf, pad, dataset, iterations, (uint64(ma)<<32)|uint64(mx), eMask, jmpPtr)
+}
+
+func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	if f == nil {
+		panic("program is nil")
+	}
+
+	jmpPtr := uintptr(unsafe.Pointer(unsafe.SliceData(f)))
+	vm_run(rf, pad, eMask, jmpPtr)
+}
+
+func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte {
+	program = program[:0]
+
+	isFullMode := readReg != nil
+
+	if isFullMode {
+
+		program = append(program, programCalculateSpAddrs...)
+		// prologue
+		program = append(program, programLoopLoad...)
+	}
+
+	var instructionOffsets [RANDOMX_PROGRAM_SIZE]int32
+
+	for ix := range c {
+		instructionOffsets[ix] = int32(len(program))
+
+		instr := &c[ix]
+		switch instr.Opcode {
+
+		case VM_IADD_RS:
+			program = append(program, REX_LEA...)
+			if instr.Dst == RegisterNeedsDisplacement {
+				program = append(program, 0xac)
+			} else {
+				program = append(program, 0x04+8*instr.Dst)
+			}
+			program = append(program, genSIB(int(instr.ImmB), int(instr.Src), int(instr.Dst)))
+			if instr.Dst == RegisterNeedsDisplacement {
+				program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			}
+
+		case VM_IADD_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_ADD_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IADD_MZ:
+			program = append(program, REX_ADD_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_ISUB_R:
+			program = append(program, REX_SUB_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_ISUB_I:
+			program = append(program, REX_81...)
+			program = append(program, 0xe8+instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_ISUB_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_SUB_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_ISUB_MZ:
+			program = append(program, REX_SUB_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IMUL_R:
+			program = append(program, REX_IMUL_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_IMUL_I:
+			// also handles imul_rcp, with 64-bit special
+			if bits.Len64(instr.Imm) > 32 {
+				program = append(program, MOV_RAX_I...)
+				program = binary.LittleEndian.AppendUint64(program, instr.Imm)
+				program = append(program, REX_IMUL_RM...)
+				program = append(program, 0xc0+8*instr.Dst)
+			} else {
+				program = append(program, REX_IMUL_RRI...)
+				program = append(program, 0xc0+9*instr.Dst)
+				program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			}
+
+		case VM_IMUL_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_IMUL_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IMUL_MZ:
+			program = append(program, REX_IMUL_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_R...)
+			program = append(program, 0xe0+instr.Src)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_IMULH_M:
+			program = genAddressReg(program, instr, false)
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_MEM...)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+		case VM_IMULH_MZ:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_M...)
+			program = append(program, 0xa6)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_ISMULH_R:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_R...)
+			program = append(program, 0xe8+instr.Src)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_ISMULH_M:
+			program = genAddressReg(program, instr, false)
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_IMUL_MEM...)
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+		case VM_ISMULH_MZ:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, REX_MUL_M...)
+			program = append(program, 0xae)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, REX_MOV_R64R...)
+			program = append(program, 0xc2+8*instr.Dst)
+
+		case VM_INEG_R:
+			program = append(program, REX_NEG...)
+			program = append(program, 0xd8+instr.Dst)
+
+		case VM_IXOR_R:
+			program = append(program, REX_XOR_RR...)
+			program = append(program, 0xc0+8*instr.Dst+instr.Src)
+		case VM_IXOR_I:
+			program = append(program, REX_XOR_RI...)
+			program = append(program, 0xf0+instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IXOR_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_XOR_RM...)
+			program = append(program, 0x04+8*instr.Dst)
+			program = append(program, 0x06)
+		case VM_IXOR_MZ:
+			program = append(program, REX_XOR_RM...)
+			program = append(program, 0x86+8*instr.Dst)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+		case VM_IROR_R:
+			program = append(program, REX_MOV_RR...)
+			program = append(program, 0xc8+instr.Src)
+			program = append(program, REX_ROT_CL...)
+			program = append(program, 0xc8+instr.Dst)
+		case VM_IROR_I:
+			program = append(program, REX_ROT_I8...)
+			program = append(program, 0xc8+instr.Dst)
+			program = append(program, byte(instr.Imm&63))
+
+		case VM_IROL_R:
+			program = append(program, REX_MOV_RR...)
+			program = append(program, 0xc8+instr.Src)
+			program = append(program, REX_ROT_CL...)
+			program = append(program, 0xc0+instr.Dst)
+		case VM_IROL_I:
+			program = append(program, REX_ROT_I8...)
+			program = append(program, 0xc0+instr.Dst)
+			program = append(program, byte(instr.Imm&63))
+
+		case VM_ISWAP_R:
+			program = append(program, REX_XCHG...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FSWAP_RF:
+			program = append(program, SHUFPD...)
+			program = append(program, 0xc0+9*instr.Dst)
+			program = append(program, 1)
+		case VM_FSWAP_RE:
+			program = append(program, SHUFPD...)
+			program = append(program, 0xc0+9*(instr.Dst+RegistersCountFloat))
+			program = append(program, 1)
+
+		case VM_FADD_R:
+			program = append(program, REX_ADDPD...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FADD_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_ADDPD...)
+			program = append(program, 0xc4+8*instr.Dst)
+
+		case VM_FSUB_R:
+			program = append(program, REX_SUBPD...)
+			program = append(program, 0xc0+instr.Src+8*instr.Dst)
+
+		case VM_FSUB_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_SUBPD...)
+			program = append(program, 0xc4+8*instr.Dst)
+
+		case VM_FSCAL_R:
+			program = append(program, REX_XORPS...)
+			program = append(program, 0xc7+8*instr.Dst)
+
+		case VM_FMUL_R:
+			program = append(program, REX_MULPD...)
+			program = append(program, 0xe0+instr.Src+8*instr.Dst)
+
+		case VM_FDIV_M:
+			program = genAddressReg(program, instr, true)
+			program = append(program, REX_CVTDQ2PD_XMM12...)
+			program = append(program, REX_ANDPS_XMM12...)
+			program = append(program, REX_DIVPD...)
+			program = append(program, 0xe4+8*instr.Dst)
+
+		case VM_FSQRT_R:
+			program = append(program, SQRTPD...)
+			program = append(program, 0xe4+9*instr.Dst)
+
+		case VM_CFROUND:
+			program = append(program, REX_MOV_RR64...)
+			program = append(program, 0xc0+instr.Src)
+			rotate := byte((13 - instr.Imm) & 63)
+			if rotate != 0 {
+				program = append(program, ROL_RAX...)
+				program = append(program, rotate)
+			}
+			program = append(program, AND_OR_MOV_LDMXCSR...)
+		case VM_CBRANCH:
+			reg := instr.Dst
+			target := instr.jumpTarget() + 1
+
+			jmpOffset := instructionOffsets[target] - (int32(len(program)) + 16)
+
+			if BranchesWithin32B {
+				branchBegin := uint32(int32(len(program)) + 7)
+				branchEnd := branchBegin
+				if jmpOffset >= -128 {
+					branchEnd += 9
+				} else {
+					branchEnd += 13
+				}
+				// If the jump crosses or touches 32-byte boundary, align it
+				if (branchBegin ^ branchEnd) >= 32 {
+					alignmentSize := 32 - (branchBegin & 31)
+					alignmentSize -= alignmentSize
+
+					program = append(program, JMP_ALIGN_PREFIX[alignmentSize]...)
+				}
+			}
+			program = append(program, REX_ADD_I...)
+			program = append(program, 0xc0+reg)
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+
+			program = append(program, REX_TEST...)
+			program = append(program, 0xc0+reg)
+			program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
+
+			if jmpOffset >= -128 {
+				program = append(program, JZ_SHORT)
+				program = append(program, byte(jmpOffset))
+			} else {
+				program = append(program, JZ...)
+				program = binary.LittleEndian.AppendUint32(program, uint32(jmpOffset-4))
+			}
+
+		case VM_ISTORE:
+			//genAddressRegDst
+			program = append(program, LEA_32...)
+			program = append(program, 0x80+instr.Dst)
+			if instr.Dst == RegisterNeedsSib {
+				program = append(program, 0x24)
+			}
+			program = binary.LittleEndian.AppendUint32(program, uint32(instr.Imm))
+			program = append(program, AND_EAX_I)
+			program = binary.LittleEndian.AppendUint32(program, instr.MemMask)
+
+			program = append(program, REX_MOV_MR...)
+			program = append(program, 0x04+8*instr.Src)
+			program = append(program, 0x06)
+		case VM_NOP:
+			program = append(program, NOP1...)
+		}
+	}
+
+	if isFullMode {
+		// end of prologue
+		program = append(program, REX_MOV_RR...)
+		program = append(program, 0xc0+byte(readReg[2]))
+		program = append(program, REX_XOR_EAX...)
+		program = append(program, 0xc0+byte(readReg[3]))
+
+		// read dataset
+
+		program = append(program, programReadDataset...)
+
+		// epilogue
+		program = append(program, REX_MOV_RR64...)
+		program = append(program, 0xc0+byte(readReg[0]))
+		program = append(program, REX_XOR_RAX_R64...)
+		program = append(program, 0xc0+byte(readReg[1]))
+		//todo: prefetch scratchpad
+
+		program = append(program, programLoopStoreAligned...)
+
+		if BranchesWithin32B {
+			branchBegin := uint32(len(program))
+			branchEnd := branchBegin + 9
+
+			// If the jump crosses or touches 32-byte boundary, align it
+			if (branchBegin ^ branchEnd) >= 32 {
+				alignmentSize := 32 - (branchBegin & 31)
+				if alignmentSize > 8 {
+					program = append(program, NOPX[alignmentSize-9][:alignmentSize-8]...)
+					alignmentSize = 8
+				}
+				program = append(program, NOPX[alignmentSize-1][:alignmentSize]...)
+			}
+		}
+
+		program = append(program, SUB_EBX...)
+		program = append(program, JNZ...)
+		program = binary.LittleEndian.AppendUint32(program, uint32(-len(program)-4))
+		//exit otherwise
+
+	}
+
+	program = append(program, RET)
+
+	return program
+}
--- a/vm_bytecode_jit_amd64.s
+++ b/vm_bytecode_jit_amd64.s
@ -0,0 +1,204 @@
+//go:build unix && amd64 && !disable_jit && !purego
+
+#include "textflag.h"
+
+TEXT ·vm_run(SB),$8-40
+
+    // move register file to registers
+	MOVQ rf+0(FP), AX
+
+    PREFETCHNTA 0(AX)
+    // r0-r7
+    MOVQ (0*8)(AX), R8
+    MOVQ (1*8)(AX), R9
+    MOVQ (2*8)(AX), R10
+    MOVQ (3*8)(AX), R11
+    MOVQ (4*8)(AX), R12
+    MOVQ (5*8)(AX), R13
+    MOVQ (6*8)(AX), R14
+    MOVQ (7*8)(AX), R15
+
+    // f0-f3
+    VMOVAPD (8*8)(AX), X0
+    VMOVAPD (10*8)(AX), X1
+    VMOVAPD (12*8)(AX), X2
+    VMOVAPD (14*8)(AX), X3
+    // e0-e3
+    VMOVAPD (16*8)(AX), X4
+    VMOVAPD (18*8)(AX), X5
+    VMOVAPD (20*8)(AX), X6
+    VMOVAPD (22*8)(AX), X7
+    // a0-a3
+    VMOVAPD (24*8)(AX), X8
+    VMOVAPD (26*8)(AX), X9
+    VMOVAPD (28*8)(AX), X10
+    VMOVAPD (30*8)(AX), X11
+
+    // mantissa mask
+	//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
+    MOVQ $0x00ffffffffffffff, AX
+	VMOVQ AX, X13
+	VPBROADCASTQ X13, X13
+
+    // eMask
+	VMOVDQU64 eMask+16(FP), X14
+
+    // scale mask
+	//VMOVQ $0x80F0000000000000, $0x80F0000000000000, X15
+    MOVQ $0x80F0000000000000, AX
+	VMOVQ AX, X15
+	VPBROADCASTQ X15, X15
+
+    // scratchpad pointer
+    MOVQ pad+8(FP), SI
+
+    // JIT location
+    MOVQ jmp+32(FP), AX
+
+    // jump to JIT code
+    CALL AX
+
+
+    // move register file back to registers
+	MOVQ rf+0(FP), AX
+
+    // prefetchw BYTE PTR [rax]
+    // PREFETCHW 0(AX)
+    BYTE $0x0F
+    BYTE $0x0D
+    BYTE $0x08
+
+    // r0-r7
+    MOVQ R8, (0*8)(AX)
+    MOVQ R9, (1*8)(AX)
+    MOVQ R10, (2*8)(AX)
+    MOVQ R11, (3*8)(AX)
+    MOVQ R12, (4*8)(AX)
+    MOVQ R13, (5*8)(AX)
+    MOVQ R14, (6*8)(AX)
+    MOVQ R15, (7*8)(AX)
+
+    // f0-f3
+    VMOVAPD X0, (8*8)(AX)
+    VMOVAPD X1, (10*8)(AX)
+    VMOVAPD X2, (12*8)(AX)
+    VMOVAPD X3, (14*8)(AX)
+    // e0-e3
+    VMOVAPD X4, (16*8)(AX)
+    VMOVAPD X5, (18*8)(AX)
+    VMOVAPD X6, (20*8)(AX)
+    VMOVAPD X7, (22*8)(AX)
+
+    // a0-a3 are constant, no need to move
+
+    RET
+
+
+#define RANDOMX_SCRATCHPAD_L3 2097152
+#define RANDOMX_SCRATCHPAD_MASK      (RANDOMX_SCRATCHPAD_L3-64)
+
+TEXT ·vm_run_full(SB),$32-64
+
+    // move register file to registers
+	MOVQ rf+0(FP), AX
+
+    PREFETCHNTA 0(AX)
+    // r0-r7
+    MOVQ (0*8)(AX), R8
+    MOVQ (1*8)(AX), R9
+    MOVQ (2*8)(AX), R10
+    MOVQ (3*8)(AX), R11
+    MOVQ (4*8)(AX), R12
+    MOVQ (5*8)(AX), R13
+    MOVQ (6*8)(AX), R14
+    MOVQ (7*8)(AX), R15
+
+    // f0-f3
+    VMOVAPD (8*8)(AX), X0
+    VMOVAPD (10*8)(AX), X1
+    VMOVAPD (12*8)(AX), X2
+    VMOVAPD (14*8)(AX), X3
+    // e0-e3
+    VMOVAPD (16*8)(AX), X4
+    VMOVAPD (18*8)(AX), X5
+    VMOVAPD (20*8)(AX), X6
+    VMOVAPD (22*8)(AX), X7
+    // load constants a0-a3
+    VMOVAPD (24*8)(AX), X8
+    VMOVAPD (26*8)(AX), X9
+    VMOVAPD (28*8)(AX), X10
+    VMOVAPD (30*8)(AX), X11
+
+    //TODO: rest of init
+
+    // mantissa mask
+	//VMOVQ $0x00ffffffffffffff, $0x00ffffffffffffff, X13
+    MOVQ $0x00ffffffffffffff, AX
+	VMOVQ AX, X13
+	VPBROADCASTQ X13, X13
+
+    // eMask
+	VMOVDQU64 eMask+40(FP), X14
+
+    // scale mask
+	//VMOVQ $0x80F0000000000000, $0x80F0000000000000, X15
+    MOVQ $0x80F0000000000000, AX
+	VMOVQ AX, X15
+	VPBROADCASTQ X15, X15
+
+    // scratchpad pointer on rsi
+    MOVQ pad+8(FP), SI
+    // dataset pointer on rdi
+    MOVQ dataset+16(FP), DI
+    // iterations on rbx
+    MOVQ iterations+24(FP), BX
+    // ma and mx on rbp TODO: change this
+    MOVQ memoryRegisters+32(FP), BP
+
+    // do ma/mx calcs
+    MOVQ BP, AX
+    RORQ $32, BP
+
+    //AX = spAddr0
+    //DX = spAddr1
+
+    // JIT location
+    MOVQ jmp+56(FP), CX
+    // jump to JIT code
+    // this handles readReg[0-3] and dataset reading, load, stores
+    CALL CX
+
+    // move register file back to registers
+	MOVQ rf+0(FP), AX
+
+
+    // prefetchw BYTE PTR [rax]
+    // PREFETCHW 0(AX)
+    BYTE $0x0F
+    BYTE $0x0D
+    BYTE $0x08
+
+    // r0-r7
+    MOVQ R8, (0*8)(AX)
+    MOVQ R9, (1*8)(AX)
+    MOVQ R10, (2*8)(AX)
+    MOVQ R11, (3*8)(AX)
+    MOVQ R12, (4*8)(AX)
+    MOVQ R13, (5*8)(AX)
+    MOVQ R14, (6*8)(AX)
+    MOVQ R15, (7*8)(AX)
+
+    // f0-f3
+    VMOVAPD X0, (8*8)(AX)
+    VMOVAPD X1, (10*8)(AX)
+    VMOVAPD X2, (12*8)(AX)
+    VMOVAPD X3, (14*8)(AX)
+    // e0-e3
+    VMOVAPD X4, (16*8)(AX)
+    VMOVAPD X5, (18*8)(AX)
+    VMOVAPD X6, (20*8)(AX)
+    VMOVAPD X7, (22*8)(AX)
+
+    // a0-a3 are constant, no need to move
+
+    RET
--- a/vm_bytecode_jit_generic.go
+++ b/vm_bytecode_jit_generic.go
@ -0,0 +1,14 @@
+//go:build !unix || !amd64 || disable_jit || purego
+
+package randomx
+
+func (c *ByteCode) generateCode(program []byte, readReg *[4]uint64) []byte {
+	return nil
+}
+
+func (f VMProgramFunc) Execute(rf *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+
+}
+func (f VMProgramFunc) ExecuteFull(rf *RegisterFile, pad *ScratchPad, dataset *RegisterLine, iterations uint64, ma, mx uint32, eMask [2]uint64) {
+
+}
--- a/vm_bytecode_native.go
+++ b/vm_bytecode_native.go
@ -0,0 +1,137 @@
+//go:build (arm64 || arm.6 || arm.7 || amd64 || 386) && !purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/go-randomx/v3/internal/asm"
+	"math"
+	"math/bits"
+)
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call asm.SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to softfloat64.RoundingModeToNearest between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < len(c); pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] += f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] += f.A[i.Src][HIGH]
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] += lo
+			f.F[i.Dst][HIGH] += hi
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] -= f.A[i.Src][LOW]
+			f.F[i.Dst][HIGH] -= f.A[i.Src][HIGH]
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] -= lo
+			f.F[i.Dst][HIGH] -= hi
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] *= f.A[i.Src][LOW]
+			f.E[i.Dst][HIGH] *= f.A[i.Src][HIGH]
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] /= MaskRegisterExponentMantissa(lo, eMask[LOW])
+			f.E[i.Dst][HIGH] /= MaskRegisterExponentMantissa(hi, eMask[HIGH])
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = math.Sqrt(f.E[i.Dst][LOW])
+			f.E[i.Dst][HIGH] = math.Sqrt(f.E[i.Dst][HIGH])
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			SetRoundingMode(f, uint8(tmp))
+
+		case VM_CBRANCH:
+			f.R[i.Dst] += i.Imm
+			if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+const lockThreadDueToRoundingMode = true
+
+func SetRoundingMode(f *RegisterFile, mode uint8) {
+	if f.FPRC == mode {
+		return
+	}
+	f.FPRC = mode
+	asm.SetRoundingMode(mode)
+}
+
+func ResetRoundingMode(f *RegisterFile) {
+	f.FPRC = 0
+	asm.SetRoundingMode(uint8(0))
+}
--- a/vm_bytecode_purego.go
+++ b/vm_bytecode_purego.go
@ -0,0 +1,131 @@
+//go:build (!arm64 && !(arm.6 || arm.7) && !amd64 && !386) || purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/softfloat64"
+	"math/bits"
+)
+
+// Execute Runs a RandomX program with the given register file and scratchpad
+// Warning: This will call float64 SetRoundingMode directly
+// It is the caller's responsibility to set and restore the mode to IEEE 754 roundTiesToEven between full executions
+// Additionally, runtime.LockOSThread and defer runtime.UnlockOSThread is recommended to prevent other goroutines sharing these changes
+func (c *ByteCode) Execute(f *RegisterFile, pad *ScratchPad, eMask [2]uint64) {
+	for pc := 0; pc < len(c); pc++ {
+		i := &c[pc]
+		switch i.Opcode {
+		case VM_NOP: // we do nothing
+		case VM_IADD_RS:
+			f.R[i.Dst] += (f.R[i.Src] << i.ImmB) + i.Imm
+		case VM_IADD_M:
+			f.R[i.Dst] += pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IADD_MZ:
+			f.R[i.Dst] += pad.Load64(uint32(i.Imm))
+		case VM_ISUB_R:
+			f.R[i.Dst] -= f.R[i.Src]
+		case VM_ISUB_I:
+			f.R[i.Dst] -= i.Imm
+		case VM_ISUB_M:
+			f.R[i.Dst] -= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_ISUB_MZ:
+			f.R[i.Dst] -= pad.Load64(uint32(i.Imm))
+		case VM_IMUL_R:
+			f.R[i.Dst] *= f.R[i.Src]
+		case VM_IMUL_I:
+			// also handles imul_rcp
+			f.R[i.Dst] *= i.Imm
+		case VM_IMUL_M:
+			f.R[i.Dst] *= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IMUL_MZ:
+			f.R[i.Dst] *= pad.Load64(uint32(i.Imm))
+		case VM_IMULH_R:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], f.R[i.Src])
+		case VM_IMULH_M:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(i.getScratchpadAddress(f.R[i.Src])))
+		case VM_IMULH_MZ:
+			f.R[i.Dst], _ = bits.Mul64(f.R[i.Dst], pad.Load64(uint32(i.Imm)))
+		case VM_ISMULH_R:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(f.R[i.Src]))
+		case VM_ISMULH_M:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(i.getScratchpadAddress(f.R[i.Src]))))
+		case VM_ISMULH_MZ:
+			f.R[i.Dst] = smulh(int64(f.R[i.Dst]), int64(pad.Load64(uint32(i.Imm))))
+		case VM_INEG_R:
+			f.R[i.Dst] = -f.R[i.Dst]
+		case VM_IXOR_R:
+			f.R[i.Dst] ^= f.R[i.Src]
+		case VM_IXOR_I:
+			f.R[i.Dst] ^= i.Imm
+		case VM_IXOR_M:
+			f.R[i.Dst] ^= pad.Load64(i.getScratchpadAddress(f.R[i.Src]))
+		case VM_IXOR_MZ:
+			f.R[i.Dst] ^= pad.Load64(uint32(i.Imm))
+		case VM_IROR_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(f.R[i.Src]&63))
+		case VM_IROR_I:
+			//todo: can merge into VM_IROL_I
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], 0-int(i.Imm&63))
+		case VM_IROL_R:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(f.R[i.Src]&63))
+		case VM_IROL_I:
+			f.R[i.Dst] = bits.RotateLeft64(f.R[i.Dst], int(i.Imm&63))
+		case VM_ISWAP_R:
+			f.R[i.Dst], f.R[i.Src] = f.R[i.Src], f.R[i.Dst]
+
+		case VM_FSWAP_RF:
+			f.F[i.Dst][HIGH], f.F[i.Dst][LOW] = f.F[i.Dst][LOW], f.F[i.Dst][HIGH]
+		case VM_FSWAP_RE:
+			f.E[i.Dst][HIGH], f.E[i.Dst][LOW] = f.E[i.Dst][LOW], f.E[i.Dst][HIGH]
+		case VM_FADD_R:
+			f.F[i.Dst][LOW] = softfloat64.Add(f.F[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Add(f.F[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FADD_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] = softfloat64.Add(f.F[i.Dst][LOW], lo, softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Add(f.F[i.Dst][HIGH], hi, softfloat64.RoundingMode(f.FPRC))
+		case VM_FSUB_R:
+			f.F[i.Dst][LOW] = softfloat64.Sub(f.F[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Sub(f.F[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FSUB_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.F[i.Dst][LOW] = softfloat64.Sub(f.F[i.Dst][LOW], lo, softfloat64.RoundingMode(f.FPRC))
+			f.F[i.Dst][HIGH] = softfloat64.Sub(f.F[i.Dst][HIGH], hi, softfloat64.RoundingMode(f.FPRC))
+		case VM_FSCAL_R:
+			// no dependent on rounding modes
+			f.F[i.Dst][LOW] = ScaleNegate(f.F[i.Dst][LOW])
+			f.F[i.Dst][HIGH] = ScaleNegate(f.F[i.Dst][HIGH])
+		case VM_FMUL_R:
+			f.E[i.Dst][LOW] = softfloat64.Mul(f.E[i.Dst][LOW], f.A[i.Src][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Mul(f.E[i.Dst][HIGH], f.A[i.Src][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_FDIV_M:
+			lo, hi := pad.Load32F(i.getScratchpadAddress(f.R[i.Src]))
+			f.E[i.Dst][LOW] = softfloat64.Div(f.E[i.Dst][LOW], MaskRegisterExponentMantissa(lo, eMask[LOW]), softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Div(f.E[i.Dst][HIGH], MaskRegisterExponentMantissa(hi, eMask[HIGH]), softfloat64.RoundingMode(f.FPRC))
+		case VM_FSQRT_R:
+			f.E[i.Dst][LOW] = softfloat64.Sqrt(f.E[i.Dst][LOW], softfloat64.RoundingMode(f.FPRC))
+			f.E[i.Dst][HIGH] = softfloat64.Sqrt(f.E[i.Dst][HIGH], softfloat64.RoundingMode(f.FPRC))
+		case VM_CFROUND:
+			tmp := (bits.RotateLeft64(f.R[i.Src], 0-int(i.Imm))) % 4 // rotate right
+			SetRoundingMode(f, uint8(tmp))
+
+		case VM_CBRANCH:
+			f.R[i.Dst] += i.Imm
+			if (f.R[i.Dst] & uint64(i.MemMask)) == 0 {
+				pc = i.jumpTarget()
+			}
+		case VM_ISTORE:
+			pad.Store64(i.getScratchpadAddress(f.R[i.Dst]), f.R[i.Src])
+		}
+	}
+}
+
+const lockThreadDueToRoundingMode = false
+
+func SetRoundingMode(f *RegisterFile, mode uint8) {
+	f.FPRC = mode
+}
+
+func ResetRoundingMode(f *RegisterFile) {
+	f.FPRC = 0
+}
--- a/vm_instruction.go
+++ b/vm_instruction.go
@ -30,23 +30,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 package randomx

 import (
-	"git.gammaspectra.live/P2Pool/go-randomx/v2/asm"
-	"math"
-	"math/bits"
 	"unsafe"
 )
 import "encoding/binary"

 //reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#51-instruction-encoding

-var Zero uint64 = 0
-
-// since go does not have union, use byte array
-type VM_Instruction []byte // it is hardcode 8 bytes
+// VM_Instruction since go does not have union, use byte array
+type VM_Instruction [8]byte // it is hardcode 8 bytes

 func (ins VM_Instruction) IMM() uint32 {
 	return binary.LittleEndian.Uint32(ins[4:])
 }
+
+func (ins VM_Instruction) IMM64() uint64 {
+	return signExtend2sCompl(ins.IMM())
+}
+
 func (ins VM_Instruction) Mod() byte {
 	return ins[3]
 }
@ -60,229 +60,140 @@ func (ins VM_Instruction) Opcode() byte {
 	return ins[0]
 }

-type VM_Instruction_Type int
-
-const (
-	VM_IADD_RS  VM_Instruction_Type = 0
-	VM_IADD_M   VM_Instruction_Type = 1
-	VM_ISUB_R   VM_Instruction_Type = 2
-	VM_ISUB_M   VM_Instruction_Type = 3
-	VM_IMUL_R   VM_Instruction_Type = 4
-	VM_IMUL_M   VM_Instruction_Type = 5
-	VM_IMULH_R  VM_Instruction_Type = 6
-	VM_IMULH_M  VM_Instruction_Type = 7
-	VM_ISMULH_R VM_Instruction_Type = 8
-	VM_ISMULH_M VM_Instruction_Type = 9
-	VM_IMUL_RCP VM_Instruction_Type = 10
-	VM_INEG_R   VM_Instruction_Type = 11
-	VM_IXOR_R   VM_Instruction_Type = 12
-	VM_IXOR_M   VM_Instruction_Type = 13
-	VM_IROR_R   VM_Instruction_Type = 14
-	VM_IROL_R   VM_Instruction_Type = 15
-	VM_ISWAP_R  VM_Instruction_Type = 16
-	VM_FSWAP_R  VM_Instruction_Type = 17
-	VM_FADD_R   VM_Instruction_Type = 18
-	VM_FADD_M   VM_Instruction_Type = 19
-	VM_FSUB_R   VM_Instruction_Type = 20
-	VM_FSUB_M   VM_Instruction_Type = 21
-	VM_FSCAL_R  VM_Instruction_Type = 22
-	VM_FMUL_R   VM_Instruction_Type = 23
-	VM_FDIV_M   VM_Instruction_Type = 24
-	VM_FSQRT_R  VM_Instruction_Type = 25
-	VM_CBRANCH  VM_Instruction_Type = 26
-	VM_CFROUND  VM_Instruction_Type = 27
-	VM_ISTORE   VM_Instruction_Type = 28
-	VM_NOP      VM_Instruction_Type = 29
-)
-
-var Names = map[VM_Instruction_Type]string{
-
-	VM_IADD_RS:  "VM_IADD_RS",
-	VM_IADD_M:   "VM_IADD_M",
-	VM_ISUB_R:   "VM_ISUB_R",
-	VM_ISUB_M:   "VM_ISUB_M",
-	VM_IMUL_R:   "VM_IMUL_R",
-	VM_IMUL_M:   "VM_IMUL_M",
-	VM_IMULH_R:  "VM_IMULH_R",
-	VM_IMULH_M:  "VM_IMULH_M",
-	VM_ISMULH_R: "VM_ISMULH_R",
-	VM_ISMULH_M: "VM_ISMULH_M",
-	VM_IMUL_RCP: "VM_IMUL_RCP",
-	VM_INEG_R:   "VM_INEG_R",
-	VM_IXOR_R:   "VM_IXOR_R",
-	VM_IXOR_M:   "VM_IXOR_M",
-	VM_IROR_R:   "VM_IROR_R",
-	VM_IROL_R:   "VM_IROL_R",
-	VM_ISWAP_R:  "VM_ISWAP_R",
-	VM_FSWAP_R:  "VM_FSWAP_R",
-	VM_FADD_R:   "VM_FADD_R",
-	VM_FADD_M:   "VM_FADD_M",
-	VM_FSUB_R:   "VM_FSUB_R",
-	VM_FSUB_M:   "VM_FSUB_M",
-	VM_FSCAL_R:  "VM_FSCAL_R",
-	VM_FMUL_R:   "VM_FMUL_R",
-	VM_FDIV_M:   "VM_FDIV_M",
-	VM_FSQRT_R:  "VM_FSQRT_R",
-	VM_CBRANCH:  "VM_CBRANCH",
-	VM_CFROUND:  "VM_CFROUND",
-	VM_ISTORE:   "VM_ISTORE",
-	VM_NOP:      "VM_NOP",
-}
-
-// this will interpret single vm instruction
+// CompileProgramToByteCode this will interpret single vm instruction into executable opcodes
 // reference https://github.com/tevador/RandomX/blob/master/doc/specs.md#52-integer-instructions
-func (vm *VM) Compile_TO_Bytecode() {
+func CompileProgramToByteCode(prog []byte, bc *ByteCode) {

-	var registerUsage [REGISTERSCOUNT]int
+	var registerUsage [RegistersCount]int
 	for i := range registerUsage {
 		registerUsage[i] = -1
 	}

-	for i := 0; i < RANDOMX_PROGRAM_SIZE; i++ {
-		instr := VM_Instruction(vm.Prog[i*8:])
-		ibc := &vm.ByteCode[i]
+	for i := 0; i < len(bc); i++ {
+		instr := VM_Instruction(prog[i*8:])
+		ibc := &bc[i]

 		opcode := instr.Opcode()
-		dst := instr.Dst() % REGISTERSCOUNT // bit shift optimization
-		src := instr.Src() % REGISTERSCOUNT
-		ibc.dst = dst
-		ibc.src = src
+		dst := instr.Dst() % RegistersCount // bit shift optimization
+		src := instr.Src() % RegistersCount
+		ibc.Dst = dst
+		ibc.Src = src
 		switch opcode {
 		case 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15: // 16 frequency
 			ibc.Opcode = VM_IADD_RS
-			ibc.idst = &vm.reg.r[dst]
 			if dst != RegisterNeedsDisplacement {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
-				ibc.imm = 0
+				//shift
+				ibc.ImmB = (instr.Mod() >> 2) % 4
+				ibc.Imm = 0
 			} else {
-				ibc.isrc = &vm.reg.r[src]
-				ibc.shift = (instr.Mod() >> 2) % 4
-				ibc.imm = signExtend2sCompl(instr.IMM())
+				//shift
+				ibc.ImmB = (instr.Mod() >> 2) % 4
+				ibc.Imm = instr.IMM64()
 			}
 			registerUsage[dst] = i

 		case 16, 17, 18, 19, 20, 21, 22: // 7
 			ibc.Opcode = VM_IADD_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_IADD_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38: // 16
 			ibc.Opcode = VM_ISUB_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = instr.IMM64()
+				ibc.Opcode = VM_ISUB_I
 			}
 			registerUsage[dst] = i
 		case 39, 40, 41, 42, 43, 44, 45: // 7
 			ibc.Opcode = VM_ISUB_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_ISUB_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61: // 16
 			ibc.Opcode = VM_IMUL_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = instr.IMM64()
+				ibc.Opcode = VM_IMUL_I
 			}
 			registerUsage[dst] = i
 		case 62, 63, 64, 65: //4
 			ibc.Opcode = VM_IMUL_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_IMUL_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 66, 67, 68, 69: //4
 			ibc.Opcode = VM_IMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 70: //1
 			ibc.Opcode = VM_IMULH_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_IMULH_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 71, 72, 73, 74: //4
 			ibc.Opcode = VM_ISMULH_R
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
 			registerUsage[dst] = i
 		case 75: //1
 			ibc.Opcode = VM_ISMULH_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_ISMULH_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 76, 77, 78, 79, 80, 81, 82, 83: // 8
 			divisor := instr.IMM()
 			if !isZeroOrPowerOf2(divisor) {
-				ibc.Opcode = VM_IMUL_R
-				ibc.idst = &vm.reg.r[dst]
-				ibc.imm = randomx_reciprocal(divisor)
-				ibc.isrc = &ibc.imm
+				ibc.Opcode = VM_IMUL_I
+				ibc.Imm = reciprocal(divisor)
 				registerUsage[dst] = i
 			} else {
 				ibc.Opcode = VM_NOP
@ -290,66 +201,49 @@ func (vm *VM) Compile_TO_Bytecode() {

 		case 84, 85: //2
 			ibc.Opcode = VM_INEG_R
-			ibc.idst = &vm.reg.r[dst]
 			registerUsage[dst] = i
 		case 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100: //15
 			ibc.Opcode = VM_IXOR_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = instr.IMM64()
+				ibc.Opcode = VM_IXOR_I
 			}
 			registerUsage[dst] = i
 		case 101, 102, 103, 104, 105: //5
 			ibc.Opcode = VM_IXOR_M
-			ibc.idst = &vm.reg.r[dst]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}
 			} else {
-				ibc.isrc = &Zero
-				ibc.memMask = ScratchpadL3Mask
+				ibc.Opcode = VM_IXOR_MZ
+				ibc.MemMask = ScratchpadL3Mask
+				ibc.Imm = uint64(ibc.getScratchpadZeroAddress())
 			}
 			registerUsage[dst] = i
 		case 106, 107, 108, 109, 110, 111, 112, 113: //8
 			ibc.Opcode = VM_IROR_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode
-
+			if src == dst {
+				ibc.Imm = instr.IMM64()
+				ibc.Opcode = VM_IROR_I
 			}
 			registerUsage[dst] = i
 		case 114, 115: // 2 IROL_R
 			ibc.Opcode = VM_IROL_R
-			ibc.idst = &vm.reg.r[dst]
-
-			if src != dst {
-				ibc.isrc = &vm.reg.r[src]
-			} else {
-				ibc.imm = signExtend2sCompl(instr.IMM())
-				ibc.isrc = &ibc.imm // we are pointing within bytecode

+			if src == dst {
+				ibc.Imm = instr.IMM64()
+				ibc.Opcode = VM_IROL_I
 			}
 			registerUsage[dst] = i

 		case 116, 117, 118, 119: //4
 			if src != dst {
 				ibc.Opcode = VM_ISWAP_R
-				ibc.idst = &vm.reg.r[dst]
-				ibc.isrc = &vm.reg.r[src]
 				registerUsage[dst] = i
 				registerUsage[src] = i
 			} else {
@ -359,111 +253,100 @@ func (vm *VM) Compile_TO_Bytecode() {

 		// below are floating point instructions
 		case 120, 121, 122, 123: // 4
-			ibc.Opcode = VM_FSWAP_R
-			if dst < REGISTERCOUNTFLT {
-				ibc.fdst = &vm.reg.f[dst]
+			//ibc.Opcode = VM_FSWAP_R
+			if dst < RegistersCountFloat {
+				ibc.Opcode = VM_FSWAP_RF
 			} else {
-				ibc.fdst = &vm.reg.e[dst-REGISTERCOUNTFLT]
+				ibc.Opcode = VM_FSWAP_RE
+				ibc.Dst = dst - RegistersCountFloat
 			}
 		case 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139: //16
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FADD_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]

 		case 140, 141, 142, 143, 144: //5
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FADD_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()

 		case 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160: //16
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FSUB_R
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 161, 162, 163, 164, 165: //5
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSUB_M
-			ibc.fdst = &vm.reg.f[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()

 		case 166, 167, 168, 169, 170, 171: //6
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSCAL_R
-			ibc.fdst = &vm.reg.f[dst]
 		case 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203: //32
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
-			src := instr.Src() % REGISTERCOUNTFLT
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
+			ibc.Src = instr.Src() % RegistersCountFloat
 			ibc.Opcode = VM_FMUL_R
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.fsrc = &vm.reg.a[src]
 		case 204, 205, 206, 207: //4
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FDIV_M
-			ibc.fdst = &vm.reg.e[dst]
-			ibc.isrc = &vm.reg.r[src]
 			if (instr.Mod() % 4) != 0 {
-				ibc.memMask = ScratchpadL1Mask
+				ibc.MemMask = ScratchpadL1Mask
 			} else {
-				ibc.memMask = ScratchpadL2Mask
+				ibc.MemMask = ScratchpadL2Mask
 			}
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 		case 208, 209, 210, 211, 212, 213: //6
-			dst := instr.Dst() % REGISTERCOUNTFLT // bit shift optimization
+			ibc.Dst = instr.Dst() % RegistersCountFloat // bit shift optimization
 			ibc.Opcode = VM_FSQRT_R
-			ibc.fdst = &vm.reg.e[dst]

 		case 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238: //25  // CBRANCH and CFROUND are interchanged
 			ibc.Opcode = VM_CBRANCH
-			reg := instr.Dst() % REGISTERSCOUNT
-			ibc.isrc = &vm.reg.r[reg]
-			ibc.target = int16(registerUsage[reg])
+			//TODO:??? it's +1 on other
+			ibc.Dst = instr.Dst() % RegistersCount
+
+			target := uint16(int16(registerUsage[ibc.Dst]))
+			// set target!
+			ibc.Src = uint8(target)
+			ibc.ImmB = uint8(target >> 8)
+
 			shift := uint64(instr.Mod()>>4) + CONDITIONOFFSET
 			//conditionmask := CONDITIONMASK << shift
-			ibc.imm = signExtend2sCompl(instr.IMM()) | (uint64(1) << shift)
+			ibc.Imm = instr.IMM64() | (uint64(1) << shift)
 			if CONDITIONOFFSET > 0 || shift > 0 {
-				ibc.imm &= (^(uint64(1) << (shift - 1)))
+				ibc.Imm &= ^(uint64(1) << (shift - 1))
 			}
-			ibc.memMask = CONDITIONMASK << shift
+			ibc.MemMask = CONDITIONMASK << shift

-			for j := 0; j < REGISTERSCOUNT; j++ {
+			for j := 0; j < RegistersCount; j++ {
 				registerUsage[j] = i
 			}

 		case 239: //1
 			ibc.Opcode = VM_CFROUND
-			ibc.isrc = &vm.reg.r[src]
-			ibc.imm = uint64(instr.IMM() & 63)
+			ibc.Imm = uint64(instr.IMM() & 63)

 		case 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255: //16
 			ibc.Opcode = VM_ISTORE
-			ibc.idst = &vm.reg.r[dst]
-			ibc.isrc = &vm.reg.r[src]
-			ibc.imm = signExtend2sCompl(instr.IMM())
+			ibc.Imm = instr.IMM64()
 			if (instr.Mod() >> 4) < STOREL3CONDITION {
 				if (instr.Mod() % 4) != 0 {
-					ibc.memMask = ScratchpadL1Mask
+					ibc.MemMask = ScratchpadL1Mask
 				} else {
-					ibc.memMask = ScratchpadL2Mask
+					ibc.MemMask = ScratchpadL2Mask
 				}

 			} else {
-				ibc.memMask = ScratchpadL3Mask
+				ibc.MemMask = ScratchpadL3Mask
 			}

 		default:
@ -471,145 +354,18 @@ func (vm *VM) Compile_TO_Bytecode() {

 		}
 	}
-
 }

-type InstructionByteCode struct {
-	dst, src   byte
-	idst, isrc *uint64
-	fdst, fsrc *[2]float64
-	imm        uint64
-	simm       int64
-	Opcode     VM_Instruction_Type
-	target     int16
-	shift      uint8
-	memMask    uint32
-	/*
-		union {
-			int_reg_t* idst;
-			rx_vec_f128* fdst;
-		};
-		union {
-			int_reg_t* isrc;
-			rx_vec_f128* fsrc;
-		};
-		union {
-			uint64_t imm;
-			int64_t simm;
-		};
-		InstructionType type;
-		union {
-			int16_t target;
-			uint16_t shift;
-		};
-		uint32_t memMask;
-	*/
+type ScratchPad [ScratchpadSize]byte

+func (pad *ScratchPad) Store64(addr uint32, val uint64) {
+	*(*uint64)(unsafe.Pointer(&pad[addr])) = val
+	//binary.LittleEndian.PutUint64(pad[addr:], val)
 }

-func (ibc *InstructionByteCode) getScratchpadAddress() uint64 {
-	return (*ibc.isrc + ibc.imm) & uint64(ibc.memMask)
+func (pad *ScratchPad) Load64(addr uint32) uint64 {
+	return *(*uint64)(unsafe.Pointer(&pad[addr]))
 }
-
-func (ibc *InstructionByteCode) getScratchpadDestAddress() uint64 {
-	return (*ibc.idst + ibc.imm) & uint64(ibc.memMask)
-}
-
-func (vm *VM) Load64(addr uint64) uint64 {
-	return *(*uint64)(unsafe.Pointer(&vm.ScratchPad[addr]))
-}
-func (vm *VM) Load32(addr uint64) uint32 {
-	return *(*uint32)(unsafe.Pointer(&vm.ScratchPad[addr]))
-}
-
-func (vm *VM) Load32F(addr uint64) (lo, hi float64) {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
-	return float64(a[LOW]), float64(a[HIGH])
-}
-
-func (vm *VM) Load32FA(addr uint64) [2]float64 {
-	a := *(*[2]int32)(unsafe.Pointer(&vm.ScratchPad[addr]))
-	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
-}
-
-func (vm *VM) InterpretByteCode() {
-	for pc := 0; pc < RANDOMX_PROGRAM_SIZE; pc++ {
-		ibc := &vm.ByteCode[pc]
-		switch ibc.Opcode {
-		case VM_IADD_RS:
-			*ibc.idst += (*ibc.isrc << ibc.shift) + ibc.imm
-		case VM_IADD_M:
-			*ibc.idst += vm.Load64(ibc.getScratchpadAddress())
-		case VM_ISUB_R:
-			*ibc.idst -= *ibc.isrc
-		case VM_ISUB_M:
-			*ibc.idst -= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IMUL_R:
-			// also handles imul_rcp
-			*ibc.idst *= *ibc.isrc
-		case VM_IMUL_M:
-			*ibc.idst *= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IMULH_R:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, *ibc.isrc)
-		case VM_IMULH_M:
-			*ibc.idst, _ = bits.Mul64(*ibc.idst, vm.Load64(ibc.getScratchpadAddress()))
-		case VM_ISMULH_R:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(*ibc.isrc))
-		case VM_ISMULH_M:
-			*ibc.idst = smulh(int64(*ibc.idst), int64(vm.Load64(ibc.getScratchpadAddress())))
-		case VM_INEG_R:
-			*ibc.idst = (^(*ibc.idst)) + 1 // 2's complement negative
-		case VM_IXOR_R:
-			*ibc.idst ^= *ibc.isrc
-		case VM_IXOR_M:
-			*ibc.idst ^= vm.Load64(ibc.getScratchpadAddress())
-		case VM_IROR_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, 0-int(*ibc.isrc&63))
-		case VM_IROL_R:
-			*ibc.idst = bits.RotateLeft64(*ibc.idst, int(*ibc.isrc&63))
-		case VM_ISWAP_R:
-			*ibc.idst, *ibc.isrc = *ibc.isrc, *ibc.idst
-		case VM_FSWAP_R:
-			ibc.fdst[HIGH], ibc.fdst[LOW] = ibc.fdst[LOW], ibc.fdst[HIGH]
-		case VM_FADD_R:
-			ibc.fdst[LOW] += ibc.fsrc[LOW]
-			ibc.fdst[HIGH] += ibc.fsrc[HIGH]
-		case VM_FADD_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] += lo
-			ibc.fdst[HIGH] += hi
-		case VM_FSUB_R:
-			ibc.fdst[LOW] -= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] -= ibc.fsrc[HIGH]
-		case VM_FSUB_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] -= lo
-			ibc.fdst[HIGH] -= hi
-		case VM_FSCAL_R:
-			// no dependent on rounding modes
-			ibc.fdst[LOW] = math.Float64frombits(math.Float64bits(ibc.fdst[LOW]) ^ 0x80F0000000000000)
-			ibc.fdst[HIGH] = math.Float64frombits(math.Float64bits(ibc.fdst[HIGH]) ^ 0x80F0000000000000)
-		case VM_FMUL_R:
-			ibc.fdst[LOW] *= ibc.fsrc[LOW]
-			ibc.fdst[HIGH] *= ibc.fsrc[HIGH]
-		case VM_FDIV_M:
-			lo, hi := vm.Load32F(ibc.getScratchpadAddress())
-			ibc.fdst[LOW] /= MaskRegisterExponentMantissa(lo, vm.config.eMask[LOW])
-			ibc.fdst[HIGH] /= MaskRegisterExponentMantissa(hi, vm.config.eMask[HIGH])
-		case VM_FSQRT_R:
-			ibc.fdst[LOW] = math.Sqrt(ibc.fdst[LOW])
-			ibc.fdst[HIGH] = math.Sqrt(ibc.fdst[HIGH])
-		case VM_CBRANCH:
-			*ibc.isrc += ibc.imm
-			if (*ibc.isrc & uint64(ibc.memMask)) == 0 {
-				pc = int(ibc.target)
-			}
-		case VM_CFROUND:
-			tmp := (bits.RotateLeft64(*ibc.isrc, 0-int(ibc.imm))) % 4 // rotate right
-			asm.SetRoundingMode(asm.RoundingMode(tmp))
-		case VM_ISTORE:
-			binary.LittleEndian.PutUint64(vm.ScratchPad[(*ibc.idst+ibc.imm)&uint64(ibc.memMask):], *ibc.isrc)
-		case VM_NOP: // we do nothing
-		}
-	}
+func (pad *ScratchPad) Load32(addr uint32) uint32 {
+	return *(*uint32)(unsafe.Pointer(&pad[addr]))
 }
--- a/vm_instruction_native.go
+++ b/vm_instruction_native.go
@ -0,0 +1,15 @@
+//go:build (arm64 || amd64 || 386) && !purego
+
+package randomx
+
+import "unsafe"
+
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return float64(a[LOW]), float64(a[HIGH])
+}
+
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return [2]float64{float64(a[LOW]), float64(a[HIGH])}
+}
--- a/vm_instruction_purego.go
+++ b/vm_instruction_purego.go
@ -0,0 +1,18 @@
+//go:build (!arm64 && !amd64 && !386) || purego
+
+package randomx
+
+import (
+	"git.gammaspectra.live/P2Pool/softfloat64"
+	"unsafe"
+)
+
+func (pad *ScratchPad) Load32F(addr uint32) (lo, hi float64) {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return softfloat64.Int32ToFloat64(a[LOW]), softfloat64.Int32ToFloat64(a[HIGH])
+}
+
+func (pad *ScratchPad) Load32FA(addr uint32) [2]float64 {
+	a := *(*[2]int32)(unsafe.Pointer(&pad[addr]))
+	return [2]float64{softfloat64.Int32ToFloat64(a[LOW]), softfloat64.Int32ToFloat64(a[HIGH])}
+}
Author	SHA1	Message	Date
DataHoarder	b0265950b6	Disable large page testing on 32-bit platforms All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-02 16:28:38 +02:00
DataHoarder	c41d6c8080	Support large pages, implement aligned / paged / large paged allocators Some checks failed continuous-integration/drone/push Build is failing Details	2024-05-02 16:18:50 +02:00
DataHoarder	9aa3631f37	Ensure 16-byte alignment of dataset/scratchpad/register file and use more performance fetch/write SIMD on amd64 All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-02 12:06:38 +02:00
DataHoarder	9826b7beb4	Added partial hash and fill AES for First/Next/Last hashing modes in VM	2024-05-02 11:42:23 +02:00
DataHoarder	acfff4a4ad	Add hard float support for arm platform, add tests All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-02 04:16:52 +02:00
DataHoarder	a458a18f07	Added CalculateCommitment api for RandomX v2 hashes, added further testing All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-02 03:46:03 +02:00
DataHoarder	cceea5b0ba	Simplify amd64 / 386 rounding mode set	2024-05-02 03:00:26 +02:00
DataHoarder	8b063bde61	Match functionality / API with upstream randomx All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-02 02:25:17 +02:00
DataHoarder	c50cbc56b5	Reduce heap allocations under VM	2024-05-01 16:58:49 +02:00
DataHoarder	1d83de4880	Split hard/soft AES implementations	2024-05-01 16:25:35 +02:00
DataHoarder	25b7fc4cc0	Move internal packages to internal directory	2024-05-01 11:36:43 +02:00
DataHoarder	3f70ec75be	Remove unused functions on cpuid_amd64 All checks were successful continuous-integration/drone/push Build is passing Details	2024-05-01 07:36:26 +02:00
DataHoarder	55d6161f6e	Version v3.1.0, implement generic NewDataset and GetFlags Some checks are pending continuous-integration/drone/tag Build is passing Details continuous-integration/drone/push Build is running Details	2024-04-23 14:36:43 +02:00
DataHoarder	36f1a90a20	Version v3.0.0, support full datataset mode in 64-bit targets, modified api, optimized allocations, full VM run JIT on amd64, optimize AES asm All checks were successful continuous-integration/drone/push Build is passing Details continuous-integration/drone/tag Build is passing Details	2024-04-23 04:33:42 +02:00
DataHoarder	4903cd7407	Cleanup readme, superscalar	2024-04-20 20:22:05 +02:00
DataHoarder	d20dd880ce	amd64: Implemented VM JIT All checks were successful continuous-integration/drone/push Build is passing Details	2024-04-20 19:53:47 +02:00
DataHoarder	d72726b0fe	Added wasm testing to CI All checks were successful continuous-integration/drone/push Build is passing Details	2024-04-19 18:33:50 +02:00
DataHoarder	34cfab4176	redo JIT superscalar to include less custom assembly	2024-04-19 17:53:43 +02:00
DataHoarder	a71d8f6a2e	allow lock-free vm execution in soft float mode	2024-04-18 12:08:49 +02:00
DataHoarder	14a10f544f	Support x86_64 aesenc/aesdec and roundtrip mode All checks were successful continuous-integration/drone/push Build is passing Details continuous-integration/drone/tag Build is passing Details	2024-04-18 11:38:55 +02:00
DataHoarder	ef069318b9	fix purego bytecode / rounding mode calls All checks were successful continuous-integration/drone/push Build is passing Details continuous-integration/drone/tag Build is passing Details	2024-04-18 09:06:53 +02:00
DataHoarder	80f473de54	General cleanup of jit / VM / mmap usage Some checks failed continuous-integration/drone/push Build is failing Details continuous-integration/drone/tag Build is failing Details	2024-04-18 07:57:15 +02:00
DataHoarder	fe253fb825	cleanup vm_instruction IMM with sign extension	2024-04-18 07:11:51 +02:00
DataHoarder	699ce02f2d	hash register file memory at once instead on loop calls	2024-04-17 09:53:24 +02:00
DataHoarder	b35751462b	hack: reserve stack on JIT call	2024-04-17 09:40:54 +02:00
DataHoarder	1ce9bff7d3	Initialize AES S-Box directly All checks were successful continuous-integration/drone/push Build is passing Details continuous-integration/drone/tag Build is passing Details	2024-04-17 06:45:08 +02:00
DataHoarder	aab8f99dd4	Include softfloat64 and allow for purego implementation All checks were successful continuous-integration/drone/push Build is passing Details continuous-integration/drone/tag Build is passing Details	2024-04-17 06:04:29 +02:00
DataHoarder	432590f930	Move argon2 / float packages to their own folders, cleanup vm Run All checks were successful continuous-integration/drone/push Build is passing Details	2024-04-15 04:14:15 +02:00
DataHoarder	5b9b3c3565	Use direct register and scratchpad under bytecode execution	2024-04-15 02:22:04 +02:00
DataHoarder	b72f79a653	Remove zero register from vm bytecode	2024-04-14 15:43:54 +02:00