Cleanup readme, superscalar

2024-04-20 20:22:05 +02:00 · 2024-04-20 20:22:05 +02:00 · 4903cd7407
parent d20dd880ce
commit 4903cd7407
3 changed files with 60 additions and 53 deletions
--- a/README.md
+++ b/README.md
@ -1,29 +1,36 @@
 # RandomX (Golang Implementation)
 RandomX is a proof-of-work (PoW) algorithm that is optimized for general-purpose CPUs.
 RandomX uses random code execution (hence the name) together with several memory-hard techniques to minimize the efficiency advantage of specialized hardware.
 ---
 Fork from [git.dero.io/DERO_Foundation/RandomX](https://git.dero.io/DERO_Foundation/RandomX). Also related, their [Analysis of RandomX writeup](https://medium.com/deroproject/analysis-of-randomx-dde9dfe9bbc6).
 Original code failed RandomX testcases and was implemented using big.Float.
-This package implements RandomX without CGO, using only Golang code, pure float64 ops and two small assembly sections to implement CFROUND modes, with optional soft float implementation.
+---
 This package implements RandomX without CGO, using only Golang code, native float64 ops, some assembly, but with optional soft float _purego_ implementation.
 All test cases pass properly.
-JIT is supported on a few platforms but can be hard-disabled via the `disable_jit` build flag, or at runtime.
+For the C++ implementation and design of RandomX, see [github.com/tevador/RandomX](https://github.com/tevador/RandomX)
 |           Feature            | 386 | amd64 | arm | arm64 | mips | mips64 | riscv64 | wasm |
 |:----------------------------:|:---:|:-----:|:---:|:-----:|:----:|:------:|:-------:|:----:|
 |            purego            |  ✅  |   ✅   |  ✅  |   ✅   |  ✅   |   ✅    |    ✅    |  ✅   |
 |  Hardware Float Operations   |  ✅  |   ✅   |  ❌  |   ✅   |  ❌   |   ❌    |    ❌    |  ❌   |
 |   Hardware AES Operations    |  ❌  |   ✅   |  ❌  |   ❌   |  ❌   |   ❌    |    ❌    |  ❌   |
 | Native Superscalar Execution |  ✅  |   ✅   |  ✅  |   ✅   |  ✅   |   ✅    |    ✅    |  ✅   |
 |  Superscalar JIT Execution   |  ❌  |  ✅*   |  ❌  |   ❌   |  ❌   |   ❌    |    ❌    |  ❌   |
 |     Native VM Execution      |  ✅  |   ✅   |  ❌  |   ✅   |  ❌   |   ❌    |    ❌    |  ❌   |
 |       VM JIT Execution       |  ❌  |  ✅*   |  ❌  |   ❌   |  ❌   |   ❌    |    ❌    |  ❌   |
 A pure Golang implementation can be used on platforms without hard float support or via the `purego` build flag manually.
-|  Platform   | Hard Float | Hard AES | JIT | Native | purego |      Notes       |
+Any platform with no hard float support or when enabled manually will use soft float, using [softfloat64](https://git.gammaspectra.live/P2Pool/softfloat64). This will be very slow.
 |:-----------:|:----------:|:--------:|:---:|:------:|:------:|:----------------:|
 |   **386**   |     ✅      |    ❌     |  ❌  |   ✅    |   ✅    |                  |
 |  **amd64**  |     ✅      |    ✅     | ✅*  |   ✅    |   ✅    | JIT only on Unix |
 |   **arm**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 |  **arm64**  |     ✅      |    ❌     |  ❌  |   ✅    |   ✅    |                  |
 |  **mips**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 | **mips64**  |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 | **riscv64** |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 |  **wasm**   |     ❌      |    ❌     |  ❌  |   ❌    |   ✅    |                  |
 Native hard float can be added with supporting rounding mode under _asm_.
-Any platform with no hard float support (soft float using [softfloat64](git.gammaspectra.live/P2Pool/softfloat64)) will be vastly slow.
+JIT only supported under Unix systems (Linux, *BSD, macOS), and can be hard-disabled via the `disable_jit` build flag, or at runtime.
 Native hard float can be added with supporting rounding mode under _asm_.
--- a/superscalar.go
+++ b/superscalar.go
@ -307,11 +307,11 @@ var slot10 = []*Instruction{&IMUL_RCP}
 // SuperScalarInstruction superscalar program is built with superscalar instructions
 type SuperScalarInstruction struct {
 	Opcode           byte
-	Dst_Reg          int
+	Dst              int
-	Src_Reg          int
+	Src              int
 	Mod              byte
 	Imm32            uint32
-	Type             int
+	Imm64            uint64
 	OpGroup          int
 	OpGroupPar       int
 	GroupParIsSource int
@ -320,17 +320,15 @@ type SuperScalarInstruction struct {
 }
 func (sins *SuperScalarInstruction) FixSrcReg() {
-	if sins.Src_Reg >= 0 {
+	if sins.Src == 0xff {
-		// do nothing
+		sins.Src = sins.Dst
 	} else {
 		sins.Src_Reg = sins.Dst_Reg
 	}
 }
 func (sins *SuperScalarInstruction) Reset() {
 	sins.Opcode = 99
-	sins.Src_Reg = -1
+	sins.Src = 0xff
-	sins.Dst_Reg = -1
+	sins.Dst = 0xff
 	sins.CanReuse = false
 	sins.GroupParIsSource = 0
 }
@ -406,6 +404,8 @@ func create(sins *SuperScalarInstruction, ins *Instruction, gen *Blake2Generator
 			}
 		}
 		sins.Imm64 = randomx_reciprocal(sins.Imm32)
 		sins.OpGroup = S_IMUL_RCP
 	default:
@ -450,11 +450,11 @@ func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *Blake2Gener
 type SuperScalarProgram []SuperScalarInstruction
 func (p SuperScalarProgram) setAddressRegister(addressRegister int) {
-	p[0].Dst_Reg = addressRegister
+	p[0].Dst = addressRegister
 }
 func (p SuperScalarProgram) AddressRegister() int {
-	return p[0].Dst_Reg
+	return p[0].Dst
 }
 func (p SuperScalarProgram) Program() []SuperScalarInstruction {
 	return p[1:]
@ -569,9 +569,9 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 			depcycle = scheduleCycle + mop.GetLatency() // calculate when will the result be ready
 			if macro_op_index == sins.ins.ResultOP { // fix me
-				registers[sins.Dst_Reg].Latency = depcycle
+				registers[sins.Dst].Latency = depcycle
-				registers[sins.Dst_Reg].LastOpGroup = sins.OpGroup
+				registers[sins.Dst].LastOpGroup = sins.OpGroup
-				registers[sins.Dst_Reg].LastOpPar = sins.OpGroupPar
+				registers[sins.Dst].LastOpPar = sins.OpGroupPar
 			}
@ -609,12 +609,12 @@ func Build_SuperScalar_Program(gen *Blake2Generator) SuperScalarProgram {
 		if i == 0 {
 			continue
 		}
-		lastdst := asic_latencies[program[i].Dst_Reg] + 1
+		lastdst := asic_latencies[program[i].Dst] + 1
 		lastsrc := 0
-		if program[i].Dst_Reg != program[i].Src_Reg {
+		if program[i].Dst != program[i].Src {
-			lastsrc = asic_latencies[program[i].Src_Reg] + 1
+			lastsrc = asic_latencies[program[i].Src] + 1
 		}
-		asic_latencies[program[i].Dst_Reg] = max(lastdst, lastsrc)
+		asic_latencies[program[i].Dst] = max(lastdst, lastsrc)
 	}
 	asic_latency_max := 0
@ -719,18 +719,18 @@ func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters
 	if len(available_registers) == 2 && sins.Opcode == S_IADD_RS {
 		if available_registers[0] == RegisterNeedsDisplacement || available_registers[1] == RegisterNeedsDisplacement {
-			sins.Src_Reg = RegisterNeedsDisplacement
+			sins.Src = RegisterNeedsDisplacement
-			sins.OpGroupPar = sins.Src_Reg
+			sins.OpGroupPar = sins.Src
 			return true
 		}
 	}
-	if selectRegister(available_registers, gen, &sins.Src_Reg) {
+	if selectRegister(available_registers, gen, &sins.Src) {
 		if sins.GroupParIsSource == 0 {
 		} else {
-			sins.OpGroupPar = sins.Src_Reg
+			sins.OpGroupPar = sins.Src
 		}
 		return true
 	}
@ -741,7 +741,7 @@ func (sins *SuperScalarInstruction) SelectDestination(preAllocatedAvailableRegis
 	preAllocatedAvailableRegisters = preAllocatedAvailableRegisters[:0]
 	for i := range Registers {
-		if Registers[i].Latency <= cycle && (sins.CanReuse || i != sins.Src_Reg) &&
+		if Registers[i].Latency <= cycle && (sins.CanReuse || i != sins.Src) &&
 			(allowChainedMul || sins.OpGroup != S_IMUL_R || Registers[i].LastOpGroup != S_IMUL_R) &&
 			(Registers[i].LastOpGroup != sins.OpGroup || Registers[i].LastOpPar != sins.OpGroupPar) &&
 			(sins.Opcode != S_IADD_RS || i != RegisterNeedsDisplacement) {
@ -749,7 +749,7 @@ func (sins *SuperScalarInstruction) SelectDestination(preAllocatedAvailableRegis
 		}
 	}
-	return selectRegister(preAllocatedAvailableRegisters, gen, &sins.Dst_Reg)
+	return selectRegister(preAllocatedAvailableRegisters, gen, &sins.Dst)
 }
 func selectRegister(available_registers []int, gen *Blake2Generator, reg *int) bool {
@ -776,25 +776,25 @@ func executeSuperscalar(p []SuperScalarInstruction, r *RegisterLine) {
 		ins := &p[i]
 		switch ins.Opcode {
 		case S_ISUB_R:
-			r[ins.Dst_Reg] -= r[ins.Src_Reg]
+			r[ins.Dst] -= r[ins.Src]
 		case S_IXOR_R:
-			r[ins.Dst_Reg] ^= r[ins.Src_Reg]
+			r[ins.Dst] ^= r[ins.Src]
 		case S_IADD_RS:
-			r[ins.Dst_Reg] += r[ins.Src_Reg] << ins.Imm32
+			r[ins.Dst] += r[ins.Src] << ins.Imm32
 		case S_IMUL_R:
-			r[ins.Dst_Reg] *= r[ins.Src_Reg]
+			r[ins.Dst] *= r[ins.Src]
 		case S_IROR_C:
-			r[ins.Dst_Reg] = bits.RotateLeft64(r[ins.Dst_Reg], 0-int(ins.Imm32))
+			r[ins.Dst] = bits.RotateLeft64(r[ins.Dst], 0-int(ins.Imm32))
 		case S_IADD_C7, S_IADD_C8, S_IADD_C9:
-			r[ins.Dst_Reg] += signExtend2sCompl(ins.Imm32)
+			r[ins.Dst] += signExtend2sCompl(ins.Imm32)
 		case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
-			r[ins.Dst_Reg] ^= signExtend2sCompl(ins.Imm32)
+			r[ins.Dst] ^= signExtend2sCompl(ins.Imm32)
 		case S_IMULH_R:
-			r[ins.Dst_Reg], _ = bits.Mul64(r[ins.Dst_Reg], r[ins.Src_Reg])
+			r[ins.Dst], _ = bits.Mul64(r[ins.Dst], r[ins.Src])
 		case S_ISMULH_R:
-			r[ins.Dst_Reg] = smulh(int64(r[ins.Dst_Reg]), int64(r[ins.Src_Reg]))
+			r[ins.Dst] = smulh(int64(r[ins.Dst]), int64(r[ins.Src]))
 		case S_IMUL_RCP:
-			r[ins.Dst_Reg] *= randomx_reciprocal(ins.Imm32)
+			r[ins.Dst] *= ins.Imm64
 		}
 	}
--- a/superscalar_jit_amd64.go
+++ b/superscalar_jit_amd64.go
@ -27,8 +27,8 @@ func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgra
 	for i := range p {
 		instr := &p[i]
-		dst := instr.Dst_Reg % RegistersCount
+		dst := instr.Dst % RegistersCount
-		src := instr.Src_Reg % RegistersCount
+		src := instr.Src % RegistersCount
 		switch instr.Opcode {
 		case S_ISUB_R:
@ -80,9 +80,9 @@ func generateSuperscalarCode(scalarProgram SuperScalarProgram) SuperScalarProgra
 			program = append(program, byte(0xc2+8*dst))
 		case S_IMUL_RCP:
 			program = append(program, MOV_RAX_I...)
-			program = binary.LittleEndian.AppendUint64(program, randomx_reciprocal(instr.Imm32))
+			program = binary.LittleEndian.AppendUint64(program, instr.Imm64)
 			program = append(program, REX_IMUL_RM...)
-			program = append(program, byte(0xc0+8*instr.Dst_Reg))
+			program = append(program, byte(0xc0+8*instr.Dst))
 		default:
 			panic("unreachable")
 		}