go-randomx/superscalar.go

1015 lines
26 KiB
Go

/*
Copyright (c) 2019 DERO Foundation. All rights reserved.
Redistribution and use in source and binary forms, with or without modification,
are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
package randomx
import "fmt"
import "math/bits"
type ExecutionPort byte
const (
Null ExecutionPort = iota
P0 = 1
P1 = 2
P5 = 4
P01 = P0 | P1
P05 = P0 | P5
P015 = P0 | P1 | P5
)
type MacroOP struct {
Name string
Size int
Latency int
UOP1 ExecutionPort
UOP2 ExecutionPort
Dependent bool
}
func (m *MacroOP) GetSize() int {
return m.Size
}
func (m *MacroOP) GetLatency() int {
return m.Latency
}
func (m *MacroOP) GetUOP1() ExecutionPort {
return m.UOP1
}
func (m *MacroOP) GetUOP2() ExecutionPort {
return m.UOP2
}
func (m *MacroOP) IsSimple() bool {
return m.UOP2 == Null
}
func (m *MacroOP) IsEliminated() bool {
return m.UOP1 == Null
}
func (m *MacroOP) IsDependent() bool {
return m.Dependent
}
// 3 byte instructions
var M_NOP = MacroOP{"NOP", 0, 0, Null, Null, false}
var M_Add_rr = MacroOP{"add r,r", 3, 1, P015, Null, false}
var M_Sub_rr = MacroOP{"sub r,r", 3, 1, P015, Null, false}
var M_Xor_rr = MacroOP{"xor r,r", 3, 1, P015, Null, false}
var M_Imul_r = MacroOP{"imul r", 3, 4, P1, P5, false}
var M_Mul_r = MacroOP{"mul r", 3, 4, P1, P5, false}
var M_Mov_rr = MacroOP{"mov r,r", 3, 0, Null, Null, false}
// latency is 1 lower
var M_Imul_r_dependent = MacroOP{"imul r", 3, 3, P1, Null, true} // this is the dependent version where current instruction depends on previous instruction
// Size: 4 bytes
var M_Lea_SIB = MacroOP{"lea r,r+r*s", 4, 1, P01, Null, false}
var M_Imul_rr = MacroOP{"imul r,r", 4, 3, P1, Null, false}
var M_Ror_ri = MacroOP{"ror r,i", 4, 1, P05, Null, false}
// Size: 7 bytes (can be optionally padded with nop to 8 or 9 bytes)
var M_Add_ri = MacroOP{"add r,i", 7, 1, P015, Null, false}
var M_Xor_ri = MacroOP{"xor r,i", 7, 1, P015, Null, false}
// Size: 10 bytes
var M_Mov_ri64 = MacroOP{"mov rax,i64", 10, 1, P015, Null, false}
// unused are not implemented
type Instruction struct {
Name string
Opcode byte
UOP MacroOP
SrcOP int
ResultOP int
DstOP int
UOP_Array []MacroOP
}
func (ins *Instruction) GetUOPCount() int {
if len(ins.UOP_Array) != 0 {
return len(ins.UOP_Array)
} else {
if ins.Name == "NOP" { // nop is assumed to be zero bytes
return 0
}
return 1
}
}
func (ins *Instruction) GetSize() int {
if len(ins.UOP_Array) != 0 {
sum_size := 0
for i := range ins.UOP_Array {
sum_size += ins.UOP_Array[i].GetSize()
}
return sum_size
} else {
return ins.UOP.GetSize()
}
}
func (ins *Instruction) IsSimple() bool {
if ins.GetSize() == 1 {
return true
}
return false
}
func (ins *Instruction) GetLatency() int {
if len(ins.UOP_Array) != 0 {
sum := 0
for i := range ins.UOP_Array {
sum += ins.UOP_Array[i].GetLatency()
}
return sum
} else {
return ins.UOP.GetLatency()
}
}
const (
S_INVALID int = -1
S_ISUB_R = 0
S_IXOR_R = 1
S_IADD_RS = 2
S_IMUL_R = 3
S_IROR_C = 4
S_IADD_C7 = 5
S_IXOR_C7 = 6
S_IADD_C8 = 7
S_IXOR_C8 = 8
S_IADD_C9 = 9
S_IXOR_C9 = 10
S_IMULH_R = 11
S_ISMULH_R = 12
S_IMUL_RCP = 13
)
var Opcode_To_String = map[int]string{S_INVALID: "INVALID",
S_ISUB_R: "ISUB_R",
S_IXOR_R: "IXOR_R",
S_IADD_RS: "IADD_RS",
S_IMUL_R: "IMUL_R",
S_IROR_C: "IROR_C",
S_IADD_C7: "IADD_C7",
S_IXOR_C7: "IXOR_C7",
S_IADD_C8: "IADD_C8",
S_IXOR_C8: "IXOR_C8",
S_IADD_C9: "IADD_C9",
S_IXOR_C9: "IXOR_C9",
S_IMULH_R: "IMULH_R",
S_ISMULH_R: "ISMULH_R",
S_IMUL_RCP: "IMUL_RCP",
}
// SrcOP/DstOp are used to selected registers
var ISUB_R = Instruction{Name: "ISUB_R", Opcode: S_ISUB_R, UOP: M_Sub_rr, SrcOP: 0}
var IXOR_R = Instruction{Name: "IXOR_R", Opcode: S_IXOR_R, UOP: M_Xor_rr, SrcOP: 0}
var IADD_RS = Instruction{Name: "IADD_RS", Opcode: S_IADD_RS, UOP: M_Lea_SIB, SrcOP: 0}
var IMUL_R = Instruction{Name: "IMUL_R", Opcode: S_IMUL_R, UOP: M_Imul_rr, SrcOP: 0}
var IROR_C = Instruction{Name: "IROR_C", Opcode: S_IROR_C, UOP: M_Ror_ri, SrcOP: -1}
var IADD_C7 = Instruction{Name: "IADD_C7", Opcode: S_IADD_C7, UOP: M_Add_ri, SrcOP: -1}
var IXOR_C7 = Instruction{Name: "IXOR_C7", Opcode: S_IXOR_C7, UOP: M_Xor_ri, SrcOP: -1}
var IADD_C8 = Instruction{Name: "IADD_C8", Opcode: S_IADD_C8, UOP: M_Add_ri, SrcOP: -1}
var IXOR_C8 = Instruction{Name: "IXOR_C8", Opcode: S_IXOR_C8, UOP: M_Xor_ri, SrcOP: -1}
var IADD_C9 = Instruction{Name: "IADD_C9", Opcode: S_IADD_C9, UOP: M_Add_ri, SrcOP: -1}
var IXOR_C9 = Instruction{Name: "IXOR_C9", Opcode: S_IXOR_C9, UOP: M_Xor_ri, SrcOP: -1}
var IMULH_R = Instruction{Name: "IMULH_R", Opcode: S_IMULH_R, UOP_Array: []MacroOP{M_Mov_rr, M_Mul_r, M_Mov_rr}, ResultOP: 1, DstOP: 0, SrcOP: 1}
var ISMULH_R = Instruction{Name: "ISMULH_R", Opcode: S_ISMULH_R, UOP_Array: []MacroOP{M_Mov_rr, M_Imul_r, M_Mov_rr}, ResultOP: 1, DstOP: 0, SrcOP: 1}
var IMUL_RCP = Instruction{Name: "IMUL_RCP", Opcode: S_IMUL_RCP, UOP_Array: []MacroOP{M_Mov_ri64, M_Imul_r_dependent}, ResultOP: 1, DstOP: 1, SrcOP: -1}
var INOP = Instruction{Name: "NOP", UOP: M_NOP}
// how random 16 bytes are split into instructions
var buffer0 = []int{4, 8, 4}
var buffer1 = []int{7, 3, 3, 3}
var buffer2 = []int{3, 7, 3, 3}
var buffer3 = []int{4, 9, 3}
var buffer4 = []int{4, 4, 4, 4}
var buffer5 = []int{3, 3, 10}
var Decoder_To_Instruction_Length = [][]int{{4, 8, 4},
{7, 3, 3, 3},
{3, 7, 3, 3},
{4, 9, 3},
{4, 4, 4, 4},
{3, 3, 10}}
type DecoderType int
const Decoder484 DecoderType = 0
const Decoder7333 DecoderType = 1
const Decoder3733 DecoderType = 2
const Decoder493 DecoderType = 3
const Decoder4444 DecoderType = 4
const Decoder3310 DecoderType = 5
func (d DecoderType) GetSize() int {
switch d {
case Decoder484:
return 3
case Decoder7333:
return 4
case Decoder3733:
return 4
case Decoder493:
return 3
case Decoder4444:
return 4
case Decoder3310:
return 3
default:
panic("unknown decoder")
}
}
func (d DecoderType) String() string {
switch d {
case Decoder484:
return "Decoder484"
case Decoder7333:
return "Decoder7333"
case Decoder3733:
return "Decoder3733"
case Decoder493:
return "Decoder493"
case Decoder4444:
return "Decoder4444"
case Decoder3310:
return "Decoder3310"
default:
panic("unknown decoder")
}
}
func FetchNextDecoder(ins *Instruction, cycle int, mulcount int, gen *Blake2Generator) DecoderType {
if ins.Name == IMULH_R.Name || ins.Name == ISMULH_R.Name {
return Decoder3310
}
// make sure multiplication port is satured, if number of multiplications les less than number of cycles, a 4444 is returned
if mulcount < (cycle + 1) {
return Decoder4444
}
if ins.Name == IMUL_RCP.Name {
if gen.GetByte()&1 == 1 {
return Decoder484
} else {
return Decoder493
}
}
// we are here means selecta decoded randomly
rnd_byte := gen.GetByte()
switch rnd_byte & 3 {
case 0:
return Decoder484
case 1:
return Decoder7333
case 2:
return Decoder3733
case 3:
return Decoder493
}
panic("can never reach")
return Decoder484
}
var slot3 = []*Instruction{&ISUB_R, &IXOR_R} // 3 length instruction will be filled with these
var slot3L = []*Instruction{&ISUB_R, &IXOR_R, &IMULH_R, &ISMULH_R}
var slot4 = []*Instruction{&IROR_C, &IADD_RS}
var slot7 = []*Instruction{&IXOR_C7, &IADD_C7}
var slot8 = []*Instruction{&IXOR_C8, &IADD_C8}
var slot9 = []*Instruction{&IXOR_C9, &IADD_C9}
var slot10 = []*Instruction{&IMUL_RCP}
// superscalar program is built with superscalara instructions
type SuperScalarInstruction struct {
Opcode byte
Dst_Reg int
Src_Reg int
Mod byte
Imm32 uint32
Type int
Name string
OpGroup int
OpGroupPar int
GroupParIsSource int
ins *Instruction
CanReuse bool
}
func (sins SuperScalarInstruction) String() string {
result := fmt.Sprintf("; %10s %2d ", sins.Name, sins.Opcode)
result += fmt.Sprintf("dst r%d ", sins.Dst_Reg)
if sins.Src_Reg >= 0 {
result += fmt.Sprintf("src r%d ", sins.Src_Reg)
} else {
result += fmt.Sprintf("src r%d ", sins.Dst_Reg)
}
result += fmt.Sprintf("Mod %08x ", sins.Mod)
result += fmt.Sprintf("Imm %08x ", sins.Imm32)
return result
}
func (sins *SuperScalarInstruction) FixSrcReg() {
if sins.Src_Reg >= 0 {
// do nothing
} else {
sins.Src_Reg = sins.Dst_Reg
}
}
func (sins *SuperScalarInstruction) Reset() {
sins.Opcode = 99
sins.Src_Reg = -1
sins.Dst_Reg = -1
sins.CanReuse = false
sins.GroupParIsSource = 0
}
func create(sins *SuperScalarInstruction, ins *Instruction, gen *Blake2Generator) {
sins.Reset()
sins.ins = ins
sins.Name = ins.Name
sins.OpGroupPar = -1
sins.Opcode = ins.Opcode
switch ins.Name {
case ISUB_R.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
sins.Imm32 = 0
sins.OpGroup = S_IADD_RS
sins.GroupParIsSource = 1
case IXOR_R.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
sins.Imm32 = 0
sins.OpGroup = S_IXOR_R
sins.GroupParIsSource = 1
case IADD_RS.Name:
//fmt.Printf("q %s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = gen.GetByte()
sins.Imm32 = uint32((sins.Mod & 0b1100) >> 2) // bits 2-3
//sins.Imm32 = 0
sins.OpGroup = S_IADD_RS
sins.GroupParIsSource = 1
case IMUL_R.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
sins.Imm32 = 0
sins.OpGroup = S_IMUL_R
sins.GroupParIsSource = 1
case IROR_C.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
for sins.Imm32 = 0; sins.Imm32 == 0; {
sins.Imm32 = uint32(gen.GetByte() & 63)
}
sins.OpGroup = S_IROR_C
sins.OpGroupPar = -1
case IADD_C7.Name, IADD_C8.Name, IADD_C9.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
sins.Imm32 = gen.GetUint32()
sins.OpGroup = S_IADD_C7
sins.OpGroupPar = -1
case IXOR_C7.Name, IXOR_C8.Name, IXOR_C9.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
sins.Imm32 = gen.GetUint32()
sins.OpGroup = S_IXOR_C7
sins.OpGroupPar = -1
case IMULH_R.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.CanReuse = true
sins.Mod = 0
sins.Imm32 = 0
sins.OpGroup = S_IMULH_R
sins.OpGroupPar = int(gen.GetUint32())
case ISMULH_R.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.CanReuse = true
sins.Mod = 0
sins.Imm32 = 0
sins.OpGroup = S_ISMULH_R
sins.OpGroupPar = int(gen.GetUint32())
case IMUL_RCP.Name:
//fmt.Printf("%s \n", ins.Name)
sins.Name = ins.Name
sins.Mod = 0
for {
sins.Imm32 = gen.GetUint32()
if (sins.Imm32&sins.Imm32 - 1) != 0 {
break
}
}
sins.OpGroup = S_IMUL_RCP
default:
fmt.Printf("%s \n", ins.Name)
panic("should not occur")
}
}
func CreateSuperScalarInstruction(sins *SuperScalarInstruction, gen *Blake2Generator, instruction_len int, decoder_type int, islast, isfirst bool) {
//fmt.Printf("instruction len %d\n", instruction_len)
switch instruction_len {
case 3:
if islast {
create(sins, slot3L[gen.GetByte()&3], gen)
} else {
create(sins, slot3[gen.GetByte()&1], gen)
}
case 4:
//if this is the 4-4-4-4 buffer, issue multiplications as the first 3 instructions
if decoder_type == int(Decoder4444) && !islast {
create(sins, &IMUL_R, gen)
} else {
create(sins, slot4[gen.GetByte()&1], gen)
}
case 7:
create(sins, slot7[gen.GetByte()&1], gen)
case 8:
//fmt.Printf("creating 8\n")
create(sins, slot8[gen.GetByte()&1], gen)
case 9:
create(sins, slot9[gen.GetByte()&1], gen)
case 10:
create(sins, slot10[0], gen)
default:
panic("should not be possible")
}
}
type SuperScalarProgram struct {
Ins []SuperScalarInstruction // all instructions of program
AddressReg int
}
func Build_SuperScalar_Program(gen *Blake2Generator) *SuperScalarProgram {
cycle := 0
depcycle := 0
//retire_cycle := 0
mulcount := 0
ports_saturated := false
program_size := 0
//current_instruction := INOP
macro_op_index := 0
macro_op_count := 0
throwAwayCount := 0
code_size := 0
var program SuperScalarProgram
preAllocatedRegisters := gen.allocRegIndex[:]
registers := gen.allocRegisters[:]
for i := range registers {
registers[i] = Register{}
}
sins := &SuperScalarInstruction{}
sins.ins = &Instruction{Name: "NOP"}
portbusy := make([][]int, CYCLE_MAP_SIZE)
for i := range portbusy {
portbusy[i] = make([]int, 3)
}
done := 0
for decode_cycle := 0; decode_cycle < RANDOMX_SUPERSCALAR_LATENCY && !ports_saturated && program_size < SuperscalarMaxSize; decode_cycle++ {
decoder := FetchNextDecoder(sins.ins, decode_cycle, mulcount, gen)
//fmt.Printf("; ------------- fetch cycle %d (%s)\n", cycle, decoder)
if cycle == 51 {
// break
}
/* for i := range portbusy {
for j := range portbusy[i]{
portbusy[i][j]=false
}
}*/
buffer_index := 0
for buffer_index < decoder.GetSize() { // generate instructions for the current decoder
top_cycle := cycle
//fmt.Printf("macro_op_index %d current_instruction %s actual instruction uop %d\n", macro_op_index, current_instruction.Name, sins.ins.GetUOPCount())
if macro_op_index >= sins.ins.GetUOPCount() {
if ports_saturated || program_size >= SuperscalarMaxSize {
//panic("breaking off") program built successfully
break
}
CreateSuperScalarInstruction(sins, gen, Decoder_To_Instruction_Length[int(decoder)][buffer_index], int(decoder), len(Decoder_To_Instruction_Length[decoder]) == (buffer_index+1), buffer_index == 0)
macro_op_index = 0
}
mop := sins.ins.UOP
if sins.ins.GetUOPCount() == 1 {
} else {
mop = sins.ins.UOP_Array[macro_op_index]
}
//fmt.Printf("MOP name %s depcycle %d\n", mop.Name, depcycle)
//calculate the earliest cycle when this macro-op (all of its uOPs) can be scheduled for execution
scheduleCycle := ScheduleMop(&mop, portbusy, cycle, depcycle, false)
if scheduleCycle < 0 {
//fmt.Printf("Unable to map operation %s to execution port (cycle %d)", mop.Name, cycle)
//__debugbreak();
ports_saturated = true
break
}
//fmt.Printf("scheduleCycle %d\n", scheduleCycle)
if macro_op_index == sins.ins.SrcOP { // FIXME
forward := 0
for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectSource(preAllocatedRegisters, scheduleCycle, registers, gen); forward++ {
//fmt.Printf(";src STALL at cycle %d\n", cycle)
scheduleCycle++
cycle++
}
if forward == LOOK_FORWARD_CYCLES {
if throwAwayCount < MAX_THROWAWAY_COUNT {
throwAwayCount++
macro_op_index = sins.ins.GetUOPCount()
//fmt.Printf(";throwAway %s\n", sins.Name)
continue
}
//fmt.Printf("aborting at cycle %d source registers not available", cycle)
break
}
//fmt.Printf("; src = r%d\n", sins.Src_Reg)
}
if macro_op_index == sins.ins.DstOP { // FIXME
forward := 0
for ; forward < LOOK_FORWARD_CYCLES && !sins.SelectDestination(preAllocatedRegisters, scheduleCycle, throwAwayCount > 0, registers, gen); forward++ {
//fmt.Printf(";dst STALL at cycle %d\n", cycle)
scheduleCycle++
cycle++
}
if forward == LOOK_FORWARD_CYCLES {
if throwAwayCount < MAX_THROWAWAY_COUNT {
throwAwayCount++
macro_op_index = sins.ins.GetUOPCount()
//fmt.Printf(";throwAway %s\n", sins.Name)
continue
}
//fmt.Printf("aborting at cycle %d destination registers not available", cycle)
break
}
//fmt.Printf("; dst = r%d\n", sins.Dst_Reg)
}
throwAwayCount = 0
// recalculate when the instruction can be scheduled based on operand availability
scheduleCycle = ScheduleMop(&mop, portbusy, scheduleCycle, scheduleCycle, true)
depcycle = scheduleCycle + mop.GetLatency() // calculate when will the result be ready
if macro_op_index == sins.ins.ResultOP { // fix me
//retire_cycle = depcycle
//fmt.Printf("; RETIRED at cycle %d Dst_Reg %d\n", retire_cycle, sins.Dst_Reg)
registers[sins.Dst_Reg].Latency = depcycle
registers[sins.Dst_Reg].LastOpGroup = sins.OpGroup
registers[sins.Dst_Reg].LastOpPar = sins.OpGroupPar
}
code_size += mop.GetSize()
buffer_index++
macro_op_index++
macro_op_count++
// terminating condition for 99% case
if scheduleCycle >= RANDOMX_SUPERSCALAR_LATENCY {
ports_saturated = true
}
cycle = top_cycle
// when all uops of current instruction have been issued, add the instruction to supercalara program
if macro_op_index >= sins.ins.GetUOPCount() {
sins.FixSrcReg() // fix src register once and for all
program.Ins = append(program.Ins, *sins)
if sins.ins.Name == "IMUL_R" || sins.ins.Name == "IMULH_R" || sins.ins.Name == "ISMULH_R" || sins.ins.Name == "IMUL_RCP" {
mulcount++
}
}
done++
// if done >= 20 {break}
}
cycle++
}
/*
for i := range program.Ins {
fmt.Printf("%d %s\n", i, program.Ins[i].String())
}
*/
var asic_latencies [8]int
for i := range program.Ins {
//fmt.Printf("%d %s\n",i ,program[i].String() )
lastdst := asic_latencies[program.Ins[i].Dst_Reg] + 1
lastsrc := 0
if program.Ins[i].Dst_Reg != program.Ins[i].Src_Reg {
lastsrc = asic_latencies[program.Ins[i].Src_Reg] + 1
}
asic_latencies[program.Ins[i].Dst_Reg] = Max(lastdst, lastsrc)
}
asic_latency_max := 0
address_reg := 0
for i := range asic_latencies {
//fmt.Printf("latency[%d] %d\n", i, asic_latencies[i])
if asic_latencies[i] > asic_latency_max {
asic_latency_max = asic_latencies[i]
address_reg = i
}
}
program.AddressReg = address_reg
//fmt.Printf("address_reg %d\n", address_reg)
return &program
}
const CYCLE_MAP_SIZE int = RANDOMX_SUPERSCALAR_LATENCY + 4
const LOOK_FORWARD_CYCLES int = 4
const MAX_THROWAWAY_COUNT int = 256
// schedule the uop as early as possible
func ScheduleUop(uop ExecutionPort, portbusy [][]int, cycle int, commit bool) int {
//cycle++
for ; cycle < CYCLE_MAP_SIZE; cycle++ { // since cycle is value based, its restored on return
//fmt.Printf("port busy %+v\n", portbusy[cycle])
//fmt.Printf("current cycle %d portbusy %+v commit %+v\n", cycle, portbusy[cycle], commit)
if (uop&P5) != 0 && portbusy[cycle][2] == 0 {
if commit {
//fmt.Printf("; P5 at cycle %d\n", cycle)
portbusy[cycle][2] = int(uop)
}
//fmt.Printf("P5 available\n")
return cycle
}
if (uop&P0) != 0 && portbusy[cycle][0] == 0 {
if commit {
//fmt.Printf("; P0 at cycle %d\n", cycle)
portbusy[cycle][0] = int(uop)
}
//fmt.Printf("P0 available\n")
return cycle
}
if (uop&P1) != 0 && portbusy[cycle][1] == 0 {
if commit {
//fmt.Printf("; P1 at cycle %d\n", cycle)
portbusy[cycle][1] = int(uop)
}
//fmt.Printf("P1 available\n")
return cycle
}
}
return -1
}
func ScheduleMop(mop *MacroOP, portbusy [][]int, cycle int, depcycle int, commit bool) int {
if mop.IsDependent() {
//fmt.Printf("dependent\n")
cycle = Max(cycle, depcycle)
}
if mop.IsEliminated() {
if commit {
//fmt.Printf("; (eliminated)\n")
}
return cycle
} else if mop.IsSimple() {
//fmt.Printf("simple 1\n")
return ScheduleUop(mop.GetUOP1(), portbusy, cycle, commit)
} else {
for ; cycle < CYCLE_MAP_SIZE; cycle++ { // since cycle is value based, its restored on return
cycle1 := ScheduleUop(mop.GetUOP1(), portbusy, cycle, false)
cycle2 := ScheduleUop(mop.GetUOP2(), portbusy, cycle, false)
if cycle1 == cycle2 {
if commit {
ScheduleUop(mop.GetUOP1(), portbusy, cycle, true)
ScheduleUop(mop.GetUOP2(), portbusy, cycle, true)
}
return cycle1
}
}
}
return -1
}
// Max returns the larger of x or y.
func Max(x, y int) int {
if x < y {
return y
}
return x
}
type Register struct {
Value uint64
Latency int
LastOpGroup int
LastOpPar int //-1 = immediate , 0 to 7 register
Status int // can be RegisterNeedsDisplacement = 5; //x86 r13 register
//RegisterNeedsSib = 4; //x86 r12 register
}
const RegisterNeedsDisplacement = 5
const RegisterNeedsSib = 4
func (sins *SuperScalarInstruction) SelectSource(preAllocatedAvailableRegisters []int, cycle int, Registers []Register, gen *Blake2Generator) bool {
available_registers := preAllocatedAvailableRegisters[:0]
for i := range Registers {
//fmt.Printf("\nchecking s reg %d latency %d cycle %d", i, Registers[i].Latency, cycle)
if Registers[i].Latency <= cycle {
available_registers = append(available_registers, i)
//fmt.Printf("available")
}
}
if len(available_registers) == 2 && sins.Name == "IADD_RS" {
if available_registers[0] == RegisterNeedsDisplacement || available_registers[1] == RegisterNeedsDisplacement {
sins.Src_Reg = RegisterNeedsDisplacement
sins.OpGroupPar = sins.Src_Reg
return true
}
}
if selectRegister(available_registers, gen, &sins.Src_Reg) {
if sins.GroupParIsSource == 0 {
} else {
sins.OpGroupPar = sins.Src_Reg
}
return true
}
return false
}
func (sins *SuperScalarInstruction) SelectDestination(preAllocatedAvailableRegisters []int, cycle int, allowChainedMul bool, Registers []Register, gen *Blake2Generator) bool {
preAllocatedAvailableRegisters = preAllocatedAvailableRegisters[:0]
for i := range Registers {
if Registers[i].Latency <= cycle && (sins.CanReuse || i != sins.Src_Reg) &&
(allowChainedMul || sins.OpGroup != S_IMUL_R || Registers[i].LastOpGroup != S_IMUL_R) &&
(Registers[i].LastOpGroup != sins.OpGroup || Registers[i].LastOpPar != sins.OpGroupPar) &&
(sins.Opcode != S_IADD_RS || i != RegisterNeedsDisplacement) {
preAllocatedAvailableRegisters = append(preAllocatedAvailableRegisters, i)
}
}
return selectRegister(preAllocatedAvailableRegisters, gen, &sins.Dst_Reg)
}
func selectRegister(available_registers []int, gen *Blake2Generator, reg *int) bool {
index := 0
if len(available_registers) == 0 {
return false
}
if len(available_registers) > 1 {
tmp := gen.GetUint32()
// fmt.Printf("GetUint32 %d len %d \n", tmp,uint32(len(available_registers)))
index = int(tmp % uint32(len(available_registers)))
} else {
index = 0
}
//fmt.Printf("reg index %d\n", index)
*reg = available_registers[index] // availableRegisters[index];
return true
}
const Mask = CacheSize/CacheLineSize - 1
func getMixBlock(register_value uint64, memory []byte) uint64 {
return (register_value * Mask) * CacheLineSize
}
const superscalarMul0 uint64 = 6364136223846793005
const superscalarAdd1 uint64 = 9298411001130361340
const superscalarAdd2 uint64 = 12065312585734608966
const superscalarAdd3 uint64 = 9306329213124626780
const superscalarAdd4 uint64 = 5281919268842080866
const superscalarAdd5 uint64 = 10536153434571861004
const superscalarAdd6 uint64 = 3398623926847679864
const superscalarAdd7 uint64 = 9549104520008361294
func (cache *Randomx_Cache) InitDatasetItem(out []uint64, itemnumber uint64) {
var rl_array, mix_array [8]uint64
rl := rl_array
mix_block := mix_array[:]
register_value := itemnumber
_ = register_value
rl[0] = (itemnumber + 1) * superscalarMul0
rl[1] = rl[0] ^ superscalarAdd1
rl[2] = rl[0] ^ superscalarAdd2
rl[3] = rl[0] ^ superscalarAdd3
rl[4] = rl[0] ^ superscalarAdd4
rl[5] = rl[0] ^ superscalarAdd5
rl[6] = rl[0] ^ superscalarAdd6
rl[7] = rl[0] ^ superscalarAdd7
for i := 0; i < RANDOMX_CACHE_ACCESSES; i++ {
//mix_block_index := getMixBlock(register_value,nil)
cache.Programs[i].executeSuperscalar_nocache(rl[:])
cache.GetBlock(register_value, mix_block)
for q := range rl {
// fmt.Printf("%d rl[%d] %16x mix %16x\n",i, q,rl[q], mix_block[q])
rl[q] ^= mix_block[q]
}
register_value = rl[cache.Programs[i].AddressReg]
// fmt.Printf("%d\n",i)
}
for q := range rl {
out[q] = rl[q]
}
}
func (cache *Randomx_Cache) initDataset(start_item, end_item uint64) {
for itemnumber := start_item; itemnumber < end_item; itemnumber++ {
cache.InitDatasetItem(nil, itemnumber)
// dataset_index += CacheLineSize
//fmt.Printf("exiting dataset item\n")
break
}
}
// execute the superscalar program
func (p *SuperScalarProgram) executeSuperscalar_nocache(r []uint64) {
_ = r[7] // bounds check hint to compiler; see golang.org/issue/14808
for i := range p.Ins {
ins := &p.Ins[i]
switch ins.Opcode {
case S_ISUB_R:
r[ins.Dst_Reg] -= r[ins.Src_Reg]
case S_IXOR_R:
r[ins.Dst_Reg] ^= r[ins.Src_Reg]
case S_IADD_RS:
r[ins.Dst_Reg] += r[ins.Src_Reg] << ins.Imm32
case S_IMUL_R:
r[ins.Dst_Reg] *= r[ins.Src_Reg]
case S_IROR_C:
r[ins.Dst_Reg] = bits.RotateLeft64(r[ins.Dst_Reg], 0-int(ins.Imm32))
case S_IADD_C7, S_IADD_C8, S_IADD_C9:
r[ins.Dst_Reg] += signExtend2sCompl(ins.Imm32)
case S_IXOR_C7, S_IXOR_C8, S_IXOR_C9:
r[ins.Dst_Reg] ^= signExtend2sCompl(ins.Imm32)
case S_IMULH_R:
r[ins.Dst_Reg], _ = bits.Mul64(r[ins.Dst_Reg], r[ins.Src_Reg])
case S_ISMULH_R:
r[ins.Dst_Reg] = smulh(int64(r[ins.Dst_Reg]), int64(r[ins.Src_Reg]))
case S_IMUL_RCP:
r[ins.Dst_Reg] *= randomx_reciprocal(ins.Imm32)
}
}
}
func smulh(a, b int64) uint64 {
hi_, _ := bits.Mul64(uint64(a), uint64(b))
hi := int64(hi_)
if a < 0 {
hi -= b
}
if b < 0 {
hi -= a
}
return uint64(hi)
}
func randomx_reciprocal(divisor uint32) uint64 {
const p2exp63 uint64 = uint64(1) << 63
quotient := p2exp63 / uint64(divisor)
remainder := p2exp63 % uint64(divisor)
shift := uint32(bits.Len32(divisor))
return (quotient << shift) + ((remainder << shift) / uint64(divisor))
}
func signExtend2sCompl(x uint32) uint64 {
return uint64(int64(int32(x)))
/*
if -1 == (^0) {
return
} else if x > math.MaxInt32 {
return uint64(x) | 0xffffffff00000000
} else {
return uint64(x)
}
*/
}