edwards25519: rewrite amd64 assembly with avo
Mapping almost 1:1 from generic Go lines to Avo lines. Again a little faster, as a treat. (Probably due to better pipelining of the reduction chains.) name old time/op new time/op delta Add-4 7.87ns ± 2% 7.93ns ± 1% ~ (p=0.065 n=10+9) Mul-4 18.9ns ± 2% 18.4ns ± 1% -2.20% (p=0.000 n=10+9) Mul32-4 7.22ns ± 1% 7.19ns ± 1% ~ (p=0.128 n=10+9) BasepointMul-4 21.4µs ± 1% 21.1µs ± 1% -1.54% (p=0.000 n=10+10) ScalarMul-4 67.3µs ± 2% 67.2µs ± 1% ~ (p=0.579 n=10+10) VartimeDoubleBaseMul-4 62.5µs ± 2% 62.9µs ± 1% ~ (p=0.436 n=10+10) MultiscalarMulSize8-4 246µs ± 0% 246µs ± 1% ~ (p=0.631 n=10+10)
This commit is contained in:
parent
8eb02eb997
commit
c882e8e8ab
296
asm/fe_amd64_asm.go
Normal file
296
asm/fe_amd64_asm.go
Normal file
|
@ -0,0 +1,296 @@
|
|||
// Copyright (c) 2021 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
. "github.com/mmcloughlin/avo/build"
|
||||
. "github.com/mmcloughlin/avo/gotypes"
|
||||
. "github.com/mmcloughlin/avo/operand"
|
||||
. "github.com/mmcloughlin/avo/reg"
|
||||
|
||||
_ "filippo.io/edwards25519"
|
||||
)
|
||||
|
||||
//go:generate go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519
|
||||
|
||||
func main() {
|
||||
Package("filippo.io/edwards25519")
|
||||
ConstraintExpr("amd64,gc,!purego")
|
||||
feMul()
|
||||
feSquare()
|
||||
Generate()
|
||||
}
|
||||
|
||||
type namedComponent struct {
|
||||
Component
|
||||
name string
|
||||
}
|
||||
|
||||
func (c namedComponent) String() string { return c.name }
|
||||
|
||||
type uint128 struct {
|
||||
name string
|
||||
hi, lo GPVirtual
|
||||
}
|
||||
|
||||
func (c uint128) String() string { return c.name }
|
||||
|
||||
func feSquare() {
|
||||
TEXT("feSquare", NOSPLIT, "func(out, a *fieldElement)")
|
||||
Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
|
||||
Pragma("noescape")
|
||||
|
||||
a := Dereference(Param("a"))
|
||||
l0 := namedComponent{a.Field("l0"), "l0"}
|
||||
l1 := namedComponent{a.Field("l1"), "l1"}
|
||||
l2 := namedComponent{a.Field("l2"), "l2"}
|
||||
l3 := namedComponent{a.Field("l3"), "l3"}
|
||||
l4 := namedComponent{a.Field("l4"), "l4"}
|
||||
|
||||
// r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
|
||||
r0 := uint128{"r0", GP64(), GP64()}
|
||||
mul64(r0, 1, l0, l0)
|
||||
addMul64(r0, 38, l1, l4)
|
||||
addMul64(r0, 38, l2, l3)
|
||||
|
||||
// r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
|
||||
r1 := uint128{"r1", GP64(), GP64()}
|
||||
mul64(r1, 2, l0, l1)
|
||||
addMul64(r1, 38, l2, l4)
|
||||
addMul64(r1, 19, l3, l3)
|
||||
|
||||
// r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
|
||||
r2 := uint128{"r2", GP64(), GP64()}
|
||||
mul64(r2, 2, l0, l2)
|
||||
addMul64(r2, 1, l1, l1)
|
||||
addMul64(r2, 38, l3, l4)
|
||||
|
||||
// r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
|
||||
r3 := uint128{"r3", GP64(), GP64()}
|
||||
mul64(r3, 2, l0, l3)
|
||||
addMul64(r3, 2, l1, l2)
|
||||
addMul64(r3, 19, l4, l4)
|
||||
|
||||
// r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
|
||||
r4 := uint128{"r4", GP64(), GP64()}
|
||||
mul64(r4, 2, l0, l4)
|
||||
addMul64(r4, 2, l1, l3)
|
||||
addMul64(r4, 1, l2, l2)
|
||||
|
||||
Comment("First reduction chain")
|
||||
maskLow51Bits := GP64()
|
||||
MOVQ(Imm((1<<51)-1), maskLow51Bits)
|
||||
c0, r0lo := shiftRightBy51(&r0)
|
||||
c1, r1lo := shiftRightBy51(&r1)
|
||||
c2, r2lo := shiftRightBy51(&r2)
|
||||
c3, r3lo := shiftRightBy51(&r3)
|
||||
c4, r4lo := shiftRightBy51(&r4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Second reduction chain (carryPropagate)")
|
||||
// c0 = r0 >> 51
|
||||
MOVQ(r0lo, c0)
|
||||
SHRQ(Imm(51), c0)
|
||||
// c1 = r1 >> 51
|
||||
MOVQ(r1lo, c1)
|
||||
SHRQ(Imm(51), c1)
|
||||
// c2 = r2 >> 51
|
||||
MOVQ(r2lo, c2)
|
||||
SHRQ(Imm(51), c2)
|
||||
// c3 = r3 >> 51
|
||||
MOVQ(r3lo, c3)
|
||||
SHRQ(Imm(51), c3)
|
||||
// c4 = r4 >> 51
|
||||
MOVQ(r4lo, c4)
|
||||
SHRQ(Imm(51), c4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Store output")
|
||||
out := Dereference(Param("out"))
|
||||
Store(r0lo, out.Field("l0"))
|
||||
Store(r1lo, out.Field("l1"))
|
||||
Store(r2lo, out.Field("l2"))
|
||||
Store(r3lo, out.Field("l3"))
|
||||
Store(r4lo, out.Field("l4"))
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
func feMul() {
|
||||
TEXT("feMul", NOSPLIT, "func(out, a, b *fieldElement)")
|
||||
Doc("feMul sets out = a * b. It works like feMulGeneric.")
|
||||
Pragma("noescape")
|
||||
|
||||
a := Dereference(Param("a"))
|
||||
a0 := namedComponent{a.Field("l0"), "a0"}
|
||||
a1 := namedComponent{a.Field("l1"), "a1"}
|
||||
a2 := namedComponent{a.Field("l2"), "a2"}
|
||||
a3 := namedComponent{a.Field("l3"), "a3"}
|
||||
a4 := namedComponent{a.Field("l4"), "a4"}
|
||||
|
||||
b := Dereference(Param("b"))
|
||||
b0 := namedComponent{b.Field("l0"), "b0"}
|
||||
b1 := namedComponent{b.Field("l1"), "b1"}
|
||||
b2 := namedComponent{b.Field("l2"), "b2"}
|
||||
b3 := namedComponent{b.Field("l3"), "b3"}
|
||||
b4 := namedComponent{b.Field("l4"), "b4"}
|
||||
|
||||
// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
|
||||
r0 := uint128{"r0", GP64(), GP64()}
|
||||
mul64(r0, 1, a0, b0)
|
||||
addMul64(r0, 19, a1, b4)
|
||||
addMul64(r0, 19, a2, b3)
|
||||
addMul64(r0, 19, a3, b2)
|
||||
addMul64(r0, 19, a4, b1)
|
||||
|
||||
// r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2)
|
||||
r1 := uint128{"r1", GP64(), GP64()}
|
||||
mul64(r1, 1, a0, b1)
|
||||
addMul64(r1, 1, a1, b0)
|
||||
addMul64(r1, 19, a2, b4)
|
||||
addMul64(r1, 19, a3, b3)
|
||||
addMul64(r1, 19, a4, b2)
|
||||
|
||||
// r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3)
|
||||
r2 := uint128{"r2", GP64(), GP64()}
|
||||
mul64(r2, 1, a0, b2)
|
||||
addMul64(r2, 1, a1, b1)
|
||||
addMul64(r2, 1, a2, b0)
|
||||
addMul64(r2, 19, a3, b4)
|
||||
addMul64(r2, 19, a4, b3)
|
||||
|
||||
// r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4
|
||||
r3 := uint128{"r3", GP64(), GP64()}
|
||||
mul64(r3, 1, a0, b3)
|
||||
addMul64(r3, 1, a1, b2)
|
||||
addMul64(r3, 1, a2, b1)
|
||||
addMul64(r3, 1, a3, b0)
|
||||
addMul64(r3, 19, a4, b4)
|
||||
|
||||
// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
|
||||
r4 := uint128{"r4", GP64(), GP64()}
|
||||
mul64(r4, 1, a0, b4)
|
||||
addMul64(r4, 1, a1, b3)
|
||||
addMul64(r4, 1, a2, b2)
|
||||
addMul64(r4, 1, a3, b1)
|
||||
addMul64(r4, 1, a4, b0)
|
||||
|
||||
Comment("First reduction chain")
|
||||
maskLow51Bits := GP64()
|
||||
MOVQ(Imm((1<<51)-1), maskLow51Bits)
|
||||
c0, r0lo := shiftRightBy51(&r0)
|
||||
c1, r1lo := shiftRightBy51(&r1)
|
||||
c2, r2lo := shiftRightBy51(&r2)
|
||||
c3, r3lo := shiftRightBy51(&r3)
|
||||
c4, r4lo := shiftRightBy51(&r4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Second reduction chain (carryPropagate)")
|
||||
// c0 = r0 >> 51
|
||||
MOVQ(r0lo, c0)
|
||||
SHRQ(Imm(51), c0)
|
||||
// c1 = r1 >> 51
|
||||
MOVQ(r1lo, c1)
|
||||
SHRQ(Imm(51), c1)
|
||||
// c2 = r2 >> 51
|
||||
MOVQ(r2lo, c2)
|
||||
SHRQ(Imm(51), c2)
|
||||
// c3 = r3 >> 51
|
||||
MOVQ(r3lo, c3)
|
||||
SHRQ(Imm(51), c3)
|
||||
// c4 = r4 >> 51
|
||||
MOVQ(r4lo, c4)
|
||||
SHRQ(Imm(51), c4)
|
||||
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
|
||||
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
|
||||
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
|
||||
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
|
||||
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
|
||||
|
||||
Comment("Store output")
|
||||
out := Dereference(Param("out"))
|
||||
Store(r0lo, out.Field("l0"))
|
||||
Store(r1lo, out.Field("l1"))
|
||||
Store(r2lo, out.Field("l2"))
|
||||
Store(r3lo, out.Field("l3"))
|
||||
Store(r4lo, out.Field("l4"))
|
||||
|
||||
RET()
|
||||
}
|
||||
|
||||
// mul64 sets r to i * aX * bX.
|
||||
func mul64(r uint128, i int, aX, bX namedComponent) {
|
||||
switch i {
|
||||
case 1:
|
||||
Comment(fmt.Sprintf("%s = %s×%s", r, aX, bX))
|
||||
Load(aX, RAX)
|
||||
case 2:
|
||||
Comment(fmt.Sprintf("%s = 2×%s×%s", r, aX, bX))
|
||||
Load(aX, RAX)
|
||||
SHLQ(Imm(1), RAX)
|
||||
default:
|
||||
panic("unsupported i value")
|
||||
}
|
||||
MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX
|
||||
MOVQ(RAX, r.lo)
|
||||
MOVQ(RDX, r.hi)
|
||||
}
|
||||
|
||||
// addMul64 sets r to r + i * aX * bX.
|
||||
func addMul64(r uint128, i uint64, aX, bX namedComponent) {
|
||||
switch i {
|
||||
case 1:
|
||||
Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX))
|
||||
Load(aX, RAX)
|
||||
default:
|
||||
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
|
||||
IMUL3Q(Imm(i), Load(aX, GP64()), RAX)
|
||||
}
|
||||
MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX
|
||||
ADDQ(RAX, r.lo)
|
||||
ADCQ(RDX, r.hi)
|
||||
}
|
||||
|
||||
// shiftRightBy51 returns r >> 51 and r.lo.
|
||||
//
|
||||
// After this function is called, the uint128 may not be used anymore.
|
||||
func shiftRightBy51(r *uint128) (out, lo GPVirtual) {
|
||||
out = r.hi
|
||||
lo = r.lo
|
||||
SHLQ(Imm(64-51), r.lo, r.hi)
|
||||
r.lo, r.hi = nil, nil // make sure the uint128 is unusable
|
||||
return
|
||||
}
|
||||
|
||||
// maskAndAdd sets r = r&mask + c*i.
|
||||
func maskAndAdd(r, mask, c GPVirtual, i uint64) {
|
||||
ANDQ(mask, r)
|
||||
if i != 1 {
|
||||
IMUL3Q(Imm(i), c, c)
|
||||
}
|
||||
ADDQ(c, r)
|
||||
}
|
||||
|
||||
func mustAddr(c Component) Op {
|
||||
b, err := c.Resolve()
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
return b.Addr
|
||||
}
|
10
asm/go.mod
Normal file
10
asm/go.mod
Normal file
|
@ -0,0 +1,10 @@
|
|||
module filippo.io/edwards25519/asm
|
||||
|
||||
go 1.16
|
||||
|
||||
require (
|
||||
filippo.io/edwards25519 v0.0.0
|
||||
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4
|
||||
)
|
||||
|
||||
replace filippo.io/edwards25519 => ../
|
28
asm/go.sum
Normal file
28
asm/go.sum
Normal file
|
@ -0,0 +1,28 @@
|
|||
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 h1:ExoghBBFY7A3RzgkAOq0XbHs9zaT/bHq7xysgyp3z3Q=
|
||||
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4/go.mod h1:6aKT4zZIrpGqB3RpFU14ByCSSyKY6LfJz4J/JJChHfI=
|
||||
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
|
||||
golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
|
||||
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
|
||||
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
|
||||
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
|
||||
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
|
||||
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
|
||||
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
|
||||
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
|
||||
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
|
||||
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
|
||||
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
|
||||
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
|
||||
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174 h1:0rx0F4EjJNbxTuzWe0KjKcIzs+3VEb/Mrs/d1ciNz1c=
|
||||
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
|
||||
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
|
||||
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
|
||||
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=
|
14
fe_amd64.go
14
fe_amd64.go
|
@ -1,17 +1,13 @@
|
|||
// Copyright (c) 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT.
|
||||
|
||||
// +build amd64,gc,!purego
|
||||
|
||||
package edwards25519
|
||||
|
||||
// feMul sets out = a * b. It works like feMulGeneric.
|
||||
//go:noescape
|
||||
func feMul(out, a, b *fieldElement)
|
||||
func feMul(out *fieldElement, a *fieldElement, b *fieldElement)
|
||||
|
||||
// feSquare sets out = a * a. It works like feSquareGeneric.
|
||||
//go:noescape
|
||||
func feSquare(out, x *fieldElement)
|
||||
|
||||
func (v *fieldElement) carryPropagate() *fieldElement {
|
||||
return v.carryPropagateGeneric()
|
||||
}
|
||||
func feSquare(out *fieldElement, a *fieldElement)
|
||||
|
|
612
fe_amd64.s
612
fe_amd64.s
|
@ -1,348 +1,378 @@
|
|||
// Copyright (c) 2017 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT.
|
||||
|
||||
// +build amd64,gc,!purego
|
||||
|
||||
// func feMul(out, a, b *fieldElement)
|
||||
TEXT ·feMul(SB),$0-24
|
||||
// Based on assembly generated by PeachPy. Equivalent to the Go in
|
||||
// feMulGeneric, which was originally based on the amd64-51-30k
|
||||
// assembly in SUPERCOP.
|
||||
#include "textflag.h"
|
||||
|
||||
MOVQ a+8(FP), BX
|
||||
MOVQ b+16(FP), CX
|
||||
// func feMul(out *fieldElement, a *fieldElement, b *fieldElement)
|
||||
TEXT ·feMul(SB), NOSPLIT, $0-24
|
||||
MOVQ a+8(FP), CX
|
||||
MOVQ b+16(FP), BX
|
||||
|
||||
// Calculate r0
|
||||
MOVQ 0(BX), AX // rax <-- x0
|
||||
MULQ 0(CX) // rdx, rax <-- x0*y0
|
||||
MOVQ AX, SI // r00 = rax
|
||||
MOVQ DX, DI // r01 = rdx
|
||||
// r0 = a0×b0
|
||||
MOVQ (CX), AX
|
||||
MULQ (BX)
|
||||
MOVQ AX, SI
|
||||
MOVQ DX, BP
|
||||
|
||||
MOVQ 8(BX), DX // rdx <-- x1
|
||||
IMUL3Q $19, DX, AX // rax <-- x1*19
|
||||
MULQ 32(CX) // rdx, rax <-- x1_19*y4
|
||||
ADDQ AX, SI // r00 += rax
|
||||
ADCQ DX, DI // r01 += rdx
|
||||
// r0 += 19×a1×b4
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BP
|
||||
|
||||
MOVQ 16(BX), DX // rdx <-- x2
|
||||
IMUL3Q $19, DX, AX // rax <-- x2*19
|
||||
MULQ 24(CX) // rdx, rax <-- x2_19*y3
|
||||
ADDQ AX, SI // r00 += rax
|
||||
ADCQ DX, DI // r01 += rdx
|
||||
// r0 += 19×a2×b3
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BP
|
||||
|
||||
MOVQ 24(BX), DX // rdx <-- x3
|
||||
IMUL3Q $19, DX, AX // rax <-- x3*19
|
||||
MULQ 16(CX) // rdx, rax <-- x3_19 * y2
|
||||
ADDQ AX, SI // r00 += rax
|
||||
ADCQ DX, DI // r01 += rdx
|
||||
// r0 += 19×a3×b2
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BP
|
||||
|
||||
MOVQ 32(BX), DX // rdx <-- x4
|
||||
IMUL3Q $19, DX, AX // rax <-- x4*19
|
||||
MULQ 8(CX) // rdx rax <-- x4_19*y1
|
||||
ADDQ AX, SI // r00 += rax
|
||||
ADCQ DX, DI // r01 += rdx
|
||||
// r0 += 19×a4×b1
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, SI
|
||||
ADCQ DX, BP
|
||||
|
||||
// Calculate r1
|
||||
MOVQ 0(BX), AX
|
||||
MULQ 8(CX)
|
||||
MOVQ AX, R8 // r10
|
||||
MOVQ DX, R9 // r11
|
||||
// r1 = a0×b1
|
||||
MOVQ (CX), AX
|
||||
MULQ 8(BX)
|
||||
MOVQ AX, R8
|
||||
MOVQ DX, DI
|
||||
|
||||
MOVQ 8(BX), AX
|
||||
MULQ 0(CX)
|
||||
// r1 += a1×b0
|
||||
MOVQ 8(CX), AX
|
||||
MULQ (BX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r1 += 19×a2×b4
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r1 += 19×a3×b3
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r1 += 19×a4×b2
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, DI
|
||||
|
||||
// r2 = a0×b2
|
||||
MOVQ (CX), AX
|
||||
MULQ 16(BX)
|
||||
MOVQ AX, R10
|
||||
MOVQ DX, R9
|
||||
|
||||
// r2 += a1×b1
|
||||
MOVQ 8(CX), AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
MOVQ 16(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R8
|
||||
// r2 += a2×b0
|
||||
MOVQ 16(CX), AX
|
||||
MULQ (BX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
MOVQ 24(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, R9
|
||||
// r2 += 19×a3×b4
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
MOVQ 32(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R8
|
||||
ADCQ DX, R9
|
||||
// r2 += 19×a4×b3
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R9
|
||||
|
||||
// Calculate r2
|
||||
MOVQ 0(BX), AX
|
||||
MULQ 16(CX)
|
||||
MOVQ AX, R10 // r20
|
||||
MOVQ DX, R11 // r21
|
||||
// r3 = a0×b3
|
||||
MOVQ (CX), AX
|
||||
MULQ 24(BX)
|
||||
MOVQ AX, R12
|
||||
MOVQ DX, R11
|
||||
|
||||
MOVQ 8(BX), AX
|
||||
MULQ 8(CX)
|
||||
ADDQ AX, R10
|
||||
// r3 += a1×b2
|
||||
MOVQ 8(CX), AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
MOVQ 16(BX), AX
|
||||
MULQ 0(CX)
|
||||
ADDQ AX, R10
|
||||
// r3 += a2×b1
|
||||
MOVQ 16(CX), AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
MOVQ 24(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R10
|
||||
// r3 += a3×b0
|
||||
MOVQ 24(CX), AX
|
||||
MULQ (BX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
MOVQ 32(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R10
|
||||
ADCQ DX, R11
|
||||
// r3 += 19×a4×b4
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(BX)
|
||||
ADDQ AX, R12
|
||||
ADCQ DX, R11
|
||||
|
||||
// Calculate r3
|
||||
MOVQ 0(BX), AX
|
||||
MULQ 24(CX)
|
||||
MOVQ AX, R12 // r30
|
||||
MOVQ DX, R13 // r31
|
||||
// r4 = a0×b4
|
||||
MOVQ (CX), AX
|
||||
MULQ 32(BX)
|
||||
MOVQ AX, R14
|
||||
MOVQ DX, R13
|
||||
|
||||
MOVQ 8(BX), AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R12
|
||||
// r4 += a1×b3
|
||||
MOVQ 8(CX), AX
|
||||
MULQ 24(BX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
MOVQ 16(BX), AX
|
||||
MULQ 8(CX)
|
||||
ADDQ AX, R12
|
||||
// r4 += a2×b2
|
||||
MOVQ 16(CX), AX
|
||||
MULQ 16(BX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
MOVQ 24(BX), AX
|
||||
MULQ 0(CX)
|
||||
ADDQ AX, R12
|
||||
// r4 += a3×b1
|
||||
MOVQ 24(CX), AX
|
||||
MULQ 8(BX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
MOVQ 32(BX), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R12
|
||||
// r4 += a4×b0
|
||||
MOVQ 32(CX), AX
|
||||
MULQ (BX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R13
|
||||
|
||||
// Calculate r4
|
||||
MOVQ 0(BX), AX
|
||||
MULQ 32(CX)
|
||||
MOVQ AX, R14 // r40
|
||||
MOVQ DX, R15 // r41
|
||||
// First reduction chain
|
||||
MOVQ $0x0007ffffffffffff, AX
|
||||
SHLQ $0x0d, SI, BP
|
||||
SHLQ $0x0d, R8, DI
|
||||
SHLQ $0x0d, R10, R9
|
||||
SHLQ $0x0d, R12, R11
|
||||
SHLQ $0x0d, R14, R13
|
||||
ANDQ AX, SI
|
||||
IMUL3Q $0x13, R13, R13
|
||||
ADDQ R13, SI
|
||||
ANDQ AX, R8
|
||||
ADDQ BP, R8
|
||||
ANDQ AX, R10
|
||||
ADDQ DI, R10
|
||||
ANDQ AX, R12
|
||||
ADDQ R9, R12
|
||||
ANDQ AX, R14
|
||||
ADDQ R11, R14
|
||||
|
||||
MOVQ 8(BX), AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R15
|
||||
// Second reduction chain (carryPropagate)
|
||||
MOVQ SI, BP
|
||||
SHRQ $0x33, BP
|
||||
MOVQ R8, DI
|
||||
SHRQ $0x33, DI
|
||||
MOVQ R10, R9
|
||||
SHRQ $0x33, R9
|
||||
MOVQ R12, R11
|
||||
SHRQ $0x33, R11
|
||||
MOVQ R14, R13
|
||||
SHRQ $0x33, R13
|
||||
ANDQ AX, SI
|
||||
IMUL3Q $0x13, R13, R13
|
||||
ADDQ R13, SI
|
||||
ANDQ AX, R8
|
||||
ADDQ BP, R8
|
||||
ANDQ AX, R10
|
||||
ADDQ DI, R10
|
||||
ANDQ AX, R12
|
||||
ADDQ R9, R12
|
||||
ANDQ AX, R14
|
||||
ADDQ R11, R14
|
||||
|
||||
MOVQ 16(BX), AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R15
|
||||
|
||||
MOVQ 24(BX), AX
|
||||
MULQ 8(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R15
|
||||
|
||||
MOVQ 32(BX), AX
|
||||
MULQ 0(CX)
|
||||
ADDQ AX, R14
|
||||
ADCQ DX, R15
|
||||
|
||||
|
||||
MOVQ $2251799813685247, AX // (1<<51) - 1
|
||||
SHLQ $13, SI, DI // r01 = shld with r00
|
||||
ANDQ AX, SI // r00 &= mask51
|
||||
SHLQ $13, R8, R9 // r11 = shld with r10
|
||||
ANDQ AX, R8 // r10 &= mask51
|
||||
ADDQ DI, R8 // r10 += r01
|
||||
SHLQ $13, R10, R11 // r21 = shld with r20
|
||||
ANDQ AX, R10 // r20 &= mask51
|
||||
ADDQ R9, R10 // r20 += r11
|
||||
SHLQ $13, R12, R13 // r31 = shld with r30
|
||||
ANDQ AX, R12 // r30 &= mask51
|
||||
ADDQ R11, R12 // r30 += r21
|
||||
SHLQ $13, R14, R15 // r41 = shld with r40
|
||||
ANDQ AX, R14 // r40 &= mask51
|
||||
ADDQ R13, R14 // r40 += r31
|
||||
IMUL3Q $19, R15, R15 // r41 = r41*19
|
||||
ADDQ R15, SI // r00 += r41
|
||||
|
||||
MOVQ SI, DX // rdx <-- r00
|
||||
SHRQ $51, DX // rdx <-- r00 >> 51
|
||||
ADDQ DX, R8 // r10 += r00 >> 51
|
||||
MOVQ R8, DX // rdx <-- r10
|
||||
SHRQ $51, DX // rdx <-- r10 >> 51
|
||||
ANDQ AX, SI // r00 &= mask51
|
||||
ADDQ DX, R10 // r20 += r10 >> 51
|
||||
MOVQ R10, DX // rdx <-- r20
|
||||
SHRQ $51, DX // rdx <-- r20 >> 51
|
||||
ANDQ AX, R8 // r10 &= mask51
|
||||
ADDQ DX, R12 // r30 += r20 >> 51
|
||||
MOVQ R12, DX // rdx <-- r30
|
||||
SHRQ $51, DX // rdx <-- r30 >> 51
|
||||
ANDQ AX, R10 // r20 &= mask51
|
||||
ADDQ DX, R14 // r40 += r30 >> 51
|
||||
MOVQ R14, DX // rdx <-- r40
|
||||
SHRQ $51, DX // rdx <-- r40 >> 51
|
||||
ANDQ AX, R12 // r30 &= mask51
|
||||
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
|
||||
ADDQ DX, SI // r00 += (r40 >> 51) *19
|
||||
ANDQ AX, R14 // r40 &= mask51
|
||||
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ SI, 0(DI)
|
||||
MOVQ R8, 8(DI)
|
||||
MOVQ R10, 16(DI)
|
||||
MOVQ R12, 24(DI)
|
||||
MOVQ R14, 32(DI)
|
||||
// Store output
|
||||
MOVQ out+0(FP), AX
|
||||
MOVQ SI, (AX)
|
||||
MOVQ R8, 8(AX)
|
||||
MOVQ R10, 16(AX)
|
||||
MOVQ R12, 24(AX)
|
||||
MOVQ R14, 32(AX)
|
||||
RET
|
||||
|
||||
// func feSquare(out, x *fieldElement)
|
||||
TEXT ·feSquare(SB),4,$0-16
|
||||
MOVQ out+0(FP), DI
|
||||
MOVQ x+8(FP), SI
|
||||
// func feSquare(out *fieldElement, a *fieldElement)
|
||||
TEXT ·feSquare(SB), NOSPLIT, $0-16
|
||||
MOVQ a+8(FP), CX
|
||||
|
||||
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
|
||||
MOVQ 0(SI), AX
|
||||
MULQ 0(SI)
|
||||
MOVQ AX, CX // r00
|
||||
MOVQ DX, R8 // r01
|
||||
// r0 = l0×l0
|
||||
MOVQ (CX), AX
|
||||
MULQ (CX)
|
||||
MOVQ AX, BP
|
||||
MOVQ DX, BX
|
||||
|
||||
MOVQ 8(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
// r0 += 38×l1×l4
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, BP
|
||||
ADCQ DX, BX
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, CX
|
||||
ADCQ DX, R8
|
||||
// r0 += 38×l2×l3
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, BP
|
||||
ADCQ DX, BX
|
||||
|
||||
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 8(SI)
|
||||
MOVQ AX, R9 // r10
|
||||
MOVQ DX, R10 // r11
|
||||
// r1 = 2×l0×l1
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 8(CX)
|
||||
MOVQ AX, DI
|
||||
MOVQ DX, SI
|
||||
|
||||
MOVQ 16(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
// r1 += 38×l2×l4
|
||||
MOVQ 16(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R10
|
||||
// r1 += 19×l3×l3
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, DI
|
||||
ADCQ DX, SI
|
||||
|
||||
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
MOVQ AX, R11 // r20
|
||||
MOVQ DX, R12 // r21
|
||||
// r2 = 2×l0×l2
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 16(CX)
|
||||
MOVQ AX, R9
|
||||
MOVQ DX, R8
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
MULQ 8(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
// r2 += l1×l1
|
||||
MOVQ 8(CX), AX
|
||||
MULQ 8(CX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
|
||||
MOVQ 24(SI), DX
|
||||
IMUL3Q $38, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R12
|
||||
// r2 += 38×l3×l4
|
||||
MOVQ 24(CX), AX
|
||||
IMUL3Q $0x26, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R9
|
||||
ADCQ DX, R8
|
||||
|
||||
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
MOVQ AX, R13 // r30
|
||||
MOVQ DX, R14 // r31
|
||||
// r3 = 2×l0×l3
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 24(CX)
|
||||
MOVQ AX, R11
|
||||
MOVQ DX, R10
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
// r3 += 2×l1×l2
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x02, AX, AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
|
||||
MOVQ 32(SI), DX
|
||||
IMUL3Q $19, DX, AX
|
||||
MULQ 32(SI)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R14
|
||||
// r3 += 19×l4×l4
|
||||
MOVQ 32(CX), AX
|
||||
IMUL3Q $0x13, AX, AX
|
||||
MULQ 32(CX)
|
||||
ADDQ AX, R11
|
||||
ADCQ DX, R10
|
||||
|
||||
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
|
||||
MOVQ 0(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 32(SI)
|
||||
MOVQ AX, R15 // r40
|
||||
MOVQ DX, BX // r41
|
||||
// r4 = 2×l0×l4
|
||||
MOVQ (CX), AX
|
||||
SHLQ $0x01, AX
|
||||
MULQ 32(CX)
|
||||
MOVQ AX, R13
|
||||
MOVQ DX, R12
|
||||
|
||||
MOVQ 8(SI), AX
|
||||
SHLQ $1, AX
|
||||
MULQ 24(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
// r4 += 2×l1×l3
|
||||
MOVQ 8(CX), AX
|
||||
IMUL3Q $0x02, AX, AX
|
||||
MULQ 24(CX)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R12
|
||||
|
||||
MOVQ 16(SI), AX
|
||||
MULQ 16(SI)
|
||||
ADDQ AX, R15
|
||||
ADCQ DX, BX
|
||||
// r4 += l2×l2
|
||||
MOVQ 16(CX), AX
|
||||
MULQ 16(CX)
|
||||
ADDQ AX, R13
|
||||
ADCQ DX, R12
|
||||
|
||||
// Reduce
|
||||
MOVQ $2251799813685247, AX // (1<<51) - 1
|
||||
SHLQ $13, CX, R8 // r01 = shld with r00
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
SHLQ $13, R9, R10 // r11 = shld with r10
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ R8, R9 // r10 += r01
|
||||
SHLQ $13, R11, R12 // r21 = shld with r20
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ R10, R11 // r20 += r11
|
||||
SHLQ $13, R13, R14 // r31 = shld with r30
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
ADDQ R12, R13 // r30 += r21
|
||||
SHLQ $13, R15, BX // r41 = shld with r40
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
ADDQ R14, R15 // r40 += r31
|
||||
IMUL3Q $19, BX, DX // r41 = r41*19
|
||||
ADDQ DX, CX // r00 += r41
|
||||
// First reduction chain
|
||||
MOVQ $0x0007ffffffffffff, AX
|
||||
SHLQ $0x0d, BP, BX
|
||||
SHLQ $0x0d, DI, SI
|
||||
SHLQ $0x0d, R9, R8
|
||||
SHLQ $0x0d, R11, R10
|
||||
SHLQ $0x0d, R13, R12
|
||||
ANDQ AX, BP
|
||||
IMUL3Q $0x13, R12, R12
|
||||
ADDQ R12, BP
|
||||
ANDQ AX, DI
|
||||
ADDQ BX, DI
|
||||
ANDQ AX, R9
|
||||
ADDQ SI, R9
|
||||
ANDQ AX, R11
|
||||
ADDQ R8, R11
|
||||
ANDQ AX, R13
|
||||
ADDQ R10, R13
|
||||
|
||||
MOVQ CX, DX // rdx <-- r00
|
||||
SHRQ $51, DX // rdx <-- r00 >> 51
|
||||
ADDQ DX, R9 // r10 += r00 >> 51
|
||||
MOVQ R9, DX // rdx <-- r10
|
||||
SHRQ $51, DX // rdx <-- r10 >> 51
|
||||
ANDQ AX, CX // r00 &= mask51
|
||||
ADDQ DX, R11 // r20 += r10 >> 51
|
||||
MOVQ R11, DX // rdx <-- r20
|
||||
SHRQ $51, DX // rdx <-- r20 >> 51
|
||||
ANDQ AX, R9 // r10 &= mask51
|
||||
ADDQ DX, R13 // r30 += r20 >> 51
|
||||
MOVQ R13, DX // rdx <-- r30
|
||||
SHRQ $51, DX // rdx <-- r30 >> 51
|
||||
ANDQ AX, R11 // r20 &= mask51
|
||||
ADDQ DX, R15 // r40 += r30 >> 51
|
||||
MOVQ R15, DX // rdx <-- r40
|
||||
SHRQ $51, DX // rdx <-- r40 >> 51
|
||||
ANDQ AX, R13 // r30 &= mask51
|
||||
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
|
||||
ADDQ DX, CX // r00 += (r40 >> 51) *19
|
||||
ANDQ AX, R15 // r40 &= mask51
|
||||
// Second reduction chain (carryPropagate)
|
||||
MOVQ BP, BX
|
||||
SHRQ $0x33, BX
|
||||
MOVQ DI, SI
|
||||
SHRQ $0x33, SI
|
||||
MOVQ R9, R8
|
||||
SHRQ $0x33, R8
|
||||
MOVQ R11, R10
|
||||
SHRQ $0x33, R10
|
||||
MOVQ R13, R12
|
||||
SHRQ $0x33, R12
|
||||
ANDQ AX, BP
|
||||
IMUL3Q $0x13, R12, R12
|
||||
ADDQ R12, BP
|
||||
ANDQ AX, DI
|
||||
ADDQ BX, DI
|
||||
ANDQ AX, R9
|
||||
ADDQ SI, R9
|
||||
ANDQ AX, R11
|
||||
ADDQ R8, R11
|
||||
ANDQ AX, R13
|
||||
ADDQ R10, R13
|
||||
|
||||
MOVQ CX, 0(DI)
|
||||
MOVQ R9, 8(DI)
|
||||
MOVQ R11, 16(DI)
|
||||
MOVQ R13, 24(DI)
|
||||
MOVQ R15, 32(DI)
|
||||
RET
|
||||
// Store output
|
||||
MOVQ out+0(FP), AX
|
||||
MOVQ BP, (AX)
|
||||
MOVQ DI, 8(AX)
|
||||
MOVQ R9, 16(AX)
|
||||
MOVQ R11, 24(AX)
|
||||
MOVQ R13, 32(AX)
|
||||
RET
|
||||
|
|
|
@ -2,14 +2,10 @@
|
|||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !amd64,!arm64 !gc purego
|
||||
// +build !amd64 !gc purego
|
||||
|
||||
package edwards25519
|
||||
|
||||
func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) }
|
||||
|
||||
func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) }
|
||||
|
||||
func (v *fieldElement) carryPropagate() *fieldElement {
|
||||
return v.carryPropagateGeneric()
|
||||
}
|
|
@ -6,10 +6,6 @@
|
|||
|
||||
package edwards25519
|
||||
|
||||
func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) }
|
||||
|
||||
func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) }
|
||||
|
||||
//go:noescape
|
||||
func carryPropagate(v *fieldElement)
|
||||
|
||||
|
|
11
fe_arm64_noasm.go
Normal file
11
fe_arm64_noasm.go
Normal file
|
@ -0,0 +1,11 @@
|
|||
// Copyright (c) 2021 The Go Authors. All rights reserved.
|
||||
// Use of this source code is governed by a BSD-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
// +build !arm64 !gc purego
|
||||
|
||||
package edwards25519
|
||||
|
||||
func (v *fieldElement) carryPropagate() *fieldElement {
|
||||
return v.carryPropagateGeneric()
|
||||
}
|
54
fe_test.go
54
fe_test.go
|
@ -26,15 +26,13 @@ func (v fieldElement) String() string {
|
|||
var quickCheckConfig1024 = &quick.Config{MaxCountScale: 1 << 10}
|
||||
|
||||
func generateFieldElement(rand *mathrand.Rand) fieldElement {
|
||||
// Generation strategy: generate random limb values of [52, 51, 51, 51, 51]
|
||||
// bits, like the ones returned by lightReduce.
|
||||
const maskLow52Bits = (1 << 52) - 1
|
||||
return fieldElement{
|
||||
rand.Uint64() & maskLow52Bits,
|
||||
rand.Uint64() & maskLow51Bits,
|
||||
rand.Uint64() & maskLow51Bits,
|
||||
rand.Uint64() & maskLow51Bits,
|
||||
rand.Uint64() & maskLow51Bits,
|
||||
rand.Uint64() & maskLow52Bits,
|
||||
rand.Uint64() & maskLow52Bits,
|
||||
rand.Uint64() & maskLow52Bits,
|
||||
rand.Uint64() & maskLow52Bits,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -524,3 +522,47 @@ func TestCarryPropagate(t *testing.T) {
|
|||
t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}")
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeSquare(t *testing.T) {
|
||||
asmLikeGeneric := func(a fieldElement) bool {
|
||||
t1 := a
|
||||
t2 := a
|
||||
|
||||
feSquareGeneric(&t1, &t1)
|
||||
feSquare(&t2, &t2)
|
||||
|
||||
if t1 != t2 {
|
||||
t.Logf("got: %#v,\nexpected: %#v", t1, t2)
|
||||
}
|
||||
|
||||
return t1 == t2 && isInBounds(&t2)
|
||||
}
|
||||
|
||||
if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestFeMul(t *testing.T) {
|
||||
asmLikeGeneric := func(a, b fieldElement) bool {
|
||||
a1 := a
|
||||
a2 := a
|
||||
b1 := b
|
||||
b2 := b
|
||||
|
||||
feMulGeneric(&a1, &a1, &b1)
|
||||
feMul(&a2, &a2, &b2)
|
||||
|
||||
if a1 != a2 || b1 != b2 {
|
||||
t.Logf("got: %#v,\nexpected: %#v", a1, a2)
|
||||
t.Logf("got: %#v,\nexpected: %#v", b1, b2)
|
||||
}
|
||||
|
||||
return a1 == a2 && isInBounds(&a2) &&
|
||||
b1 == b2 && isInBounds(&b2)
|
||||
}
|
||||
|
||||
if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil {
|
||||
t.Error(err)
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue