edwards25519: rewrite amd64 assembly with avo

Mapping almost 1:1 from generic Go lines to Avo lines.

Again a little faster, as a treat. (Probably due to better pipelining of
the reduction chains.)

name                    old time/op  new time/op  delta
Add-4                   7.87ns ± 2%  7.93ns ± 1%    ~     (p=0.065 n=10+9)
Mul-4                   18.9ns ± 2%  18.4ns ± 1%  -2.20%  (p=0.000 n=10+9)
Mul32-4                 7.22ns ± 1%  7.19ns ± 1%    ~     (p=0.128 n=10+9)
BasepointMul-4          21.4µs ± 1%  21.1µs ± 1%  -1.54%  (p=0.000 n=10+10)
ScalarMul-4             67.3µs ± 2%  67.2µs ± 1%    ~     (p=0.579 n=10+10)
VartimeDoubleBaseMul-4  62.5µs ± 2%  62.9µs ± 1%    ~     (p=0.436 n=10+10)
MultiscalarMulSize8-4    246µs ± 0%   246µs ± 1%    ~     (p=0.631 n=10+10)
This commit is contained in:
Filippo Valsorda 2021-03-31 23:15:38 +02:00 committed by Filippo Valsorda
parent 8eb02eb997
commit c882e8e8ab
9 changed files with 720 additions and 315 deletions

296
asm/fe_amd64_asm.go Normal file
View file

@ -0,0 +1,296 @@
// Copyright (c) 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
import (
"fmt"
. "github.com/mmcloughlin/avo/build"
. "github.com/mmcloughlin/avo/gotypes"
. "github.com/mmcloughlin/avo/operand"
. "github.com/mmcloughlin/avo/reg"
_ "filippo.io/edwards25519"
)
//go:generate go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519
func main() {
Package("filippo.io/edwards25519")
ConstraintExpr("amd64,gc,!purego")
feMul()
feSquare()
Generate()
}
type namedComponent struct {
Component
name string
}
func (c namedComponent) String() string { return c.name }
type uint128 struct {
name string
hi, lo GPVirtual
}
func (c uint128) String() string { return c.name }
func feSquare() {
TEXT("feSquare", NOSPLIT, "func(out, a *fieldElement)")
Doc("feSquare sets out = a * a. It works like feSquareGeneric.")
Pragma("noescape")
a := Dereference(Param("a"))
l0 := namedComponent{a.Field("l0"), "l0"}
l1 := namedComponent{a.Field("l1"), "l1"}
l2 := namedComponent{a.Field("l2"), "l2"}
l3 := namedComponent{a.Field("l3"), "l3"}
l4 := namedComponent{a.Field("l4"), "l4"}
// r0 = l0×l0 + 19×2×(l1×l4 + l2×l3)
r0 := uint128{"r0", GP64(), GP64()}
mul64(r0, 1, l0, l0)
addMul64(r0, 38, l1, l4)
addMul64(r0, 38, l2, l3)
// r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3
r1 := uint128{"r1", GP64(), GP64()}
mul64(r1, 2, l0, l1)
addMul64(r1, 38, l2, l4)
addMul64(r1, 19, l3, l3)
// r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4
r2 := uint128{"r2", GP64(), GP64()}
mul64(r2, 2, l0, l2)
addMul64(r2, 1, l1, l1)
addMul64(r2, 38, l3, l4)
// r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4
r3 := uint128{"r3", GP64(), GP64()}
mul64(r3, 2, l0, l3)
addMul64(r3, 2, l1, l2)
addMul64(r3, 19, l4, l4)
// r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2
r4 := uint128{"r4", GP64(), GP64()}
mul64(r4, 2, l0, l4)
addMul64(r4, 2, l1, l3)
addMul64(r4, 1, l2, l2)
Comment("First reduction chain")
maskLow51Bits := GP64()
MOVQ(Imm((1<<51)-1), maskLow51Bits)
c0, r0lo := shiftRightBy51(&r0)
c1, r1lo := shiftRightBy51(&r1)
c2, r2lo := shiftRightBy51(&r2)
c3, r3lo := shiftRightBy51(&r3)
c4, r4lo := shiftRightBy51(&r4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Second reduction chain (carryPropagate)")
// c0 = r0 >> 51
MOVQ(r0lo, c0)
SHRQ(Imm(51), c0)
// c1 = r1 >> 51
MOVQ(r1lo, c1)
SHRQ(Imm(51), c1)
// c2 = r2 >> 51
MOVQ(r2lo, c2)
SHRQ(Imm(51), c2)
// c3 = r3 >> 51
MOVQ(r3lo, c3)
SHRQ(Imm(51), c3)
// c4 = r4 >> 51
MOVQ(r4lo, c4)
SHRQ(Imm(51), c4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Store output")
out := Dereference(Param("out"))
Store(r0lo, out.Field("l0"))
Store(r1lo, out.Field("l1"))
Store(r2lo, out.Field("l2"))
Store(r3lo, out.Field("l3"))
Store(r4lo, out.Field("l4"))
RET()
}
func feMul() {
TEXT("feMul", NOSPLIT, "func(out, a, b *fieldElement)")
Doc("feMul sets out = a * b. It works like feMulGeneric.")
Pragma("noescape")
a := Dereference(Param("a"))
a0 := namedComponent{a.Field("l0"), "a0"}
a1 := namedComponent{a.Field("l1"), "a1"}
a2 := namedComponent{a.Field("l2"), "a2"}
a3 := namedComponent{a.Field("l3"), "a3"}
a4 := namedComponent{a.Field("l4"), "a4"}
b := Dereference(Param("b"))
b0 := namedComponent{b.Field("l0"), "b0"}
b1 := namedComponent{b.Field("l1"), "b1"}
b2 := namedComponent{b.Field("l2"), "b2"}
b3 := namedComponent{b.Field("l3"), "b3"}
b4 := namedComponent{b.Field("l4"), "b4"}
// r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1)
r0 := uint128{"r0", GP64(), GP64()}
mul64(r0, 1, a0, b0)
addMul64(r0, 19, a1, b4)
addMul64(r0, 19, a2, b3)
addMul64(r0, 19, a3, b2)
addMul64(r0, 19, a4, b1)
// r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2)
r1 := uint128{"r1", GP64(), GP64()}
mul64(r1, 1, a0, b1)
addMul64(r1, 1, a1, b0)
addMul64(r1, 19, a2, b4)
addMul64(r1, 19, a3, b3)
addMul64(r1, 19, a4, b2)
// r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3)
r2 := uint128{"r2", GP64(), GP64()}
mul64(r2, 1, a0, b2)
addMul64(r2, 1, a1, b1)
addMul64(r2, 1, a2, b0)
addMul64(r2, 19, a3, b4)
addMul64(r2, 19, a4, b3)
// r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4
r3 := uint128{"r3", GP64(), GP64()}
mul64(r3, 1, a0, b3)
addMul64(r3, 1, a1, b2)
addMul64(r3, 1, a2, b1)
addMul64(r3, 1, a3, b0)
addMul64(r3, 19, a4, b4)
// r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0
r4 := uint128{"r4", GP64(), GP64()}
mul64(r4, 1, a0, b4)
addMul64(r4, 1, a1, b3)
addMul64(r4, 1, a2, b2)
addMul64(r4, 1, a3, b1)
addMul64(r4, 1, a4, b0)
Comment("First reduction chain")
maskLow51Bits := GP64()
MOVQ(Imm((1<<51)-1), maskLow51Bits)
c0, r0lo := shiftRightBy51(&r0)
c1, r1lo := shiftRightBy51(&r1)
c2, r2lo := shiftRightBy51(&r2)
c3, r3lo := shiftRightBy51(&r3)
c4, r4lo := shiftRightBy51(&r4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Second reduction chain (carryPropagate)")
// c0 = r0 >> 51
MOVQ(r0lo, c0)
SHRQ(Imm(51), c0)
// c1 = r1 >> 51
MOVQ(r1lo, c1)
SHRQ(Imm(51), c1)
// c2 = r2 >> 51
MOVQ(r2lo, c2)
SHRQ(Imm(51), c2)
// c3 = r3 >> 51
MOVQ(r3lo, c3)
SHRQ(Imm(51), c3)
// c4 = r4 >> 51
MOVQ(r4lo, c4)
SHRQ(Imm(51), c4)
maskAndAdd(r0lo, maskLow51Bits, c4, 19)
maskAndAdd(r1lo, maskLow51Bits, c0, 1)
maskAndAdd(r2lo, maskLow51Bits, c1, 1)
maskAndAdd(r3lo, maskLow51Bits, c2, 1)
maskAndAdd(r4lo, maskLow51Bits, c3, 1)
Comment("Store output")
out := Dereference(Param("out"))
Store(r0lo, out.Field("l0"))
Store(r1lo, out.Field("l1"))
Store(r2lo, out.Field("l2"))
Store(r3lo, out.Field("l3"))
Store(r4lo, out.Field("l4"))
RET()
}
// mul64 sets r to i * aX * bX.
func mul64(r uint128, i int, aX, bX namedComponent) {
switch i {
case 1:
Comment(fmt.Sprintf("%s = %s×%s", r, aX, bX))
Load(aX, RAX)
case 2:
Comment(fmt.Sprintf("%s = 2×%s×%s", r, aX, bX))
Load(aX, RAX)
SHLQ(Imm(1), RAX)
default:
panic("unsupported i value")
}
MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX
MOVQ(RAX, r.lo)
MOVQ(RDX, r.hi)
}
// addMul64 sets r to r + i * aX * bX.
func addMul64(r uint128, i uint64, aX, bX namedComponent) {
switch i {
case 1:
Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX))
Load(aX, RAX)
default:
Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX))
IMUL3Q(Imm(i), Load(aX, GP64()), RAX)
}
MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX
ADDQ(RAX, r.lo)
ADCQ(RDX, r.hi)
}
// shiftRightBy51 returns r >> 51 and r.lo.
//
// After this function is called, the uint128 may not be used anymore.
func shiftRightBy51(r *uint128) (out, lo GPVirtual) {
out = r.hi
lo = r.lo
SHLQ(Imm(64-51), r.lo, r.hi)
r.lo, r.hi = nil, nil // make sure the uint128 is unusable
return
}
// maskAndAdd sets r = r&mask + c*i.
func maskAndAdd(r, mask, c GPVirtual, i uint64) {
ANDQ(mask, r)
if i != 1 {
IMUL3Q(Imm(i), c, c)
}
ADDQ(c, r)
}
func mustAddr(c Component) Op {
b, err := c.Resolve()
if err != nil {
panic(err)
}
return b.Addr
}

10
asm/go.mod Normal file
View file

@ -0,0 +1,10 @@
module filippo.io/edwards25519/asm
go 1.16
require (
filippo.io/edwards25519 v0.0.0
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4
)
replace filippo.io/edwards25519 => ../

28
asm/go.sum Normal file
View file

@ -0,0 +1,28 @@
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 h1:ExoghBBFY7A3RzgkAOq0XbHs9zaT/bHq7xysgyp3z3Q=
github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4/go.mod h1:6aKT4zZIrpGqB3RpFU14ByCSSyKY6LfJz4J/JJChHfI=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto=
golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4=
golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA=
golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM=
golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY=
golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174 h1:0rx0F4EjJNbxTuzWe0KjKcIzs+3VEb/Mrs/d1ciNz1c=
golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE=
golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4=

View file

@ -1,17 +1,13 @@
// Copyright (c) 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT.
// +build amd64,gc,!purego
package edwards25519
// feMul sets out = a * b. It works like feMulGeneric.
//go:noescape
func feMul(out, a, b *fieldElement)
func feMul(out *fieldElement, a *fieldElement, b *fieldElement)
// feSquare sets out = a * a. It works like feSquareGeneric.
//go:noescape
func feSquare(out, x *fieldElement)
func (v *fieldElement) carryPropagate() *fieldElement {
return v.carryPropagateGeneric()
}
func feSquare(out *fieldElement, a *fieldElement)

View file

@ -1,348 +1,378 @@
// Copyright (c) 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT.
// +build amd64,gc,!purego
// func feMul(out, a, b *fieldElement)
TEXT ·feMul(SB),$0-24
// Based on assembly generated by PeachPy. Equivalent to the Go in
// feMulGeneric, which was originally based on the amd64-51-30k
// assembly in SUPERCOP.
#include "textflag.h"
MOVQ a+8(FP), BX
MOVQ b+16(FP), CX
// func feMul(out *fieldElement, a *fieldElement, b *fieldElement)
TEXT ·feMul(SB), NOSPLIT, $0-24
MOVQ a+8(FP), CX
MOVQ b+16(FP), BX
// Calculate r0
MOVQ 0(BX), AX // rax <-- x0
MULQ 0(CX) // rdx, rax <-- x0*y0
MOVQ AX, SI // r00 = rax
MOVQ DX, DI // r01 = rdx
// r0 = a0×b0
MOVQ (CX), AX
MULQ (BX)
MOVQ AX, SI
MOVQ DX, BP
MOVQ 8(BX), DX // rdx <-- x1
IMUL3Q $19, DX, AX // rax <-- x1*19
MULQ 32(CX) // rdx, rax <-- x1_19*y4
ADDQ AX, SI // r00 += rax
ADCQ DX, DI // r01 += rdx
// r0 += 19×a1×b4
MOVQ 8(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, SI
ADCQ DX, BP
MOVQ 16(BX), DX // rdx <-- x2
IMUL3Q $19, DX, AX // rax <-- x2*19
MULQ 24(CX) // rdx, rax <-- x2_19*y3
ADDQ AX, SI // r00 += rax
ADCQ DX, DI // r01 += rdx
// r0 += 19×a2×b3
MOVQ 16(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, SI
ADCQ DX, BP
MOVQ 24(BX), DX // rdx <-- x3
IMUL3Q $19, DX, AX // rax <-- x3*19
MULQ 16(CX) // rdx, rax <-- x3_19 * y2
ADDQ AX, SI // r00 += rax
ADCQ DX, DI // r01 += rdx
// r0 += 19×a3×b2
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 16(BX)
ADDQ AX, SI
ADCQ DX, BP
MOVQ 32(BX), DX // rdx <-- x4
IMUL3Q $19, DX, AX // rax <-- x4*19
MULQ 8(CX) // rdx rax <-- x4_19*y1
ADDQ AX, SI // r00 += rax
ADCQ DX, DI // r01 += rdx
// r0 += 19×a4×b1
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 8(BX)
ADDQ AX, SI
ADCQ DX, BP
// Calculate r1
MOVQ 0(BX), AX
MULQ 8(CX)
MOVQ AX, R8 // r10
MOVQ DX, R9 // r11
// r1 = a0×b1
MOVQ (CX), AX
MULQ 8(BX)
MOVQ AX, R8
MOVQ DX, DI
MOVQ 8(BX), AX
MULQ 0(CX)
// r1 += a1×b0
MOVQ 8(CX), AX
MULQ (BX)
ADDQ AX, R8
ADCQ DX, DI
// r1 += 19×a2×b4
MOVQ 16(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R8
ADCQ DX, DI
// r1 += 19×a3×b3
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, R8
ADCQ DX, DI
// r1 += 19×a4×b2
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 16(BX)
ADDQ AX, R8
ADCQ DX, DI
// r2 = a0×b2
MOVQ (CX), AX
MULQ 16(BX)
MOVQ AX, R10
MOVQ DX, R9
// r2 += a1×b1
MOVQ 8(CX), AX
MULQ 8(BX)
ADDQ AX, R10
ADCQ DX, R9
MOVQ 16(BX), DX
IMUL3Q $19, DX, AX
MULQ 32(CX)
ADDQ AX, R8
// r2 += a2×b0
MOVQ 16(CX), AX
MULQ (BX)
ADDQ AX, R10
ADCQ DX, R9
MOVQ 24(BX), DX
IMUL3Q $19, DX, AX
MULQ 24(CX)
ADDQ AX, R8
ADCQ DX, R9
// r2 += 19×a3×b4
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R10
ADCQ DX, R9
MOVQ 32(BX), DX
IMUL3Q $19, DX, AX
MULQ 16(CX)
ADDQ AX, R8
ADCQ DX, R9
// r2 += 19×a4×b3
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(BX)
ADDQ AX, R10
ADCQ DX, R9
// Calculate r2
MOVQ 0(BX), AX
MULQ 16(CX)
MOVQ AX, R10 // r20
MOVQ DX, R11 // r21
// r3 = a0×b3
MOVQ (CX), AX
MULQ 24(BX)
MOVQ AX, R12
MOVQ DX, R11
MOVQ 8(BX), AX
MULQ 8(CX)
ADDQ AX, R10
// r3 += a1×b2
MOVQ 8(CX), AX
MULQ 16(BX)
ADDQ AX, R12
ADCQ DX, R11
MOVQ 16(BX), AX
MULQ 0(CX)
ADDQ AX, R10
// r3 += a2×b1
MOVQ 16(CX), AX
MULQ 8(BX)
ADDQ AX, R12
ADCQ DX, R11
MOVQ 24(BX), DX
IMUL3Q $19, DX, AX
MULQ 32(CX)
ADDQ AX, R10
// r3 += a3×b0
MOVQ 24(CX), AX
MULQ (BX)
ADDQ AX, R12
ADCQ DX, R11
MOVQ 32(BX), DX
IMUL3Q $19, DX, AX
MULQ 24(CX)
ADDQ AX, R10
ADCQ DX, R11
// r3 += 19×a4×b4
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(BX)
ADDQ AX, R12
ADCQ DX, R11
// Calculate r3
MOVQ 0(BX), AX
MULQ 24(CX)
MOVQ AX, R12 // r30
MOVQ DX, R13 // r31
// r4 = a0×b4
MOVQ (CX), AX
MULQ 32(BX)
MOVQ AX, R14
MOVQ DX, R13
MOVQ 8(BX), AX
MULQ 16(CX)
ADDQ AX, R12
// r4 += a1×b3
MOVQ 8(CX), AX
MULQ 24(BX)
ADDQ AX, R14
ADCQ DX, R13
MOVQ 16(BX), AX
MULQ 8(CX)
ADDQ AX, R12
// r4 += a2×b2
MOVQ 16(CX), AX
MULQ 16(BX)
ADDQ AX, R14
ADCQ DX, R13
MOVQ 24(BX), AX
MULQ 0(CX)
ADDQ AX, R12
// r4 += a3×b1
MOVQ 24(CX), AX
MULQ 8(BX)
ADDQ AX, R14
ADCQ DX, R13
MOVQ 32(BX), DX
IMUL3Q $19, DX, AX
MULQ 32(CX)
ADDQ AX, R12
// r4 += a4×b0
MOVQ 32(CX), AX
MULQ (BX)
ADDQ AX, R14
ADCQ DX, R13
// Calculate r4
MOVQ 0(BX), AX
MULQ 32(CX)
MOVQ AX, R14 // r40
MOVQ DX, R15 // r41
// First reduction chain
MOVQ $0x0007ffffffffffff, AX
SHLQ $0x0d, SI, BP
SHLQ $0x0d, R8, DI
SHLQ $0x0d, R10, R9
SHLQ $0x0d, R12, R11
SHLQ $0x0d, R14, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BP, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
MOVQ 8(BX), AX
MULQ 24(CX)
ADDQ AX, R14
ADCQ DX, R15
// Second reduction chain (carryPropagate)
MOVQ SI, BP
SHRQ $0x33, BP
MOVQ R8, DI
SHRQ $0x33, DI
MOVQ R10, R9
SHRQ $0x33, R9
MOVQ R12, R11
SHRQ $0x33, R11
MOVQ R14, R13
SHRQ $0x33, R13
ANDQ AX, SI
IMUL3Q $0x13, R13, R13
ADDQ R13, SI
ANDQ AX, R8
ADDQ BP, R8
ANDQ AX, R10
ADDQ DI, R10
ANDQ AX, R12
ADDQ R9, R12
ANDQ AX, R14
ADDQ R11, R14
MOVQ 16(BX), AX
MULQ 16(CX)
ADDQ AX, R14
ADCQ DX, R15
MOVQ 24(BX), AX
MULQ 8(CX)
ADDQ AX, R14
ADCQ DX, R15
MOVQ 32(BX), AX
MULQ 0(CX)
ADDQ AX, R14
ADCQ DX, R15
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, SI, DI // r01 = shld with r00
ANDQ AX, SI // r00 &= mask51
SHLQ $13, R8, R9 // r11 = shld with r10
ANDQ AX, R8 // r10 &= mask51
ADDQ DI, R8 // r10 += r01
SHLQ $13, R10, R11 // r21 = shld with r20
ANDQ AX, R10 // r20 &= mask51
ADDQ R9, R10 // r20 += r11
SHLQ $13, R12, R13 // r31 = shld with r30
ANDQ AX, R12 // r30 &= mask51
ADDQ R11, R12 // r30 += r21
SHLQ $13, R14, R15 // r41 = shld with r40
ANDQ AX, R14 // r40 &= mask51
ADDQ R13, R14 // r40 += r31
IMUL3Q $19, R15, R15 // r41 = r41*19
ADDQ R15, SI // r00 += r41
MOVQ SI, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R8 // r10 += r00 >> 51
MOVQ R8, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, SI // r00 &= mask51
ADDQ DX, R10 // r20 += r10 >> 51
MOVQ R10, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R8 // r10 &= mask51
ADDQ DX, R12 // r30 += r20 >> 51
MOVQ R12, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R10 // r20 &= mask51
ADDQ DX, R14 // r40 += r30 >> 51
MOVQ R14, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R12 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, SI // r00 += (r40 >> 51) *19
ANDQ AX, R14 // r40 &= mask51
MOVQ out+0(FP), DI
MOVQ SI, 0(DI)
MOVQ R8, 8(DI)
MOVQ R10, 16(DI)
MOVQ R12, 24(DI)
MOVQ R14, 32(DI)
// Store output
MOVQ out+0(FP), AX
MOVQ SI, (AX)
MOVQ R8, 8(AX)
MOVQ R10, 16(AX)
MOVQ R12, 24(AX)
MOVQ R14, 32(AX)
RET
// func feSquare(out, x *fieldElement)
TEXT ·feSquare(SB),4,$0-16
MOVQ out+0(FP), DI
MOVQ x+8(FP), SI
// func feSquare(out *fieldElement, a *fieldElement)
TEXT ·feSquare(SB), NOSPLIT, $0-16
MOVQ a+8(FP), CX
// r0 = x0*x0 + x1*38*x4 + x2*38*x3
MOVQ 0(SI), AX
MULQ 0(SI)
MOVQ AX, CX // r00
MOVQ DX, R8 // r01
// r0 = l0×l0
MOVQ (CX), AX
MULQ (CX)
MOVQ AX, BP
MOVQ DX, BX
MOVQ 8(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, CX
ADCQ DX, R8
// r0 += 38×l1×l4
MOVQ 8(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, BP
ADCQ DX, BX
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 24(SI)
ADDQ AX, CX
ADCQ DX, R8
// r0 += 38×l2×l3
MOVQ 16(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 24(CX)
ADDQ AX, BP
ADCQ DX, BX
// r1 = x0*2*x1 + x2*38*x4 + x3*19*x3
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 8(SI)
MOVQ AX, R9 // r10
MOVQ DX, R10 // r11
// r1 = 2×l0×l1
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 8(CX)
MOVQ AX, DI
MOVQ DX, SI
MOVQ 16(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R9
ADCQ DX, R10
// r1 += 38×l2×l4
MOVQ 16(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, DI
ADCQ DX, SI
MOVQ 24(SI), DX
IMUL3Q $19, DX, AX
MULQ 24(SI)
ADDQ AX, R9
ADCQ DX, R10
// r1 += 19×l3×l3
MOVQ 24(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 24(CX)
ADDQ AX, DI
ADCQ DX, SI
// r2 = x0*2*x2 + x1*x1 + x3*38*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 16(SI)
MOVQ AX, R11 // r20
MOVQ DX, R12 // r21
// r2 = 2×l0×l2
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 16(CX)
MOVQ AX, R9
MOVQ DX, R8
MOVQ 8(SI), AX
MULQ 8(SI)
ADDQ AX, R11
ADCQ DX, R12
// r2 += l1×l1
MOVQ 8(CX), AX
MULQ 8(CX)
ADDQ AX, R9
ADCQ DX, R8
MOVQ 24(SI), DX
IMUL3Q $38, DX, AX
MULQ 32(SI)
ADDQ AX, R11
ADCQ DX, R12
// r2 += 38×l3×l4
MOVQ 24(CX), AX
IMUL3Q $0x26, AX, AX
MULQ 32(CX)
ADDQ AX, R9
ADCQ DX, R8
// r3 = x0*2*x3 + x1*2*x2 + x4*19*x4
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 24(SI)
MOVQ AX, R13 // r30
MOVQ DX, R14 // r31
// r3 = 2×l0×l3
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 24(CX)
MOVQ AX, R11
MOVQ DX, R10
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 16(SI)
ADDQ AX, R13
ADCQ DX, R14
// r3 += 2×l1×l2
MOVQ 8(CX), AX
IMUL3Q $0x02, AX, AX
MULQ 16(CX)
ADDQ AX, R11
ADCQ DX, R10
MOVQ 32(SI), DX
IMUL3Q $19, DX, AX
MULQ 32(SI)
ADDQ AX, R13
ADCQ DX, R14
// r3 += 19×l4×l4
MOVQ 32(CX), AX
IMUL3Q $0x13, AX, AX
MULQ 32(CX)
ADDQ AX, R11
ADCQ DX, R10
// r4 = x0*2*x4 + x1*2*x3 + x2*x2
MOVQ 0(SI), AX
SHLQ $1, AX
MULQ 32(SI)
MOVQ AX, R15 // r40
MOVQ DX, BX // r41
// r4 = 2×l0×l4
MOVQ (CX), AX
SHLQ $0x01, AX
MULQ 32(CX)
MOVQ AX, R13
MOVQ DX, R12
MOVQ 8(SI), AX
SHLQ $1, AX
MULQ 24(SI)
ADDQ AX, R15
ADCQ DX, BX
// r4 += 2×l1×l3
MOVQ 8(CX), AX
IMUL3Q $0x02, AX, AX
MULQ 24(CX)
ADDQ AX, R13
ADCQ DX, R12
MOVQ 16(SI), AX
MULQ 16(SI)
ADDQ AX, R15
ADCQ DX, BX
// r4 += l2×l2
MOVQ 16(CX), AX
MULQ 16(CX)
ADDQ AX, R13
ADCQ DX, R12
// Reduce
MOVQ $2251799813685247, AX // (1<<51) - 1
SHLQ $13, CX, R8 // r01 = shld with r00
ANDQ AX, CX // r00 &= mask51
SHLQ $13, R9, R10 // r11 = shld with r10
ANDQ AX, R9 // r10 &= mask51
ADDQ R8, R9 // r10 += r01
SHLQ $13, R11, R12 // r21 = shld with r20
ANDQ AX, R11 // r20 &= mask51
ADDQ R10, R11 // r20 += r11
SHLQ $13, R13, R14 // r31 = shld with r30
ANDQ AX, R13 // r30 &= mask51
ADDQ R12, R13 // r30 += r21
SHLQ $13, R15, BX // r41 = shld with r40
ANDQ AX, R15 // r40 &= mask51
ADDQ R14, R15 // r40 += r31
IMUL3Q $19, BX, DX // r41 = r41*19
ADDQ DX, CX // r00 += r41
// First reduction chain
MOVQ $0x0007ffffffffffff, AX
SHLQ $0x0d, BP, BX
SHLQ $0x0d, DI, SI
SHLQ $0x0d, R9, R8
SHLQ $0x0d, R11, R10
SHLQ $0x0d, R13, R12
ANDQ AX, BP
IMUL3Q $0x13, R12, R12
ADDQ R12, BP
ANDQ AX, DI
ADDQ BX, DI
ANDQ AX, R9
ADDQ SI, R9
ANDQ AX, R11
ADDQ R8, R11
ANDQ AX, R13
ADDQ R10, R13
MOVQ CX, DX // rdx <-- r00
SHRQ $51, DX // rdx <-- r00 >> 51
ADDQ DX, R9 // r10 += r00 >> 51
MOVQ R9, DX // rdx <-- r10
SHRQ $51, DX // rdx <-- r10 >> 51
ANDQ AX, CX // r00 &= mask51
ADDQ DX, R11 // r20 += r10 >> 51
MOVQ R11, DX // rdx <-- r20
SHRQ $51, DX // rdx <-- r20 >> 51
ANDQ AX, R9 // r10 &= mask51
ADDQ DX, R13 // r30 += r20 >> 51
MOVQ R13, DX // rdx <-- r30
SHRQ $51, DX // rdx <-- r30 >> 51
ANDQ AX, R11 // r20 &= mask51
ADDQ DX, R15 // r40 += r30 >> 51
MOVQ R15, DX // rdx <-- r40
SHRQ $51, DX // rdx <-- r40 >> 51
ANDQ AX, R13 // r30 &= mask51
IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19
ADDQ DX, CX // r00 += (r40 >> 51) *19
ANDQ AX, R15 // r40 &= mask51
// Second reduction chain (carryPropagate)
MOVQ BP, BX
SHRQ $0x33, BX
MOVQ DI, SI
SHRQ $0x33, SI
MOVQ R9, R8
SHRQ $0x33, R8
MOVQ R11, R10
SHRQ $0x33, R10
MOVQ R13, R12
SHRQ $0x33, R12
ANDQ AX, BP
IMUL3Q $0x13, R12, R12
ADDQ R12, BP
ANDQ AX, DI
ADDQ BX, DI
ANDQ AX, R9
ADDQ SI, R9
ANDQ AX, R11
ADDQ R8, R11
ANDQ AX, R13
ADDQ R10, R13
MOVQ CX, 0(DI)
MOVQ R9, 8(DI)
MOVQ R11, 16(DI)
MOVQ R13, 24(DI)
MOVQ R15, 32(DI)
RET
// Store output
MOVQ out+0(FP), AX
MOVQ BP, (AX)
MOVQ DI, 8(AX)
MOVQ R9, 16(AX)
MOVQ R11, 24(AX)
MOVQ R13, 32(AX)
RET

View file

@ -2,14 +2,10 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !amd64,!arm64 !gc purego
// +build !amd64 !gc purego
package edwards25519
func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) }
func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) }
func (v *fieldElement) carryPropagate() *fieldElement {
return v.carryPropagateGeneric()
}

View file

@ -6,10 +6,6 @@
package edwards25519
func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) }
func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) }
//go:noescape
func carryPropagate(v *fieldElement)

11
fe_arm64_noasm.go Normal file
View file

@ -0,0 +1,11 @@
// Copyright (c) 2021 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !arm64 !gc purego
package edwards25519
func (v *fieldElement) carryPropagate() *fieldElement {
return v.carryPropagateGeneric()
}

View file

@ -26,15 +26,13 @@ func (v fieldElement) String() string {
var quickCheckConfig1024 = &quick.Config{MaxCountScale: 1 << 10}
func generateFieldElement(rand *mathrand.Rand) fieldElement {
// Generation strategy: generate random limb values of [52, 51, 51, 51, 51]
// bits, like the ones returned by lightReduce.
const maskLow52Bits = (1 << 52) - 1
return fieldElement{
rand.Uint64() & maskLow52Bits,
rand.Uint64() & maskLow51Bits,
rand.Uint64() & maskLow51Bits,
rand.Uint64() & maskLow51Bits,
rand.Uint64() & maskLow51Bits,
rand.Uint64() & maskLow52Bits,
rand.Uint64() & maskLow52Bits,
rand.Uint64() & maskLow52Bits,
rand.Uint64() & maskLow52Bits,
}
}
@ -524,3 +522,47 @@ func TestCarryPropagate(t *testing.T) {
t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}")
}
}
func TestFeSquare(t *testing.T) {
asmLikeGeneric := func(a fieldElement) bool {
t1 := a
t2 := a
feSquareGeneric(&t1, &t1)
feSquare(&t2, &t2)
if t1 != t2 {
t.Logf("got: %#v,\nexpected: %#v", t1, t2)
}
return t1 == t2 && isInBounds(&t2)
}
if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil {
t.Error(err)
}
}
func TestFeMul(t *testing.T) {
asmLikeGeneric := func(a, b fieldElement) bool {
a1 := a
a2 := a
b1 := b
b2 := b
feMulGeneric(&a1, &a1, &b1)
feMul(&a2, &a2, &b2)
if a1 != a2 || b1 != b2 {
t.Logf("got: %#v,\nexpected: %#v", a1, a2)
t.Logf("got: %#v,\nexpected: %#v", b1, b2)
}
return a1 == a2 && isInBounds(&a2) &&
b1 == b2 && isInBounds(&b2)
}
if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil {
t.Error(err)
}
}