diff --git a/asm/fe_amd64_asm.go b/asm/fe_amd64_asm.go new file mode 100644 index 0000000..5f88d89 --- /dev/null +++ b/asm/fe_amd64_asm.go @@ -0,0 +1,296 @@ +// Copyright (c) 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package main + +import ( + "fmt" + + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/gotypes" + . "github.com/mmcloughlin/avo/operand" + . "github.com/mmcloughlin/avo/reg" + + _ "filippo.io/edwards25519" +) + +//go:generate go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519 + +func main() { + Package("filippo.io/edwards25519") + ConstraintExpr("amd64,gc,!purego") + feMul() + feSquare() + Generate() +} + +type namedComponent struct { + Component + name string +} + +func (c namedComponent) String() string { return c.name } + +type uint128 struct { + name string + hi, lo GPVirtual +} + +func (c uint128) String() string { return c.name } + +func feSquare() { + TEXT("feSquare", NOSPLIT, "func(out, a *fieldElement)") + Doc("feSquare sets out = a * a. It works like feSquareGeneric.") + Pragma("noescape") + + a := Dereference(Param("a")) + l0 := namedComponent{a.Field("l0"), "l0"} + l1 := namedComponent{a.Field("l1"), "l1"} + l2 := namedComponent{a.Field("l2"), "l2"} + l3 := namedComponent{a.Field("l3"), "l3"} + l4 := namedComponent{a.Field("l4"), "l4"} + + // r0 = l0×l0 + 19×2×(l1×l4 + l2×l3) + r0 := uint128{"r0", GP64(), GP64()} + mul64(r0, 1, l0, l0) + addMul64(r0, 38, l1, l4) + addMul64(r0, 38, l2, l3) + + // r1 = 2×l0×l1 + 19×2×l2×l4 + 19×l3×l3 + r1 := uint128{"r1", GP64(), GP64()} + mul64(r1, 2, l0, l1) + addMul64(r1, 38, l2, l4) + addMul64(r1, 19, l3, l3) + + // r2 = = 2×l0×l2 + l1×l1 + 19×2×l3×l4 + r2 := uint128{"r2", GP64(), GP64()} + mul64(r2, 2, l0, l2) + addMul64(r2, 1, l1, l1) + addMul64(r2, 38, l3, l4) + + // r3 = = 2×l0×l3 + 2×l1×l2 + 19×l4×l4 + r3 := uint128{"r3", GP64(), GP64()} + mul64(r3, 2, l0, l3) + addMul64(r3, 2, l1, l2) + addMul64(r3, 19, l4, l4) + + // r4 = = 2×l0×l4 + 2×l1×l3 + l2×l2 + r4 := uint128{"r4", GP64(), GP64()} + mul64(r4, 2, l0, l4) + addMul64(r4, 2, l1, l3) + addMul64(r4, 1, l2, l2) + + Comment("First reduction chain") + maskLow51Bits := GP64() + MOVQ(Imm((1<<51)-1), maskLow51Bits) + c0, r0lo := shiftRightBy51(&r0) + c1, r1lo := shiftRightBy51(&r1) + c2, r2lo := shiftRightBy51(&r2) + c3, r3lo := shiftRightBy51(&r3) + c4, r4lo := shiftRightBy51(&r4) + maskAndAdd(r0lo, maskLow51Bits, c4, 19) + maskAndAdd(r1lo, maskLow51Bits, c0, 1) + maskAndAdd(r2lo, maskLow51Bits, c1, 1) + maskAndAdd(r3lo, maskLow51Bits, c2, 1) + maskAndAdd(r4lo, maskLow51Bits, c3, 1) + + Comment("Second reduction chain (carryPropagate)") + // c0 = r0 >> 51 + MOVQ(r0lo, c0) + SHRQ(Imm(51), c0) + // c1 = r1 >> 51 + MOVQ(r1lo, c1) + SHRQ(Imm(51), c1) + // c2 = r2 >> 51 + MOVQ(r2lo, c2) + SHRQ(Imm(51), c2) + // c3 = r3 >> 51 + MOVQ(r3lo, c3) + SHRQ(Imm(51), c3) + // c4 = r4 >> 51 + MOVQ(r4lo, c4) + SHRQ(Imm(51), c4) + maskAndAdd(r0lo, maskLow51Bits, c4, 19) + maskAndAdd(r1lo, maskLow51Bits, c0, 1) + maskAndAdd(r2lo, maskLow51Bits, c1, 1) + maskAndAdd(r3lo, maskLow51Bits, c2, 1) + maskAndAdd(r4lo, maskLow51Bits, c3, 1) + + Comment("Store output") + out := Dereference(Param("out")) + Store(r0lo, out.Field("l0")) + Store(r1lo, out.Field("l1")) + Store(r2lo, out.Field("l2")) + Store(r3lo, out.Field("l3")) + Store(r4lo, out.Field("l4")) + + RET() +} + +func feMul() { + TEXT("feMul", NOSPLIT, "func(out, a, b *fieldElement)") + Doc("feMul sets out = a * b. It works like feMulGeneric.") + Pragma("noescape") + + a := Dereference(Param("a")) + a0 := namedComponent{a.Field("l0"), "a0"} + a1 := namedComponent{a.Field("l1"), "a1"} + a2 := namedComponent{a.Field("l2"), "a2"} + a3 := namedComponent{a.Field("l3"), "a3"} + a4 := namedComponent{a.Field("l4"), "a4"} + + b := Dereference(Param("b")) + b0 := namedComponent{b.Field("l0"), "b0"} + b1 := namedComponent{b.Field("l1"), "b1"} + b2 := namedComponent{b.Field("l2"), "b2"} + b3 := namedComponent{b.Field("l3"), "b3"} + b4 := namedComponent{b.Field("l4"), "b4"} + + // r0 = a0×b0 + 19×(a1×b4 + a2×b3 + a3×b2 + a4×b1) + r0 := uint128{"r0", GP64(), GP64()} + mul64(r0, 1, a0, b0) + addMul64(r0, 19, a1, b4) + addMul64(r0, 19, a2, b3) + addMul64(r0, 19, a3, b2) + addMul64(r0, 19, a4, b1) + + // r1 = a0×b1 + a1×b0 + 19×(a2×b4 + a3×b3 + a4×b2) + r1 := uint128{"r1", GP64(), GP64()} + mul64(r1, 1, a0, b1) + addMul64(r1, 1, a1, b0) + addMul64(r1, 19, a2, b4) + addMul64(r1, 19, a3, b3) + addMul64(r1, 19, a4, b2) + + // r2 = a0×b2 + a1×b1 + a2×b0 + 19×(a3×b4 + a4×b3) + r2 := uint128{"r2", GP64(), GP64()} + mul64(r2, 1, a0, b2) + addMul64(r2, 1, a1, b1) + addMul64(r2, 1, a2, b0) + addMul64(r2, 19, a3, b4) + addMul64(r2, 19, a4, b3) + + // r3 = a0×b3 + a1×b2 + a2×b1 + a3×b0 + 19×a4×b4 + r3 := uint128{"r3", GP64(), GP64()} + mul64(r3, 1, a0, b3) + addMul64(r3, 1, a1, b2) + addMul64(r3, 1, a2, b1) + addMul64(r3, 1, a3, b0) + addMul64(r3, 19, a4, b4) + + // r4 = a0×b4 + a1×b3 + a2×b2 + a3×b1 + a4×b0 + r4 := uint128{"r4", GP64(), GP64()} + mul64(r4, 1, a0, b4) + addMul64(r4, 1, a1, b3) + addMul64(r4, 1, a2, b2) + addMul64(r4, 1, a3, b1) + addMul64(r4, 1, a4, b0) + + Comment("First reduction chain") + maskLow51Bits := GP64() + MOVQ(Imm((1<<51)-1), maskLow51Bits) + c0, r0lo := shiftRightBy51(&r0) + c1, r1lo := shiftRightBy51(&r1) + c2, r2lo := shiftRightBy51(&r2) + c3, r3lo := shiftRightBy51(&r3) + c4, r4lo := shiftRightBy51(&r4) + maskAndAdd(r0lo, maskLow51Bits, c4, 19) + maskAndAdd(r1lo, maskLow51Bits, c0, 1) + maskAndAdd(r2lo, maskLow51Bits, c1, 1) + maskAndAdd(r3lo, maskLow51Bits, c2, 1) + maskAndAdd(r4lo, maskLow51Bits, c3, 1) + + Comment("Second reduction chain (carryPropagate)") + // c0 = r0 >> 51 + MOVQ(r0lo, c0) + SHRQ(Imm(51), c0) + // c1 = r1 >> 51 + MOVQ(r1lo, c1) + SHRQ(Imm(51), c1) + // c2 = r2 >> 51 + MOVQ(r2lo, c2) + SHRQ(Imm(51), c2) + // c3 = r3 >> 51 + MOVQ(r3lo, c3) + SHRQ(Imm(51), c3) + // c4 = r4 >> 51 + MOVQ(r4lo, c4) + SHRQ(Imm(51), c4) + maskAndAdd(r0lo, maskLow51Bits, c4, 19) + maskAndAdd(r1lo, maskLow51Bits, c0, 1) + maskAndAdd(r2lo, maskLow51Bits, c1, 1) + maskAndAdd(r3lo, maskLow51Bits, c2, 1) + maskAndAdd(r4lo, maskLow51Bits, c3, 1) + + Comment("Store output") + out := Dereference(Param("out")) + Store(r0lo, out.Field("l0")) + Store(r1lo, out.Field("l1")) + Store(r2lo, out.Field("l2")) + Store(r3lo, out.Field("l3")) + Store(r4lo, out.Field("l4")) + + RET() +} + +// mul64 sets r to i * aX * bX. +func mul64(r uint128, i int, aX, bX namedComponent) { + switch i { + case 1: + Comment(fmt.Sprintf("%s = %s×%s", r, aX, bX)) + Load(aX, RAX) + case 2: + Comment(fmt.Sprintf("%s = 2×%s×%s", r, aX, bX)) + Load(aX, RAX) + SHLQ(Imm(1), RAX) + default: + panic("unsupported i value") + } + MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX + MOVQ(RAX, r.lo) + MOVQ(RDX, r.hi) +} + +// addMul64 sets r to r + i * aX * bX. +func addMul64(r uint128, i uint64, aX, bX namedComponent) { + switch i { + case 1: + Comment(fmt.Sprintf("%s += %s×%s", r, aX, bX)) + Load(aX, RAX) + default: + Comment(fmt.Sprintf("%s += %d×%s×%s", r, i, aX, bX)) + IMUL3Q(Imm(i), Load(aX, GP64()), RAX) + } + MULQ(mustAddr(bX)) // RDX, RAX = RAX * bX + ADDQ(RAX, r.lo) + ADCQ(RDX, r.hi) +} + +// shiftRightBy51 returns r >> 51 and r.lo. +// +// After this function is called, the uint128 may not be used anymore. +func shiftRightBy51(r *uint128) (out, lo GPVirtual) { + out = r.hi + lo = r.lo + SHLQ(Imm(64-51), r.lo, r.hi) + r.lo, r.hi = nil, nil // make sure the uint128 is unusable + return +} + +// maskAndAdd sets r = r&mask + c*i. +func maskAndAdd(r, mask, c GPVirtual, i uint64) { + ANDQ(mask, r) + if i != 1 { + IMUL3Q(Imm(i), c, c) + } + ADDQ(c, r) +} + +func mustAddr(c Component) Op { + b, err := c.Resolve() + if err != nil { + panic(err) + } + return b.Addr +} diff --git a/asm/go.mod b/asm/go.mod new file mode 100644 index 0000000..5c81220 --- /dev/null +++ b/asm/go.mod @@ -0,0 +1,10 @@ +module filippo.io/edwards25519/asm + +go 1.16 + +require ( + filippo.io/edwards25519 v0.0.0 + github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 +) + +replace filippo.io/edwards25519 => ../ diff --git a/asm/go.sum b/asm/go.sum new file mode 100644 index 0000000..9a1fd78 --- /dev/null +++ b/asm/go.sum @@ -0,0 +1,28 @@ +github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4 h1:ExoghBBFY7A3RzgkAOq0XbHs9zaT/bHq7xysgyp3z3Q= +github.com/mmcloughlin/avo v0.0.0-20210104032911-599bdd1269f4/go.mod h1:6aKT4zZIrpGqB3RpFU14ByCSSyKY6LfJz4J/JJChHfI= +github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= +golang.org/x/arch v0.0.0-20201008161808-52c3e6f60cff/go.mod h1:flIaEI6LNU6xOCD5PaJvn9wGP0agmIOqjrtsKGRguv4= +golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= +golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI= +golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= +golang.org/x/mod v0.3.0 h1:RM4zey1++hCTbCVQfnWeKs9/IEsaBLA8vTkd0WVtmH4= +golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg= +golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= +golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU= +golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sync v0.0.0-20201020160332-67f06af15bc9/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= +golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= +golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= +golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= +golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= +golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= +golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174 h1:0rx0F4EjJNbxTuzWe0KjKcIzs+3VEb/Mrs/d1ciNz1c= +golang.org/x/tools v0.0.0-20201105001634-bc3cf281b174/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1 h1:go1bK/D/BFZV2I8cIQd1NKEZ+0owSTG1fDTci4IqFcE= +golang.org/x/xerrors v0.0.0-20200804184101-5ec99f83aff1/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= +rsc.io/pdf v0.1.1/go.mod h1:n8OzWcQ6Sp37PL01nO98y4iUCRdTGarVfzxY20ICaU4= diff --git a/fe_amd64.go b/fe_amd64.go index 9402b15..939a570 100644 --- a/fe_amd64.go +++ b/fe_amd64.go @@ -1,17 +1,13 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT. // +build amd64,gc,!purego package edwards25519 +// feMul sets out = a * b. It works like feMulGeneric. //go:noescape -func feMul(out, a, b *fieldElement) +func feMul(out *fieldElement, a *fieldElement, b *fieldElement) +// feSquare sets out = a * a. It works like feSquareGeneric. //go:noescape -func feSquare(out, x *fieldElement) - -func (v *fieldElement) carryPropagate() *fieldElement { - return v.carryPropagateGeneric() -} +func feSquare(out *fieldElement, a *fieldElement) diff --git a/fe_amd64.s b/fe_amd64.s index b164949..f5eea04 100644 --- a/fe_amd64.s +++ b/fe_amd64.s @@ -1,348 +1,378 @@ -// Copyright (c) 2017 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run fe_amd64_asm.go -out ../fe_amd64.s -stubs ../fe_amd64.go -pkg edwards25519. DO NOT EDIT. // +build amd64,gc,!purego -// func feMul(out, a, b *fieldElement) -TEXT ·feMul(SB),$0-24 - // Based on assembly generated by PeachPy. Equivalent to the Go in - // feMulGeneric, which was originally based on the amd64-51-30k - // assembly in SUPERCOP. +#include "textflag.h" - MOVQ a+8(FP), BX - MOVQ b+16(FP), CX +// func feMul(out *fieldElement, a *fieldElement, b *fieldElement) +TEXT ·feMul(SB), NOSPLIT, $0-24 + MOVQ a+8(FP), CX + MOVQ b+16(FP), BX - // Calculate r0 - MOVQ 0(BX), AX // rax <-- x0 - MULQ 0(CX) // rdx, rax <-- x0*y0 - MOVQ AX, SI // r00 = rax - MOVQ DX, DI // r01 = rdx + // r0 = a0×b0 + MOVQ (CX), AX + MULQ (BX) + MOVQ AX, SI + MOVQ DX, BP - MOVQ 8(BX), DX // rdx <-- x1 - IMUL3Q $19, DX, AX // rax <-- x1*19 - MULQ 32(CX) // rdx, rax <-- x1_19*y4 - ADDQ AX, SI // r00 += rax - ADCQ DX, DI // r01 += rdx + // r0 += 19×a1×b4 + MOVQ 8(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 32(BX) + ADDQ AX, SI + ADCQ DX, BP - MOVQ 16(BX), DX // rdx <-- x2 - IMUL3Q $19, DX, AX // rax <-- x2*19 - MULQ 24(CX) // rdx, rax <-- x2_19*y3 - ADDQ AX, SI // r00 += rax - ADCQ DX, DI // r01 += rdx + // r0 += 19×a2×b3 + MOVQ 16(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 24(BX) + ADDQ AX, SI + ADCQ DX, BP - MOVQ 24(BX), DX // rdx <-- x3 - IMUL3Q $19, DX, AX // rax <-- x3*19 - MULQ 16(CX) // rdx, rax <-- x3_19 * y2 - ADDQ AX, SI // r00 += rax - ADCQ DX, DI // r01 += rdx + // r0 += 19×a3×b2 + MOVQ 24(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 16(BX) + ADDQ AX, SI + ADCQ DX, BP - MOVQ 32(BX), DX // rdx <-- x4 - IMUL3Q $19, DX, AX // rax <-- x4*19 - MULQ 8(CX) // rdx rax <-- x4_19*y1 - ADDQ AX, SI // r00 += rax - ADCQ DX, DI // r01 += rdx + // r0 += 19×a4×b1 + MOVQ 32(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 8(BX) + ADDQ AX, SI + ADCQ DX, BP - // Calculate r1 - MOVQ 0(BX), AX - MULQ 8(CX) - MOVQ AX, R8 // r10 - MOVQ DX, R9 // r11 + // r1 = a0×b1 + MOVQ (CX), AX + MULQ 8(BX) + MOVQ AX, R8 + MOVQ DX, DI - MOVQ 8(BX), AX - MULQ 0(CX) + // r1 += a1×b0 + MOVQ 8(CX), AX + MULQ (BX) ADDQ AX, R8 + ADCQ DX, DI + + // r1 += 19×a2×b4 + MOVQ 16(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 32(BX) + ADDQ AX, R8 + ADCQ DX, DI + + // r1 += 19×a3×b3 + MOVQ 24(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 24(BX) + ADDQ AX, R8 + ADCQ DX, DI + + // r1 += 19×a4×b2 + MOVQ 32(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 16(BX) + ADDQ AX, R8 + ADCQ DX, DI + + // r2 = a0×b2 + MOVQ (CX), AX + MULQ 16(BX) + MOVQ AX, R10 + MOVQ DX, R9 + + // r2 += a1×b1 + MOVQ 8(CX), AX + MULQ 8(BX) + ADDQ AX, R10 ADCQ DX, R9 - MOVQ 16(BX), DX - IMUL3Q $19, DX, AX - MULQ 32(CX) - ADDQ AX, R8 + // r2 += a2×b0 + MOVQ 16(CX), AX + MULQ (BX) + ADDQ AX, R10 ADCQ DX, R9 - MOVQ 24(BX), DX - IMUL3Q $19, DX, AX - MULQ 24(CX) - ADDQ AX, R8 - ADCQ DX, R9 + // r2 += 19×a3×b4 + MOVQ 24(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 32(BX) + ADDQ AX, R10 + ADCQ DX, R9 - MOVQ 32(BX), DX - IMUL3Q $19, DX, AX - MULQ 16(CX) - ADDQ AX, R8 - ADCQ DX, R9 + // r2 += 19×a4×b3 + MOVQ 32(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 24(BX) + ADDQ AX, R10 + ADCQ DX, R9 - // Calculate r2 - MOVQ 0(BX), AX - MULQ 16(CX) - MOVQ AX, R10 // r20 - MOVQ DX, R11 // r21 + // r3 = a0×b3 + MOVQ (CX), AX + MULQ 24(BX) + MOVQ AX, R12 + MOVQ DX, R11 - MOVQ 8(BX), AX - MULQ 8(CX) - ADDQ AX, R10 + // r3 += a1×b2 + MOVQ 8(CX), AX + MULQ 16(BX) + ADDQ AX, R12 ADCQ DX, R11 - MOVQ 16(BX), AX - MULQ 0(CX) - ADDQ AX, R10 + // r3 += a2×b1 + MOVQ 16(CX), AX + MULQ 8(BX) + ADDQ AX, R12 ADCQ DX, R11 - MOVQ 24(BX), DX - IMUL3Q $19, DX, AX - MULQ 32(CX) - ADDQ AX, R10 + // r3 += a3×b0 + MOVQ 24(CX), AX + MULQ (BX) + ADDQ AX, R12 ADCQ DX, R11 - MOVQ 32(BX), DX - IMUL3Q $19, DX, AX - MULQ 24(CX) - ADDQ AX, R10 - ADCQ DX, R11 + // r3 += 19×a4×b4 + MOVQ 32(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 32(BX) + ADDQ AX, R12 + ADCQ DX, R11 - // Calculate r3 - MOVQ 0(BX), AX - MULQ 24(CX) - MOVQ AX, R12 // r30 - MOVQ DX, R13 // r31 + // r4 = a0×b4 + MOVQ (CX), AX + MULQ 32(BX) + MOVQ AX, R14 + MOVQ DX, R13 - MOVQ 8(BX), AX - MULQ 16(CX) - ADDQ AX, R12 + // r4 += a1×b3 + MOVQ 8(CX), AX + MULQ 24(BX) + ADDQ AX, R14 ADCQ DX, R13 - MOVQ 16(BX), AX - MULQ 8(CX) - ADDQ AX, R12 + // r4 += a2×b2 + MOVQ 16(CX), AX + MULQ 16(BX) + ADDQ AX, R14 ADCQ DX, R13 - MOVQ 24(BX), AX - MULQ 0(CX) - ADDQ AX, R12 + // r4 += a3×b1 + MOVQ 24(CX), AX + MULQ 8(BX) + ADDQ AX, R14 ADCQ DX, R13 - MOVQ 32(BX), DX - IMUL3Q $19, DX, AX - MULQ 32(CX) - ADDQ AX, R12 + // r4 += a4×b0 + MOVQ 32(CX), AX + MULQ (BX) + ADDQ AX, R14 ADCQ DX, R13 - // Calculate r4 - MOVQ 0(BX), AX - MULQ 32(CX) - MOVQ AX, R14 // r40 - MOVQ DX, R15 // r41 + // First reduction chain + MOVQ $0x0007ffffffffffff, AX + SHLQ $0x0d, SI, BP + SHLQ $0x0d, R8, DI + SHLQ $0x0d, R10, R9 + SHLQ $0x0d, R12, R11 + SHLQ $0x0d, R14, R13 + ANDQ AX, SI + IMUL3Q $0x13, R13, R13 + ADDQ R13, SI + ANDQ AX, R8 + ADDQ BP, R8 + ANDQ AX, R10 + ADDQ DI, R10 + ANDQ AX, R12 + ADDQ R9, R12 + ANDQ AX, R14 + ADDQ R11, R14 - MOVQ 8(BX), AX - MULQ 24(CX) - ADDQ AX, R14 - ADCQ DX, R15 + // Second reduction chain (carryPropagate) + MOVQ SI, BP + SHRQ $0x33, BP + MOVQ R8, DI + SHRQ $0x33, DI + MOVQ R10, R9 + SHRQ $0x33, R9 + MOVQ R12, R11 + SHRQ $0x33, R11 + MOVQ R14, R13 + SHRQ $0x33, R13 + ANDQ AX, SI + IMUL3Q $0x13, R13, R13 + ADDQ R13, SI + ANDQ AX, R8 + ADDQ BP, R8 + ANDQ AX, R10 + ADDQ DI, R10 + ANDQ AX, R12 + ADDQ R9, R12 + ANDQ AX, R14 + ADDQ R11, R14 - MOVQ 16(BX), AX - MULQ 16(CX) - ADDQ AX, R14 - ADCQ DX, R15 - - MOVQ 24(BX), AX - MULQ 8(CX) - ADDQ AX, R14 - ADCQ DX, R15 - - MOVQ 32(BX), AX - MULQ 0(CX) - ADDQ AX, R14 - ADCQ DX, R15 - - - MOVQ $2251799813685247, AX // (1<<51) - 1 - SHLQ $13, SI, DI // r01 = shld with r00 - ANDQ AX, SI // r00 &= mask51 - SHLQ $13, R8, R9 // r11 = shld with r10 - ANDQ AX, R8 // r10 &= mask51 - ADDQ DI, R8 // r10 += r01 - SHLQ $13, R10, R11 // r21 = shld with r20 - ANDQ AX, R10 // r20 &= mask51 - ADDQ R9, R10 // r20 += r11 - SHLQ $13, R12, R13 // r31 = shld with r30 - ANDQ AX, R12 // r30 &= mask51 - ADDQ R11, R12 // r30 += r21 - SHLQ $13, R14, R15 // r41 = shld with r40 - ANDQ AX, R14 // r40 &= mask51 - ADDQ R13, R14 // r40 += r31 - IMUL3Q $19, R15, R15 // r41 = r41*19 - ADDQ R15, SI // r00 += r41 - - MOVQ SI, DX // rdx <-- r00 - SHRQ $51, DX // rdx <-- r00 >> 51 - ADDQ DX, R8 // r10 += r00 >> 51 - MOVQ R8, DX // rdx <-- r10 - SHRQ $51, DX // rdx <-- r10 >> 51 - ANDQ AX, SI // r00 &= mask51 - ADDQ DX, R10 // r20 += r10 >> 51 - MOVQ R10, DX // rdx <-- r20 - SHRQ $51, DX // rdx <-- r20 >> 51 - ANDQ AX, R8 // r10 &= mask51 - ADDQ DX, R12 // r30 += r20 >> 51 - MOVQ R12, DX // rdx <-- r30 - SHRQ $51, DX // rdx <-- r30 >> 51 - ANDQ AX, R10 // r20 &= mask51 - ADDQ DX, R14 // r40 += r30 >> 51 - MOVQ R14, DX // rdx <-- r40 - SHRQ $51, DX // rdx <-- r40 >> 51 - ANDQ AX, R12 // r30 &= mask51 - IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19 - ADDQ DX, SI // r00 += (r40 >> 51) *19 - ANDQ AX, R14 // r40 &= mask51 - - MOVQ out+0(FP), DI - MOVQ SI, 0(DI) - MOVQ R8, 8(DI) - MOVQ R10, 16(DI) - MOVQ R12, 24(DI) - MOVQ R14, 32(DI) + // Store output + MOVQ out+0(FP), AX + MOVQ SI, (AX) + MOVQ R8, 8(AX) + MOVQ R10, 16(AX) + MOVQ R12, 24(AX) + MOVQ R14, 32(AX) RET -// func feSquare(out, x *fieldElement) -TEXT ·feSquare(SB),4,$0-16 - MOVQ out+0(FP), DI - MOVQ x+8(FP), SI +// func feSquare(out *fieldElement, a *fieldElement) +TEXT ·feSquare(SB), NOSPLIT, $0-16 + MOVQ a+8(FP), CX - // r0 = x0*x0 + x1*38*x4 + x2*38*x3 - MOVQ 0(SI), AX - MULQ 0(SI) - MOVQ AX, CX // r00 - MOVQ DX, R8 // r01 + // r0 = l0×l0 + MOVQ (CX), AX + MULQ (CX) + MOVQ AX, BP + MOVQ DX, BX - MOVQ 8(SI), DX - IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, CX - ADCQ DX, R8 + // r0 += 38×l1×l4 + MOVQ 8(CX), AX + IMUL3Q $0x26, AX, AX + MULQ 32(CX) + ADDQ AX, BP + ADCQ DX, BX - MOVQ 16(SI), DX - IMUL3Q $38, DX, AX - MULQ 24(SI) - ADDQ AX, CX - ADCQ DX, R8 + // r0 += 38×l2×l3 + MOVQ 16(CX), AX + IMUL3Q $0x26, AX, AX + MULQ 24(CX) + ADDQ AX, BP + ADCQ DX, BX - // r1 = x0*2*x1 + x2*38*x4 + x3*19*x3 - MOVQ 0(SI), AX - SHLQ $1, AX - MULQ 8(SI) - MOVQ AX, R9 // r10 - MOVQ DX, R10 // r11 + // r1 = 2×l0×l1 + MOVQ (CX), AX + SHLQ $0x01, AX + MULQ 8(CX) + MOVQ AX, DI + MOVQ DX, SI - MOVQ 16(SI), DX - IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, R9 - ADCQ DX, R10 + // r1 += 38×l2×l4 + MOVQ 16(CX), AX + IMUL3Q $0x26, AX, AX + MULQ 32(CX) + ADDQ AX, DI + ADCQ DX, SI - MOVQ 24(SI), DX - IMUL3Q $19, DX, AX - MULQ 24(SI) - ADDQ AX, R9 - ADCQ DX, R10 + // r1 += 19×l3×l3 + MOVQ 24(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 24(CX) + ADDQ AX, DI + ADCQ DX, SI - // r2 = x0*2*x2 + x1*x1 + x3*38*x4 - MOVQ 0(SI), AX - SHLQ $1, AX - MULQ 16(SI) - MOVQ AX, R11 // r20 - MOVQ DX, R12 // r21 + // r2 = 2×l0×l2 + MOVQ (CX), AX + SHLQ $0x01, AX + MULQ 16(CX) + MOVQ AX, R9 + MOVQ DX, R8 - MOVQ 8(SI), AX - MULQ 8(SI) - ADDQ AX, R11 - ADCQ DX, R12 + // r2 += l1×l1 + MOVQ 8(CX), AX + MULQ 8(CX) + ADDQ AX, R9 + ADCQ DX, R8 - MOVQ 24(SI), DX - IMUL3Q $38, DX, AX - MULQ 32(SI) - ADDQ AX, R11 - ADCQ DX, R12 + // r2 += 38×l3×l4 + MOVQ 24(CX), AX + IMUL3Q $0x26, AX, AX + MULQ 32(CX) + ADDQ AX, R9 + ADCQ DX, R8 - // r3 = x0*2*x3 + x1*2*x2 + x4*19*x4 - MOVQ 0(SI), AX - SHLQ $1, AX - MULQ 24(SI) - MOVQ AX, R13 // r30 - MOVQ DX, R14 // r31 + // r3 = 2×l0×l3 + MOVQ (CX), AX + SHLQ $0x01, AX + MULQ 24(CX) + MOVQ AX, R11 + MOVQ DX, R10 - MOVQ 8(SI), AX - SHLQ $1, AX - MULQ 16(SI) - ADDQ AX, R13 - ADCQ DX, R14 + // r3 += 2×l1×l2 + MOVQ 8(CX), AX + IMUL3Q $0x02, AX, AX + MULQ 16(CX) + ADDQ AX, R11 + ADCQ DX, R10 - MOVQ 32(SI), DX - IMUL3Q $19, DX, AX - MULQ 32(SI) - ADDQ AX, R13 - ADCQ DX, R14 + // r3 += 19×l4×l4 + MOVQ 32(CX), AX + IMUL3Q $0x13, AX, AX + MULQ 32(CX) + ADDQ AX, R11 + ADCQ DX, R10 - // r4 = x0*2*x4 + x1*2*x3 + x2*x2 - MOVQ 0(SI), AX - SHLQ $1, AX - MULQ 32(SI) - MOVQ AX, R15 // r40 - MOVQ DX, BX // r41 + // r4 = 2×l0×l4 + MOVQ (CX), AX + SHLQ $0x01, AX + MULQ 32(CX) + MOVQ AX, R13 + MOVQ DX, R12 - MOVQ 8(SI), AX - SHLQ $1, AX - MULQ 24(SI) - ADDQ AX, R15 - ADCQ DX, BX + // r4 += 2×l1×l3 + MOVQ 8(CX), AX + IMUL3Q $0x02, AX, AX + MULQ 24(CX) + ADDQ AX, R13 + ADCQ DX, R12 - MOVQ 16(SI), AX - MULQ 16(SI) - ADDQ AX, R15 - ADCQ DX, BX + // r4 += l2×l2 + MOVQ 16(CX), AX + MULQ 16(CX) + ADDQ AX, R13 + ADCQ DX, R12 - // Reduce - MOVQ $2251799813685247, AX // (1<<51) - 1 - SHLQ $13, CX, R8 // r01 = shld with r00 - ANDQ AX, CX // r00 &= mask51 - SHLQ $13, R9, R10 // r11 = shld with r10 - ANDQ AX, R9 // r10 &= mask51 - ADDQ R8, R9 // r10 += r01 - SHLQ $13, R11, R12 // r21 = shld with r20 - ANDQ AX, R11 // r20 &= mask51 - ADDQ R10, R11 // r20 += r11 - SHLQ $13, R13, R14 // r31 = shld with r30 - ANDQ AX, R13 // r30 &= mask51 - ADDQ R12, R13 // r30 += r21 - SHLQ $13, R15, BX // r41 = shld with r40 - ANDQ AX, R15 // r40 &= mask51 - ADDQ R14, R15 // r40 += r31 - IMUL3Q $19, BX, DX // r41 = r41*19 - ADDQ DX, CX // r00 += r41 + // First reduction chain + MOVQ $0x0007ffffffffffff, AX + SHLQ $0x0d, BP, BX + SHLQ $0x0d, DI, SI + SHLQ $0x0d, R9, R8 + SHLQ $0x0d, R11, R10 + SHLQ $0x0d, R13, R12 + ANDQ AX, BP + IMUL3Q $0x13, R12, R12 + ADDQ R12, BP + ANDQ AX, DI + ADDQ BX, DI + ANDQ AX, R9 + ADDQ SI, R9 + ANDQ AX, R11 + ADDQ R8, R11 + ANDQ AX, R13 + ADDQ R10, R13 - MOVQ CX, DX // rdx <-- r00 - SHRQ $51, DX // rdx <-- r00 >> 51 - ADDQ DX, R9 // r10 += r00 >> 51 - MOVQ R9, DX // rdx <-- r10 - SHRQ $51, DX // rdx <-- r10 >> 51 - ANDQ AX, CX // r00 &= mask51 - ADDQ DX, R11 // r20 += r10 >> 51 - MOVQ R11, DX // rdx <-- r20 - SHRQ $51, DX // rdx <-- r20 >> 51 - ANDQ AX, R9 // r10 &= mask51 - ADDQ DX, R13 // r30 += r20 >> 51 - MOVQ R13, DX // rdx <-- r30 - SHRQ $51, DX // rdx <-- r30 >> 51 - ANDQ AX, R11 // r20 &= mask51 - ADDQ DX, R15 // r40 += r30 >> 51 - MOVQ R15, DX // rdx <-- r40 - SHRQ $51, DX // rdx <-- r40 >> 51 - ANDQ AX, R13 // r30 &= mask51 - IMUL3Q $19, DX, DX // rdx <-- (r40 >> 51) * 19 - ADDQ DX, CX // r00 += (r40 >> 51) *19 - ANDQ AX, R15 // r40 &= mask51 + // Second reduction chain (carryPropagate) + MOVQ BP, BX + SHRQ $0x33, BX + MOVQ DI, SI + SHRQ $0x33, SI + MOVQ R9, R8 + SHRQ $0x33, R8 + MOVQ R11, R10 + SHRQ $0x33, R10 + MOVQ R13, R12 + SHRQ $0x33, R12 + ANDQ AX, BP + IMUL3Q $0x13, R12, R12 + ADDQ R12, BP + ANDQ AX, DI + ADDQ BX, DI + ANDQ AX, R9 + ADDQ SI, R9 + ANDQ AX, R11 + ADDQ R8, R11 + ANDQ AX, R13 + ADDQ R10, R13 - MOVQ CX, 0(DI) - MOVQ R9, 8(DI) - MOVQ R11, 16(DI) - MOVQ R13, 24(DI) - MOVQ R15, 32(DI) - RET + // Store output + MOVQ out+0(FP), AX + MOVQ BP, (AX) + MOVQ DI, 8(AX) + MOVQ R9, 16(AX) + MOVQ R11, 24(AX) + MOVQ R13, 32(AX) + RET diff --git a/fe_noasm.go b/fe_amd64_noasm.go similarity index 70% rename from fe_noasm.go rename to fe_amd64_noasm.go index 59c7bdd..53e702b 100644 --- a/fe_noasm.go +++ b/fe_amd64_noasm.go @@ -2,14 +2,10 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!arm64 !gc purego +// +build !amd64 !gc purego package edwards25519 func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) } func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) } - -func (v *fieldElement) carryPropagate() *fieldElement { - return v.carryPropagateGeneric() -} diff --git a/fe_arm64.go b/fe_arm64.go index 96cf368..f613ad5 100644 --- a/fe_arm64.go +++ b/fe_arm64.go @@ -6,10 +6,6 @@ package edwards25519 -func feMul(v, x, y *fieldElement) { feMulGeneric(v, x, y) } - -func feSquare(v, x *fieldElement) { feSquareGeneric(v, x) } - //go:noescape func carryPropagate(v *fieldElement) diff --git a/fe_arm64_noasm.go b/fe_arm64_noasm.go new file mode 100644 index 0000000..1a1df8a --- /dev/null +++ b/fe_arm64_noasm.go @@ -0,0 +1,11 @@ +// Copyright (c) 2021 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !arm64 !gc purego + +package edwards25519 + +func (v *fieldElement) carryPropagate() *fieldElement { + return v.carryPropagateGeneric() +} diff --git a/fe_test.go b/fe_test.go index 52bbe93..95a3a8e 100644 --- a/fe_test.go +++ b/fe_test.go @@ -26,15 +26,13 @@ func (v fieldElement) String() string { var quickCheckConfig1024 = &quick.Config{MaxCountScale: 1 << 10} func generateFieldElement(rand *mathrand.Rand) fieldElement { - // Generation strategy: generate random limb values of [52, 51, 51, 51, 51] - // bits, like the ones returned by lightReduce. const maskLow52Bits = (1 << 52) - 1 return fieldElement{ rand.Uint64() & maskLow52Bits, - rand.Uint64() & maskLow51Bits, - rand.Uint64() & maskLow51Bits, - rand.Uint64() & maskLow51Bits, - rand.Uint64() & maskLow51Bits, + rand.Uint64() & maskLow52Bits, + rand.Uint64() & maskLow52Bits, + rand.Uint64() & maskLow52Bits, + rand.Uint64() & maskLow52Bits, } } @@ -524,3 +522,47 @@ func TestCarryPropagate(t *testing.T) { t.Errorf("failed for {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}") } } + +func TestFeSquare(t *testing.T) { + asmLikeGeneric := func(a fieldElement) bool { + t1 := a + t2 := a + + feSquareGeneric(&t1, &t1) + feSquare(&t2, &t2) + + if t1 != t2 { + t.Logf("got: %#v,\nexpected: %#v", t1, t2) + } + + return t1 == t2 && isInBounds(&t2) + } + + if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { + t.Error(err) + } +} + +func TestFeMul(t *testing.T) { + asmLikeGeneric := func(a, b fieldElement) bool { + a1 := a + a2 := a + b1 := b + b2 := b + + feMulGeneric(&a1, &a1, &b1) + feMul(&a2, &a2, &b2) + + if a1 != a2 || b1 != b2 { + t.Logf("got: %#v,\nexpected: %#v", a1, a2) + t.Logf("got: %#v,\nexpected: %#v", b1, b2) + } + + return a1 == a2 && isInBounds(&a2) && + b1 == b2 && isInBounds(&b2) + } + + if err := quick.Check(asmLikeGeneric, quickCheckConfig1024); err != nil { + t.Error(err) + } +}