diff options
author | Andrew Bonventre <andybons@golang.org> | 2018-06-28 01:41:22 +0000 |
---|---|---|
committer | Andrew Bonventre <andybons@golang.org> | 2018-06-28 01:45:22 +0000 |
commit | ed333353a02721dc002c0b7a7c3ef5eb99166dfb (patch) | |
tree | b963fdb45afa268fdddcbc45530916ae66deb08b | |
parent | 11f1fab4dfe59f09f322b6493a2b9c5d0ae99bfa (diff) | |
download | go-ed333353a02721dc002c0b7a7c3ef5eb99166dfb.tar.gz go-ed333353a02721dc002c0b7a7c3ef5eb99166dfb.zip |
Revert "crypto/elliptic: implement P256 for arm64"
This reverts commit 0246915fbfcc41870173b7f016dc7fa9437bbc13.
Reason for revert: Broke darwin/arm64 builds.
Change-Id: Iead935d345c4776c0f823f4c152e02bdda308401
Reviewed-on: https://go-review.googlesource.com/121375
Reviewed-by: Andrew Bonventre <andybons@golang.org>
-rw-r--r-- | src/crypto/elliptic/p256.go | 2 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_amd64.go (renamed from src/crypto/elliptic/p256_asm.go) | 2 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_asm_arm64.s | 1522 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_generic.go | 2 |
4 files changed, 3 insertions, 1525 deletions
diff --git a/src/crypto/elliptic/p256.go b/src/crypto/elliptic/p256.go index 80e123a734..bb9757355a 100644 --- a/src/crypto/elliptic/p256.go +++ b/src/crypto/elliptic/p256.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!arm64 +// +build !amd64 package elliptic diff --git a/src/crypto/elliptic/p256_asm.go b/src/crypto/elliptic/p256_amd64.go index 6cf7742e1b..30eb33a0d4 100644 --- a/src/crypto/elliptic/p256_asm.go +++ b/src/crypto/elliptic/p256_amd64.go @@ -10,7 +10,7 @@ // https://link.springer.com/article/10.1007%2Fs13389-014-0090-x // https://eprint.iacr.org/2013/816.pdf -// +build amd64 arm64 +// +build amd64 package elliptic diff --git a/src/crypto/elliptic/p256_asm_arm64.s b/src/crypto/elliptic/p256_asm_arm64.s deleted file mode 100644 index bc54ec04d2..0000000000 --- a/src/crypto/elliptic/p256_asm_arm64.s +++ /dev/null @@ -1,1522 +0,0 @@ -// Copyright 2018 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file contains constant-time, 64-bit assembly implementation of -// P256. The optimizations performed here are described in detail in: -// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with -// 256-bit primes" -// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x -// https://eprint.iacr.org/2013/816.pdf - -#include "textflag.h" - -#define res_ptr R0 -#define a_ptr R1 -#define b_ptr R2 - -#define acc0 R3 -#define acc1 R4 -#define acc2 R5 -#define acc3 R6 - -#define acc4 R7 -#define acc5 R8 -#define acc6 R9 -#define acc7 R10 -#define t0 R11 -#define t1 R12 -#define t2 R13 -#define t3 R14 -#define const0 R15 -#define const1 R16 - -#define hlp0 R17 -#define hlp1 R18 - -#define x0 R19 -#define x1 R20 -#define x2 R21 -#define x3 R22 -#define y0 R23 -#define y1 R24 -#define y2 R25 -#define y3 R26 - -#define const2 t2 -#define const3 t3 - -DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff -DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001 -DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f -DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551 -DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84 -DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000 -DATA p256one<>+0x00(SB)/8, $0x0000000000000001 -DATA p256one<>+0x08(SB)/8, $0xffffffff00000000 -DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff -DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe -GLOBL p256const0<>(SB), 8, $8 -GLOBL p256const1<>(SB), 8, $8 -GLOBL p256ordK0<>(SB), 8, $8 -GLOBL p256ord<>(SB), 8, $32 -GLOBL p256one<>(SB), 8, $32 - -/* ---------------------------------------*/ -// func p256LittleToBig(res []byte, in []uint64) -TEXT ·p256LittleToBig(SB),NOSPLIT,$0 - JMP ·p256BigToLittle(SB) -/* ---------------------------------------*/ -// func p256BigToLittle(res []uint64, in []byte) -TEXT ·p256BigToLittle(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in+24(FP), a_ptr - - LDP 0*16(a_ptr), (acc0, acc1) - LDP 1*16(a_ptr), (acc2, acc3) - - REV acc0, acc0 - REV acc1, acc1 - REV acc2, acc2 - REV acc3, acc3 - - STP (acc3, acc2), 0*16(res_ptr) - STP (acc1, acc0), 1*16(res_ptr) - RET -/* ---------------------------------------*/ -// func p256MovCond(res, a, b []uint64, cond int) -// If cond == 0 res=b, else res=a -TEXT ·p256MovCond(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD a+24(FP), a_ptr - MOVD b+48(FP), b_ptr - MOVD cond+72(FP), R3 - - // Two remarks: - // 1) Will want to revisit NEON, when support is better - // 2) CSEL might not be constant time on all ARM processors - LDP 0*16(a_ptr), (R4, R5) - LDP 1*16(a_ptr), (R6, R7) - LDP 2*16(a_ptr), (R8, R9) - LDP 3*16(a_ptr), (R10, R11) - LDP 4*16(a_ptr), (R12, R13) - LDP 5*16(a_ptr), (R14, R15) - - LDP 0*16(b_ptr), (R16, R17) - LDP 1*16(b_ptr), (R18, R19) - LDP 2*16(b_ptr), (R20, R21) - LDP 3*16(b_ptr), (R22, R23) - LDP 4*16(b_ptr), (R24, R25) - LDP 5*16(b_ptr), (R26, R27) - - CMP $0, R3 - CSEL EQ, R16, R4, R4 - CSEL EQ, R17, R5, R5 - CSEL EQ, R18, R6, R6 - CSEL EQ, R19, R7, R7 - CSEL EQ, R20, R8, R8 - CSEL EQ, R21, R9, R9 - CSEL EQ, R22, R10, R10 - CSEL EQ, R23, R11, R11 - CSEL EQ, R24, R12, R12 - CSEL EQ, R25, R13, R13 - CSEL EQ, R26, R14, R14 - CSEL EQ, R27, R15, R15 - - STP (R4, R5), 0*16(res_ptr) - STP (R6, R7), 1*16(res_ptr) - STP (R8, R9), 2*16(res_ptr) - STP (R10, R11), 3*16(res_ptr) - STP (R12, R13), 4*16(res_ptr) - STP (R14, R15), 5*16(res_ptr) - - RET -/* ---------------------------------------*/ -// func p256NegCond(val []uint64, cond int) -TEXT ·p256NegCond(SB),NOSPLIT,$0 - MOVD val+0(FP), a_ptr - MOVD cond+24(FP), hlp0 - MOVD a_ptr, res_ptr - // acc = poly - MOVD $-1, acc0 - MOVD p256const0<>(SB), acc1 - MOVD $0, acc2 - MOVD p256const1<>(SB), acc3 - // Load the original value - LDP 0*16(a_ptr), (t0, t1) - LDP 1*16(a_ptr), (t2, t3) - // Speculatively subtract - SUBS t0, acc0 - SBCS t1, acc1 - SBCS t2, acc2 - SBC t3, acc3 - // If condition is 0, keep original value - CMP $0, hlp0 - CSEL EQ, t0, acc0, acc0 - CSEL EQ, t1, acc1, acc1 - CSEL EQ, t2, acc2, acc2 - CSEL EQ, t3, acc3, acc3 - // Store result - STP (acc0, acc1), 0*16(res_ptr) - STP (acc2, acc3), 1*16(res_ptr) - - RET -/* ---------------------------------------*/ -// func p256Sqr(res, in []uint64, n int) -TEXT ·p256Sqr(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in+24(FP), a_ptr - MOVD n+48(FP), b_ptr - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - - LDP 0*16(a_ptr), (x0, x1) - LDP 1*16(a_ptr), (x2, x3) - -sqrLoop: - SUB $1, b_ptr - CALL p256SqrInternal(SB) - MOVD y0, x0 - MOVD y1, x1 - MOVD y2, x2 - MOVD y3, x3 - CBNZ b_ptr, sqrLoop - - STP (y0, y1), 0*16(res_ptr) - STP (y2, y3), 1*16(res_ptr) - RET -/* ---------------------------------------*/ -// func p256Mul(res, in1, in2 []uint64) -TEXT ·p256Mul(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in1+24(FP), a_ptr - MOVD in2+48(FP), b_ptr - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - - LDP 0*16(a_ptr), (x0, x1) - LDP 1*16(a_ptr), (x2, x3) - - LDP 0*16(b_ptr), (y0, y1) - LDP 1*16(b_ptr), (y2, y3) - - CALL p256MulInternal(SB) - - STP (y0, y1), 0*16(res_ptr) - STP (y2, y3), 1*16(res_ptr) - RET -/* ---------------------------------------*/ -// func p256FromMont(res, in []uint64) -TEXT ·p256FromMont(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in+24(FP), a_ptr - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - - LDP 0*16(a_ptr), (acc0, acc1) - LDP 1*16(a_ptr), (acc2, acc3) - // Only reduce, no multiplications are needed - // First reduction step - ADDS acc0<<32, acc1, acc1 - LSR $32, acc0, t0 - MUL acc0, const1, t1 - UMULH acc0, const1, acc0 - ADCS t0, acc2 - ADCS t1, acc3 - ADC $0, acc0 - // Second reduction step - ADDS acc1<<32, acc2, acc2 - LSR $32, acc1, t0 - MUL acc1, const1, t1 - UMULH acc1, const1, acc1 - ADCS t0, acc3 - ADCS t1, acc0 - ADC $0, acc1 - // Third reduction step - ADDS acc2<<32, acc3, acc3 - LSR $32, acc2, t0 - MUL acc2, const1, t1 - UMULH acc2, const1, acc2 - ADCS t0, acc0 - ADCS t1, acc1 - ADC $0, acc2 - // Last reduction step - ADDS acc3<<32, acc0, acc0 - LSR $32, acc3, t0 - MUL acc3, const1, t1 - UMULH acc3, const1, acc3 - ADCS t0, acc1 - ADCS t1, acc2 - ADC $0, acc3 - - SUBS $-1, acc0, t0 - SBCS const0, acc1, t1 - SBCS $0, acc2, t2 - SBCS const1, acc3, t3 - - CSEL CS, t0, acc0, acc0 - CSEL CS, t1, acc1, acc1 - CSEL CS, t2, acc2, acc2 - CSEL CS, t3, acc3, acc3 - - STP (acc0, acc1), 0*16(res_ptr) - STP (acc2, acc3), 1*16(res_ptr) - - RET -/* ---------------------------------------*/ -// Constant time point access to arbitrary point table. -// Indexed from 1 to 15, with -1 offset -// (index 0 is implicitly point at infinity) -// func p256Select(point, table []uint64, idx int) -TEXT ·p256Select(SB),NOSPLIT,$0 - MOVD idx+48(FP), const0 - MOVD table+24(FP), b_ptr - MOVD point+0(FP), res_ptr - - EOR x0, x0, x0 - EOR x1, x1, x1 - EOR x2, x2, x2 - EOR x3, x3, x3 - EOR y0, y0, y0 - EOR y1, y1, y1 - EOR y2, y2, y2 - EOR y3, y3, y3 - EOR t0, t0, t0 - EOR t1, t1, t1 - EOR t2, t2, t2 - EOR t3, t3, t3 - - MOVD $0, const1 - -loop_select: - ADD $1, const1 - CMP const0, const1 - LDP.P 16(b_ptr), (acc0, acc1) - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - LDP.P 16(b_ptr), (acc2, acc3) - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - LDP.P 16(b_ptr), (acc4, acc5) - CSEL EQ, acc4, y0, y0 - CSEL EQ, acc5, y1, y1 - LDP.P 16(b_ptr), (acc6, acc7) - CSEL EQ, acc6, y2, y2 - CSEL EQ, acc7, y3, y3 - LDP.P 16(b_ptr), (acc0, acc1) - CSEL EQ, acc0, t0, t0 - CSEL EQ, acc1, t1, t1 - LDP.P 16(b_ptr), (acc2, acc3) - CSEL EQ, acc2, t2, t2 - CSEL EQ, acc3, t3, t3 - - CMP $16, const1 - BNE loop_select - - STP (x0, x1), 0*16(res_ptr) - STP (x2, x3), 1*16(res_ptr) - STP (y0, y1), 2*16(res_ptr) - STP (y2, y3), 3*16(res_ptr) - STP (t0, t1), 4*16(res_ptr) - STP (t2, t3), 5*16(res_ptr) - RET -/* ---------------------------------------*/ -// Constant time point access to base point table. -// func p256SelectBase(point, table []uint64, idx int) -TEXT ·p256SelectBase(SB),NOSPLIT,$0 - MOVD idx+48(FP), t0 - MOVD table+24(FP), t1 - MOVD point+0(FP), res_ptr - - EOR x0, x0, x0 - EOR x1, x1, x1 - EOR x2, x2, x2 - EOR x3, x3, x3 - EOR y0, y0, y0 - EOR y1, y1, y1 - EOR y2, y2, y2 - EOR y3, y3, y3 - - MOVD $0, t2 - -loop_select: - ADD $1, t2 - CMP t0, t2 - LDP.P 16(t1), (acc0, acc1) - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - LDP.P 16(t1), (acc2, acc3) - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - LDP.P 16(t1), (acc4, acc5) - CSEL EQ, acc4, y0, y0 - CSEL EQ, acc5, y1, y1 - LDP.P 16(t1), (acc6, acc7) - CSEL EQ, acc6, y2, y2 - CSEL EQ, acc7, y3, y3 - - CMP $32, t2 - BNE loop_select - - STP (x0, x1), 0*16(res_ptr) - STP (x2, x3), 1*16(res_ptr) - STP (y0, y1), 2*16(res_ptr) - STP (y2, y3), 3*16(res_ptr) - RET -/* ---------------------------------------*/ -// func p256OrdSqr(res, in []uint64, n int) -TEXT ·p256OrdSqr(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in+24(FP), a_ptr - MOVD n+48(FP), b_ptr - - MOVD p256ordK0<>(SB), hlp1 - LDP p256ord<>+0x00(SB), (const0, const1) - LDP p256ord<>+0x10(SB), (const2, const3) - - LDP 0*16(a_ptr), (x0, x1) - LDP 1*16(a_ptr), (x2, x3) - -ordSqrLoop: - SUB $1, b_ptr - - // x[1:] * x[0] - MUL x0, x1, acc1 - UMULH x0, x1, acc2 - - MUL x0, x2, t0 - ADDS t0, acc2, acc2 - UMULH x0, x2, acc3 - - MUL x0, x3, t0 - ADCS t0, acc3, acc3 - UMULH x0, x3, acc4 - ADC $0, acc4, acc4 - // x[2:] * x[1] - MUL x1, x2, t0 - ADDS t0, acc3 - UMULH x1, x2, t1 - ADCS t1, acc4 - ADC $0, ZR, acc5 - - MUL x1, x3, t0 - ADDS t0, acc4 - UMULH x1, x3, t1 - ADC t1, acc5 - // x[3] * x[2] - MUL x2, x3, t0 - ADDS t0, acc5 - UMULH x2, x3, acc6 - ADC $0, acc6 - - MOVD $0, acc7 - // *2 - ADDS acc1, acc1 - ADCS acc2, acc2 - ADCS acc3, acc3 - ADCS acc4, acc4 - ADCS acc5, acc5 - ADCS acc6, acc6 - ADC $0, acc7 - // Missing products - MUL x0, x0, acc0 - UMULH x0, x0, t0 - ADDS t0, acc1, acc1 - - MUL x1, x1, t0 - ADCS t0, acc2, acc2 - UMULH x1, x1, t1 - ADCS t1, acc3, acc3 - - MUL x2, x2, t0 - ADCS t0, acc4, acc4 - UMULH x2, x2, t1 - ADCS t1, acc5, acc5 - - MUL x3, x3, t0 - ADCS t0, acc6, acc6 - UMULH x3, x3, t1 - ADC t1, acc7, acc7 - // First reduction step - MUL acc0, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc0, acc0 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc1, acc1 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc2, acc2 - UMULH const2, hlp0, acc0 - - MUL const3, hlp0, t0 - ADCS t0, acc3, acc3 - - UMULH const3, hlp0, hlp0 - ADC $0, hlp0 - - ADDS t1, acc1, acc1 - ADCS y0, acc2, acc2 - ADCS acc0, acc3, acc3 - ADC $0, hlp0, acc0 - // Second reduction step - MUL acc1, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc1, acc1 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc2, acc2 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc3, acc3 - UMULH const2, hlp0, acc1 - - MUL const3, hlp0, t0 - ADCS t0, acc0, acc0 - - UMULH const3, hlp0, hlp0 - ADC $0, hlp0 - - ADDS t1, acc2, acc2 - ADCS y0, acc3, acc3 - ADCS acc1, acc0, acc0 - ADC $0, hlp0, acc1 - // Third reduction step - MUL acc2, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc2, acc2 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc3, acc3 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc0, acc0 - UMULH const2, hlp0, acc2 - - MUL const3, hlp0, t0 - ADCS t0, acc1, acc1 - - UMULH const3, hlp0, hlp0 - ADC $0, hlp0 - - ADDS t1, acc3, acc3 - ADCS y0, acc0, acc0 - ADCS acc2, acc1, acc1 - ADC $0, hlp0, acc2 - - // Last reduction step - MUL acc3, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc3, acc3 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc0, acc0 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc1, acc1 - UMULH const2, hlp0, acc3 - - MUL const3, hlp0, t0 - ADCS t0, acc2, acc2 - - UMULH const3, hlp0, hlp0 - ADC $0, acc7 - - ADDS t1, acc0, acc0 - ADCS y0, acc1, acc1 - ADCS acc3, acc2, acc2 - ADC $0, hlp0, acc3 - - ADDS acc4, acc0, acc0 - ADCS acc5, acc1, acc1 - ADCS acc6, acc2, acc2 - ADCS acc7, acc3, acc3 - ADC $0, ZR, acc4 - - SUBS const0, acc0, y0 - SBCS const1, acc1, y1 - SBCS const2, acc2, y2 - SBCS const3, acc3, y3 - SBCS $0, acc4, acc4 - - CSEL CS, y0, acc0, x0 - CSEL CS, y1, acc1, x1 - CSEL CS, y2, acc2, x2 - CSEL CS, y3, acc3, x3 - - CBNZ b_ptr, ordSqrLoop - - STP (x0, x1), 0*16(res_ptr) - STP (x2, x3), 1*16(res_ptr) - - RET -/* ---------------------------------------*/ -// func p256OrdMul(res, in1, in2 []uint64) -TEXT ·p256OrdMul(SB),NOSPLIT,$0 - MOVD res+0(FP), res_ptr - MOVD in1+24(FP), a_ptr - MOVD in2+48(FP), b_ptr - - MOVD p256ordK0<>(SB), hlp1 - LDP p256ord<>+0x00(SB), (const0, const1) - LDP p256ord<>+0x10(SB), (const2, const3) - - LDP 0*16(a_ptr), (x0, x1) - LDP 1*16(a_ptr), (x2, x3) - LDP 0*16(b_ptr), (y0, y1) - LDP 1*16(b_ptr), (y2, y3) - - // y[0] * x - MUL y0, x0, acc0 - UMULH y0, x0, acc1 - - MUL y0, x1, t0 - ADDS t0, acc1 - UMULH y0, x1, acc2 - - MUL y0, x2, t0 - ADCS t0, acc2 - UMULH y0, x2, acc3 - - MUL y0, x3, t0 - ADCS t0, acc3 - UMULH y0, x3, acc4 - ADC $0, acc4 - // First reduction step - MUL acc0, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc0, acc0 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc1, acc1 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc2, acc2 - UMULH const2, hlp0, acc0 - - MUL const3, hlp0, t0 - ADCS t0, acc3, acc3 - - UMULH const3, hlp0, hlp0 - ADC $0, acc4 - - ADDS t1, acc1, acc1 - ADCS y0, acc2, acc2 - ADCS acc0, acc3, acc3 - ADC $0, hlp0, acc0 - // y[1] * x - MUL y1, x0, t0 - ADDS t0, acc1 - UMULH y1, x0, t1 - - MUL y1, x1, t0 - ADCS t0, acc2 - UMULH y1, x1, hlp0 - - MUL y1, x2, t0 - ADCS t0, acc3 - UMULH y1, x2, y0 - - MUL y1, x3, t0 - ADCS t0, acc4 - UMULH y1, x3, y1 - ADC $0, ZR, acc5 - - ADDS t1, acc2 - ADCS hlp0, acc3 - ADCS y0, acc4 - ADC y1, acc5 - // Second reduction step - MUL acc1, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc1, acc1 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc2, acc2 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc3, acc3 - UMULH const2, hlp0, acc1 - - MUL const3, hlp0, t0 - ADCS t0, acc0, acc0 - - UMULH const3, hlp0, hlp0 - ADC $0, acc5 - - ADDS t1, acc2, acc2 - ADCS y0, acc3, acc3 - ADCS acc1, acc0, acc0 - ADC $0, hlp0, acc1 - // y[2] * x - MUL y2, x0, t0 - ADDS t0, acc2 - UMULH y2, x0, t1 - - MUL y2, x1, t0 - ADCS t0, acc3 - UMULH y2, x1, hlp0 - - MUL y2, x2, t0 - ADCS t0, acc4 - UMULH y2, x2, y0 - - MUL y2, x3, t0 - ADCS t0, acc5 - UMULH y2, x3, y1 - ADC $0, ZR, acc6 - - ADDS t1, acc3 - ADCS hlp0, acc4 - ADCS y0, acc5 - ADC y1, acc6 - // Third reduction step - MUL acc2, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc2, acc2 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc3, acc3 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc0, acc0 - UMULH const2, hlp0, acc2 - - MUL const3, hlp0, t0 - ADCS t0, acc1, acc1 - - UMULH const3, hlp0, hlp0 - ADC $0, acc6 - - ADDS t1, acc3, acc3 - ADCS y0, acc0, acc0 - ADCS acc2, acc1, acc1 - ADC $0, hlp0, acc2 - // y[3] * x - MUL y3, x0, t0 - ADDS t0, acc3 - UMULH y3, x0, t1 - - MUL y3, x1, t0 - ADCS t0, acc4 - UMULH y3, x1, hlp0 - - MUL y3, x2, t0 - ADCS t0, acc5 - UMULH y3, x2, y0 - - MUL y3, x3, t0 - ADCS t0, acc6 - UMULH y3, x3, y1 - ADC $0, ZR, acc7 - - ADDS t1, acc4 - ADCS hlp0, acc5 - ADCS y0, acc6 - ADC y1, acc7 - // Last reduction step - MUL acc3, hlp1, hlp0 - - MUL const0, hlp1, t0 - ADDS t0, acc3, acc3 - UMULH const0, hlp0, t1 - - MUL const1, hlp0, t0 - ADCS t0, acc0, acc0 - UMULH const1, hlp0, y0 - - MUL const2, hlp0, t0 - ADCS t0, acc1, acc1 - UMULH const2, hlp0, acc3 - - MUL const3, hlp0, t0 - ADCS t0, acc2, acc2 - - UMULH const3, hlp0, hlp0 - ADC $0, acc7 - - ADDS t1, acc0, acc0 - ADCS y0, acc1, acc1 - ADCS acc3, acc2, acc2 - ADC $0, hlp0, acc3 - - ADDS acc4, acc0, acc0 - ADCS acc5, acc1, acc1 - ADCS acc6, acc2, acc2 - ADCS acc7, acc3, acc3 - ADC $0, ZR, acc4 - - SUBS const0, acc0, t0 - SBCS const1, acc1, t1 - SBCS const2, acc2, t2 - SBCS const3, acc3, t3 - SBCS $0, acc4, acc4 - - CSEL CS, t0, acc0, acc0 - CSEL CS, t1, acc1, acc1 - CSEL CS, t2, acc2, acc2 - CSEL CS, t3, acc3, acc3 - - STP (acc0, acc1), 0*16(res_ptr) - STP (acc2, acc3), 1*16(res_ptr) - - RET -/* ---------------------------------------*/ -TEXT p256SubInternal(SB),NOSPLIT,$0 - SUBS x0, y0, acc0 - SBCS x1, y1, acc1 - SBCS x2, y2, acc2 - SBCS x3, y3, acc3 - SBC $0, ZR, t0 - - ADDS $-1, acc0, acc4 - ADCS const0, acc1, acc5 - ADCS $0, acc2, acc6 - ADC const1, acc3, acc7 - - ANDS $1, t0 - CSEL EQ, acc0, acc4, x0 - CSEL EQ, acc1, acc5, x1 - CSEL EQ, acc2, acc6, x2 - CSEL EQ, acc3, acc7, x3 - - RET -/* ---------------------------------------*/ -TEXT p256SqrInternal(SB),NOSPLIT,$0 - // x[1:] * x[0] - MUL x0, x1, acc1 - UMULH x0, x1, acc2 - - MUL x0, x2, t0 - ADDS t0, acc2, acc2 - UMULH x0, x2, acc3 - - MUL x0, x3, t0 - ADCS t0, acc3, acc3 - UMULH x0, x3, acc4 - ADC $0, acc4, acc4 - // x[2:] * x[1] - MUL x1, x2, t0 - ADDS t0, acc3 - UMULH x1, x2, t1 - ADCS t1, acc4 - ADC $0, ZR, acc5 - - MUL x1, x3, t0 - ADDS t0, acc4 - UMULH x1, x3, t1 - ADC t1, acc5 - // x[3] * x[2] - MUL x2, x3, t0 - ADDS t0, acc5 - UMULH x2, x3, acc6 - ADC $0, acc6 - - MOVD $0, acc7 - // *2 - ADDS acc1, acc1 - ADCS acc2, acc2 - ADCS acc3, acc3 - ADCS acc4, acc4 - ADCS acc5, acc5 - ADCS acc6, acc6 - ADC $0, acc7 - // Missing products - MUL x0, x0, acc0 - UMULH x0, x0, t0 - ADDS t0, acc1, acc1 - - MUL x1, x1, t0 - ADCS t0, acc2, acc2 - UMULH x1, x1, t1 - ADCS t1, acc3, acc3 - - MUL x2, x2, t0 - ADCS t0, acc4, acc4 - UMULH x2, x2, t1 - ADCS t1, acc5, acc5 - - MUL x3, x3, t0 - ADCS t0, acc6, acc6 - UMULH x3, x3, t1 - ADCS t1, acc7, acc7 - // First reduction step - ADDS acc0<<32, acc1, acc1 - LSR $32, acc0, t0 - MUL acc0, const1, t1 - UMULH acc0, const1, acc0 - ADCS t0, acc2, acc2 - ADCS t1, acc3, acc3 - ADC $0, acc0, acc0 - // Second reduction step - ADDS acc1<<32, acc2, acc2 - LSR $32, acc1, t0 - MUL acc1, const1, t1 - UMULH acc1, const1, acc1 - ADCS t0, acc3, acc3 - ADCS t1, acc0, acc0 - ADC $0, acc1, acc1 - // Third reduction step - ADDS acc2<<32, acc3, acc3 - LSR $32, acc2, t0 - MUL acc2, const1, t1 - UMULH acc2, const1, acc2 - ADCS t0, acc0, acc0 - ADCS t1, acc1, acc1 - ADC $0, acc2, acc2 - // Last reduction step - ADDS acc3<<32, acc0, acc0 - LSR $32, acc3, t0 - MUL acc3, const1, t1 - UMULH acc3, const1, acc3 - ADCS t0, acc1, acc1 - ADCS t1, acc2, acc2 - ADC $0, acc3, acc3 - // Add bits [511:256] of the sqr result - ADDS acc4, acc0, acc0 - ADCS acc5, acc1, acc1 - ADCS acc6, acc2, acc2 - ADCS acc7, acc3, acc3 - ADC $0, ZR, acc4 - - SUBS $-1, acc0, t0 - SBCS const0, acc1, t1 - SBCS $0, acc2, t2 - SBCS const1, acc3, t3 - SBCS $0, acc4, acc4 - - CSEL CS, t0, acc0, y0 - CSEL CS, t1, acc1, y1 - CSEL CS, t2, acc2, y2 - CSEL CS, t3, acc3, y3 - RET -/* ---------------------------------------*/ -TEXT p256MulInternal(SB),NOSPLIT,$0 - // y[0] * x - MUL y0, x0, acc0 - UMULH y0, x0, acc1 - - MUL y0, x1, t0 - ADDS t0, acc1 - UMULH y0, x1, acc2 - - MUL y0, x2, t0 - ADCS t0, acc2 - UMULH y0, x2, acc3 - - MUL y0, x3, t0 - ADCS t0, acc3 - UMULH y0, x3, acc4 - ADC $0, acc4 - // First reduction step - ADDS acc0<<32, acc1, acc1 - LSR $32, acc0, t0 - MUL acc0, const1, t1 - UMULH acc0, const1, acc0 - ADCS t0, acc2 - ADCS t1, acc3 - ADC $0, acc0 - // y[1] * x - MUL y1, x0, t0 - ADDS t0, acc1 - UMULH y1, x0, t1 - - MUL y1, x1, t0 - ADCS t0, acc2 - UMULH y1, x1, t2 - - MUL y1, x2, t0 - ADCS t0, acc3 - UMULH y1, x2, t3 - - MUL y1, x3, t0 - ADCS t0, acc4 - UMULH y1, x3, hlp0 - ADC $0, ZR, acc5 - - ADDS t1, acc2 - ADCS t2, acc3 - ADCS t3, acc4 - ADC hlp0, acc5 - // Second reduction step - ADDS acc1<<32, acc2, acc2 - LSR $32, acc1, t0 - MUL acc1, const1, t1 - UMULH acc1, const1, acc1 - ADCS t0, acc3 - ADCS t1, acc0 - ADC $0, acc1 - // y[2] * x - MUL y2, x0, t0 - ADDS t0, acc2 - UMULH y2, x0, t1 - - MUL y2, x1, t0 - ADCS t0, acc3 - UMULH y2, x1, t2 - - MUL y2, x2, t0 - ADCS t0, acc4 - UMULH y2, x2, t3 - - MUL y2, x3, t0 - ADCS t0, acc5 - UMULH y2, x3, hlp0 - ADC $0, ZR, acc6 - - ADDS t1, acc3 - ADCS t2, acc4 - ADCS t3, acc5 - ADC hlp0, acc6 - // Third reduction step - ADDS acc2<<32, acc3, acc3 - LSR $32, acc2, t0 - MUL acc2, const1, t1 - UMULH acc2, const1, acc2 - ADCS t0, acc0 - ADCS t1, acc1 - ADC $0, acc2 - // y[3] * x - MUL y3, x0, t0 - ADDS t0, acc3 - UMULH y3, x0, t1 - - MUL y3, x1, t0 - ADCS t0, acc4 - UMULH y3, x1, t2 - - MUL y3, x2, t0 - ADCS t0, acc5 - UMULH y3, x2, t3 - - MUL y3, x3, t0 - ADCS t0, acc6 - UMULH y3, x3, hlp0 - ADC $0, ZR, acc7 - - ADDS t1, acc4 - ADCS t2, acc5 - ADCS t3, acc6 - ADC hlp0, acc7 - // Last reduction step - ADDS acc3<<32, acc0, acc0 - LSR $32, acc3, t0 - MUL acc3, const1, t1 - UMULH acc3, const1, acc3 - ADCS t0, acc1 - ADCS t1, acc2 - ADC $0, acc3 - // Add bits [511:256] of the mul result - ADDS acc4, acc0, acc0 - ADCS acc5, acc1, acc1 - ADCS acc6, acc2, acc2 - ADCS acc7, acc3, acc3 - ADC $0, ZR, acc4 - - SUBS $-1, acc0, t0 - SBCS const0, acc1, t1 - SBCS $0, acc2, t2 - SBCS const1, acc3, t3 - SBCS $0, acc4, acc4 - - CSEL CS, t0, acc0, y0 - CSEL CS, t1, acc1, y1 - CSEL CS, t2, acc2, y2 - CSEL CS, t3, acc3, y3 - RET -/* ---------------------------------------*/ -#define p256MulBy2Inline \ - ADDS y0, y0, x0; \ - ADCS y1, y1, x1; \ - ADCS y2, y2, x2; \ - ADCS y3, y3, x3; \ - ADC $0, ZR, hlp0; \ - SUBS $-1, x0, t0; \ - SBCS const0, x1, t1;\ - SBCS $0, x2, t2; \ - SBCS const1, x3, t3;\ - SBCS $0, hlp0, hlp0;\ - CSEL CC, x0, t0, x0;\ - CSEL CC, x1, t1, x1;\ - CSEL CC, x2, t2, x2;\ - CSEL CC, x3, t3, x3; -/* ---------------------------------------*/ -#define x1in(off) (off)(a_ptr) -#define y1in(off) (off + 32)(a_ptr) -#define z1in(off) (off + 64)(a_ptr) -#define x2in(off) (off)(b_ptr) -#define z2in(off) (off + 64)(b_ptr) -#define x3out(off) (off)(res_ptr) -#define y3out(off) (off + 32)(res_ptr) -#define z3out(off) (off + 64)(res_ptr) -#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3) -#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3) -#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16) -#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16) -/* ---------------------------------------*/ -#define y2in(off) (32*0 + 8 + off)(RSP) -#define s2(off) (32*1 + 8 + off)(RSP) -#define z1sqr(off) (32*2 + 8 + off)(RSP) -#define h(off) (32*3 + 8 + off)(RSP) -#define r(off) (32*4 + 8 + off)(RSP) -#define hsqr(off) (32*5 + 8 + off)(RSP) -#define rsqr(off) (32*6 + 8 + off)(RSP) -#define hcub(off) (32*7 + 8 + off)(RSP) - -#define z2sqr(off) (32*8 + 8 + off)(RSP) -#define s1(off) (32*9 + 8 + off)(RSP) -#define u1(off) (32*10 + 8 + off)(RSP) -#define u2(off) (32*11 + 8 + off)(RSP) - -// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int) -TEXT ·p256PointAddAffineAsm(SB),0,$264-96 - MOVD res+0(FP), res_ptr - MOVD in1+24(FP), a_ptr - MOVD in2+48(FP), b_ptr - MOVD sign+72(FP), hlp0 - MOVD sel+80(FP), hlp1 - MOVD zero+88(FP), t2 - - MOVD $1, t0 - CMP $0, t2 - CSEL EQ, ZR, t0, t2 - CMP $0, hlp1 - CSEL EQ, ZR, t0, hlp1 - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - EOR t2<<1, hlp1 - - // Negate y2in based on sign - LDP 2*16(b_ptr), (y0, y1) - LDP 3*16(b_ptr), (y2, y3) - MOVD $-1, acc0 - - SUBS y0, acc0, acc0 - SBCS y1, const0, acc1 - SBCS y2, ZR, acc2 - SBCS y3, const1, acc3 - SBC $0, ZR, t0 - - ADDS $-1, acc0, acc4 - ADCS const0, acc1, acc5 - ADCS $0, acc2, acc6 - ADCS const1, acc3, acc7 - ADC $0, t0, t0 - - CMP $0, t0 - CSEL EQ, acc4, acc0, acc0 - CSEL EQ, acc5, acc1, acc1 - CSEL EQ, acc6, acc2, acc2 - CSEL EQ, acc7, acc3, acc3 - // If condition is 0, keep original value - CMP $0, hlp0 - CSEL EQ, y0, acc0, y0 - CSEL EQ, y1, acc1, y1 - CSEL EQ, y2, acc2, y2 - CSEL EQ, y3, acc3, y3 - // Store result - STy(y2in) - // Begin point add - LDx(z1in) - CALL p256SqrInternal(SB) // z1ˆ2 - STy(z1sqr) - - LDx(x2in) - CALL p256MulInternal(SB) // x2 * z1ˆ2 - - LDx(x1in) - CALL p256SubInternal(SB) // h = u2 - u1 - STx(h) - - LDy(z1in) - CALL p256MulInternal(SB) // z3 = h * z1 - - LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1 - LDP 5*16(a_ptr), (acc2, acc3) - ANDS $1, hlp1, ZR - CSEL EQ, acc0, y0, y0 - CSEL EQ, acc1, y1, y1 - CSEL EQ, acc2, y2, y2 - CSEL EQ, acc3, y3, y3 - LDP p256one<>+0x00(SB), (acc0, acc1) - LDP p256one<>+0x10(SB), (acc2, acc3) - ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1 - CSEL EQ, acc0, y0, y0 - CSEL EQ, acc1, y1, y1 - CSEL EQ, acc2, y2, y2 - CSEL EQ, acc3, y3, y3 - LDx(z1in) - STy(z3out) - - LDy(z1sqr) - CALL p256MulInternal(SB) // z1 ^ 3 - - LDx(y2in) - CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3 - STy(s2) - - LDx(y1in) - CALL p256SubInternal(SB) // r = s2 - s1 - STx(r) - - CALL p256SqrInternal(SB) // rsqr = rˆ2 - STy (rsqr) - - LDx(h) - CALL p256SqrInternal(SB) // hsqr = hˆ2 - STy(hsqr) - - CALL p256MulInternal(SB) // hcub = hˆ3 - STy(hcub) - - LDx(y1in) - CALL p256MulInternal(SB) // y1 * hˆ3 - STy(s2) - - LDP hsqr(0*8), (x0, x1) - LDP hsqr(2*8), (x2, x3) - LDP 0*16(a_ptr), (y0, y1) - LDP 1*16(a_ptr), (y2, y3) - CALL p256MulInternal(SB) // u1 * hˆ2 - STP (y0, y1), h(0*8) - STP (y2, y3), h(2*8) - - p256MulBy2Inline // u1 * hˆ2 * 2, inline - - LDy(rsqr) - CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 - - MOVD x0, y0 - MOVD x1, y1 - MOVD x2, y2 - MOVD x3, y3 - LDx(hcub) - CALL p256SubInternal(SB) - - LDP 0*16(a_ptr), (acc0, acc1) - LDP 1*16(a_ptr), (acc2, acc3) - ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1 - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - LDP 0*16(b_ptr), (acc0, acc1) - LDP 1*16(b_ptr), (acc2, acc3) - ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2 - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - STP (x0, x1), 0*16(res_ptr) - STP (x2, x3), 1*16(res_ptr) - - LDP h(0*8), (y0, y1) - LDP h(2*8), (y2, y3) - CALL p256SubInternal(SB) - - LDP r(0*8), (y0, y1) - LDP r(2*8), (y2, y3) - CALL p256MulInternal(SB) - - LDP s2(0*8), (x0, x1) - LDP s2(2*8), (x2, x3) - CALL p256SubInternal(SB) - LDP 2*16(a_ptr), (acc0, acc1) - LDP 3*16(a_ptr), (acc2, acc3) - ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1 - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - LDP y2in(0*8), (acc0, acc1) - LDP y2in(2*8), (acc2, acc3) - ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2 - CSEL EQ, acc0, x0, x0 - CSEL EQ, acc1, x1, x1 - CSEL EQ, acc2, x2, x2 - CSEL EQ, acc3, x3, x3 - STP (x0, x1), 2*16(res_ptr) - STP (x2, x3), 3*16(res_ptr) - - RET - -#define p256AddInline \ - ADDS y0, x0, x0; \ - ADCS y1, x1, x1; \ - ADCS y2, x2, x2; \ - ADCS y3, x3, x3; \ - ADC $0, ZR, hlp0; \ - SUBS $-1, x0, t0; \ - SBCS const0, x1, t1;\ - SBCS $0, x2, t2; \ - SBCS const1, x3, t3;\ - SBCS $0, hlp0, hlp0;\ - CSEL CC, x0, t0, x0;\ - CSEL CC, x1, t1, x1;\ - CSEL CC, x2, t2, x2;\ - CSEL CC, x3, t3, x3; - -#define s(off) (32*0 + 8 + off)(RSP) -#define m(off) (32*1 + 8 + off)(RSP) -#define zsqr(off) (32*2 + 8 + off)(RSP) -#define tmp(off) (32*3 + 8 + off)(RSP) - -//func p256PointDoubleAsm(res, in []uint64) -TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48 - MOVD res+0(FP), res_ptr - MOVD in+24(FP), a_ptr - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - - // Begin point double - LDP 4*16(a_ptr), (x0, x1) - LDP 5*16(a_ptr), (x2, x3) - CALL p256SqrInternal(SB) - STP (y0, y1), zsqr(0*8) - STP (y2, y3), zsqr(2*8) - - LDP 0*16(a_ptr), (x0, x1) - LDP 1*16(a_ptr), (x2, x3) - p256AddInline - STx(m) - - LDx(z1in) - LDy(y1in) - CALL p256MulInternal(SB) - p256MulBy2Inline - STx(z3out) - - LDy(x1in) - LDx(zsqr) - CALL p256SubInternal(SB) - LDy(m) - CALL p256MulInternal(SB) - - // Multiply by 3 - p256MulBy2Inline - p256AddInline - STx(m) - - LDy(y1in) - p256MulBy2Inline - CALL p256SqrInternal(SB) - STy(s) - MOVD y0, x0 - MOVD y1, x1 - MOVD y2, x2 - MOVD y3, x3 - CALL p256SqrInternal(SB) - - // Divide by 2 - ADDS $-1, y0, t0 - ADCS const0, y1, t1 - ADCS $0, y2, t2 - ADCS const1, y3, t3 - ADC $0, ZR, hlp0 - - ANDS $1, y0, ZR - CSEL EQ, y0, t0, t0 - CSEL EQ, y1, t1, t1 - CSEL EQ, y2, t2, t2 - CSEL EQ, y3, t3, t3 - AND y0, hlp0, hlp0 - - EXTR $1, t0, t1, y0 - EXTR $1, t1, t2, y1 - EXTR $1, t2, t3, y2 - EXTR $1, t3, hlp0, y3 - STy(y3out) - - LDx(x1in) - LDy(s) - CALL p256MulInternal(SB) - STy(s) - p256MulBy2Inline - STx(tmp) - - LDx(m) - CALL p256SqrInternal(SB) - LDx(tmp) - CALL p256SubInternal(SB) - - STx(x3out) - - LDy(s) - CALL p256SubInternal(SB) - - LDy(m) - CALL p256MulInternal(SB) - - LDx(y3out) - CALL p256SubInternal(SB) - STx(y3out) - RET -/* ---------------------------------------*/ -#undef y2in -#define y2in(off) (off + 32)(b_ptr) -//func p256PointAddAsm(res, in1, in2 []uint64) int -TEXT ·p256PointAddAsm(SB),0,$392-80 - // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl - // Move input to stack in order to free registers - MOVD res+0(FP), res_ptr - MOVD in1+24(FP), a_ptr - MOVD in2+48(FP), b_ptr - - MOVD p256const0<>(SB), const0 - MOVD p256const1<>(SB), const1 - - // Begin point add - LDx(z2in) - CALL p256SqrInternal(SB) // z2^2 - STy(z2sqr) - - CALL p256MulInternal(SB) // z2^3 - - LDx(y1in) - CALL p256MulInternal(SB) // s1 = z2ˆ3*y1 - STy(s1) - - LDx(z1in) - CALL p256SqrInternal(SB) // z1^2 - STy(z1sqr) - - CALL p256MulInternal(SB) // z1^3 - - LDx(y2in) - CALL p256MulInternal(SB) // s2 = z1ˆ3*y2 - - LDx(s1) - CALL p256SubInternal(SB) // r = s2 - s1 - STx(r) - - MOVD $1, t2 - ORR x0, x1, t0 // Check if zero mod p256 - ORR x2, x3, t1 - ORR t1, t0, t0 - CMP $0, t0 - CSEL EQ, t2, ZR, hlp1 - - EOR $-1, x0, t0 - EOR const0, x1, t1 - EOR const1, x3, t3 - - ORR t0, t1, t0 - ORR x2, t3, t1 - ORR t1, t0, t0 - CMP $0, t0 - CSEL EQ, t2, hlp1, hlp1 - - LDx(z2sqr) - LDy(x1in) - CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2 - STy(u1) - - LDx(z1sqr) - LDy(x2in) - CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2 - STy(u2) - - LDx(u1) - CALL p256SubInternal(SB) // h = u2 - u1 - STx(h) - - MOVD $1, t2 - ORR x0, x1, t0 // Check if zero mod p256 - ORR x2, x3, t1 - ORR t1, t0, t0 - CMP $0, t0 - CSEL EQ, t2, ZR, hlp0 - - EOR $-1, x0, t0 - EOR const0, x1, t1 - EOR const1, x3, t3 - - ORR t0, t1, t0 - ORR x2, t3, t1 - ORR t1, t0, t0 - CMP $0, t0 - CSEL EQ, t2, hlp0, hlp0 - - AND hlp0, hlp1, hlp1 - - LDx(r) - CALL p256SqrInternal(SB) // rsqr = rˆ2 - STy(rsqr) - - LDx(h) - CALL p256SqrInternal(SB) // hsqr = hˆ2 - STy(hsqr) - - LDx(h) - CALL p256MulInternal(SB) // hcub = hˆ3 - STy(hcub) - - LDx(s1) - CALL p256MulInternal(SB) - STy(s2) - - LDx(z1in) - LDy(z2in) - CALL p256MulInternal(SB) // z1 * z2 - LDx(h) - CALL p256MulInternal(SB) // z1 * z2 * h - STy(z3out) - - LDx(hsqr) - LDy(u1) - CALL p256MulInternal(SB) // hˆ2 * u1 - STy(u2) - - p256MulBy2Inline // u1 * hˆ2 * 2, inline - LDy(rsqr) - CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2 - - MOVD x0, y0 - MOVD x1, y1 - MOVD x2, y2 - MOVD x3, y3 - LDx(hcub) - CALL p256SubInternal(SB) - STx(x3out) - - LDy(u2) - CALL p256SubInternal(SB) - - LDy(r) - CALL p256MulInternal(SB) - - LDx(s2) - CALL p256SubInternal(SB) - STx(y3out) - - MOVD hlp1, R0 - MOVD R0, ret+72(FP) - - RET diff --git a/src/crypto/elliptic/p256_generic.go b/src/crypto/elliptic/p256_generic.go index 9427331d52..9963fcafdd 100644 --- a/src/crypto/elliptic/p256_generic.go +++ b/src/crypto/elliptic/p256_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!s390x,!arm64 +// +build !amd64,!s390x package elliptic |