aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew Bonventre <andybons@golang.org>2018-06-28 01:41:22 +0000
committerAndrew Bonventre <andybons@golang.org>2018-06-28 01:45:22 +0000
commited333353a02721dc002c0b7a7c3ef5eb99166dfb (patch)
treeb963fdb45afa268fdddcbc45530916ae66deb08b
parent11f1fab4dfe59f09f322b6493a2b9c5d0ae99bfa (diff)
downloadgo-ed333353a02721dc002c0b7a7c3ef5eb99166dfb.tar.gz
go-ed333353a02721dc002c0b7a7c3ef5eb99166dfb.zip
Revert "crypto/elliptic: implement P256 for arm64"
This reverts commit 0246915fbfcc41870173b7f016dc7fa9437bbc13. Reason for revert: Broke darwin/arm64 builds. Change-Id: Iead935d345c4776c0f823f4c152e02bdda308401 Reviewed-on: https://go-review.googlesource.com/121375 Reviewed-by: Andrew Bonventre <andybons@golang.org>
-rw-r--r--src/crypto/elliptic/p256.go2
-rw-r--r--src/crypto/elliptic/p256_amd64.go (renamed from src/crypto/elliptic/p256_asm.go)2
-rw-r--r--src/crypto/elliptic/p256_asm_arm64.s1522
-rw-r--r--src/crypto/elliptic/p256_generic.go2
4 files changed, 3 insertions, 1525 deletions
diff --git a/src/crypto/elliptic/p256.go b/src/crypto/elliptic/p256.go
index 80e123a734..bb9757355a 100644
--- a/src/crypto/elliptic/p256.go
+++ b/src/crypto/elliptic/p256.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!arm64
+// +build !amd64
package elliptic
diff --git a/src/crypto/elliptic/p256_asm.go b/src/crypto/elliptic/p256_amd64.go
index 6cf7742e1b..30eb33a0d4 100644
--- a/src/crypto/elliptic/p256_asm.go
+++ b/src/crypto/elliptic/p256_amd64.go
@@ -10,7 +10,7 @@
// https://link.springer.com/article/10.1007%2Fs13389-014-0090-x
// https://eprint.iacr.org/2013/816.pdf
-// +build amd64 arm64
+// +build amd64
package elliptic
diff --git a/src/crypto/elliptic/p256_asm_arm64.s b/src/crypto/elliptic/p256_asm_arm64.s
deleted file mode 100644
index bc54ec04d2..0000000000
--- a/src/crypto/elliptic/p256_asm_arm64.s
+++ /dev/null
@@ -1,1522 +0,0 @@
-// Copyright 2018 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// This file contains constant-time, 64-bit assembly implementation of
-// P256. The optimizations performed here are described in detail in:
-// S.Gueron and V.Krasnov, "Fast prime field elliptic-curve cryptography with
-// 256-bit primes"
-// http://link.springer.com/article/10.1007%2Fs13389-014-0090-x
-// https://eprint.iacr.org/2013/816.pdf
-
-#include "textflag.h"
-
-#define res_ptr R0
-#define a_ptr R1
-#define b_ptr R2
-
-#define acc0 R3
-#define acc1 R4
-#define acc2 R5
-#define acc3 R6
-
-#define acc4 R7
-#define acc5 R8
-#define acc6 R9
-#define acc7 R10
-#define t0 R11
-#define t1 R12
-#define t2 R13
-#define t3 R14
-#define const0 R15
-#define const1 R16
-
-#define hlp0 R17
-#define hlp1 R18
-
-#define x0 R19
-#define x1 R20
-#define x2 R21
-#define x3 R22
-#define y0 R23
-#define y1 R24
-#define y2 R25
-#define y3 R26
-
-#define const2 t2
-#define const3 t3
-
-DATA p256const0<>+0x00(SB)/8, $0x00000000ffffffff
-DATA p256const1<>+0x00(SB)/8, $0xffffffff00000001
-DATA p256ordK0<>+0x00(SB)/8, $0xccd1c8aaee00bc4f
-DATA p256ord<>+0x00(SB)/8, $0xf3b9cac2fc632551
-DATA p256ord<>+0x08(SB)/8, $0xbce6faada7179e84
-DATA p256ord<>+0x10(SB)/8, $0xffffffffffffffff
-DATA p256ord<>+0x18(SB)/8, $0xffffffff00000000
-DATA p256one<>+0x00(SB)/8, $0x0000000000000001
-DATA p256one<>+0x08(SB)/8, $0xffffffff00000000
-DATA p256one<>+0x10(SB)/8, $0xffffffffffffffff
-DATA p256one<>+0x18(SB)/8, $0x00000000fffffffe
-GLOBL p256const0<>(SB), 8, $8
-GLOBL p256const1<>(SB), 8, $8
-GLOBL p256ordK0<>(SB), 8, $8
-GLOBL p256ord<>(SB), 8, $32
-GLOBL p256one<>(SB), 8, $32
-
-/* ---------------------------------------*/
-// func p256LittleToBig(res []byte, in []uint64)
-TEXT ·p256LittleToBig(SB),NOSPLIT,$0
- JMP ·p256BigToLittle(SB)
-/* ---------------------------------------*/
-// func p256BigToLittle(res []uint64, in []byte)
-TEXT ·p256BigToLittle(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in+24(FP), a_ptr
-
- LDP 0*16(a_ptr), (acc0, acc1)
- LDP 1*16(a_ptr), (acc2, acc3)
-
- REV acc0, acc0
- REV acc1, acc1
- REV acc2, acc2
- REV acc3, acc3
-
- STP (acc3, acc2), 0*16(res_ptr)
- STP (acc1, acc0), 1*16(res_ptr)
- RET
-/* ---------------------------------------*/
-// func p256MovCond(res, a, b []uint64, cond int)
-// If cond == 0 res=b, else res=a
-TEXT ·p256MovCond(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD a+24(FP), a_ptr
- MOVD b+48(FP), b_ptr
- MOVD cond+72(FP), R3
-
- // Two remarks:
- // 1) Will want to revisit NEON, when support is better
- // 2) CSEL might not be constant time on all ARM processors
- LDP 0*16(a_ptr), (R4, R5)
- LDP 1*16(a_ptr), (R6, R7)
- LDP 2*16(a_ptr), (R8, R9)
- LDP 3*16(a_ptr), (R10, R11)
- LDP 4*16(a_ptr), (R12, R13)
- LDP 5*16(a_ptr), (R14, R15)
-
- LDP 0*16(b_ptr), (R16, R17)
- LDP 1*16(b_ptr), (R18, R19)
- LDP 2*16(b_ptr), (R20, R21)
- LDP 3*16(b_ptr), (R22, R23)
- LDP 4*16(b_ptr), (R24, R25)
- LDP 5*16(b_ptr), (R26, R27)
-
- CMP $0, R3
- CSEL EQ, R16, R4, R4
- CSEL EQ, R17, R5, R5
- CSEL EQ, R18, R6, R6
- CSEL EQ, R19, R7, R7
- CSEL EQ, R20, R8, R8
- CSEL EQ, R21, R9, R9
- CSEL EQ, R22, R10, R10
- CSEL EQ, R23, R11, R11
- CSEL EQ, R24, R12, R12
- CSEL EQ, R25, R13, R13
- CSEL EQ, R26, R14, R14
- CSEL EQ, R27, R15, R15
-
- STP (R4, R5), 0*16(res_ptr)
- STP (R6, R7), 1*16(res_ptr)
- STP (R8, R9), 2*16(res_ptr)
- STP (R10, R11), 3*16(res_ptr)
- STP (R12, R13), 4*16(res_ptr)
- STP (R14, R15), 5*16(res_ptr)
-
- RET
-/* ---------------------------------------*/
-// func p256NegCond(val []uint64, cond int)
-TEXT ·p256NegCond(SB),NOSPLIT,$0
- MOVD val+0(FP), a_ptr
- MOVD cond+24(FP), hlp0
- MOVD a_ptr, res_ptr
- // acc = poly
- MOVD $-1, acc0
- MOVD p256const0<>(SB), acc1
- MOVD $0, acc2
- MOVD p256const1<>(SB), acc3
- // Load the original value
- LDP 0*16(a_ptr), (t0, t1)
- LDP 1*16(a_ptr), (t2, t3)
- // Speculatively subtract
- SUBS t0, acc0
- SBCS t1, acc1
- SBCS t2, acc2
- SBC t3, acc3
- // If condition is 0, keep original value
- CMP $0, hlp0
- CSEL EQ, t0, acc0, acc0
- CSEL EQ, t1, acc1, acc1
- CSEL EQ, t2, acc2, acc2
- CSEL EQ, t3, acc3, acc3
- // Store result
- STP (acc0, acc1), 0*16(res_ptr)
- STP (acc2, acc3), 1*16(res_ptr)
-
- RET
-/* ---------------------------------------*/
-// func p256Sqr(res, in []uint64, n int)
-TEXT ·p256Sqr(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in+24(FP), a_ptr
- MOVD n+48(FP), b_ptr
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
-
- LDP 0*16(a_ptr), (x0, x1)
- LDP 1*16(a_ptr), (x2, x3)
-
-sqrLoop:
- SUB $1, b_ptr
- CALL p256SqrInternal(SB)
- MOVD y0, x0
- MOVD y1, x1
- MOVD y2, x2
- MOVD y3, x3
- CBNZ b_ptr, sqrLoop
-
- STP (y0, y1), 0*16(res_ptr)
- STP (y2, y3), 1*16(res_ptr)
- RET
-/* ---------------------------------------*/
-// func p256Mul(res, in1, in2 []uint64)
-TEXT ·p256Mul(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in1+24(FP), a_ptr
- MOVD in2+48(FP), b_ptr
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
-
- LDP 0*16(a_ptr), (x0, x1)
- LDP 1*16(a_ptr), (x2, x3)
-
- LDP 0*16(b_ptr), (y0, y1)
- LDP 1*16(b_ptr), (y2, y3)
-
- CALL p256MulInternal(SB)
-
- STP (y0, y1), 0*16(res_ptr)
- STP (y2, y3), 1*16(res_ptr)
- RET
-/* ---------------------------------------*/
-// func p256FromMont(res, in []uint64)
-TEXT ·p256FromMont(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in+24(FP), a_ptr
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
-
- LDP 0*16(a_ptr), (acc0, acc1)
- LDP 1*16(a_ptr), (acc2, acc3)
- // Only reduce, no multiplications are needed
- // First reduction step
- ADDS acc0<<32, acc1, acc1
- LSR $32, acc0, t0
- MUL acc0, const1, t1
- UMULH acc0, const1, acc0
- ADCS t0, acc2
- ADCS t1, acc3
- ADC $0, acc0
- // Second reduction step
- ADDS acc1<<32, acc2, acc2
- LSR $32, acc1, t0
- MUL acc1, const1, t1
- UMULH acc1, const1, acc1
- ADCS t0, acc3
- ADCS t1, acc0
- ADC $0, acc1
- // Third reduction step
- ADDS acc2<<32, acc3, acc3
- LSR $32, acc2, t0
- MUL acc2, const1, t1
- UMULH acc2, const1, acc2
- ADCS t0, acc0
- ADCS t1, acc1
- ADC $0, acc2
- // Last reduction step
- ADDS acc3<<32, acc0, acc0
- LSR $32, acc3, t0
- MUL acc3, const1, t1
- UMULH acc3, const1, acc3
- ADCS t0, acc1
- ADCS t1, acc2
- ADC $0, acc3
-
- SUBS $-1, acc0, t0
- SBCS const0, acc1, t1
- SBCS $0, acc2, t2
- SBCS const1, acc3, t3
-
- CSEL CS, t0, acc0, acc0
- CSEL CS, t1, acc1, acc1
- CSEL CS, t2, acc2, acc2
- CSEL CS, t3, acc3, acc3
-
- STP (acc0, acc1), 0*16(res_ptr)
- STP (acc2, acc3), 1*16(res_ptr)
-
- RET
-/* ---------------------------------------*/
-// Constant time point access to arbitrary point table.
-// Indexed from 1 to 15, with -1 offset
-// (index 0 is implicitly point at infinity)
-// func p256Select(point, table []uint64, idx int)
-TEXT ·p256Select(SB),NOSPLIT,$0
- MOVD idx+48(FP), const0
- MOVD table+24(FP), b_ptr
- MOVD point+0(FP), res_ptr
-
- EOR x0, x0, x0
- EOR x1, x1, x1
- EOR x2, x2, x2
- EOR x3, x3, x3
- EOR y0, y0, y0
- EOR y1, y1, y1
- EOR y2, y2, y2
- EOR y3, y3, y3
- EOR t0, t0, t0
- EOR t1, t1, t1
- EOR t2, t2, t2
- EOR t3, t3, t3
-
- MOVD $0, const1
-
-loop_select:
- ADD $1, const1
- CMP const0, const1
- LDP.P 16(b_ptr), (acc0, acc1)
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- LDP.P 16(b_ptr), (acc2, acc3)
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- LDP.P 16(b_ptr), (acc4, acc5)
- CSEL EQ, acc4, y0, y0
- CSEL EQ, acc5, y1, y1
- LDP.P 16(b_ptr), (acc6, acc7)
- CSEL EQ, acc6, y2, y2
- CSEL EQ, acc7, y3, y3
- LDP.P 16(b_ptr), (acc0, acc1)
- CSEL EQ, acc0, t0, t0
- CSEL EQ, acc1, t1, t1
- LDP.P 16(b_ptr), (acc2, acc3)
- CSEL EQ, acc2, t2, t2
- CSEL EQ, acc3, t3, t3
-
- CMP $16, const1
- BNE loop_select
-
- STP (x0, x1), 0*16(res_ptr)
- STP (x2, x3), 1*16(res_ptr)
- STP (y0, y1), 2*16(res_ptr)
- STP (y2, y3), 3*16(res_ptr)
- STP (t0, t1), 4*16(res_ptr)
- STP (t2, t3), 5*16(res_ptr)
- RET
-/* ---------------------------------------*/
-// Constant time point access to base point table.
-// func p256SelectBase(point, table []uint64, idx int)
-TEXT ·p256SelectBase(SB),NOSPLIT,$0
- MOVD idx+48(FP), t0
- MOVD table+24(FP), t1
- MOVD point+0(FP), res_ptr
-
- EOR x0, x0, x0
- EOR x1, x1, x1
- EOR x2, x2, x2
- EOR x3, x3, x3
- EOR y0, y0, y0
- EOR y1, y1, y1
- EOR y2, y2, y2
- EOR y3, y3, y3
-
- MOVD $0, t2
-
-loop_select:
- ADD $1, t2
- CMP t0, t2
- LDP.P 16(t1), (acc0, acc1)
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- LDP.P 16(t1), (acc2, acc3)
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- LDP.P 16(t1), (acc4, acc5)
- CSEL EQ, acc4, y0, y0
- CSEL EQ, acc5, y1, y1
- LDP.P 16(t1), (acc6, acc7)
- CSEL EQ, acc6, y2, y2
- CSEL EQ, acc7, y3, y3
-
- CMP $32, t2
- BNE loop_select
-
- STP (x0, x1), 0*16(res_ptr)
- STP (x2, x3), 1*16(res_ptr)
- STP (y0, y1), 2*16(res_ptr)
- STP (y2, y3), 3*16(res_ptr)
- RET
-/* ---------------------------------------*/
-// func p256OrdSqr(res, in []uint64, n int)
-TEXT ·p256OrdSqr(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in+24(FP), a_ptr
- MOVD n+48(FP), b_ptr
-
- MOVD p256ordK0<>(SB), hlp1
- LDP p256ord<>+0x00(SB), (const0, const1)
- LDP p256ord<>+0x10(SB), (const2, const3)
-
- LDP 0*16(a_ptr), (x0, x1)
- LDP 1*16(a_ptr), (x2, x3)
-
-ordSqrLoop:
- SUB $1, b_ptr
-
- // x[1:] * x[0]
- MUL x0, x1, acc1
- UMULH x0, x1, acc2
-
- MUL x0, x2, t0
- ADDS t0, acc2, acc2
- UMULH x0, x2, acc3
-
- MUL x0, x3, t0
- ADCS t0, acc3, acc3
- UMULH x0, x3, acc4
- ADC $0, acc4, acc4
- // x[2:] * x[1]
- MUL x1, x2, t0
- ADDS t0, acc3
- UMULH x1, x2, t1
- ADCS t1, acc4
- ADC $0, ZR, acc5
-
- MUL x1, x3, t0
- ADDS t0, acc4
- UMULH x1, x3, t1
- ADC t1, acc5
- // x[3] * x[2]
- MUL x2, x3, t0
- ADDS t0, acc5
- UMULH x2, x3, acc6
- ADC $0, acc6
-
- MOVD $0, acc7
- // *2
- ADDS acc1, acc1
- ADCS acc2, acc2
- ADCS acc3, acc3
- ADCS acc4, acc4
- ADCS acc5, acc5
- ADCS acc6, acc6
- ADC $0, acc7
- // Missing products
- MUL x0, x0, acc0
- UMULH x0, x0, t0
- ADDS t0, acc1, acc1
-
- MUL x1, x1, t0
- ADCS t0, acc2, acc2
- UMULH x1, x1, t1
- ADCS t1, acc3, acc3
-
- MUL x2, x2, t0
- ADCS t0, acc4, acc4
- UMULH x2, x2, t1
- ADCS t1, acc5, acc5
-
- MUL x3, x3, t0
- ADCS t0, acc6, acc6
- UMULH x3, x3, t1
- ADC t1, acc7, acc7
- // First reduction step
- MUL acc0, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc0, acc0
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc1, acc1
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc2, acc2
- UMULH const2, hlp0, acc0
-
- MUL const3, hlp0, t0
- ADCS t0, acc3, acc3
-
- UMULH const3, hlp0, hlp0
- ADC $0, hlp0
-
- ADDS t1, acc1, acc1
- ADCS y0, acc2, acc2
- ADCS acc0, acc3, acc3
- ADC $0, hlp0, acc0
- // Second reduction step
- MUL acc1, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc1, acc1
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc2, acc2
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc3, acc3
- UMULH const2, hlp0, acc1
-
- MUL const3, hlp0, t0
- ADCS t0, acc0, acc0
-
- UMULH const3, hlp0, hlp0
- ADC $0, hlp0
-
- ADDS t1, acc2, acc2
- ADCS y0, acc3, acc3
- ADCS acc1, acc0, acc0
- ADC $0, hlp0, acc1
- // Third reduction step
- MUL acc2, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc2, acc2
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc3, acc3
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc0, acc0
- UMULH const2, hlp0, acc2
-
- MUL const3, hlp0, t0
- ADCS t0, acc1, acc1
-
- UMULH const3, hlp0, hlp0
- ADC $0, hlp0
-
- ADDS t1, acc3, acc3
- ADCS y0, acc0, acc0
- ADCS acc2, acc1, acc1
- ADC $0, hlp0, acc2
-
- // Last reduction step
- MUL acc3, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc3, acc3
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc0, acc0
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc1, acc1
- UMULH const2, hlp0, acc3
-
- MUL const3, hlp0, t0
- ADCS t0, acc2, acc2
-
- UMULH const3, hlp0, hlp0
- ADC $0, acc7
-
- ADDS t1, acc0, acc0
- ADCS y0, acc1, acc1
- ADCS acc3, acc2, acc2
- ADC $0, hlp0, acc3
-
- ADDS acc4, acc0, acc0
- ADCS acc5, acc1, acc1
- ADCS acc6, acc2, acc2
- ADCS acc7, acc3, acc3
- ADC $0, ZR, acc4
-
- SUBS const0, acc0, y0
- SBCS const1, acc1, y1
- SBCS const2, acc2, y2
- SBCS const3, acc3, y3
- SBCS $0, acc4, acc4
-
- CSEL CS, y0, acc0, x0
- CSEL CS, y1, acc1, x1
- CSEL CS, y2, acc2, x2
- CSEL CS, y3, acc3, x3
-
- CBNZ b_ptr, ordSqrLoop
-
- STP (x0, x1), 0*16(res_ptr)
- STP (x2, x3), 1*16(res_ptr)
-
- RET
-/* ---------------------------------------*/
-// func p256OrdMul(res, in1, in2 []uint64)
-TEXT ·p256OrdMul(SB),NOSPLIT,$0
- MOVD res+0(FP), res_ptr
- MOVD in1+24(FP), a_ptr
- MOVD in2+48(FP), b_ptr
-
- MOVD p256ordK0<>(SB), hlp1
- LDP p256ord<>+0x00(SB), (const0, const1)
- LDP p256ord<>+0x10(SB), (const2, const3)
-
- LDP 0*16(a_ptr), (x0, x1)
- LDP 1*16(a_ptr), (x2, x3)
- LDP 0*16(b_ptr), (y0, y1)
- LDP 1*16(b_ptr), (y2, y3)
-
- // y[0] * x
- MUL y0, x0, acc0
- UMULH y0, x0, acc1
-
- MUL y0, x1, t0
- ADDS t0, acc1
- UMULH y0, x1, acc2
-
- MUL y0, x2, t0
- ADCS t0, acc2
- UMULH y0, x2, acc3
-
- MUL y0, x3, t0
- ADCS t0, acc3
- UMULH y0, x3, acc4
- ADC $0, acc4
- // First reduction step
- MUL acc0, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc0, acc0
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc1, acc1
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc2, acc2
- UMULH const2, hlp0, acc0
-
- MUL const3, hlp0, t0
- ADCS t0, acc3, acc3
-
- UMULH const3, hlp0, hlp0
- ADC $0, acc4
-
- ADDS t1, acc1, acc1
- ADCS y0, acc2, acc2
- ADCS acc0, acc3, acc3
- ADC $0, hlp0, acc0
- // y[1] * x
- MUL y1, x0, t0
- ADDS t0, acc1
- UMULH y1, x0, t1
-
- MUL y1, x1, t0
- ADCS t0, acc2
- UMULH y1, x1, hlp0
-
- MUL y1, x2, t0
- ADCS t0, acc3
- UMULH y1, x2, y0
-
- MUL y1, x3, t0
- ADCS t0, acc4
- UMULH y1, x3, y1
- ADC $0, ZR, acc5
-
- ADDS t1, acc2
- ADCS hlp0, acc3
- ADCS y0, acc4
- ADC y1, acc5
- // Second reduction step
- MUL acc1, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc1, acc1
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc2, acc2
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc3, acc3
- UMULH const2, hlp0, acc1
-
- MUL const3, hlp0, t0
- ADCS t0, acc0, acc0
-
- UMULH const3, hlp0, hlp0
- ADC $0, acc5
-
- ADDS t1, acc2, acc2
- ADCS y0, acc3, acc3
- ADCS acc1, acc0, acc0
- ADC $0, hlp0, acc1
- // y[2] * x
- MUL y2, x0, t0
- ADDS t0, acc2
- UMULH y2, x0, t1
-
- MUL y2, x1, t0
- ADCS t0, acc3
- UMULH y2, x1, hlp0
-
- MUL y2, x2, t0
- ADCS t0, acc4
- UMULH y2, x2, y0
-
- MUL y2, x3, t0
- ADCS t0, acc5
- UMULH y2, x3, y1
- ADC $0, ZR, acc6
-
- ADDS t1, acc3
- ADCS hlp0, acc4
- ADCS y0, acc5
- ADC y1, acc6
- // Third reduction step
- MUL acc2, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc2, acc2
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc3, acc3
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc0, acc0
- UMULH const2, hlp0, acc2
-
- MUL const3, hlp0, t0
- ADCS t0, acc1, acc1
-
- UMULH const3, hlp0, hlp0
- ADC $0, acc6
-
- ADDS t1, acc3, acc3
- ADCS y0, acc0, acc0
- ADCS acc2, acc1, acc1
- ADC $0, hlp0, acc2
- // y[3] * x
- MUL y3, x0, t0
- ADDS t0, acc3
- UMULH y3, x0, t1
-
- MUL y3, x1, t0
- ADCS t0, acc4
- UMULH y3, x1, hlp0
-
- MUL y3, x2, t0
- ADCS t0, acc5
- UMULH y3, x2, y0
-
- MUL y3, x3, t0
- ADCS t0, acc6
- UMULH y3, x3, y1
- ADC $0, ZR, acc7
-
- ADDS t1, acc4
- ADCS hlp0, acc5
- ADCS y0, acc6
- ADC y1, acc7
- // Last reduction step
- MUL acc3, hlp1, hlp0
-
- MUL const0, hlp1, t0
- ADDS t0, acc3, acc3
- UMULH const0, hlp0, t1
-
- MUL const1, hlp0, t0
- ADCS t0, acc0, acc0
- UMULH const1, hlp0, y0
-
- MUL const2, hlp0, t0
- ADCS t0, acc1, acc1
- UMULH const2, hlp0, acc3
-
- MUL const3, hlp0, t0
- ADCS t0, acc2, acc2
-
- UMULH const3, hlp0, hlp0
- ADC $0, acc7
-
- ADDS t1, acc0, acc0
- ADCS y0, acc1, acc1
- ADCS acc3, acc2, acc2
- ADC $0, hlp0, acc3
-
- ADDS acc4, acc0, acc0
- ADCS acc5, acc1, acc1
- ADCS acc6, acc2, acc2
- ADCS acc7, acc3, acc3
- ADC $0, ZR, acc4
-
- SUBS const0, acc0, t0
- SBCS const1, acc1, t1
- SBCS const2, acc2, t2
- SBCS const3, acc3, t3
- SBCS $0, acc4, acc4
-
- CSEL CS, t0, acc0, acc0
- CSEL CS, t1, acc1, acc1
- CSEL CS, t2, acc2, acc2
- CSEL CS, t3, acc3, acc3
-
- STP (acc0, acc1), 0*16(res_ptr)
- STP (acc2, acc3), 1*16(res_ptr)
-
- RET
-/* ---------------------------------------*/
-TEXT p256SubInternal(SB),NOSPLIT,$0
- SUBS x0, y0, acc0
- SBCS x1, y1, acc1
- SBCS x2, y2, acc2
- SBCS x3, y3, acc3
- SBC $0, ZR, t0
-
- ADDS $-1, acc0, acc4
- ADCS const0, acc1, acc5
- ADCS $0, acc2, acc6
- ADC const1, acc3, acc7
-
- ANDS $1, t0
- CSEL EQ, acc0, acc4, x0
- CSEL EQ, acc1, acc5, x1
- CSEL EQ, acc2, acc6, x2
- CSEL EQ, acc3, acc7, x3
-
- RET
-/* ---------------------------------------*/
-TEXT p256SqrInternal(SB),NOSPLIT,$0
- // x[1:] * x[0]
- MUL x0, x1, acc1
- UMULH x0, x1, acc2
-
- MUL x0, x2, t0
- ADDS t0, acc2, acc2
- UMULH x0, x2, acc3
-
- MUL x0, x3, t0
- ADCS t0, acc3, acc3
- UMULH x0, x3, acc4
- ADC $0, acc4, acc4
- // x[2:] * x[1]
- MUL x1, x2, t0
- ADDS t0, acc3
- UMULH x1, x2, t1
- ADCS t1, acc4
- ADC $0, ZR, acc5
-
- MUL x1, x3, t0
- ADDS t0, acc4
- UMULH x1, x3, t1
- ADC t1, acc5
- // x[3] * x[2]
- MUL x2, x3, t0
- ADDS t0, acc5
- UMULH x2, x3, acc6
- ADC $0, acc6
-
- MOVD $0, acc7
- // *2
- ADDS acc1, acc1
- ADCS acc2, acc2
- ADCS acc3, acc3
- ADCS acc4, acc4
- ADCS acc5, acc5
- ADCS acc6, acc6
- ADC $0, acc7
- // Missing products
- MUL x0, x0, acc0
- UMULH x0, x0, t0
- ADDS t0, acc1, acc1
-
- MUL x1, x1, t0
- ADCS t0, acc2, acc2
- UMULH x1, x1, t1
- ADCS t1, acc3, acc3
-
- MUL x2, x2, t0
- ADCS t0, acc4, acc4
- UMULH x2, x2, t1
- ADCS t1, acc5, acc5
-
- MUL x3, x3, t0
- ADCS t0, acc6, acc6
- UMULH x3, x3, t1
- ADCS t1, acc7, acc7
- // First reduction step
- ADDS acc0<<32, acc1, acc1
- LSR $32, acc0, t0
- MUL acc0, const1, t1
- UMULH acc0, const1, acc0
- ADCS t0, acc2, acc2
- ADCS t1, acc3, acc3
- ADC $0, acc0, acc0
- // Second reduction step
- ADDS acc1<<32, acc2, acc2
- LSR $32, acc1, t0
- MUL acc1, const1, t1
- UMULH acc1, const1, acc1
- ADCS t0, acc3, acc3
- ADCS t1, acc0, acc0
- ADC $0, acc1, acc1
- // Third reduction step
- ADDS acc2<<32, acc3, acc3
- LSR $32, acc2, t0
- MUL acc2, const1, t1
- UMULH acc2, const1, acc2
- ADCS t0, acc0, acc0
- ADCS t1, acc1, acc1
- ADC $0, acc2, acc2
- // Last reduction step
- ADDS acc3<<32, acc0, acc0
- LSR $32, acc3, t0
- MUL acc3, const1, t1
- UMULH acc3, const1, acc3
- ADCS t0, acc1, acc1
- ADCS t1, acc2, acc2
- ADC $0, acc3, acc3
- // Add bits [511:256] of the sqr result
- ADDS acc4, acc0, acc0
- ADCS acc5, acc1, acc1
- ADCS acc6, acc2, acc2
- ADCS acc7, acc3, acc3
- ADC $0, ZR, acc4
-
- SUBS $-1, acc0, t0
- SBCS const0, acc1, t1
- SBCS $0, acc2, t2
- SBCS const1, acc3, t3
- SBCS $0, acc4, acc4
-
- CSEL CS, t0, acc0, y0
- CSEL CS, t1, acc1, y1
- CSEL CS, t2, acc2, y2
- CSEL CS, t3, acc3, y3
- RET
-/* ---------------------------------------*/
-TEXT p256MulInternal(SB),NOSPLIT,$0
- // y[0] * x
- MUL y0, x0, acc0
- UMULH y0, x0, acc1
-
- MUL y0, x1, t0
- ADDS t0, acc1
- UMULH y0, x1, acc2
-
- MUL y0, x2, t0
- ADCS t0, acc2
- UMULH y0, x2, acc3
-
- MUL y0, x3, t0
- ADCS t0, acc3
- UMULH y0, x3, acc4
- ADC $0, acc4
- // First reduction step
- ADDS acc0<<32, acc1, acc1
- LSR $32, acc0, t0
- MUL acc0, const1, t1
- UMULH acc0, const1, acc0
- ADCS t0, acc2
- ADCS t1, acc3
- ADC $0, acc0
- // y[1] * x
- MUL y1, x0, t0
- ADDS t0, acc1
- UMULH y1, x0, t1
-
- MUL y1, x1, t0
- ADCS t0, acc2
- UMULH y1, x1, t2
-
- MUL y1, x2, t0
- ADCS t0, acc3
- UMULH y1, x2, t3
-
- MUL y1, x3, t0
- ADCS t0, acc4
- UMULH y1, x3, hlp0
- ADC $0, ZR, acc5
-
- ADDS t1, acc2
- ADCS t2, acc3
- ADCS t3, acc4
- ADC hlp0, acc5
- // Second reduction step
- ADDS acc1<<32, acc2, acc2
- LSR $32, acc1, t0
- MUL acc1, const1, t1
- UMULH acc1, const1, acc1
- ADCS t0, acc3
- ADCS t1, acc0
- ADC $0, acc1
- // y[2] * x
- MUL y2, x0, t0
- ADDS t0, acc2
- UMULH y2, x0, t1
-
- MUL y2, x1, t0
- ADCS t0, acc3
- UMULH y2, x1, t2
-
- MUL y2, x2, t0
- ADCS t0, acc4
- UMULH y2, x2, t3
-
- MUL y2, x3, t0
- ADCS t0, acc5
- UMULH y2, x3, hlp0
- ADC $0, ZR, acc6
-
- ADDS t1, acc3
- ADCS t2, acc4
- ADCS t3, acc5
- ADC hlp0, acc6
- // Third reduction step
- ADDS acc2<<32, acc3, acc3
- LSR $32, acc2, t0
- MUL acc2, const1, t1
- UMULH acc2, const1, acc2
- ADCS t0, acc0
- ADCS t1, acc1
- ADC $0, acc2
- // y[3] * x
- MUL y3, x0, t0
- ADDS t0, acc3
- UMULH y3, x0, t1
-
- MUL y3, x1, t0
- ADCS t0, acc4
- UMULH y3, x1, t2
-
- MUL y3, x2, t0
- ADCS t0, acc5
- UMULH y3, x2, t3
-
- MUL y3, x3, t0
- ADCS t0, acc6
- UMULH y3, x3, hlp0
- ADC $0, ZR, acc7
-
- ADDS t1, acc4
- ADCS t2, acc5
- ADCS t3, acc6
- ADC hlp0, acc7
- // Last reduction step
- ADDS acc3<<32, acc0, acc0
- LSR $32, acc3, t0
- MUL acc3, const1, t1
- UMULH acc3, const1, acc3
- ADCS t0, acc1
- ADCS t1, acc2
- ADC $0, acc3
- // Add bits [511:256] of the mul result
- ADDS acc4, acc0, acc0
- ADCS acc5, acc1, acc1
- ADCS acc6, acc2, acc2
- ADCS acc7, acc3, acc3
- ADC $0, ZR, acc4
-
- SUBS $-1, acc0, t0
- SBCS const0, acc1, t1
- SBCS $0, acc2, t2
- SBCS const1, acc3, t3
- SBCS $0, acc4, acc4
-
- CSEL CS, t0, acc0, y0
- CSEL CS, t1, acc1, y1
- CSEL CS, t2, acc2, y2
- CSEL CS, t3, acc3, y3
- RET
-/* ---------------------------------------*/
-#define p256MulBy2Inline \
- ADDS y0, y0, x0; \
- ADCS y1, y1, x1; \
- ADCS y2, y2, x2; \
- ADCS y3, y3, x3; \
- ADC $0, ZR, hlp0; \
- SUBS $-1, x0, t0; \
- SBCS const0, x1, t1;\
- SBCS $0, x2, t2; \
- SBCS const1, x3, t3;\
- SBCS $0, hlp0, hlp0;\
- CSEL CC, x0, t0, x0;\
- CSEL CC, x1, t1, x1;\
- CSEL CC, x2, t2, x2;\
- CSEL CC, x3, t3, x3;
-/* ---------------------------------------*/
-#define x1in(off) (off)(a_ptr)
-#define y1in(off) (off + 32)(a_ptr)
-#define z1in(off) (off + 64)(a_ptr)
-#define x2in(off) (off)(b_ptr)
-#define z2in(off) (off + 64)(b_ptr)
-#define x3out(off) (off)(res_ptr)
-#define y3out(off) (off + 32)(res_ptr)
-#define z3out(off) (off + 64)(res_ptr)
-#define LDx(src) LDP src(0), (x0, x1); LDP src(16), (x2, x3)
-#define LDy(src) LDP src(0), (y0, y1); LDP src(16), (y2, y3)
-#define STx(src) STP (x0, x1), src(0); STP (x2, x3), src(16)
-#define STy(src) STP (y0, y1), src(0); STP (y2, y3), src(16)
-/* ---------------------------------------*/
-#define y2in(off) (32*0 + 8 + off)(RSP)
-#define s2(off) (32*1 + 8 + off)(RSP)
-#define z1sqr(off) (32*2 + 8 + off)(RSP)
-#define h(off) (32*3 + 8 + off)(RSP)
-#define r(off) (32*4 + 8 + off)(RSP)
-#define hsqr(off) (32*5 + 8 + off)(RSP)
-#define rsqr(off) (32*6 + 8 + off)(RSP)
-#define hcub(off) (32*7 + 8 + off)(RSP)
-
-#define z2sqr(off) (32*8 + 8 + off)(RSP)
-#define s1(off) (32*9 + 8 + off)(RSP)
-#define u1(off) (32*10 + 8 + off)(RSP)
-#define u2(off) (32*11 + 8 + off)(RSP)
-
-// func p256PointAddAffineAsm(res, in1, in2 []uint64, sign, sel, zero int)
-TEXT ·p256PointAddAffineAsm(SB),0,$264-96
- MOVD res+0(FP), res_ptr
- MOVD in1+24(FP), a_ptr
- MOVD in2+48(FP), b_ptr
- MOVD sign+72(FP), hlp0
- MOVD sel+80(FP), hlp1
- MOVD zero+88(FP), t2
-
- MOVD $1, t0
- CMP $0, t2
- CSEL EQ, ZR, t0, t2
- CMP $0, hlp1
- CSEL EQ, ZR, t0, hlp1
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
- EOR t2<<1, hlp1
-
- // Negate y2in based on sign
- LDP 2*16(b_ptr), (y0, y1)
- LDP 3*16(b_ptr), (y2, y3)
- MOVD $-1, acc0
-
- SUBS y0, acc0, acc0
- SBCS y1, const0, acc1
- SBCS y2, ZR, acc2
- SBCS y3, const1, acc3
- SBC $0, ZR, t0
-
- ADDS $-1, acc0, acc4
- ADCS const0, acc1, acc5
- ADCS $0, acc2, acc6
- ADCS const1, acc3, acc7
- ADC $0, t0, t0
-
- CMP $0, t0
- CSEL EQ, acc4, acc0, acc0
- CSEL EQ, acc5, acc1, acc1
- CSEL EQ, acc6, acc2, acc2
- CSEL EQ, acc7, acc3, acc3
- // If condition is 0, keep original value
- CMP $0, hlp0
- CSEL EQ, y0, acc0, y0
- CSEL EQ, y1, acc1, y1
- CSEL EQ, y2, acc2, y2
- CSEL EQ, y3, acc3, y3
- // Store result
- STy(y2in)
- // Begin point add
- LDx(z1in)
- CALL p256SqrInternal(SB) // z1ˆ2
- STy(z1sqr)
-
- LDx(x2in)
- CALL p256MulInternal(SB) // x2 * z1ˆ2
-
- LDx(x1in)
- CALL p256SubInternal(SB) // h = u2 - u1
- STx(h)
-
- LDy(z1in)
- CALL p256MulInternal(SB) // z3 = h * z1
-
- LDP 4*16(a_ptr), (acc0, acc1)// iff select[0] == 0, z3 = z1
- LDP 5*16(a_ptr), (acc2, acc3)
- ANDS $1, hlp1, ZR
- CSEL EQ, acc0, y0, y0
- CSEL EQ, acc1, y1, y1
- CSEL EQ, acc2, y2, y2
- CSEL EQ, acc3, y3, y3
- LDP p256one<>+0x00(SB), (acc0, acc1)
- LDP p256one<>+0x10(SB), (acc2, acc3)
- ANDS $2, hlp1, ZR // iff select[1] == 0, z3 = 1
- CSEL EQ, acc0, y0, y0
- CSEL EQ, acc1, y1, y1
- CSEL EQ, acc2, y2, y2
- CSEL EQ, acc3, y3, y3
- LDx(z1in)
- STy(z3out)
-
- LDy(z1sqr)
- CALL p256MulInternal(SB) // z1 ^ 3
-
- LDx(y2in)
- CALL p256MulInternal(SB) // s2 = y2 * z1ˆ3
- STy(s2)
-
- LDx(y1in)
- CALL p256SubInternal(SB) // r = s2 - s1
- STx(r)
-
- CALL p256SqrInternal(SB) // rsqr = rˆ2
- STy (rsqr)
-
- LDx(h)
- CALL p256SqrInternal(SB) // hsqr = hˆ2
- STy(hsqr)
-
- CALL p256MulInternal(SB) // hcub = hˆ3
- STy(hcub)
-
- LDx(y1in)
- CALL p256MulInternal(SB) // y1 * hˆ3
- STy(s2)
-
- LDP hsqr(0*8), (x0, x1)
- LDP hsqr(2*8), (x2, x3)
- LDP 0*16(a_ptr), (y0, y1)
- LDP 1*16(a_ptr), (y2, y3)
- CALL p256MulInternal(SB) // u1 * hˆ2
- STP (y0, y1), h(0*8)
- STP (y2, y3), h(2*8)
-
- p256MulBy2Inline // u1 * hˆ2 * 2, inline
-
- LDy(rsqr)
- CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
-
- MOVD x0, y0
- MOVD x1, y1
- MOVD x2, y2
- MOVD x3, y3
- LDx(hcub)
- CALL p256SubInternal(SB)
-
- LDP 0*16(a_ptr), (acc0, acc1)
- LDP 1*16(a_ptr), (acc2, acc3)
- ANDS $1, hlp1, ZR // iff select[0] == 0, x3 = x1
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- LDP 0*16(b_ptr), (acc0, acc1)
- LDP 1*16(b_ptr), (acc2, acc3)
- ANDS $2, hlp1, ZR // iff select[1] == 0, x3 = x2
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- STP (x0, x1), 0*16(res_ptr)
- STP (x2, x3), 1*16(res_ptr)
-
- LDP h(0*8), (y0, y1)
- LDP h(2*8), (y2, y3)
- CALL p256SubInternal(SB)
-
- LDP r(0*8), (y0, y1)
- LDP r(2*8), (y2, y3)
- CALL p256MulInternal(SB)
-
- LDP s2(0*8), (x0, x1)
- LDP s2(2*8), (x2, x3)
- CALL p256SubInternal(SB)
- LDP 2*16(a_ptr), (acc0, acc1)
- LDP 3*16(a_ptr), (acc2, acc3)
- ANDS $1, hlp1, ZR // iff select[0] == 0, y3 = y1
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- LDP y2in(0*8), (acc0, acc1)
- LDP y2in(2*8), (acc2, acc3)
- ANDS $2, hlp1, ZR // iff select[1] == 0, y3 = y2
- CSEL EQ, acc0, x0, x0
- CSEL EQ, acc1, x1, x1
- CSEL EQ, acc2, x2, x2
- CSEL EQ, acc3, x3, x3
- STP (x0, x1), 2*16(res_ptr)
- STP (x2, x3), 3*16(res_ptr)
-
- RET
-
-#define p256AddInline \
- ADDS y0, x0, x0; \
- ADCS y1, x1, x1; \
- ADCS y2, x2, x2; \
- ADCS y3, x3, x3; \
- ADC $0, ZR, hlp0; \
- SUBS $-1, x0, t0; \
- SBCS const0, x1, t1;\
- SBCS $0, x2, t2; \
- SBCS const1, x3, t3;\
- SBCS $0, hlp0, hlp0;\
- CSEL CC, x0, t0, x0;\
- CSEL CC, x1, t1, x1;\
- CSEL CC, x2, t2, x2;\
- CSEL CC, x3, t3, x3;
-
-#define s(off) (32*0 + 8 + off)(RSP)
-#define m(off) (32*1 + 8 + off)(RSP)
-#define zsqr(off) (32*2 + 8 + off)(RSP)
-#define tmp(off) (32*3 + 8 + off)(RSP)
-
-//func p256PointDoubleAsm(res, in []uint64)
-TEXT ·p256PointDoubleAsm(SB),NOSPLIT,$136-48
- MOVD res+0(FP), res_ptr
- MOVD in+24(FP), a_ptr
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
-
- // Begin point double
- LDP 4*16(a_ptr), (x0, x1)
- LDP 5*16(a_ptr), (x2, x3)
- CALL p256SqrInternal(SB)
- STP (y0, y1), zsqr(0*8)
- STP (y2, y3), zsqr(2*8)
-
- LDP 0*16(a_ptr), (x0, x1)
- LDP 1*16(a_ptr), (x2, x3)
- p256AddInline
- STx(m)
-
- LDx(z1in)
- LDy(y1in)
- CALL p256MulInternal(SB)
- p256MulBy2Inline
- STx(z3out)
-
- LDy(x1in)
- LDx(zsqr)
- CALL p256SubInternal(SB)
- LDy(m)
- CALL p256MulInternal(SB)
-
- // Multiply by 3
- p256MulBy2Inline
- p256AddInline
- STx(m)
-
- LDy(y1in)
- p256MulBy2Inline
- CALL p256SqrInternal(SB)
- STy(s)
- MOVD y0, x0
- MOVD y1, x1
- MOVD y2, x2
- MOVD y3, x3
- CALL p256SqrInternal(SB)
-
- // Divide by 2
- ADDS $-1, y0, t0
- ADCS const0, y1, t1
- ADCS $0, y2, t2
- ADCS const1, y3, t3
- ADC $0, ZR, hlp0
-
- ANDS $1, y0, ZR
- CSEL EQ, y0, t0, t0
- CSEL EQ, y1, t1, t1
- CSEL EQ, y2, t2, t2
- CSEL EQ, y3, t3, t3
- AND y0, hlp0, hlp0
-
- EXTR $1, t0, t1, y0
- EXTR $1, t1, t2, y1
- EXTR $1, t2, t3, y2
- EXTR $1, t3, hlp0, y3
- STy(y3out)
-
- LDx(x1in)
- LDy(s)
- CALL p256MulInternal(SB)
- STy(s)
- p256MulBy2Inline
- STx(tmp)
-
- LDx(m)
- CALL p256SqrInternal(SB)
- LDx(tmp)
- CALL p256SubInternal(SB)
-
- STx(x3out)
-
- LDy(s)
- CALL p256SubInternal(SB)
-
- LDy(m)
- CALL p256MulInternal(SB)
-
- LDx(y3out)
- CALL p256SubInternal(SB)
- STx(y3out)
- RET
-/* ---------------------------------------*/
-#undef y2in
-#define y2in(off) (off + 32)(b_ptr)
-//func p256PointAddAsm(res, in1, in2 []uint64) int
-TEXT ·p256PointAddAsm(SB),0,$392-80
- // See https://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl
- // Move input to stack in order to free registers
- MOVD res+0(FP), res_ptr
- MOVD in1+24(FP), a_ptr
- MOVD in2+48(FP), b_ptr
-
- MOVD p256const0<>(SB), const0
- MOVD p256const1<>(SB), const1
-
- // Begin point add
- LDx(z2in)
- CALL p256SqrInternal(SB) // z2^2
- STy(z2sqr)
-
- CALL p256MulInternal(SB) // z2^3
-
- LDx(y1in)
- CALL p256MulInternal(SB) // s1 = z2ˆ3*y1
- STy(s1)
-
- LDx(z1in)
- CALL p256SqrInternal(SB) // z1^2
- STy(z1sqr)
-
- CALL p256MulInternal(SB) // z1^3
-
- LDx(y2in)
- CALL p256MulInternal(SB) // s2 = z1ˆ3*y2
-
- LDx(s1)
- CALL p256SubInternal(SB) // r = s2 - s1
- STx(r)
-
- MOVD $1, t2
- ORR x0, x1, t0 // Check if zero mod p256
- ORR x2, x3, t1
- ORR t1, t0, t0
- CMP $0, t0
- CSEL EQ, t2, ZR, hlp1
-
- EOR $-1, x0, t0
- EOR const0, x1, t1
- EOR const1, x3, t3
-
- ORR t0, t1, t0
- ORR x2, t3, t1
- ORR t1, t0, t0
- CMP $0, t0
- CSEL EQ, t2, hlp1, hlp1
-
- LDx(z2sqr)
- LDy(x1in)
- CALL p256MulInternal(SB) // u1 = x1 * z2ˆ2
- STy(u1)
-
- LDx(z1sqr)
- LDy(x2in)
- CALL p256MulInternal(SB) // u2 = x2 * z1ˆ2
- STy(u2)
-
- LDx(u1)
- CALL p256SubInternal(SB) // h = u2 - u1
- STx(h)
-
- MOVD $1, t2
- ORR x0, x1, t0 // Check if zero mod p256
- ORR x2, x3, t1
- ORR t1, t0, t0
- CMP $0, t0
- CSEL EQ, t2, ZR, hlp0
-
- EOR $-1, x0, t0
- EOR const0, x1, t1
- EOR const1, x3, t3
-
- ORR t0, t1, t0
- ORR x2, t3, t1
- ORR t1, t0, t0
- CMP $0, t0
- CSEL EQ, t2, hlp0, hlp0
-
- AND hlp0, hlp1, hlp1
-
- LDx(r)
- CALL p256SqrInternal(SB) // rsqr = rˆ2
- STy(rsqr)
-
- LDx(h)
- CALL p256SqrInternal(SB) // hsqr = hˆ2
- STy(hsqr)
-
- LDx(h)
- CALL p256MulInternal(SB) // hcub = hˆ3
- STy(hcub)
-
- LDx(s1)
- CALL p256MulInternal(SB)
- STy(s2)
-
- LDx(z1in)
- LDy(z2in)
- CALL p256MulInternal(SB) // z1 * z2
- LDx(h)
- CALL p256MulInternal(SB) // z1 * z2 * h
- STy(z3out)
-
- LDx(hsqr)
- LDy(u1)
- CALL p256MulInternal(SB) // hˆ2 * u1
- STy(u2)
-
- p256MulBy2Inline // u1 * hˆ2 * 2, inline
- LDy(rsqr)
- CALL p256SubInternal(SB) // rˆ2 - u1 * hˆ2 * 2
-
- MOVD x0, y0
- MOVD x1, y1
- MOVD x2, y2
- MOVD x3, y3
- LDx(hcub)
- CALL p256SubInternal(SB)
- STx(x3out)
-
- LDy(u2)
- CALL p256SubInternal(SB)
-
- LDy(r)
- CALL p256MulInternal(SB)
-
- LDx(s2)
- CALL p256SubInternal(SB)
- STx(y3out)
-
- MOVD hlp1, R0
- MOVD R0, ret+72(FP)
-
- RET
diff --git a/src/crypto/elliptic/p256_generic.go b/src/crypto/elliptic/p256_generic.go
index 9427331d52..9963fcafdd 100644
--- a/src/crypto/elliptic/p256_generic.go
+++ b/src/crypto/elliptic/p256_generic.go
@@ -2,7 +2,7 @@
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64,!s390x,!arm64
+// +build !amd64,!s390x
package elliptic