diff options
-rw-r--r-- | src/crypto/elliptic/p256.go | 28 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_asm_s390x.s | 2201 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_generic.go | 16 | ||||
-rw-r--r-- | src/crypto/elliptic/p256_s390x.go | 513 |
4 files changed, 2746 insertions, 12 deletions
diff --git a/src/crypto/elliptic/p256.go b/src/crypto/elliptic/p256.go index 05a3311b29..bbf0087e66 100644 --- a/src/crypto/elliptic/p256.go +++ b/src/crypto/elliptic/p256.go @@ -17,7 +17,8 @@ type p256Curve struct { } var ( - p256 p256Curve + p256Params *CurveParams + // RInverse contains 1/R mod p - the inverse of the Montgomery constant // (2**257). p256RInverse *big.Int @@ -25,15 +26,18 @@ var ( func initP256() { // See FIPS 186-3, section D.2.3 - p256.CurveParams = &CurveParams{Name: "P-256"} - p256.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10) - p256.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10) - p256.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16) - p256.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16) - p256.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16) - p256.BitSize = 256 + p256Params = &CurveParams{Name: "P-256"} + p256Params.P, _ = new(big.Int).SetString("115792089210356248762697446949407573530086143415290314195533631308867097853951", 10) + p256Params.N, _ = new(big.Int).SetString("115792089210356248762697446949407573529996955224135760342422259061068512044369", 10) + p256Params.B, _ = new(big.Int).SetString("5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b", 16) + p256Params.Gx, _ = new(big.Int).SetString("6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296", 16) + p256Params.Gy, _ = new(big.Int).SetString("4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5", 16) + p256Params.BitSize = 256 p256RInverse, _ = new(big.Int).SetString("7fffffff00000001fffffffe8000000100000000ffffffff0000000180000000", 16) + + // Arch-specific initialization, i.e. let a platform dynamically pick a P256 implementation + initP256Arch() } func (curve p256Curve) Params() *CurveParams { @@ -47,8 +51,8 @@ func p256GetScalar(out *[32]byte, in []byte) { n := new(big.Int).SetBytes(in) var scalarBytes []byte - if n.Cmp(p256.N) >= 0 { - n.Mod(n, p256.N) + if n.Cmp(p256Params.N) >= 0 { + n.Mod(n, p256Params.N) scalarBytes = n.Bytes() } else { scalarBytes = in @@ -1143,7 +1147,7 @@ func p256ScalarMult(xOut, yOut, zOut, x, y *[p256Limbs]uint32, scalar *[32]uint8 // p256FromBig sets out = R*in. func p256FromBig(out *[p256Limbs]uint32, in *big.Int) { tmp := new(big.Int).Lsh(in, 257) - tmp.Mod(tmp, p256.P) + tmp.Mod(tmp, p256Params.P) for i := 0; i < p256Limbs; i++ { if bits := tmp.Bits(); len(bits) > 0 { @@ -1183,6 +1187,6 @@ func p256ToBig(in *[p256Limbs]uint32) *big.Int { } result.Mul(result, p256RInverse) - result.Mod(result, p256.P) + result.Mod(result, p256Params.P) return result } diff --git a/src/crypto/elliptic/p256_asm_s390x.s b/src/crypto/elliptic/p256_asm_s390x.s new file mode 100644 index 0000000000..96b59be23f --- /dev/null +++ b/src/crypto/elliptic/p256_asm_s390x.s @@ -0,0 +1,2201 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +DATA p256ordK0<>+0x00(SB)/4, $0xee00bc4f +DATA p256ord<>+0x00(SB)/8, $0xffffffff00000000 +DATA p256ord<>+0x08(SB)/8, $0xffffffffffffffff +DATA p256ord<>+0x10(SB)/8, $0xbce6faada7179e84 +DATA p256ord<>+0x18(SB)/8, $0xf3b9cac2fc632551 +DATA p256<>+0x00(SB)/8, $0xffffffff00000001 // P256 +DATA p256<>+0x08(SB)/8, $0x0000000000000000 // P256 +DATA p256<>+0x10(SB)/8, $0x00000000ffffffff // P256 +DATA p256<>+0x18(SB)/8, $0xffffffffffffffff // P256 +DATA p256<>+0x20(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x28(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x30(SB)/8, $0x0000000010111213 // SEL 0 d1 d0 0 +DATA p256<>+0x38(SB)/8, $0x1415161700000000 // SEL 0 d1 d0 0 +DATA p256<>+0x40(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256<>+0x48(SB)/8, $0x18191a1b1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x00(SB)/8, $0xffffffff00000001 // P256 +DATA p256mul<>+0x08(SB)/8, $0x0000000000000000 // P256 +DATA p256mul<>+0x10(SB)/8, $0x00000000ffffffff // P256 +DATA p256mul<>+0x18(SB)/8, $0xffffffffffffffff // P256 +DATA p256mul<>+0x20(SB)/8, $0x1c1d1e1f00000000 // SEL d0 0 0 d0 +DATA p256mul<>+0x28(SB)/8, $0x000000001c1d1e1f // SEL d0 0 0 d0 +DATA p256mul<>+0x30(SB)/8, $0x0001020304050607 // SEL d0 0 d1 d0 +DATA p256mul<>+0x38(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL d0 0 d1 d0 +DATA p256mul<>+0x40(SB)/8, $0x040506071c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x48(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL 0 d1 d0 d1 +DATA p256mul<>+0x50(SB)/8, $0x0405060704050607 // SEL 0 0 d1 d0 +DATA p256mul<>+0x58(SB)/8, $0x1c1d1e1f0c0d0e0f // SEL 0 0 d1 d0 +DATA p256mul<>+0x60(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x68(SB)/8, $0x0c0d0e0f1c1d1e1f // SEL d1 d0 d1 d0 +DATA p256mul<>+0x70(SB)/8, $0x141516170c0d0e0f // SEL 0 d1 d0 0 +DATA p256mul<>+0x78(SB)/8, $0x1c1d1e1f14151617 // SEL 0 d1 d0 0 +DATA p256mul<>+0x80(SB)/8, $0x00000000fffffffe // (1*2^256)%P256 +DATA p256mul<>+0x88(SB)/8, $0xffffffffffffffff // (1*2^256)%P256 +DATA p256mul<>+0x90(SB)/8, $0xffffffff00000000 // (1*2^256)%P256 +DATA p256mul<>+0x98(SB)/8, $0x0000000000000001 // (1*2^256)%P256 +GLOBL p256ordK0<>(SB), 8, $4 +GLOBL p256ord<>(SB), 8, $32 +GLOBL p256<>(SB), 8, $80 +GLOBL p256mul<>(SB), 8, $160 + +// func hasVectorFacility() bool +TEXT ·hasVectorFacility(SB), NOSPLIT, $24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector + +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET + +novector: + MOVB $0, ret+0(FP) // no vx + RET + +// --------------------------------------- +// iff cond == 1 val <- -val +// func p256NegCond(val *p256Point, cond int) +#define P1ptr R1 +#define CPOOL R4 + +#define Y1L V0 +#define Y1H V1 +#define T1L V2 +#define T1H V3 + +#define PL V30 +#define PH V31 + +#define ZER V4 +#define SEL1 V5 +#define CAR1 V6 +TEXT ·p256NegCond(SB), NOSPLIT, $0 + MOVD val+0(FP), P1ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + VL 32(P1ptr), Y1H + VL 48(P1ptr), Y1L + + VLREPG cond+8(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSCBIQ Y1L, PL, CAR1 + VSQ Y1L, PL, T1L + VSBIQ PH, Y1H, CAR1, T1H + + VSEL Y1L, T1L, SEL1, Y1L + VSEL Y1H, T1H, SEL1, Y1H + + VST Y1H, 32(P1ptr) + VST Y1L, 48(P1ptr) + RET + +#undef P1ptr +#undef CPOOL +#undef Y1L +#undef Y1H +#undef T1L +#undef T1H +#undef PL +#undef PH +#undef ZER +#undef SEL1 +#undef CAR1 + +// --------------------------------------- +// if cond == 0 res <- b; else res <- a +// func p256MovCond(res, a, b *p256Point, cond int) +#define P3ptr R1 +#define P1ptr R2 +#define P2ptr R3 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 + +#define ZER V18 +#define SEL1 V19 +TEXT ·p256MovCond(SB), NOSPLIT, $0 + MOVD res+0(FP), P3ptr + MOVD a+8(FP), P1ptr + MOVD b+16(FP), P2ptr + VLREPG cond+24(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VL 0(P1ptr), X1H + VL 16(P1ptr), X1L + VL 32(P1ptr), Y1H + VL 48(P1ptr), Y1L + VL 64(P1ptr), Z1H + VL 80(P1ptr), Z1L + + VL 0(P2ptr), X2H + VL 16(P2ptr), X2L + VL 32(P2ptr), Y2H + VL 48(P2ptr), Y2L + VL 64(P2ptr), Z2H + VL 80(P2ptr), Z2L + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + VSEL Z2L, Z1L, SEL1, Z1L + VSEL Z2H, Z1H, SEL1, Z1H + + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + VST Z1H, 64(P3ptr) + VST Z1L, 80(P3ptr) + + RET + +#undef P3ptr +#undef P1ptr +#undef P2ptr +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ZER +#undef SEL1 + +// --------------------------------------- +// Constant time table access +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256Select(point *p256Point, table []p256Point, idx int) +#define P3ptr R1 +#define P1ptr R2 +#define COUNT R4 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL2 V21 +TEXT ·p256Select(SB), NOSPLIT, $0 + MOVD point+0(FP), P3ptr + MOVD table+8(FP), P1ptr + VLREPB idx+(32+7)(FP), IDX + VREPIB $1, ONE + VREPIB $1, SEL2 + MOVD $1, COUNT + + VZERO X1H + VZERO X1L + VZERO Y1H + VZERO Y1L + VZERO Z1H + VZERO Z1L + +loop_select: + VL 0(P1ptr), X2H + VL 16(P1ptr), X2L + VL 32(P1ptr), Y2H + VL 48(P1ptr), Y2L + VL 64(P1ptr), Z2H + VL 80(P1ptr), Z2L + + VCEQG SEL2, IDX, SEL1 + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + VSEL Z2L, Z1L, SEL1, Z1L + VSEL Z2H, Z1H, SEL1, Z1H + + VAB SEL2, ONE, SEL2 + ADDW $1, COUNT + ADD $96, P1ptr + CMPW COUNT, $17 + BLT loop_select + + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + VST Z1H, 64(P3ptr) + VST Z1L, 80(P3ptr) + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL2 + +// --------------------------------------- +// Constant time table access +// Indexed from 1 to 15, with -1 offset +// (index 0 is implicitly point at infinity) +// func p256SelectBase(point *p256Point, table []p256Point, idx int) +#define P3ptr R1 +#define P1ptr R2 +#define COUNT R4 + +#define X1L V0 +#define X1H V1 +#define Y1L V2 +#define Y1H V3 +#define Z1L V4 +#define Z1H V5 +#define X2L V6 +#define X2H V7 +#define Y2L V8 +#define Y2H V9 +#define Z2L V10 +#define Z2H V11 + +#define ONE V18 +#define IDX V19 +#define SEL1 V20 +#define SEL2 V21 +TEXT ·p256SelectBase(SB), NOSPLIT, $0 + MOVD point+0(FP), P3ptr + MOVD table+8(FP), P1ptr + VLREPB idx+(32+7)(FP), IDX + VREPIB $1, ONE + VREPIB $1, SEL2 + MOVD $1, COUNT + + VZERO X1H + VZERO X1L + VZERO Y1H + VZERO Y1L + VZERO Z1H + VZERO Z1L + +loop_select: + VL 0(P1ptr), X2H + VL 16(P1ptr), X2L + VL 32(P1ptr), Y2H + VL 48(P1ptr), Y2L + VL 64(P1ptr), Z2H + VL 80(P1ptr), Z2L + + VCEQG SEL2, IDX, SEL1 + + VSEL X2L, X1L, SEL1, X1L + VSEL X2H, X1H, SEL1, X1H + VSEL Y2L, Y1L, SEL1, Y1L + VSEL Y2H, Y1H, SEL1, Y1H + VSEL Z2L, Z1L, SEL1, Z1L + VSEL Z2H, Z1H, SEL1, Z1H + + VAB SEL2, ONE, SEL2 + ADDW $1, COUNT + ADD $96, P1ptr + CMPW COUNT, $65 + BLT loop_select + + VST X1H, 0(P3ptr) + VST X1L, 16(P3ptr) + VST Y1H, 32(P3ptr) + VST Y1L, 48(P3ptr) + VST Z1H, 64(P3ptr) + VST Z1L, 80(P3ptr) + RET + +#undef P3ptr +#undef P1ptr +#undef COUNT +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Y2L +#undef Y2H +#undef Z2L +#undef Z2H +#undef ONE +#undef IDX +#undef SEL1 +#undef SEL2 + +// --------------------------------------- +// func p256FromMont(res, in []byte) +#define res_ptr R1 +#define x_ptr R2 +#define CPOOL R4 + +#define T0 V0 +#define T1 V1 +#define T2 V2 +#define TT0 V3 +#define TT1 V4 + +#define ZER V6 +#define SEL1 V7 +#define SEL2 V8 +#define CAR1 V9 +#define CAR2 V10 +#define RED1 V11 +#define RED2 V12 +#define PL V13 +#define PH V14 + +TEXT ·p256FromMont(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in+24(FP), x_ptr + + VZERO T2 + VZERO ZER + MOVD $p256<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL1 + + VL (1*16)(x_ptr), T0 + VL (0*16)(x_ptr), T1 + + // First round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Second round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Third round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // Last round + VPERM T1, T0, SEL1, RED2 // d1 d0 d1 d0 + VPERM ZER, RED2, SEL2, RED1 // 0 d1 d0 0 + VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDB $8, T1, T0, T0 + VSLDB $8, T2, T1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // --------------------------------------------------- + + VSCBIQ PL, T0, CAR1 + VSQ PL, T0, TT0 + VSBCBIQ T1, PH, CAR1, CAR2 + VSBIQ T1, PH, CAR1, TT1 + VSBIQ T2, ZER, CAR2, T2 + + // what output to use, TT1||TT0 or T1||T0? + VSEL T0, TT0, T2, T0 + VSEL T1, TT1, T2, T1 + + VST T0, (1*16)(res_ptr) + VST T1, (0*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef CPOOL +#undef T0 +#undef T1 +#undef T2 +#undef TT0 +#undef TT1 +#undef ZER +#undef SEL1 +#undef SEL2 +#undef CAR1 +#undef CAR2 +#undef RED1 +#undef RED2 +#undef PL +#undef PH + +// --------------------------------------- +// func p256OrdMul(res, in1, in2 []byte) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define M0 V4 +#define M1 V5 +#define T0 V6 +#define T1 V7 +#define T2 V8 +#define YDIG V9 + +#define ADD1 V16 +#define ADD1H V17 +#define ADD2 V18 +#define ADD2H V19 +#define RED1 V20 +#define RED1H V21 +#define RED2 V22 +#define RED2H V23 +#define CAR1 V24 +#define CAR1M V25 + +#define MK0 V30 +#define K0 V31 +TEXT ·p256OrdMul(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in1+24(FP), x_ptr + MOVD in2+48(FP), y_ptr + + VZERO T2 + MOVD $p256ordK0<>+0x00(SB), R4 + + // VLEF $3, 0(R4), K0 + WORD $0xE7F40000 + BYTE $0x38 + BYTE $0x03 + MOVD $p256ord<>+0x00(SB), R4 + VL 16(R4), M0 + VL 0(R4), M1 + + VL (1*16)(x_ptr), X0 + VL (0*16)(x_ptr), X1 + VL (1*16)(y_ptr), Y0 + VL (0*16)(y_ptr), Y1 + + // ---------------------------------------------------------------------------/ + VREPF $3, Y0, YDIG + VMLF X0, YDIG, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMLF X1, YDIG, ADD2 + VMLHF X0, YDIG, ADD1H + VMLHF X1, YDIG, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- +/* * + * ---+--------+--------+ + * T2| T1 | T0 | + * ---+--------+--------+ + * *(add)* + * +--------+--------+ + * | X1 | X0 | + * +--------+--------+ + * *(mul)* + * +--------+--------+ + * | YDIG | YDIG | + * +--------+--------+ + * *(add)* + * +--------+--------+ + * | M1 | M0 | + * +--------+--------+ + * *(mul)* + * +--------+--------+ + * | MK0 | MK0 | + * +--------+--------+ + * + * --------------------- + * + * +--------+--------+ + * | ADD2 | ADD1 | + * +--------+--------+ + * +--------+--------+ + * | ADD2H | ADD1H | + * +--------+--------+ + * +--------+--------+ + * | RED2 | RED1 | + * +--------+--------+ + * +--------+--------+ + * | RED2H | RED1H | + * +--------+--------+ + */ + VREPF $2, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $1, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $0, Y0, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $3, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $2, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $1, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + VREPF $0, Y1, YDIG + VMALF X0, YDIG, T0, ADD1 + VMLF ADD1, K0, MK0 + VREPF $3, MK0, MK0 + + VMALF X1, YDIG, T1, ADD2 + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + + VMALF M0, MK0, ADD1, RED1 + VMALHF M0, MK0, ADD1, RED1H + VMALF M1, MK0, ADD2, RED2 + VMALHF M1, MK0, ADD2, RED2H + + VSLDB $12, RED2, RED1, RED1 + VSLDB $12, T2, RED2, RED2 + + VACCQ RED1, ADD1H, CAR1 + VAQ RED1, ADD1H, T0 + VACCQ RED1H, T0, CAR1M + VAQ RED1H, T0, T0 + + // << ready for next MK0 + + VACQ RED2, ADD2H, CAR1, T1 + VACCCQ RED2, ADD2H, CAR1, CAR1 + VACCCQ RED2H, T1, CAR1M, T2 + VACQ RED2H, T1, CAR1M, T1 + VAQ CAR1, T2, T2 + + // --------------------------------------------------- + + VZERO RED1 + VSCBIQ M0, T0, CAR1 + VSQ M0, T0, ADD1 + VSBCBIQ T1, M1, CAR1, CAR1M + VSBIQ T1, M1, CAR1, ADD2 + VSBIQ T2, RED1, CAR1M, T2 + + // what output to use, ADD2||ADD1 or T1||T0? + VSEL T0, ADD1, T2, T0 + VSEL T1, ADD2, T2, T1 + + VST T0, (1*16)(res_ptr) + VST T1, (0*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef M0 +#undef M1 +#undef T0 +#undef T1 +#undef T2 +#undef YDIG + +#undef ADD1 +#undef ADD1H +#undef ADD2 +#undef ADD2H +#undef RED1 +#undef RED1H +#undef RED2 +#undef RED2H +#undef CAR1 +#undef CAR1M + +#undef MK0 +#undef K0 + +// --------------------------------------- +// p256MulInternal +// V0-V3,V30,V31 - Not Modified +// V4-V15 - Volatile + +#define CPOOL R4 + +// Parameters +#define X0 V0 // Not modified +#define X1 V1 // Not modified +#define Y0 V2 // Not modified +#define Y1 V3 // Not modified +#define T0 V4 +#define T1 V5 +#define P0 V30 // Not modified +#define P1 V31 // Not modified + +// Temporaries +#define YDIG V6 // Overloaded with CAR2, ZER +#define ADD1H V7 // Overloaded with ADD3H +#define ADD2H V8 // Overloaded with ADD4H +#define ADD3 V9 // Overloaded with SEL2,SEL5 +#define ADD4 V10 // Overloaded with SEL3,SEL6 +#define RED1 V11 // Overloaded with CAR2 +#define RED2 V12 +#define RED3 V13 // Overloaded with SEL1 +#define T2 V14 +// Overloaded temporaries +#define ADD1 V4 // Overloaded with T0 +#define ADD2 V5 // Overloaded with T1 +#define ADD3H V7 // Overloaded with ADD1H +#define ADD4H V8 // Overloaded with ADD2H +#define ZER V6 // Overloaded with YDIG, CAR2 +#define CAR1 V6 // Overloaded with YDIG, ZER +#define CAR2 V11 // Overloaded with RED1 +// Constant Selects +#define SEL1 V13 // Overloaded with RED3 +#define SEL2 V9 // Overloaded with ADD3,SEL5 +#define SEL3 V10 // Overloaded with ADD4,SEL6 +#define SEL4 V6 // Overloaded with YDIG,CAR2,ZER +#define SEL5 V9 // Overloaded with ADD3,SEL2 +#define SEL6 V10 // Overloaded with ADD4,SEL3 + +/* * + * To follow the flow of bits, for your own sanity a stiff drink, need you shall. + * Of a single round, a 'helpful' picture, here is. Meaning, column position has. + * With you, SIMD be... + * + * +--------+--------+ + * +--------| RED2 | RED1 | + * | +--------+--------+ + * | ---+--------+--------+ + * | +---- T2| T1 | T0 |--+ + * | | ---+--------+--------+ | + * | | | + * | | ======================= | + * | | | + * | | +--------+--------+<-+ + * | +-------| ADD2 | ADD1 |--|-----+ + * | | +--------+--------+ | | + * | | +--------+--------+<---+ | + * | | | ADD2H | ADD1H |--+ | + * | | +--------+--------+ | | + * | | +--------+--------+<-+ | + * | | | ADD4 | ADD3 |--|-+ | + * | | +--------+--------+ | | | + * | | +--------+--------+<---+ | | + * | | | ADD4H | ADD3H |------|-+ |(+vzero) + * | | +--------+--------+ | | V + * | | ------------------------ | | +--------+ + * | | | | | RED3 | [d0 0 0 d0] + * | | | | +--------+ + * | +---->+--------+--------+ | | | + * (T2[1w]||ADD2[4w]||ADD1[3w]) +--------| T1 | T0 | | | | + * | +--------+--------+ | | | + * +---->---+--------+--------+ | | | + * T2| T1 | T0 |----+ | | + * ---+--------+--------+ | | | + * ---+--------+--------+<---+ | | + * +--- T2| T1 | T0 |----------+ + * | ---+--------+--------+ | | + * | +--------+--------+<-------------+ + * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] + * | +--------+--------+ | | | + * | +--------+<----------------------+ + * | | RED3 |--------------+ | [0 0 d1 d0] + * | +--------+ | | + * +--->+--------+--------+ | | + * | T1 | T0 |--------+ + * +--------+--------+ | | + * --------------------------- | | + * | | + * +--------+--------+<----+ | + * | RED2 | RED1 | | + * +--------+--------+ | + * ---+--------+--------+<-------+ + * T2| T1 | T0 | (H1P-H1P-H00RRAY!) + * ---+--------+--------+ + * + * *Mi obra de arte de siglo XXI @vpaprots + * + * + * First group is special, doesnt get the two inputs: + * +--------+--------+<-+ + * +-------| ADD2 | ADD1 |--|-----+ + * | +--------+--------+ | | + * | +--------+--------+<---+ | + * | | ADD2H | ADD1H |--+ | + * | +--------+--------+ | | + * | +--------+--------+<-+ | + * | | ADD4 | ADD3 |--|-+ | + * | +--------+--------+ | | | + * | +--------+--------+<---+ | | + * | | ADD4H | ADD3H |------|-+ |(+vzero) + * | +--------+--------+ | | V + * | ------------------------ | | +--------+ + * | | | | RED3 | [d0 0 0 d0] + * | | | +--------+ + * +---->+--------+--------+ | | | + * (T2[1w]||ADD2[4w]||ADD1[3w]) | T1 | T0 |----+ | | + * +--------+--------+ | | | + * ---+--------+--------+<---+ | | + * +--- T2| T1 | T0 |----------+ + * | ---+--------+--------+ | | + * | +--------+--------+<-------------+ + * | | RED2 | RED1 |-----+ | | [0 d1 d0 d1] [d0 0 d1 d0] + * | +--------+--------+ | | | + * | +--------+<----------------------+ + * | | RED3 |--------------+ | [0 0 d1 d0] + * | +--------+ | | + * +--->+--------+--------+ | | + * | T1 | T0 |--------+ + * +--------+--------+ | | + * --------------------------- | | + * | | + * +--------+--------+<----+ | + * | RED2 | RED1 | | + * +--------+--------+ | + * ---+--------+--------+<-------+ + * T2| T1 | T0 | (H1P-H1P-H00RRAY!) + * ---+--------+--------+ + * + * Last 'group' needs to RED2||RED1 shifted less + */ +TEXT p256MulInternal<>(SB), NOSPLIT, $0-0 + VL 32(CPOOL), SEL1 + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL3 + VL 80(CPOOL), SEL4 + + // --------------------------------------------------- + + VREPF $3, Y0, YDIG + VMLHF X0, YDIG, ADD1H + VMLHF X1, YDIG, ADD2H + VMLF X0, YDIG, ADD1 + VMLF X1, YDIG, ADD2 + + VREPF $2, Y0, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free + VSLDB $12, ZER, ADD2, T1 // ADD2 Free + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 // ADD3 Free + VACCCQ T1, ADD4, CAR1, T2 + VACQ T1, ADD4, CAR1, T1 // ADD4 Free + + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL3 + VL 80(CPOOL), SEL4 + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSQ RED3, RED2, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + // --------------------------------------------------- + + VREPF $1, Y0, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 // T0 Free->ADD1 + VMALF X1, YDIG, T1, ADD2 // T1 Free->ADD2 + + VREPF $0, Y0, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free->ADD3H + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free->ADD4H , YDIG Free->ZER + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free->T0 + VSLDB $12, T2, ADD2, T1 // ADD2 Free->T1, T2 Free + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, T2 + VACQ T1, RED2, CAR1, T1 + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, CAR2 + VACQ T1, ADD4, CAR1, T1 + VAQ T2, CAR2, T2 + + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL3 + VL 80(CPOOL), SEL4 + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSQ RED3, RED2, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + // --------------------------------------------------- + + VREPF $3, Y1, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 + VMALF X1, YDIG, T1, ADD2 + + VREPF $2, Y1, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H // ADD1H Free + VMALHF X1, YDIG, ADD2H, ADD4H // ADD2H Free + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 // ADD1 Free + VSLDB $12, T2, ADD2, T1 // ADD2 Free + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, T2 + VACQ T1, RED2, CAR1, T1 + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, CAR2 + VACQ T1, ADD4, CAR1, T1 + VAQ T2, CAR2, T2 + + VL 48(CPOOL), SEL2 + VL 64(CPOOL), SEL3 + VL 80(CPOOL), SEL4 + VPERM RED3, T0, SEL2, RED1 // [d0 0 d1 d0] + VPERM RED3, T0, SEL3, RED2 // [ 0 d1 d0 d1] + VPERM RED3, T0, SEL4, RED3 // [ 0 0 d1 d0] + VSQ RED3, RED2, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + // --------------------------------------------------- + + VREPF $1, Y1, YDIG + VMALHF X0, YDIG, T0, ADD1H + VMALHF X1, YDIG, T1, ADD2H + VMALF X0, YDIG, T0, ADD1 + VMALF X1, YDIG, T1, ADD2 + + VREPF $0, Y1, YDIG + VMALF X0, YDIG, ADD1H, ADD3 + VMALF X1, YDIG, ADD2H, ADD4 + VMALHF X0, YDIG, ADD1H, ADD3H + VMALHF X1, YDIG, ADD2H, ADD4H + + VZERO ZER + VL 32(CPOOL), SEL1 + VPERM ZER, ADD1, SEL1, RED3 // [d0 0 0 d0] + + VSLDB $12, ADD2, ADD1, T0 + VSLDB $12, T2, ADD2, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, T2 + VACQ T1, RED2, CAR1, T1 + + VACCQ T0, ADD3, CAR1 + VAQ T0, ADD3, T0 + VACCCQ T1, ADD4, CAR1, CAR2 + VACQ T1, ADD4, CAR1, T1 + VAQ T2, CAR2, T2 + + VL 96(CPOOL), SEL5 + VL 112(CPOOL), SEL6 + VPERM T0, RED3, SEL5, RED2 // [d1 d0 d1 d0] + VPERM T0, RED3, SEL6, RED1 // [ 0 d1 d0 0] + VSQ RED1, RED2, RED2 // Guaranteed not to underflow + + VSLDB $12, T1, T0, T0 + VSLDB $12, T2, T1, T1 + + VACCQ T0, ADD3H, CAR1 + VAQ T0, ADD3H, T0 + VACCCQ T1, ADD4H, CAR1, T2 + VACQ T1, ADD4H, CAR1, T1 + + VACCQ T0, RED1, CAR1 + VAQ T0, RED1, T0 + VACCCQ T1, RED2, CAR1, CAR2 + VACQ T1, RED2, CAR1, T1 + VAQ T2, CAR2, T2 + + // --------------------------------------------------- + + VZERO RED3 + VSCBIQ P0, T0, CAR1 + VSQ P0, T0, ADD1H + VSBCBIQ T1, P1, CAR1, CAR2 + VSBIQ T1, P1, CAR1, ADD2H + VSBIQ T2, RED3, CAR2, T2 + + // what output to use, ADD2H||ADD1H or T1||T0? + VSEL T0, ADD1H, T2, T0 + VSEL T1, ADD2H, T2, T1 + RET + +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +#undef SEL1 +#undef SEL2 +#undef SEL3 +#undef SEL4 +#undef SEL5 +#undef SEL6 + +#undef YDIG +#undef ADD1H +#undef ADD2H +#undef ADD3 +#undef ADD4 +#undef RED1 +#undef RED2 +#undef RED3 +#undef T2 +#undef ADD1 +#undef ADD2 +#undef ADD3H +#undef ADD4H +#undef ZER +#undef CAR1 +#undef CAR2 + +#define p256SubInternal(T1, T0, X1, X0, Y1, Y0) \ + VZERO ZER \ + VSCBIQ Y0, X0, CAR1 \ + VSQ Y0, X0, T0 \ + VSBCBIQ X1, Y1, CAR1, SEL1 \ + VSBIQ X1, Y1, CAR1, T1 \ + VSQ SEL1, ZER, SEL1 \ + \ + VACCQ T0, PL, CAR1 \ + VAQ T0, PL, TT0 \ + VACQ T1, PH, CAR1, TT1 \ + \ + VSEL T0, TT0, SEL1, T0 \ + VSEL T1, TT1, SEL1, T1 \ + +#define p256AddInternal(T1, T0, X1, X0, Y1, Y0) \ + VACCQ X0, Y0, CAR1 \ + VAQ X0, Y0, T0 \ + VACCCQ X1, Y1, CAR1, T2 \ + VACQ X1, Y1, CAR1, T1 \ + \ + VZERO ZER \ + VSCBIQ PL, T0, CAR1 \ + VSQ PL, T0, TT0 \ + VSBCBIQ T1, PH, CAR1, CAR2 \ + VSBIQ T1, PH, CAR1, TT1 \ + VSBIQ T2, ZER, CAR2, SEL1 \ + \ + VSEL T0, TT0, SEL1, T0 \ + VSEL T1, TT1, SEL1, T1 + +#define p256HalfInternal(T1, T0, X1, X0) \ + VZERO ZER \ + VSBIQ ZER, ZER, X0, SEL1 \ + \ + VACCQ X0, PL, CAR1 \ + VAQ X0, PL, T0 \ + VACCCQ X1, PH, CAR1, T2 \ + VACQ X1, PH, CAR1, T1 \ + \ + VSEL X0, T0, SEL1, T0 \ + VSEL X1, T1, SEL1, T1 \ + VSEL ZER, T2, SEL1, T2 \ + \ + VSLDB $15, T2, ZER, TT1 \ + VSLDB $15, T1, ZER, TT0 \ + VREPIB $1, SEL1 \ + VSRL SEL1, T0, T0 \ + VSRL SEL1, T1, T1 \ + VREPIB $7, SEL1 \ + VSL SEL1, TT0, TT0 \ + VSL SEL1, TT1, TT1 \ + VO T0, TT0, T0 \ + VO T1, TT1, T1 + +// --------------------------------------- +// func p256MulAsm(res, in1, in2 []byte) +#define res_ptr R1 +#define x_ptr R2 +#define y_ptr R3 +#define CPOOL R4 + +// Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +// Constants +#define P0 V30 +#define P1 V31 +TEXT ·p256MulAsm(SB), NOSPLIT, $0 + MOVD res+0(FP), res_ptr + MOVD in1+24(FP), x_ptr + MOVD in2+48(FP), y_ptr + + VL (1*16)(x_ptr), X0 + VL (0*16)(x_ptr), X1 + VL (1*16)(y_ptr), Y0 + VL (0*16)(y_ptr), Y1 + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), P0 + VL 0(CPOOL), P1 + + CALL p256MulInternal<>(SB) + + VST T0, (1*16)(res_ptr) + VST T1, (0*16)(res_ptr) + RET + +#undef res_ptr +#undef x_ptr +#undef y_ptr +#undef CPOOL + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef P0 +#undef P1 + +// Point add with P2 being affine point +// If sign == 1 -> P2 = -P2 +// If sel == 0 -> P3 = P1 +// if zero == 0 -> P3 = P2 +// p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) +#define P3ptr R1 +#define P1ptr R2 +#define P2ptr R3 +#define CPOOL R4 + +// Temporaries in REGs +#define Y2L V15 +#define Y2H V16 +#define T1L V17 +#define T1H V18 +#define T2L V19 +#define T2H V20 +#define T3L V21 +#define T3H V22 +#define T4L V23 +#define T4H V24 + +// Temps for Sub and Add +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 + +// Names for zero/sel selects +#define X1L V0 +#define X1H V1 +#define Y1L V2 // p256MulAsmParmY +#define Y1H V3 // p256MulAsmParmY +#define Z1L V4 +#define Z1H V5 +#define X2L V0 +#define X2H V1 +#define Z2L V4 +#define Z2H V5 +#define X3L V17 // T1L +#define X3H V18 // T1H +#define Y3L V21 // T3L +#define Y3H V22 // T3H +#define Z3L V28 +#define Z3H V29 + +#define ZER V6 +#define SEL1 V7 +#define CAR1 V8 +#define CAR2 V9 +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + * T1 = Z1² + * T2 = T1*Z1 + * T1 = T1*X2 + * T2 = T2*Y2 + * T1 = T1-X1 + * T2 = T2-Y1 + * Z3 = Z1*T1 + * T3 = T1² + * T4 = T3*T1 + * T3 = T3*X1 + * T1 = 2*T3 + * X3 = T2² + * X3 = X3-T1 + * X3 = X3-T4 + * T3 = T3-X3 + * T3 = T3*T2 + * T4 = T4*Y1 + * Y3 = T3-T4 + + * Three operand formulas, but with MulInternal X,Y used to store temps +X=Z1; Y=Z1; MUL;T- // T1 = Z1² T1 +X=T ; Y- ; MUL;T2=T // T2 = T1*Z1 T1 T2 +X- ; Y=X2; MUL;T1=T // T1 = T1*X2 T1 T2 +X=T2; Y=Y2; MUL;T- // T2 = T2*Y2 T1 T2 +SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 +SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 +X=Z1; Y- ; MUL;Z3:=T// Z3 = Z1*T1 T2 +X=Y; Y- ; MUL;X=T // T3 = T1*T1 T2 +X- ; Y- ; MUL;T4=T // T4 = T3*T1 T2 T4 +X- ; Y=X1; MUL;T3=T // T3 = T3*X1 T2 T3 T4 +ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 +X=T2; Y=T2; MUL;T- // X3 = T2*T2 T1 T2 T3 T4 +SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 +SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 +SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 +X- ; Y- ; MUL;T3=T // T3 = T3*T2 T2 T3 T4 +X=T4; Y=Y1; MUL;T- // T4 = T4*Y1 T3 T4 +SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 + + */ +TEXT ·p256PointAddAffineAsm(SB), NOSPLIT, $0 + MOVD P3+0(FP), P3ptr + MOVD P1+8(FP), P1ptr + MOVD P2+16(FP), P2ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // if (sign == 1) { + // Y2 = fromBig(new(big.Int).Mod(new(big.Int).Sub(p256.P, new(big.Int).SetBytes(Y2)), p256.P)) // Y2 = P-Y2 + // } + + VL 32(P2ptr), Y2H + VL 48(P2ptr), Y2L + + VLREPG sign+24(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSCBIQ Y2L, PL, CAR1 + VSQ Y2L, PL, T1L + VSBIQ PH, Y2H, CAR1, T1H + + VSEL Y2L, T1L, SEL1, Y2L + VSEL Y2H, T1H, SEL1, Y2H + +/* * + * Three operand formula: + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + */ + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² T1 + VL 64(P1ptr), X1 // Z1H + VL 80(P1ptr), X0 // Z1L + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // X=T ; Y- ; MUL; T2=T // T2 = T1*Z1 T1 T2 + VLR T0, X0 + VLR T1, X1 + CALL p256MulInternal<>(SB) + VLR T0, T2L + VLR T1, T2H + + // X- ; Y=X2; MUL; T1=T // T1 = T1*X2 T1 T2 + VL 0(P2ptr), Y1 // X2H + VL 16(P2ptr), Y0 // X2L + CALL p256MulInternal<>(SB) + VLR T0, T1L + VLR T1, T1H + + // X=T2; Y=Y2; MUL; T- // T2 = T2*Y2 T1 T2 + VLR T2L, X0 + VLR T2H, X1 + VLR Y2L, Y0 + VLR Y2H, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T2<T-Y1) // T2 = T2-Y1 T1 T2 + VL 32(P1ptr), Y1H + VL 48(P1ptr), Y1L + p256SubInternal(T2H,T2L,T1,T0,Y1H,Y1L) + + // SUB(Y<T1-X1) // T1 = T1-X1 T1 T2 + VL 0(P1ptr), X1H + VL 16(P1ptr), X1L + p256SubInternal(Y1,Y0,T1H,T1L,X1H,X1L) + + // X=Z1; Y- ; MUL; Z3:=T// Z3 = Z1*T1 T2 + VL 64(P1ptr), X1 // Z1H + VL 80(P1ptr), X0 // Z1L + CALL p256MulInternal<>(SB) + + // VST T1, 64(P3ptr) + // VST T0, 80(P3ptr) + VLR T0, Z3L + VLR T1, Z3H + + // X=Y; Y- ; MUL; X=T // T3 = T1*T1 T2 + VLR Y0, X0 + VLR Y1, X1 + CALL p256MulInternal<>(SB) + VLR T0, X0 + VLR T1, X1 + + // X- ; Y- ; MUL; T4=T // T4 = T3*T1 T2 T4 + CALL p256MulInternal<>(SB) + VLR T0, T4L + VLR T1, T4H + + // X- ; Y=X1; MUL; T3=T // T3 = T3*X1 T2 T3 T4 + VL 0(P1ptr), Y1 // X1H + VL 16(P1ptr), Y0 // X1L + CALL p256MulInternal<>(SB) + VLR T0, T3L + VLR T1, T3H + + // ADD(T1<T+T) // T1 = T3+T3 T1 T2 T3 T4 + p256AddInternal(T1H,T1L, T1,T0,T1,T0) + + // X=T2; Y=T2; MUL; T- // X3 = T2*T2 T1 T2 T3 T4 + VLR T2L, X0 + VLR T2H, X1 + VLR T2L, Y0 + VLR T2H, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T<T-T1) // X3 = X3-T1 T1 T2 T3 T4 (T1 = X3) + p256SubInternal(T1,T0,T1,T0,T1H,T1L) + + // SUB(T<T-T4) X3:=T // X3 = X3-T4 T2 T3 T4 + p256SubInternal(T1,T0,T1,T0,T4H,T4L) + VLR T0, X3L + VLR T1, X3H + + // SUB(X<T3-T) // T3 = T3-X3 T2 T3 T4 + p256SubInternal(X1,X0,T3H,T3L,T1,T0) + + // X- ; Y- ; MUL; T3=T // T3 = T3*T2 T2 T3 T4 + CALL p256MulInternal<>(SB) + VLR T0, T3L + VLR T1, T3H + + // X=T4; Y=Y1; MUL; T- // T4 = T4*Y1 T3 T4 + VLR T4L, X0 + VLR T4H, X1 + VL 32(P1ptr), Y1 // Y1H + VL 48(P1ptr), Y0 // Y1L + CALL p256MulInternal<>(SB) + + // SUB(T<T3-T) Y3:=T // Y3 = T3-T4 T3 T4 (T3 = Y3) + p256SubInternal(Y3H,Y3L,T3H,T3L,T1,T0) + + // if (sel == 0) { + // copy(P3.x[:], X1) + // copy(P3.y[:], Y1) + // copy(P3.z[:], Z1) + // } + + VL 0(P1ptr), X1H + VL 16(P1ptr), X1L + + // Y1 already loaded, left over from addition + VL 64(P1ptr), Z1H + VL 80(P1ptr), Z1L + + VLREPG sel+32(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSEL X1L, X3L, SEL1, X3L + VSEL X1H, X3H, SEL1, X3H + VSEL Y1L, Y3L, SEL1, Y3L + VSEL Y1H, Y3H, SEL1, Y3H + VSEL Z1L, Z3L, SEL1, Z3L + VSEL Z1H, Z3H, SEL1, Z3H + + // if (zero == 0) { + // copy(P3.x[:], X2) + // copy(P3.y[:], Y2) + // copy(P3.z[:], []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + // 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}) //(p256.z*2^256)%p + // } + VL 0(P2ptr), X2H + VL 16(P2ptr), X2L + + // Y2 already loaded + VL 128(CPOOL), Z2H + VL 144(CPOOL), Z2L + + VLREPG zero+40(FP), SEL1 + VZERO ZER + VCEQG SEL1, ZER, SEL1 + + VSEL X2L, X3L, SEL1, X3L + VSEL X2H, X3H, SEL1, X3H + VSEL Y2L, Y3L, SEL1, Y3L + VSEL Y2H, Y3H, SEL1, Y3H + VSEL Z2L, Z3L, SEL1, Z3L + VSEL Z2H, Z3H, SEL1, Z3H + + // All done, store out the result!!! + VST X3H, 0(P3ptr) + VST X3L, 16(P3ptr) + VST Y3H, 32(P3ptr) + VST Y3L, 48(P3ptr) + VST Z3H, 64(P3ptr) + VST Z3L, 80(P3ptr) + + RET + +#undef P3ptr +#undef P1ptr +#undef P2ptr +#undef CPOOL + +#undef Y2L +#undef Y2H +#undef T1L +#undef T1H +#undef T2L +#undef T2H +#undef T3L +#undef T3H +#undef T4L +#undef T4H + +#undef TT0 +#undef TT1 +#undef T2 + +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 + +#undef PL +#undef PH + +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef X2L +#undef X2H +#undef Z2L +#undef Z2H +#undef X3L +#undef X3H +#undef Y3L +#undef Y3H +#undef Z3L +#undef Z3H + +#undef ZER +#undef SEL1 +#undef CAR1 +#undef CAR2 + +// p256PointDoubleAsm(P3, P1 *p256Point) +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw.html +// http://www.hyperelliptic.org/EFD/g1p/auto-shortw-projective-3.html +#define P3ptr R1 +#define P1ptr R2 +#define CPOOL R4 + +// Temporaries in REGs +#define X3L V15 +#define X3H V16 +#define Y3L V17 +#define Y3H V18 +#define T1L V19 +#define T1H V20 +#define T2L V21 +#define T2H V22 +#define T3L V23 +#define T3H V24 + +#define X1L V6 +#define X1H V7 +#define Y1L V8 +#define Y1H V9 +#define Z1L V10 +#define Z1H V11 + +// Temps for Sub and Add +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 + +#define Z3L V23 +#define Z3H V24 + +#define ZER V26 +#define SEL1 V27 +#define CAR1 V28 +#define CAR2 V29 +/* + * http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2004-hmv + * Cost: 4M + 4S + 1*half + 5add + 2*2 + 1*3. + * Source: 2004 Hankerson–Menezes–Vanstone, page 91. + * A = 3(X₁-Z₁²)×(X₁+Z₁²) + * B = 2Y₁ + * Z₃ = B×Z₁ + * C = B² + * D = C×X₁ + * X₃ = A²-2D + * Y₃ = (D-X₃)×A-C²/2 + * + * Three-operand formula: + * T1 = Z1² + * T2 = X1-T1 + * T1 = X1+T1 + * T2 = T2*T1 + * T2 = 3*T2 + * Y3 = 2*Y1 + * Z3 = Y3*Z1 + * Y3 = Y3² + * T3 = Y3*X1 + * Y3 = Y3² + * Y3 = half*Y3 + * X3 = T2² + * T1 = 2*T3 + * X3 = X3-T1 + * T1 = T3-X3 + * T1 = T1*T2 + * Y3 = T1-Y3 + */ + +TEXT ·p256PointDoubleAsm(SB), NOSPLIT, $0 + MOVD P3+0(FP), P3ptr + MOVD P1+8(FP), P1ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1² + VL 64(P1ptr), X1 // Z1H + VL 80(P1ptr), X0 // Z1L + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // SUB(X<X1-T) // T2 = X1-T1 + VL 0(P1ptr), X1H + VL 16(P1ptr), X1L + p256SubInternal(X1,X0,X1H,X1L,T1,T0) + + // ADD(Y<X1+T) // T1 = X1+T1 + p256AddInternal(Y1,Y0,X1H,X1L,T1,T0) + + // X- ; Y- ; MUL; T- // T2 = T2*T1 + CALL p256MulInternal<>(SB) + + // ADD(T2<T+T); ADD(T2<T2+T) // T2 = 3*T2 + p256AddInternal(T2H,T2L,T1,T0,T1,T0) + p256AddInternal(T2H,T2L,T2H,T2L,T1,T0) + + // ADD(X<Y1+Y1) // Y3 = 2*Y1 + VL 32(P1ptr), Y1H + VL 48(P1ptr), Y1L + p256AddInternal(X1,X0,Y1H,Y1L,Y1H,Y1L) + + // X- ; Y=Z1; MUL; Z3:=T // Z3 = Y3*Z1 + VL 64(P1ptr), Y1 // Z1H + VL 80(P1ptr), Y0 // Z1L + CALL p256MulInternal<>(SB) + VST T1, 64(P3ptr) + VST T0, 80(P3ptr) + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // X=T ; Y=X1; MUL; T3=T // T3 = Y3*X1 + VLR T0, X0 + VLR T1, X1 + VL 0(P1ptr), Y1 + VL 16(P1ptr), Y0 + CALL p256MulInternal<>(SB) + VLR T0, T3L + VLR T1, T3H + + // X- ; Y=X ; MUL; T- // Y3 = Y3² + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // HAL(Y3<T) // Y3 = half*Y3 + p256HalfInternal(Y3H,Y3L, T1,T0) + + // X=T2; Y=T2; MUL; T- // X3 = T2² + VLR T2L, X0 + VLR T2H, X1 + VLR T2L, Y0 + VLR T2H, Y1 + CALL p256MulInternal<>(SB) + + // ADD(T1<T3+T3) // T1 = 2*T3 + p256AddInternal(T1H,T1L,T3H,T3L,T3H,T3L) + + // SUB(X3<T-T1) X3:=X3 // X3 = X3-T1 + p256SubInternal(X3H,X3L,T1,T0,T1H,T1L) + VST X3H, 0(P3ptr) + VST X3L, 16(P3ptr) + + // SUB(X<T3-X3) // T1 = T3-X3 + p256SubInternal(X1,X0,T3H,T3L,X3H,X3L) + + // X- ; Y- ; MUL; T- // T1 = T1*T2 + CALL p256MulInternal<>(SB) + + // SUB(Y3<T-Y3) // Y3 = T1-Y3 + p256SubInternal(Y3H,Y3L,T1,T0,Y3H,Y3L) + + VST Y3H, 32(P3ptr) + VST Y3L, 48(P3ptr) + RET + +#undef P3ptr +#undef P1ptr +#undef CPOOL +#undef X3L +#undef X3H +#undef Y3L +#undef Y3H +#undef T1L +#undef T1H +#undef T2L +#undef T2H +#undef T3L +#undef T3H +#undef X1L +#undef X1H +#undef Y1L +#undef Y1H +#undef Z1L +#undef Z1H +#undef TT0 +#undef TT1 +#undef T2 +#undef X0 +#undef X1 +#undef Y0 +#undef Y1 +#undef T0 +#undef T1 +#undef PL +#undef PH +#undef Z3L +#undef Z3H +#undef ZER +#undef SEL1 +#undef CAR1 +#undef CAR2 + +// p256PointAddAsm(P3, P1, P2 *p256Point) +#define P3ptr R1 +#define P1ptr R2 +#define P2ptr R3 +#define CPOOL R4 + +// Temporaries in REGs +#define T1L V16 +#define T1H V17 +#define T2L V18 +#define T2H V19 +#define U1L V20 +#define U1H V21 +#define S1L V22 +#define S1H V23 +#define HL V24 +#define HH V25 +#define RL V26 +#define RH V27 + +// Temps for Sub and Add +#define ZER V6 +#define SEL1 V7 +#define CAR1 V8 +#define CAR2 V9 +#define TT0 V11 +#define TT1 V12 +#define T2 V13 + +// p256MulAsm Parameters +#define X0 V0 +#define X1 V1 +#define Y0 V2 +#define Y1 V3 +#define T0 V4 +#define T1 V5 + +#define PL V30 +#define PH V31 +/* + * https://choucroutage.com/Papers/SideChannelAttacks/ctrsa-2011-brown.pdf "Software Implementation of the NIST Elliptic Curves Over Prime Fields" + * + * A = X₁×Z₂² + * B = Y₁×Z₂³ + * C = X₂×Z₁²-A + * D = Y₂×Z₁³-B + * X₃ = D² - 2A×C² - C³ + * Y₃ = D×(A×C² - X₃) - B×C³ + * Z₃ = Z₁×Z₂×C + * + * Three-operand formula (adopted): http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-1998-cmo-2 + * Temp storage: T1,T2,U1,H,Z3=X3=Y3,S1,R + * + * T1 = Z1*Z1 + * T2 = Z2*Z2 + * U1 = X1*T2 + * H = X2*T1 + * H = H-U1 + * Z3 = Z1*Z2 + * Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array + * + * S1 = Z2*T2 + * S1 = Y1*S1 + * R = Z1*T1 + * R = Y2*R + * R = R-S1 + * + * T1 = H*H + * T2 = H*T1 + * U1 = U1*T1 + * + * X3 = R*R + * X3 = X3-T2 + * T1 = 2*U1 + * X3 = X3-T1 << store-out X3 result reg + * + * T2 = S1*T2 + * Y3 = U1-X3 + * Y3 = R*Y3 + * Y3 = Y3-T2 << store-out Y3 result reg + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + // X=X2; Y- ; MUL; H=T // H = X2*T1 + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + // SUB(H<H-T) // H = H-U1 + // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 + // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H << store-out Z3 result reg.. could override Z1, if slices have same backing array + // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 + // X=Y2; Y=R ; MUL; T- // R = Y2*R + // SUB(R<T-S1) // R = R-S1 + // X=H ; Y=H ; MUL; T- // T1 = H*H + // X- ; Y=T ; MUL; T2=T // T2 = H*T1 + // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 + // X=R ; Y=R ; MUL; T- // X3 = R*R + // SUB(T<T-T2) // X3 = X3-T2 + // ADD(X<U1+U1) // T1 = 2*U1 + // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg + // SUB(Y<U1-T) // Y3 = U1-X3 + // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 + // X=S1; Y=T2; MUL; T- // T2 = S1*T2 + // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg + */ +TEXT ·p256PointAddAsm(SB), NOSPLIT, $0 + MOVD P3+0(FP), P3ptr + MOVD P1+8(FP), P1ptr + MOVD P2+16(FP), P2ptr + + MOVD $p256mul<>+0x00(SB), CPOOL + VL 16(CPOOL), PL + VL 0(CPOOL), PH + + // X=Z1; Y=Z1; MUL; T- // T1 = Z1*Z1 + VL 64(P1ptr), X1 // Z1H + VL 80(P1ptr), X0 // Z1L + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; R=T // R = Z1*T1 + VLR T0, Y0 + VLR T1, Y1 + CALL p256MulInternal<>(SB) + VLR T0, RL + VLR T1, RH + + // X=X2; Y- ; MUL; H=T // H = X2*T1 + VL 0(P2ptr), X1 // X2H + VL 16(P2ptr), X0 // X2L + CALL p256MulInternal<>(SB) + VLR T0, HL + VLR T1, HH + + // X=Z2; Y=Z2; MUL; T- // T2 = Z2*Z2 + VL 64(P2ptr), X1 // Z2H + VL 80(P2ptr), X0 // Z2L + VLR X0, Y0 + VLR X1, Y1 + CALL p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; S1=T // S1 = Z2*T2 + VLR T0, Y0 + VLR T1, Y1 + CALL p256MulInternal<>(SB) + VLR T0, S1L + VLR T1, S1H + + // X=X1; Y- ; MUL; U1=T // U1 = X1*T2 + VL 0(P1ptr), X1 // X1H + VL 16(P1ptr), X0 // X1L + CALL p256MulInternal<>(SB) + VLR T0, U1L + VLR T1, U1H + + // SUB(H<H-T) // H = H-U1 + p256SubInternal(HH,HL,HH,HL,T1,T0) + + // X=Z1; Y=Z2; MUL; T- // Z3 = Z1*Z2 + VL 64(P1ptr), X1 // Z1H + VL 80(P1ptr), X0 // Z1L + VL 64(P2ptr), Y1 // Z2H + VL 80(P2ptr), Y0 // Z2L + CALL p256MulInternal<>(SB) + + // X=T ; Y=H ; MUL; Z3:=T// Z3 = Z3*H + VLR T0, X0 + VLR T1, X1 + VLR HL, Y0 + VLR HH, Y1 + CALL p256MulInternal<>(SB) + VST T1, 64(P3ptr) + VST T0, 80(P3ptr) + + // X=Y1; Y=S1; MUL; S1=T // S1 = Y1*S1 + VL 32(P1ptr), X1 + VL 48(P1ptr), X0 + VLR S1L, Y0 + VLR S1H, Y1 + CALL p256MulInternal<>(SB) + VLR T0, S1L + VLR T1, S1H + + // X=Y2; Y=R ; MUL; T- // R = Y2*R + VL 32(P2ptr), X1 + VL 48(P2ptr), X0 + VLR RL, Y0 + VLR RH, Y1 + CALL p256MulInternal<>(SB) + + // SUB(R<T-S1) // R = T-S1 + p256SubInternal(RH,RL,T1,T0,S1H,S1L) + + // X=H ; Y=H ; MUL; T- // T1 = H*H + VLR HL, X0 + VLR HH, X1 + VLR HL, Y0 + VLR HH, Y1 + CALL p256MulInternal<>(SB) + + // X- ; Y=T ; MUL; T2=T // T2 = H*T1 + VLR T0, Y0 + VLR T1, Y1 + CALL p256MulInternal<>(SB) + VLR T0, T2L + VLR T1, T2H + + // X=U1; Y- ; MUL; U1=T // U1 = U1*T1 + VLR U1L, X0 + VLR U1H, X1 + CALL p256MulInternal<>(SB) + VLR T0, U1L + VLR T1, U1H + + // X=R ; Y=R ; MUL; T- // X3 = R*R + VLR RL, X0 + VLR RH, X1 + VLR RL, Y0 + VLR RH, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T<T-T2) // X3 = X3-T2 + p256SubInternal(T1,T0,T1,T0,T2H,T2L) + + // ADD(X<U1+U1) // T1 = 2*U1 + p256AddInternal(X1,X0,U1H,U1L,U1H,U1L) + + // SUB(T<T-X) X3:=T // X3 = X3-T1 << store-out X3 result reg + p256SubInternal(T1,T0,T1,T0,X1,X0) + VST T1, 0(P3ptr) + VST T0, 16(P3ptr) + + // SUB(Y<U1-T) // Y3 = U1-X3 + p256SubInternal(Y1,Y0,U1H,U1L,T1,T0) + + // X=R ; Y- ; MUL; U1=T // Y3 = R*Y3 + VLR RL, X0 + VLR RH, X1 + CALL p256MulInternal<>(SB) + VLR T0, U1L + VLR T1, U1H + + // X=S1; Y=T2; MUL; T- // T2 = S1*T2 + VLR S1L, X0 + VLR S1H, X1 + VLR T2L, Y0 + VLR T2H, Y1 + CALL p256MulInternal<>(SB) + + // SUB(T<U1-T); Y3:=T // Y3 = Y3-T2 << store-out Y3 result reg + p256SubInternal(T1,T0,U1H,U1L,T1,T0) + VST T1, 32(P3ptr) + VST T0, 48(P3ptr) + + RET diff --git a/src/crypto/elliptic/p256_generic.go b/src/crypto/elliptic/p256_generic.go new file mode 100644 index 0000000000..9963fcafdd --- /dev/null +++ b/src/crypto/elliptic/p256_generic.go @@ -0,0 +1,16 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build !amd64,!s390x + +package elliptic + +var ( + p256 p256Curve +) + +func initP256Arch() { + // Use pure Go implementation. + p256 = p256Curve{p256Params} +} diff --git a/src/crypto/elliptic/p256_s390x.go b/src/crypto/elliptic/p256_s390x.go new file mode 100644 index 0000000000..2ed4c0b9e7 --- /dev/null +++ b/src/crypto/elliptic/p256_s390x.go @@ -0,0 +1,513 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// +build s390x + +package elliptic + +import ( + "math/big" +) + +type p256CurveFast struct { + *CurveParams +} + +type p256Point struct { + x [32]byte + y [32]byte + z [32]byte +} + +var ( + p256 Curve + p256PreFast *[37][64]p256Point +) + +// hasVectorFacility reports whether the machine has the z/Architecture +// vector facility installed and enabled. +func hasVectorFacility() bool + +var hasVX = hasVectorFacility() + +func initP256Arch() { + if hasVX { + p256 = p256CurveFast{p256Params} + initTable() + return + } + + // No vector support, use pure Go implementation. + p256 = p256Curve{p256Params} + return +} + +func (curve p256CurveFast) Params() *CurveParams { + return curve.CurveParams +} + +// Functions implemented in p256_asm_s390x.s +// Montgomery multiplication modulo P256 +func p256MulAsm(res, in1, in2 []byte) + +// Montgomery square modulo P256 +func p256Sqr(res, in []byte) { + p256MulAsm(res, in, in) +} + +// Montgomery multiplication by 1 +func p256FromMont(res, in []byte) + +// iff cond == 1 val <- -val +func p256NegCond(val *p256Point, cond int) + +// if cond == 0 res <- b; else res <- a +func p256MovCond(res, a, b *p256Point, cond int) + +// Constant time table access +func p256Select(point *p256Point, table []p256Point, idx int) +func p256SelectBase(point *p256Point, table []p256Point, idx int) + +// Montgomery multiplication modulo Ord(G) +func p256OrdMul(res, in1, in2 []byte) + +// Montgomery square modulo Ord(G), repeated n times +func p256OrdSqr(res, in []byte, n int) { + copy(res, in) + for i := 0; i < n; i += 1 { + p256OrdMul(res, res, res) + } +} + +// Point add with P2 being affine point +// If sign == 1 -> P2 = -P2 +// If sel == 0 -> P3 = P1 +// if zero == 0 -> P3 = P2 +func p256PointAddAffineAsm(P3, P1, P2 *p256Point, sign, sel, zero int) + +// Point add +func p256PointAddAsm(P3, P1, P2 *p256Point) +func p256PointDoubleAsm(P3, P1 *p256Point) + +func (curve p256CurveFast) Inverse(k *big.Int) *big.Int { + if k.Cmp(p256Params.N) >= 0 { + // This should never happen. + reducedK := new(big.Int).Mod(k, p256Params.N) + k = reducedK + } + + // table will store precomputed powers of x. The 32 bytes at index + // i store x^(i+1). + var table [15][32]byte + + x := fromBig(k) + // This code operates in the Montgomery domain where R = 2^256 mod n + // and n is the order of the scalar field. (See initP256 for the + // value.) Elements in the Montgomery domain take the form a×R and + // multiplication of x and y in the calculates (x × y × R^-1) mod n. RR + // is R×R mod n thus the Montgomery multiplication x and RR gives x×R, + // i.e. converts x into the Montgomery domain. Stored in BigEndian form + RR := []byte{0x66, 0xe1, 0x2d, 0x94, 0xf3, 0xd9, 0x56, 0x20, 0x28, 0x45, 0xb2, 0x39, 0x2b, 0x6b, 0xec, 0x59, + 0x46, 0x99, 0x79, 0x9c, 0x49, 0xbd, 0x6f, 0xa6, 0x83, 0x24, 0x4c, 0x95, 0xbe, 0x79, 0xee, 0xa2} + + p256OrdMul(table[0][:], x, RR) + + // Prepare the table, no need in constant time access, because the + // power is not a secret. (Entry 0 is never used.) + for i := 2; i < 16; i += 2 { + p256OrdSqr(table[i-1][:], table[(i/2)-1][:], 1) + p256OrdMul(table[i][:], table[i-1][:], table[0][:]) + } + + copy(x, table[14][:]) // f + + p256OrdSqr(x[0:32], x[0:32], 4) + p256OrdMul(x[0:32], x[0:32], table[14][:]) // ff + t := make([]byte, 32) + copy(t, x) + + p256OrdSqr(x, x, 8) + p256OrdMul(x, x, t) // ffff + copy(t, x) + + p256OrdSqr(x, x, 16) + p256OrdMul(x, x, t) // ffffffff + copy(t, x) + + p256OrdSqr(x, x, 64) // ffffffff0000000000000000 + p256OrdMul(x, x, t) // ffffffff00000000ffffffff + p256OrdSqr(x, x, 32) // ffffffff00000000ffffffff00000000 + p256OrdMul(x, x, t) // ffffffff00000000ffffffffffffffff + + // Remaining 32 windows + expLo := [32]byte{0xb, 0xc, 0xe, 0x6, 0xf, 0xa, 0xa, 0xd, 0xa, 0x7, 0x1, 0x7, 0x9, 0xe, 0x8, 0x4, + 0xf, 0x3, 0xb, 0x9, 0xc, 0xa, 0xc, 0x2, 0xf, 0xc, 0x6, 0x3, 0x2, 0x5, 0x4, 0xf} + for i := 0; i < 32; i++ { + p256OrdSqr(x, x, 4) + p256OrdMul(x, x, table[expLo[i]-1][:]) + } + + // Multiplying by one in the Montgomery domain converts a Montgomery + // value out of the domain. + one := []byte{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1} + p256OrdMul(x, x, one) + + return new(big.Int).SetBytes(x) +} + +// fromBig converts a *big.Int into a format used by this code. +func fromBig(big *big.Int) []byte { + // This could be done a lot more efficiently... + res := big.Bytes() + if 32 == len(res) { + return res + } + t := make([]byte, 32) + offset := 32 - len(res) + for i := len(res) - 1; i >= 0; i-- { + t[i+offset] = res[i] + } + return t +} + +// p256GetMultiplier makes sure byte array will have 32 byte elements, If the scalar +// is equal or greater than the order of the group, it's reduced modulo that order. +func p256GetMultiplier(in []byte) []byte { + n := new(big.Int).SetBytes(in) + + if n.Cmp(p256Params.N) >= 0 { + n.Mod(n, p256Params.N) + } + return fromBig(n) +} + +// p256MulAsm operates in a Montgomery domain with R = 2^256 mod p, where p is the +// underlying field of the curve. (See initP256 for the value.) Thus rr here is +// R×R mod p. See comment in Inverse about how this is used. +var rr = []byte{0x00, 0x00, 0x00, 0x04, 0xff, 0xff, 0xff, 0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xfe, + 0xff, 0xff, 0xff, 0xfb, 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x03} + +// (This is one, in the Montgomery domain.) +var one = []byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01} + +func maybeReduceModP(in *big.Int) *big.Int { + if in.Cmp(p256Params.P) < 0 { + return in + } + return new(big.Int).Mod(in, p256Params.P) +} + +func (curve p256CurveFast) CombinedMult(bigX, bigY *big.Int, baseScalar, scalar []byte) (x, y *big.Int) { + var r1, r2 p256Point + r1.p256BaseMult(p256GetMultiplier(baseScalar)) + + copy(r2.x[:], fromBig(maybeReduceModP(bigX))) + copy(r2.y[:], fromBig(maybeReduceModP(bigY))) + copy(r2.z[:], one) + p256MulAsm(r2.x[:], r2.x[:], rr[:]) + p256MulAsm(r2.y[:], r2.y[:], rr[:]) + + r2.p256ScalarMult(p256GetMultiplier(scalar)) + p256PointAddAsm(&r1, &r1, &r2) + return r1.p256PointToAffine() +} + +func (curve p256CurveFast) ScalarBaseMult(scalar []byte) (x, y *big.Int) { + var r p256Point + r.p256BaseMult(p256GetMultiplier(scalar)) + return r.p256PointToAffine() +} + +func (curve p256CurveFast) ScalarMult(bigX, bigY *big.Int, scalar []byte) (x, y *big.Int) { + var r p256Point + copy(r.x[:], fromBig(maybeReduceModP(bigX))) + copy(r.y[:], fromBig(maybeReduceModP(bigY))) + copy(r.z[:], one) + p256MulAsm(r.x[:], r.x[:], rr[:]) + p256MulAsm(r.y[:], r.y[:], rr[:]) + r.p256ScalarMult(p256GetMultiplier(scalar)) + return r.p256PointToAffine() +} + +func (p *p256Point) p256PointToAffine() (x, y *big.Int) { + zInv := make([]byte, 32) + zInvSq := make([]byte, 32) + + p256Inverse(zInv, p.z[:]) + p256Sqr(zInvSq, zInv) + p256MulAsm(zInv, zInv, zInvSq) + + p256MulAsm(zInvSq, p.x[:], zInvSq) + p256MulAsm(zInv, p.y[:], zInv) + + p256FromMont(zInvSq, zInvSq) + p256FromMont(zInv, zInv) + + return new(big.Int).SetBytes(zInvSq), new(big.Int).SetBytes(zInv) +} + +// p256Inverse sets out to in^-1 mod p. +func p256Inverse(out, in []byte) { + var stack [6 * 32]byte + p2 := stack[32*0 : 32*0+32] + p4 := stack[32*1 : 32*1+32] + p8 := stack[32*2 : 32*2+32] + p16 := stack[32*3 : 32*3+32] + p32 := stack[32*4 : 32*4+32] + + p256Sqr(out, in) + p256MulAsm(p2, out, in) // 3*p + + p256Sqr(out, p2) + p256Sqr(out, out) + p256MulAsm(p4, out, p2) // f*p + + p256Sqr(out, p4) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(p8, out, p4) // ff*p + + p256Sqr(out, p8) + + for i := 0; i < 7; i++ { + p256Sqr(out, out) + } + p256MulAsm(p16, out, p8) // ffff*p + + p256Sqr(out, p16) + for i := 0; i < 15; i++ { + p256Sqr(out, out) + } + p256MulAsm(p32, out, p16) // ffffffff*p + + p256Sqr(out, p32) + + for i := 0; i < 31; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, in) + + for i := 0; i < 32*4; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p32) + + for i := 0; i < 32; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p32) + + for i := 0; i < 16; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p16) + + for i := 0; i < 8; i++ { + p256Sqr(out, out) + } + p256MulAsm(out, out, p8) + + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, p4) + + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, p2) + + p256Sqr(out, out) + p256Sqr(out, out) + p256MulAsm(out, out, in) +} + +func boothW5(in uint) (int, int) { + var s uint = ^((in >> 5) - 1) + var d uint = (1 << 6) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func boothW7(in uint) (int, int) { + var s uint = ^((in >> 7) - 1) + var d uint = (1 << 8) - in - 1 + d = (d & s) | (in & (^s)) + d = (d >> 1) + (d & 1) + return int(d), int(s & 1) +} + +func initTable() { + p256PreFast = new([37][64]p256Point) //z coordinate not used + basePoint := p256Point{ + x: [32]byte{0x18, 0x90, 0x5f, 0x76, 0xa5, 0x37, 0x55, 0xc6, 0x79, 0xfb, 0x73, 0x2b, 0x77, 0x62, 0x25, 0x10, + 0x75, 0xba, 0x95, 0xfc, 0x5f, 0xed, 0xb6, 0x01, 0x79, 0xe7, 0x30, 0xd4, 0x18, 0xa9, 0x14, 0x3c}, //(p256.x*2^256)%p + y: [32]byte{0x85, 0x71, 0xff, 0x18, 0x25, 0x88, 0x5d, 0x85, 0xd2, 0xe8, 0x86, 0x88, 0xdd, 0x21, 0xf3, 0x25, + 0x8b, 0x4a, 0xb8, 0xe4, 0xba, 0x19, 0xe4, 0x5c, 0xdd, 0xf2, 0x53, 0x57, 0xce, 0x95, 0x56, 0x0a}, //(p256.y*2^256)%p + z: [32]byte{0x00, 0x00, 0x00, 0x00, 0xff, 0xff, 0xff, 0xfe, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01}, //(p256.z*2^256)%p + } + + t1 := new(p256Point) + t2 := new(p256Point) + *t2 = basePoint + + zInv := make([]byte, 32) + zInvSq := make([]byte, 32) + for j := 0; j < 64; j++ { + *t1 = *t2 + for i := 0; i < 37; i++ { + // The window size is 7 so we need to double 7 times. + if i != 0 { + for k := 0; k < 7; k++ { + p256PointDoubleAsm(t1, t1) + } + } + // Convert the point to affine form. (Its values are + // still in Montgomery form however.) + p256Inverse(zInv, t1.z[:]) + p256Sqr(zInvSq, zInv) + p256MulAsm(zInv, zInv, zInvSq) + + p256MulAsm(t1.x[:], t1.x[:], zInvSq) + p256MulAsm(t1.y[:], t1.y[:], zInv) + + copy(t1.z[:], basePoint.z[:]) + // Update the table entry + copy(p256PreFast[i][j].x[:], t1.x[:]) + copy(p256PreFast[i][j].y[:], t1.y[:]) + } + if j == 0 { + p256PointDoubleAsm(t2, &basePoint) + } else { + p256PointAddAsm(t2, t2, &basePoint) + } + } +} + +func (p *p256Point) p256BaseMult(scalar []byte) { + wvalue := (uint(scalar[31]) << 1) & 0xff + sel, sign := boothW7(uint(wvalue)) + p256SelectBase(p, p256PreFast[0][:], sel) + p256NegCond(p, sign) + + copy(p.z[:], one[:]) + var t0 p256Point + + copy(t0.z[:], one[:]) + + index := uint(6) + zero := sel + + for i := 1; i < 37; i++ { + if index < 247 { + wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0xff + } else { + wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0xff + } + index += 7 + sel, sign = boothW7(uint(wvalue)) + p256SelectBase(&t0, p256PreFast[i][:], sel) + p256PointAddAffineAsm(p, p, &t0, sign, sel, zero) + zero |= sel + } +} + +func (p *p256Point) p256ScalarMult(scalar []byte) { + // precomp is a table of precomputed points that stores powers of p + // from p^1 to p^16. + var precomp [16]p256Point + var t0, t1, t2, t3 p256Point + + // Prepare the table + *&precomp[0] = *p + + p256PointDoubleAsm(&t0, p) + p256PointDoubleAsm(&t1, &t0) + p256PointDoubleAsm(&t2, &t1) + p256PointDoubleAsm(&t3, &t2) + *&precomp[1] = t0 // 2 + *&precomp[3] = t1 // 4 + *&precomp[7] = t2 // 8 + *&precomp[15] = t3 // 16 + + p256PointAddAsm(&t0, &t0, p) + p256PointAddAsm(&t1, &t1, p) + p256PointAddAsm(&t2, &t2, p) + *&precomp[2] = t0 // 3 + *&precomp[4] = t1 // 5 + *&precomp[8] = t2 // 9 + + p256PointDoubleAsm(&t0, &t0) + p256PointDoubleAsm(&t1, &t1) + *&precomp[5] = t0 // 6 + *&precomp[9] = t1 // 10 + + p256PointAddAsm(&t2, &t0, p) + p256PointAddAsm(&t1, &t1, p) + *&precomp[6] = t2 // 7 + *&precomp[10] = t1 // 11 + + p256PointDoubleAsm(&t0, &t0) + p256PointDoubleAsm(&t2, &t2) + *&precomp[11] = t0 // 12 + *&precomp[13] = t2 // 14 + + p256PointAddAsm(&t0, &t0, p) + p256PointAddAsm(&t2, &t2, p) + *&precomp[12] = t0 // 13 + *&precomp[14] = t2 // 15 + + // Start scanning the window from top bit + index := uint(254) + var sel, sign int + + wvalue := (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f + sel, _ = boothW5(uint(wvalue)) + p256Select(p, precomp[:], sel) + zero := sel + + for index > 4 { + index -= 5 + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + + if index < 247 { + wvalue = ((uint(scalar[31-index/8]) >> (index % 8)) + (uint(scalar[31-index/8-1]) << (8 - (index % 8)))) & 0x3f + } else { + wvalue = (uint(scalar[31-index/8]) >> (index % 8)) & 0x3f + } + + sel, sign = boothW5(uint(wvalue)) + + p256Select(&t0, precomp[:], sel) + p256NegCond(&t0, sign) + p256PointAddAsm(&t1, p, &t0) + p256MovCond(&t1, &t1, p, sel) + p256MovCond(p, &t1, &t0, zero) + zero |= sel + } + + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + p256PointDoubleAsm(p, p) + + wvalue = (uint(scalar[31]) << 1) & 0x3f + sel, sign = boothW5(uint(wvalue)) + + p256Select(&t0, precomp[:], sel) + p256NegCond(&t0, sign) + p256PointAddAsm(&t1, p, &t0) + p256MovCond(&t1, &t1, p, sel) + p256MovCond(p, &t1, &t0, zero) +} |