aboutsummaryrefslogtreecommitdiff
path: root/src/math/big/arith_arm64.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/math/big/arith_arm64.s')
-rw-r--r--src/math/big/arith_arm64.s113
1 files changed, 93 insertions, 20 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
index 18e513e2c3..da6e408e19 100644
--- a/src/math/big/arith_arm64.s
+++ b/src/math/big/arith_arm64.s
@@ -109,13 +109,59 @@ done:
MOVD R0, c+72(FP)
RET
+#define vwOneOp(instr, op1) \
+ MOVD.P 8(R1), R4; \
+ instr op1, R4; \
+ MOVD.P R4, 8(R3);
+
+// handle the first 1~4 elements before starting iteration in addVW/subVW
+#define vwPreIter(instr1, instr2, counter, target) \
+ vwOneOp(instr1, R2); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0); \
+ SUB $1, counter; \
+ CBZ counter, target; \
+ vwOneOp(instr2, $0);
+
+// do one iteration of add or sub in addVW/subVW
+#define vwOneIter(instr, counter, exit) \
+ CBZ counter, exit; \ // careful not to touch the carry flag
+ LDP.P 32(R1), (R4, R5); \
+ LDP -16(R1), (R6, R7); \
+ instr $0, R4, R8; \
+ instr $0, R5, R9; \
+ instr $0, R6, R10; \
+ instr $0, R7, R11; \
+ STP.P (R8, R9), 32(R3); \
+ STP (R10, R11), -16(R3); \
+ SUB $4, counter;
+
+// do one iteration of copy in addVW/subVW
+#define vwOneIterCopy(counter, exit) \
+ CBZ counter, exit; \
+ LDP.P 32(R1), (R4, R5); \
+ LDP -16(R1), (R6, R7); \
+ STP.P (R4, R5), 32(R3); \
+ STP (R6, R7), -16(R3); \
+ SUB $4, counter;
// func addVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
TEXT ·addVW(SB),NOSPLIT,$0
MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
+ CMP $32, R0
+ BGE large // large-sized 'z' and 'x'
CBZ R0, len0 // the length of z is 0
MOVD.P 8(R1), R4
ADDS R2, R4 // z[0] = x[0] + y, set carry
@@ -135,29 +181,46 @@ two: // do it twice
STP.P (R8, R9), 16(R3)
SUB $2, R0
loop: // do four times per round
- CBZ R0, len1 // careful not to touch the carry flag
- LDP.P 32(R1), (R4, R5)
- LDP -16(R1), (R6, R7)
- ADCS $0, R4, R8
- ADCS $0, R5, R9
- ADCS $0, R6, R10
- ADCS $0, R7, R11
- STP.P (R8, R9), 32(R3)
- STP (R10, R11), -16(R3)
- SUB $4, R0
+ vwOneIter(ADCS, R0, len1)
B loop
len1:
CSET HS, R2 // extract carry flag
len0:
MOVD R2, c+56(FP)
+done:
RET
+large:
+ AND $0x3, R0, R10
+ AND $~0x3, R0
+ // unrolling for the first 1~4 elements to avoid saving the carry
+ // flag in each step, adjust $R0 if we unrolled 4 elements
+ vwPreIter(ADDS, ADCS, R10, add4)
+ SUB $4, R0
+add4:
+ BCC copy
+ vwOneIter(ADCS, R0, len1)
+ B add4
+copy:
+ MOVD ZR, c+56(FP)
+ CMP R1, R3
+ BEQ done
+copy_4: // no carry flag, copy the rest
+ vwOneIterCopy(R0, done)
+ B copy_4
// func subVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
TEXT ·subVW(SB),NOSPLIT,$0
MOVD z+0(FP), R3
MOVD z_len+8(FP), R0
MOVD x+24(FP), R1
MOVD y+48(FP), R2
+ CMP $32, R0
+ BGE large // large-sized 'z' and 'x'
CBZ R0, len0 // the length of z is 0
MOVD.P 8(R1), R4
SUBS R2, R4 // z[0] = x[0] - y, set carry
@@ -177,22 +240,32 @@ two: // do it twice
STP.P (R8, R9), 16(R3)
SUB $2, R0
loop: // do four times per round
- CBZ R0, len1 // careful not to touch the carry flag
- LDP.P 32(R1), (R4, R5)
- LDP -16(R1), (R6, R7)
- SBCS $0, R4, R8
- SBCS $0, R5, R9
- SBCS $0, R6, R10
- SBCS $0, R7, R11
- STP.P (R8, R9), 32(R3)
- STP (R10, R11), -16(R3)
- SUB $4, R0
+ vwOneIter(SBCS, R0, len1)
B loop
len1:
CSET LO, R2 // extract carry flag
len0:
MOVD R2, c+56(FP)
+done:
RET
+large:
+ AND $0x3, R0, R10
+ AND $~0x3, R0
+ // unrolling for the first 1~4 elements to avoid saving the carry
+ // flag in each step, adjust $R0 if we unrolled 4 elements
+ vwPreIter(SUBS, SBCS, R10, sub4)
+ SUB $4, R0
+sub4:
+ BCS copy
+ vwOneIter(SBCS, R0, len1)
+ B sub4
+copy:
+ MOVD ZR, c+56(FP)
+ CMP R1, R3
+ BEQ done
+copy_4: // no carry flag, copy the rest
+ vwOneIterCopy(R0, done)
+ B copy_4
// func shlVU(z, x []Word, s uint) (c Word)
// This implementation handles the shift operation from the high word to the low word,