diff options
Diffstat (limited to 'src/math/big/arith_arm64.s')
-rw-r--r-- | src/math/big/arith_arm64.s | 113 |
1 files changed, 93 insertions, 20 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index 18e513e2c3..da6e408e19 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -109,13 +109,59 @@ done: MOVD R0, c+72(FP) RET +#define vwOneOp(instr, op1) \ + MOVD.P 8(R1), R4; \ + instr op1, R4; \ + MOVD.P R4, 8(R3); + +// handle the first 1~4 elements before starting iteration in addVW/subVW +#define vwPreIter(instr1, instr2, counter, target) \ + vwOneOp(instr1, R2); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); + +// do one iteration of add or sub in addVW/subVW +#define vwOneIter(instr, counter, exit) \ + CBZ counter, exit; \ // careful not to touch the carry flag + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + instr $0, R4, R8; \ + instr $0, R5, R9; \ + instr $0, R6, R10; \ + instr $0, R7, R11; \ + STP.P (R8, R9), 32(R3); \ + STP (R10, R11), -16(R3); \ + SUB $4, counter; + +// do one iteration of copy in addVW/subVW +#define vwOneIterCopy(counter, exit) \ + CBZ counter, exit; \ + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + STP.P (R4, R5), 32(R3); \ + STP (R6, R7), -16(R3); \ + SUB $4, counter; // func addVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. TEXT ·addVW(SB),NOSPLIT,$0 MOVD z+0(FP), R3 MOVD z_len+8(FP), R0 MOVD x+24(FP), R1 MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' CBZ R0, len0 // the length of z is 0 MOVD.P 8(R1), R4 ADDS R2, R4 // z[0] = x[0] + y, set carry @@ -135,29 +181,46 @@ two: // do it twice STP.P (R8, R9), 16(R3) SUB $2, R0 loop: // do four times per round - CBZ R0, len1 // careful not to touch the carry flag - LDP.P 32(R1), (R4, R5) - LDP -16(R1), (R6, R7) - ADCS $0, R4, R8 - ADCS $0, R5, R9 - ADCS $0, R6, R10 - ADCS $0, R7, R11 - STP.P (R8, R9), 32(R3) - STP (R10, R11), -16(R3) - SUB $4, R0 + vwOneIter(ADCS, R0, len1) B loop len1: CSET HS, R2 // extract carry flag len0: MOVD R2, c+56(FP) +done: RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(ADDS, ADCS, R10, add4) + SUB $4, R0 +add4: + BCC copy + vwOneIter(ADCS, R0, len1) + B add4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 // func subVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. TEXT ·subVW(SB),NOSPLIT,$0 MOVD z+0(FP), R3 MOVD z_len+8(FP), R0 MOVD x+24(FP), R1 MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' CBZ R0, len0 // the length of z is 0 MOVD.P 8(R1), R4 SUBS R2, R4 // z[0] = x[0] - y, set carry @@ -177,22 +240,32 @@ two: // do it twice STP.P (R8, R9), 16(R3) SUB $2, R0 loop: // do four times per round - CBZ R0, len1 // careful not to touch the carry flag - LDP.P 32(R1), (R4, R5) - LDP -16(R1), (R6, R7) - SBCS $0, R4, R8 - SBCS $0, R5, R9 - SBCS $0, R6, R10 - SBCS $0, R7, R11 - STP.P (R8, R9), 32(R3) - STP (R10, R11), -16(R3) - SUB $4, R0 + vwOneIter(SBCS, R0, len1) B loop len1: CSET LO, R2 // extract carry flag len0: MOVD R2, c+56(FP) +done: RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(SUBS, SBCS, R10, sub4) + SUB $4, R0 +sub4: + BCS copy + vwOneIter(SBCS, R0, len1) + B sub4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 // func shlVU(z, x []Word, s uint) (c Word) // This implementation handles the shift operation from the high word to the low word, |