1 files changed, 93 insertions, 20 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s
index 18e513e2c3..da6e408e19 100644
--- a/src/math/big/arith_arm64.s
+++ b/src/math/big/arith_arm64.s
@@ -109,13 +109,59 @@ done:
 	MOVD	R0, c+72(FP)
 	RET
 
+#define vwOneOp(instr, op1)				\
+	MOVD.P	8(R1), R4;				\
+	instr	op1, R4;				\
+	MOVD.P	R4, 8(R3);
+
+// handle the first 1~4 elements before starting iteration in addVW/subVW
+#define vwPreIter(instr1, instr2, counter, target)	\
+	vwOneOp(instr1, R2);				\
+	SUB	$1, counter;				\
+	CBZ	counter, target;			\
+	vwOneOp(instr2, $0);				\
+	SUB	$1, counter;				\
+	CBZ	counter, target;			\
+	vwOneOp(instr2, $0);				\
+	SUB	$1, counter;				\
+	CBZ	counter, target;			\
+	vwOneOp(instr2, $0);
+
+// do one iteration of add or sub in addVW/subVW
+#define vwOneIter(instr, counter, exit)	\
+	CBZ	counter, exit;		\	// careful not to touch the carry flag
+	LDP.P	32(R1), (R4, R5);	\
+	LDP	-16(R1), (R6, R7);	\
+	instr	$0, R4, R8;		\
+	instr	$0, R5, R9;		\
+	instr	$0, R6, R10;		\
+	instr	$0, R7, R11;		\
+	STP.P	(R8, R9), 32(R3);	\
+	STP	(R10, R11), -16(R3);	\
+	SUB	$4, counter;
+
+// do one iteration of copy in addVW/subVW
+#define vwOneIterCopy(counter, exit)			\
+	CBZ	counter, exit;				\
+	LDP.P	32(R1), (R4, R5);			\
+	LDP	-16(R1), (R6, R7);			\
+	STP.P	(R4, R5), 32(R3);			\
+	STP	(R6, R7), -16(R3);			\
+	SUB	$4, counter;
 
 // func addVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
 TEXT ·addVW(SB),NOSPLIT,$0
 	MOVD	z+0(FP), R3
 	MOVD	z_len+8(FP), R0
 	MOVD	x+24(FP), R1
 	MOVD	y+48(FP), R2
+	CMP	$32, R0
+	BGE	large		// large-sized 'z' and 'x'
 	CBZ	R0, len0	// the length of z is 0
 	MOVD.P	8(R1), R4
 	ADDS	R2, R4		// z[0] = x[0] + y, set carry
@@ -135,29 +181,46 @@ two:				// do it twice
 	STP.P	(R8, R9), 16(R3)
 	SUB	$2, R0
 loop:				// do four times per round
-	CBZ	R0, len1	// careful not to touch the carry flag
-	LDP.P	32(R1), (R4, R5)
-	LDP	-16(R1), (R6, R7)
-	ADCS	$0, R4, R8
-	ADCS	$0, R5, R9
-	ADCS	$0, R6, R10
-	ADCS	$0, R7, R11
-	STP.P	(R8, R9), 32(R3)
-	STP	(R10, R11), -16(R3)
-	SUB	$4, R0
+	vwOneIter(ADCS, R0, len1)
 	B	loop
 len1:
 	CSET	HS, R2		// extract carry flag
 len0:
 	MOVD	R2, c+56(FP)
+done:
 	RET
+large:
+	AND	$0x3, R0, R10
+	AND	$~0x3, R0
+	// unrolling for the first 1~4 elements to avoid saving the carry
+	// flag in each step, adjust $R0 if we unrolled 4 elements
+	vwPreIter(ADDS, ADCS, R10, add4)
+	SUB	$4, R0
+add4:
+	BCC	copy
+	vwOneIter(ADCS, R0, len1)
+	B	add4
+copy:
+	MOVD	ZR, c+56(FP)
+	CMP	R1, R3
+	BEQ	done
+copy_4:				// no carry flag, copy the rest
+	vwOneIterCopy(R0, done)
+	B	copy_4
 
 // func subVW(z, x []Word, y Word) (c Word)
+// The 'large' branch handles large 'z'. It checks the carry flag on every iteration
+// and switches to copy if we are done with carries. The copying is skipped as well
+// if 'x' and 'z' happen to share the same underlying storage.
+// The overhead of the checking and branching is visible when 'z' are small (~5%),
+// so set a threshold of 32, and remain the small-sized part entirely untouched.
 TEXT ·subVW(SB),NOSPLIT,$0
 	MOVD	z+0(FP), R3
 	MOVD	z_len+8(FP), R0
 	MOVD	x+24(FP), R1
 	MOVD	y+48(FP), R2
+	CMP	$32, R0
+	BGE	large		// large-sized 'z' and 'x'
 	CBZ	R0, len0	// the length of z is 0
 	MOVD.P	8(R1), R4
 	SUBS	R2, R4		// z[0] = x[0] - y, set carry
@@ -177,22 +240,32 @@ two:				// do it twice
 	STP.P	(R8, R9), 16(R3)
 	SUB	$2, R0
 loop:				// do four times per round
-	CBZ	R0, len1	// careful not to touch the carry flag
-	LDP.P	32(R1), (R4, R5)
-	LDP	-16(R1), (R6, R7)
-	SBCS	$0, R4, R8
-	SBCS	$0, R5, R9
-	SBCS	$0, R6, R10
-	SBCS	$0, R7, R11
-	STP.P	(R8, R9), 32(R3)
-	STP	(R10, R11), -16(R3)
-	SUB	$4, R0
+	vwOneIter(SBCS, R0, len1)
 	B	loop
 len1:
 	CSET	LO, R2		// extract carry flag
 len0:
 	MOVD	R2, c+56(FP)
+done:
 	RET
+large:
+	AND	$0x3, R0, R10
+	AND	$~0x3, R0
+	// unrolling for the first 1~4 elements to avoid saving the carry
+	// flag in each step, adjust $R0 if we unrolled 4 elements
+	vwPreIter(SUBS, SBCS, R10, sub4)
+	SUB	$4, R0
+sub4:
+	BCS	copy
+	vwOneIter(SBCS, R0, len1)
+	B	sub4
+copy:
+	MOVD	ZR, c+56(FP)
+	CMP	R1, R3
+	BEQ	done
+copy_4:				// no carry flag, copy the rest
+	vwOneIterCopy(R0, done)
+	B	copy_4
 
 // func shlVU(z, x []Word, s uint) (c Word)
 // This implementation handles the shift operation from the high word to the low word,