aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2010-10-26 21:10:17 -0700
committerRuss Cox <rsc@golang.org>2010-10-26 21:10:17 -0700
commit705c0382e8f8437558a099a92a1d48674a252ddb (patch)
tree6269cea41ea2f9052fbd2f92d6be10538a3b1b09
parenta8abb64a7165a2d673449306adf0aa68686d9091 (diff)
downloadgo-705c0382e8f8437558a099a92a1d48674a252ddb.tar.gz
go-705c0382e8f8437558a099a92a1d48674a252ddb.zip
big: arm assembly, faster software mulWW, divWW
Reduces time spent running crypto/rsa test by 65%. Fixes #1227. R=gri, PeterGo CC=golang-dev https://golang.org/cl/2743041
-rw-r--r--src/pkg/big/arith.go228
-rw-r--r--src/pkg/big/arith_arm.s286
-rw-r--r--src/pkg/big/arith_test.go1
3 files changed, 326 insertions, 189 deletions
diff --git a/src/pkg/big/arith.go b/src/pkg/big/arith.go
index 29966c7bc5..df3808f5e4 100644
--- a/src/pkg/big/arith.go
+++ b/src/pkg/big/arith.go
@@ -56,161 +56,29 @@ func subWW_g(x, y, c Word) (z1, z0 Word) {
// z1<<_W + z0 = x*y
+// Adapted from Warren, Hacker's Delight, p. 132.
func mulWW_g(x, y Word) (z1, z0 Word) {
- // Split x and y into 2 halfWords each, multiply
- // the halfWords separately while avoiding overflow,
- // and return the product as 2 Words.
-
- if x < y {
- x, y = y, x
- }
-
- if x < _B2 {
- // y < _B2 because y <= x
- // sub-digits of x and y are (0, x) and (0, y)
- // z = z[0] = x*y
- z0 = x * y
- return
- }
-
- if y < _B2 {
- // sub-digits of x and y are (x1, x0) and (0, y)
- // x = (x1*_B2 + x0)
- // y = (y1*_B2 + y0)
- x1, x0 := x>>_W2, x&_M2
-
- // x*y = t2*_B2*_B2 + t1*_B2 + t0
- t0 := x0 * y
- t1 := x1 * y
-
- // compute result digits but avoid overflow
- // z = z[1]*_B + z[0] = x*y
- z0 = t1<<_W2 + t0
- z1 = (t1 + t0>>_W2) >> _W2
- return
- }
-
- // general case
- // sub-digits of x and y are (x1, x0) and (y1, y0)
- // x = (x1*_B2 + x0)
- // y = (y1*_B2 + y0)
- x1, x0 := x>>_W2, x&_M2
- y1, y0 := y>>_W2, y&_M2
-
- // x*y = t2*_B2*_B2 + t1*_B2 + t0
- t0 := x0 * y0
- // t1 := x1*y0 + x0*y1;
- var c Word
- t1 := x1 * y0
- t1a := t1
- t1 += x0 * y1
- if t1 < t1a {
- c++
- }
- t2 := x1*y1 + c*_B2
-
- // compute result digits but avoid overflow
- // z = z[1]*_B + z[0] = x*y
- // This may overflow, but that's ok because we also sum t1 and t0 above
- // and we take care of the overflow there.
- z0 = t1<<_W2 + t0
-
- // z1 = t2 + (t1 + t0>>_W2)>>_W2;
- var c3 Word
- z1 = t1 + t0>>_W2
- if z1 < t1 {
- c3++
- }
- z1 >>= _W2
- z1 += c3 * _B2
- z1 += t2
+ x0 := x & _M2
+ x1 := x >> _W2
+ y0 := y & _M2
+ y1 := y >> _W2
+ w0 := x0 * y0
+ t := x1*y0 + w0>>_W2
+ w1 := t & _M2
+ w2 := t >> _W2
+ w1 += x0 * y1
+ z1 = x1*y1 + w2 + w1>>_W2
+ z0 = x * y
return
}
// z1<<_W + z0 = x*y + c
func mulAddWWW_g(x, y, c Word) (z1, z0 Word) {
- // Split x and y into 2 halfWords each, multiply
- // the halfWords separately while avoiding overflow,
- // and return the product as 2 Words.
-
- // TODO(gri) Should implement special cases for faster execution.
-
- // general case
- // sub-digits of x, y, and c are (x1, x0), (y1, y0), (c1, c0)
- // x = (x1*_B2 + x0)
- // y = (y1*_B2 + y0)
- x1, x0 := x>>_W2, x&_M2
- y1, y0 := y>>_W2, y&_M2
- c1, c0 := c>>_W2, c&_M2
-
- // x*y + c = t2*_B2*_B2 + t1*_B2 + t0
- // (1<<32-1)^2 == 1<<64 - 1<<33 + 1, so there's space to add c0 in here.
- t0 := x0*y0 + c0
-
- // t1 := x1*y0 + x0*y1 + c1;
- var c2 Word // extra carry
- t1 := x1*y0 + c1
- t1a := t1
- t1 += x0 * y1
- if t1 < t1a { // If the number got smaller then we overflowed.
- c2++
+ z1, zz0 := mulWW(x, y)
+ if z0 = zz0 + c; z0 < zz0 {
+ z1++
}
-
- t2 := x1*y1 + c2*_B2
-
- // compute result digits but avoid overflow
- // z = z[1]*_B + z[0] = x*y
- // z0 = t1<<_W2 + t0;
- // This may overflow, but that's ok because we also sum t1 and t0 below
- // and we take care of the overflow there.
- z0 = t1<<_W2 + t0
-
- var c3 Word
- z1 = t1 + t0>>_W2
- if z1 < t1 {
- c3++
- }
- z1 >>= _W2
- z1 += t2 + c3*_B2
-
- return
-}
-
-
-// q = (x1<<_W + x0 - r)/y
-// The most significant bit of y must be 1.
-func divStep(x1, x0, y Word) (q, r Word) {
- d1, d0 := y>>_W2, y&_M2
- q1, r1 := x1/d1, x1%d1
- m := q1 * d0
- r1 = r1*_B2 | x0>>_W2
- if r1 < m {
- q1--
- r1 += y
- if r1 >= y && r1 < m {
- q1--
- r1 += y
- }
- }
- r1 -= m
-
- r0 := r1 % d1
- q0 := r1 / d1
- m = q0 * d0
- r0 = r0*_B2 | x0&_M2
- if r0 < m {
- q0--
- r0 += y
- if r0 >= y && r0 < m {
- q0--
- r0 += y
- }
- }
- r0 -= m
-
- q = q1*_B2 | q0
- r = r0
return
}
@@ -241,46 +109,48 @@ func leadingZeros(x Word) uint {
}
-// q = (x1<<_W + x0 - r)/y
-func divWW_g(x1, x0, y Word) (q, r Word) {
- if x1 == 0 {
- q, r = x0/y, x0%y
- return
+// q = (u1<<_W + u0 - r)/y
+// Adapted from Warren, Hacker's Delight, p. 152.
+func divWW_g(u1, u0, v Word) (q, r Word) {
+ if u1 >= v {
+ return 1<<_W - 1, 1<<_W - 1
}
- var q0, q1 Word
- z := leadingZeros(y)
- if y > x1 {
- if z != 0 {
- y <<= z
- x1 = (x1 << z) | (x0 >> (_W - z))
- x0 <<= z
- }
- q0, x0 = divStep(x1, x0, y)
- q1 = 0
- } else {
- if z == 0 {
- x1 -= y
- q1 = 1
- } else {
- z1 := _W - z
- y <<= z
- x2 := x1 >> z1
- x1 = (x1 << z) | (x0 >> z1)
- x0 <<= z
- q1, x1 = divStep(x2, x1, y)
- }
+ s := leadingZeros(v)
+ v <<= s
+
+ vn1 := v >> _W2
+ vn0 := v & _M2
+ un32 := u1<<s | u0>>(_W-s)
+ un10 := u0 << s
+ un1 := un10 >> _W2
+ un0 := un10 & _M2
+ q1 := un32 / vn1
+ rhat := un32 - q1*vn1
- q0, x0 = divStep(x1, x0, y)
+again1:
+ if q1 >= _B2 || q1*vn0 > _B2*rhat+un1 {
+ q1--
+ rhat += vn1
+ if rhat < _B2 {
+ goto again1
+ }
}
- r = x0 >> z
+ un21 := un32*_B2 + un1 - q1*v
+ q0 := un21 / vn1
+ rhat = un21 - q0*vn1
- if q1 != 0 {
- panic("div out of range")
+again2:
+ if q0 >= _B2 || q0*vn0 > _B2*rhat+un0 {
+ q0--
+ rhat += vn1
+ if rhat < _B2 {
+ goto again2
+ }
}
- return q0, r
+ return q1*_B2 + q0, (un21*_B2 + un0 - q0*v) >> s
}
diff --git a/src/pkg/big/arith_arm.s b/src/pkg/big/arith_arm.s
index 6ab8e989ed..e4a9a962cf 100644
--- a/src/pkg/big/arith_arm.s
+++ b/src/pkg/big/arith_arm.s
@@ -5,36 +5,302 @@
// This file provides fast assembly versions for the elementary
// arithmetic operations on vectors implemented in arith.go.
-// TODO(gri) Implement these routines.
+#define CFLAG 29 // bit position of carry flag
+
+// func addVV(z, x, y []Word) (c Word)
TEXT ·addVV(SB),7,$0
- B ·addVV_g(SB)
+ MOVW $0, R0
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW n+4(FP), R4
+ MOVW R4<<2, R4
+ ADD R1, R4
+ B E1
+L1:
+ MOVW.P 4(R2), R5
+ MOVW.P 4(R3), R6
+ MOVW R0, CPSR
+ ADC.S R6, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+E1:
+ CMP R1, R4
+ BNE L1
+
+ MOVW R0>>CFLAG, R0
+ AND $1, R0
+ MOVW R0, c+36(FP)
+ RET
+
+// func subVV(z, x, y []Word) (c Word)
+// (same as addVV except for SBC instead of ADC and label names)
TEXT ·subVV(SB),7,$0
- B ·subVV_g(SB)
+ MOVW $(1<<CFLAG), R0
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW n+4(FP), R4
+ MOVW R4<<2, R4
+ ADD R1, R4
+ B E2
+L2:
+ MOVW.P 4(R2), R5
+ MOVW.P 4(R3), R6
+ MOVW R0, CPSR
+ SBC.S R6, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+E2:
+ CMP R1, R4
+ BNE L2
+
+ MOVW R0>>CFLAG, R0
+ AND $1, R0
+ EOR $1, R0
+ MOVW R0, c+36(FP)
+ RET
+
+// func addVW(z, x []Word, y Word) (c Word)
TEXT ·addVW(SB),7,$0
- B ·addVW_g(SB)
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW n+4(FP), R4
+ MOVW R4<<2, R4
+ ADD R1, R4
+ CMP R1, R4
+ BNE L3a
+ MOVW R3, c+28(FP)
+ RET
+L3a:
+ MOVW.P 4(R2), R5
+ ADD.S R3, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+ B E3
+L3:
+ MOVW.P 4(R2), R5
+ MOVW R0, CPSR
+ ADC.S $0, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+E3:
+ CMP R1, R4
+ BNE L3
+
+ MOVW R0>>CFLAG, R0
+ AND $1, R0
+ MOVW R0, c+28(FP)
+ RET
+
TEXT ·subVW(SB),7,$0
- B ·subVW_g(SB)
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW n+4(FP), R4
+ MOVW R4<<2, R4
+ ADD R1, R4
+ CMP R1, R4
+ BNE L4a
+ MOVW R3, c+28(FP)
+ RET
+L4a:
+ MOVW.P 4(R2), R5
+ SUB.S R3, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+ B E4
+L4:
+ MOVW.P 4(R2), R5
+ MOVW R0, CPSR
+ SBC.S $0, R5
+ MOVW.P R5, 4(R1)
+ MOVW CPSR, R0
+E4:
+ CMP R1, R4
+ BNE L4
+
+ MOVW R0>>CFLAG, R0
+ AND $1, R0
+ EOR $1, R0
+ MOVW R0, c+28(FP)
+ RET
+
+// func shlVW(z, x []Word, s Word) (c Word)
TEXT ·shlVW(SB),7,$0
- B ·shlVW_g(SB)
+ MOVW n+4(FP), R5
+ CMP $0, R5
+ BEQ X7
+
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW R5<<2, R5
+ ADD R5, R2
+ ADD R1, R5
+ MOVW s+24(FP), R3
+ CMP $0, R3 // shift 0 is special
+ BEQ Y7
+ ADD $4, R1 // stop one word early
+ MOVW $32, R4
+ SUB R3, R4
+ MOVW $0, R7
+
+ MOVW.W -4(R2), R6
+ MOVW R6<<R3, R7
+ MOVW R6>>R4, R6
+ MOVW R6, c+28(FP)
+ B E7
+
+L7:
+ MOVW.W -4(R2), R6
+ ORR R6>>R4, R7
+ MOVW.W R7, -4(R5)
+ MOVW R6<<R3, R7
+E7:
+ CMP R1, R5
+ BNE L7
+
+ MOVW R7, -4(R5)
+ RET
+
+Y7: // copy loop, because shift 0 == shift 32
+ MOVW.W -4(R2), R6
+ MOVW.W R6, -4(R5)
+ CMP R1, R5
+ BNE Y7
+
+X7:
+ MOVW $0, R1
+ MOVW R1, c+28(FP)
+ RET
+
TEXT ·shrVW(SB),7,$0
- B ·shrVW_g(SB)
+ MOVW n+4(FP), R5
+ CMP $0, R5
+ BEQ X6
+
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW R5<<2, R5
+ ADD R1, R5
+ MOVW s+24(FP), R3
+ CMP $0, R3 // shift 0 is special
+ BEQ Y6
+ SUB $4, R5 // stop one word early
+ MOVW $32, R4
+ SUB R3, R4
+ MOVW $0, R7
+
+ // first word
+ MOVW.P 4(R2), R6
+ MOVW R6>>R3, R7
+ MOVW R6<<R4, R6
+ MOVW R6, c+28(FP)
+ B E6
+
+ // word loop
+L6:
+ MOVW.P 4(R2), R6
+ ORR R6<<R4, R7
+ MOVW.P R7, 4(R1)
+ MOVW R6>>R3, R7
+E6:
+ CMP R1, R5
+ BNE L6
+
+ MOVW R7, 0(R1)
+ RET
+
+Y6: // copy loop, because shift 0 == shift 32
+ MOVW.P 4(R2), R6
+ MOVW.P R6, 4(R1)
+ CMP R1, R5
+ BNE Y6
+
+X6:
+ MOVW $0, R1
+ MOVW R1, c+28(FP)
+ RET
+
TEXT ·mulAddVWW(SB),7,$0
- B ·mulAddVWW_g(SB)
+ MOVW $0, R0
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW r+28(FP), R4
+ MOVW n+4(FP), R5
+ MOVW R5<<2, R5
+ ADD R1, R5
+ B E8
+
+ // word loop
+L8:
+ MOVW.P 4(R2), R6
+ MULLU R6, R3, (R7, R6)
+ ADD.S R4, R6
+ ADC R0, R7
+ MOVW.P R6, 4(R1)
+ MOVW R7, R4
+E8:
+ CMP R1, R5
+ BNE L8
+
+ MOVW R4, c+32(FP)
+ RET
+
TEXT ·addMulVVW(SB),7,$0
- B ·addMulVVW_g(SB)
+ MOVW $0, R0
+ MOVW z+0(FP), R1
+ MOVW x+12(FP), R2
+ MOVW y+24(FP), R3
+ MOVW n+4(FP), R5
+ MOVW R5<<2, R5
+ ADD R1, R5
+ MOVW $0, R4
+ B E9
+
+ // word loop
+L9:
+ MOVW.P 4(R2), R6
+ MULLU R6, R3, (R7, R6)
+ ADD.S R4, R6
+ ADC R0, R7
+ MOVW 0(R1), R4
+ ADD.S R4, R6
+ ADC R0, R7
+ MOVW.P R6, 4(R1)
+ MOVW R7, R4
+E9:
+ CMP R1, R5
+ BNE L9
+
+ MOVW R4, c+28(FP)
+ RET
+
TEXT ·divWVW(SB),7,$0
+ // ARM has no multiword division, so use portable code.
B ·divWVW_g(SB)
+
TEXT ·divWW(SB),7,$0
+ // ARM has no multiword division, so use portable code.
B ·divWW_g(SB)
+
+// func mulWW(x, y Word) (z1, z0 Word)
TEXT ·mulWW(SB),7,$0
- B ·mulWW_g(SB)
+ MOVW x+0(FP), R1
+ MOVW y+4(FP), R2
+ MULLU R1, R2, (R4, R3)
+ MOVW R4, z1+8(FP)
+ MOVW R3, z0+12(FP)
+ RET
diff --git a/src/pkg/big/arith_test.go b/src/pkg/big/arith_test.go
index 5765b89d17..934b302df0 100644
--- a/src/pkg/big/arith_test.go
+++ b/src/pkg/big/arith_test.go
@@ -116,6 +116,7 @@ type argVW struct {
var sumVW = []argVW{
{},
+ {nil, nil, 2, 2},
{nat{0}, nat{0}, 0, 0},
{nat{1}, nat{0}, 1, 0},
{nat{1}, nat{1}, 0, 0},