diff options
Diffstat (limited to 'src/math/big')
-rw-r--r-- | src/math/big/arith_arm64.s | 113 | ||||
-rw-r--r-- | src/math/big/arith_test.go | 106 | ||||
-rw-r--r-- | src/math/big/example_test.go | 7 | ||||
-rw-r--r-- | src/math/big/float.go | 7 |
4 files changed, 205 insertions, 28 deletions
diff --git a/src/math/big/arith_arm64.s b/src/math/big/arith_arm64.s index 18e513e2c3..da6e408e19 100644 --- a/src/math/big/arith_arm64.s +++ b/src/math/big/arith_arm64.s @@ -109,13 +109,59 @@ done: MOVD R0, c+72(FP) RET +#define vwOneOp(instr, op1) \ + MOVD.P 8(R1), R4; \ + instr op1, R4; \ + MOVD.P R4, 8(R3); + +// handle the first 1~4 elements before starting iteration in addVW/subVW +#define vwPreIter(instr1, instr2, counter, target) \ + vwOneOp(instr1, R2); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); \ + SUB $1, counter; \ + CBZ counter, target; \ + vwOneOp(instr2, $0); + +// do one iteration of add or sub in addVW/subVW +#define vwOneIter(instr, counter, exit) \ + CBZ counter, exit; \ // careful not to touch the carry flag + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + instr $0, R4, R8; \ + instr $0, R5, R9; \ + instr $0, R6, R10; \ + instr $0, R7, R11; \ + STP.P (R8, R9), 32(R3); \ + STP (R10, R11), -16(R3); \ + SUB $4, counter; + +// do one iteration of copy in addVW/subVW +#define vwOneIterCopy(counter, exit) \ + CBZ counter, exit; \ + LDP.P 32(R1), (R4, R5); \ + LDP -16(R1), (R6, R7); \ + STP.P (R4, R5), 32(R3); \ + STP (R6, R7), -16(R3); \ + SUB $4, counter; // func addVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. TEXT ·addVW(SB),NOSPLIT,$0 MOVD z+0(FP), R3 MOVD z_len+8(FP), R0 MOVD x+24(FP), R1 MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' CBZ R0, len0 // the length of z is 0 MOVD.P 8(R1), R4 ADDS R2, R4 // z[0] = x[0] + y, set carry @@ -135,29 +181,46 @@ two: // do it twice STP.P (R8, R9), 16(R3) SUB $2, R0 loop: // do four times per round - CBZ R0, len1 // careful not to touch the carry flag - LDP.P 32(R1), (R4, R5) - LDP -16(R1), (R6, R7) - ADCS $0, R4, R8 - ADCS $0, R5, R9 - ADCS $0, R6, R10 - ADCS $0, R7, R11 - STP.P (R8, R9), 32(R3) - STP (R10, R11), -16(R3) - SUB $4, R0 + vwOneIter(ADCS, R0, len1) B loop len1: CSET HS, R2 // extract carry flag len0: MOVD R2, c+56(FP) +done: RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(ADDS, ADCS, R10, add4) + SUB $4, R0 +add4: + BCC copy + vwOneIter(ADCS, R0, len1) + B add4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 // func subVW(z, x []Word, y Word) (c Word) +// The 'large' branch handles large 'z'. It checks the carry flag on every iteration +// and switches to copy if we are done with carries. The copying is skipped as well +// if 'x' and 'z' happen to share the same underlying storage. +// The overhead of the checking and branching is visible when 'z' are small (~5%), +// so set a threshold of 32, and remain the small-sized part entirely untouched. TEXT ·subVW(SB),NOSPLIT,$0 MOVD z+0(FP), R3 MOVD z_len+8(FP), R0 MOVD x+24(FP), R1 MOVD y+48(FP), R2 + CMP $32, R0 + BGE large // large-sized 'z' and 'x' CBZ R0, len0 // the length of z is 0 MOVD.P 8(R1), R4 SUBS R2, R4 // z[0] = x[0] - y, set carry @@ -177,22 +240,32 @@ two: // do it twice STP.P (R8, R9), 16(R3) SUB $2, R0 loop: // do four times per round - CBZ R0, len1 // careful not to touch the carry flag - LDP.P 32(R1), (R4, R5) - LDP -16(R1), (R6, R7) - SBCS $0, R4, R8 - SBCS $0, R5, R9 - SBCS $0, R6, R10 - SBCS $0, R7, R11 - STP.P (R8, R9), 32(R3) - STP (R10, R11), -16(R3) - SUB $4, R0 + vwOneIter(SBCS, R0, len1) B loop len1: CSET LO, R2 // extract carry flag len0: MOVD R2, c+56(FP) +done: RET +large: + AND $0x3, R0, R10 + AND $~0x3, R0 + // unrolling for the first 1~4 elements to avoid saving the carry + // flag in each step, adjust $R0 if we unrolled 4 elements + vwPreIter(SUBS, SBCS, R10, sub4) + SUB $4, R0 +sub4: + BCS copy + vwOneIter(SBCS, R0, len1) + B sub4 +copy: + MOVD ZR, c+56(FP) + CMP R1, R3 + BEQ done +copy_4: // no carry flag, copy the rest + vwOneIterCopy(R0, done) + B copy_4 // func shlVU(z, x []Word, s uint) (c Word) // This implementation handles the shift operation from the high word to the low word, diff --git a/src/math/big/arith_test.go b/src/math/big/arith_test.go index 05136f1895..fc205934c5 100644 --- a/src/math/big/arith_test.go +++ b/src/math/big/arith_test.go @@ -179,6 +179,23 @@ func testFunVW(t *testing.T, msg string, f funVW, a argVW) { } } +func testFunVWext(t *testing.T, msg string, f funVW, f_g funVW, a argVW) { + // using the result of addVW_g/subVW_g as golden + z_g := make(nat, len(a.z)) + c_g := f_g(z_g, a.x, a.y) + c := f(a.z, a.x, a.y) + + for i, zi := range a.z { + if zi != z_g[i] { + t.Errorf("%s\n\tgot z[%d] = %#x; want %#x", msg, i, zi, z_g[i]) + break + } + } + if c != c_g { + t.Errorf("%s\n\tgot c = %#x; want %#x", msg, c, c_g) + } +} + func makeFunVW(f func(z, x []Word, s uint) (c Word)) funVW { return func(z, x []Word, s Word) (c Word) { return f(z, x, uint(s)) @@ -213,6 +230,49 @@ func TestFunVW(t *testing.T) { } } +// Construct a vector comprising the same word, usually '0' or 'maximum uint' +func makeWordVec(e Word, n int) []Word { + v := make([]Word, n) + for i := range v { + v[i] = e + } + return v +} + +// Extended testing to addVW and subVW using various kinds of input data. +// We utilize the results of addVW_g and subVW_g as golden reference to check +// correctness. +func TestFunVWExt(t *testing.T) { + // 32 is the current threshold that triggers an optimized version of + // calculation for large-sized vector, ensure we have sizes around it tested. + var vwSizes = []int{0, 1, 3, 4, 5, 8, 9, 23, 31, 32, 33, 34, 35, 36, 50, 120} + for _, n := range vwSizes { + // vector of random numbers, using the result of addVW_g/subVW_g as golden + x := rndV(n) + y := rndW() + z := make(nat, n) + arg := argVW{z, x, y, 0} + testFunVWext(t, "addVW, random inputs", addVW, addVW_g, arg) + testFunVWext(t, "subVW, random inputs", subVW, subVW_g, arg) + + // vector of random numbers, but make 'x' and 'z' share storage + arg = argVW{x, x, y, 0} + testFunVWext(t, "addVW, random inputs, sharing storage", addVW, addVW_g, arg) + testFunVWext(t, "subVW, random inputs, sharing storage", subVW, subVW_g, arg) + + // vector of maximum uint, to force carry flag set in each 'add' + y = ^Word(0) + x = makeWordVec(y, n) + arg = argVW{z, x, y, 0} + testFunVWext(t, "addVW, vector of max uint", addVW, addVW_g, arg) + + // vector of '0', to force carry flag set in each 'sub' + x = makeWordVec(0, n) + arg = argVW{z, x, 1, 0} + testFunVWext(t, "subVW, vector of zero", subVW, subVW_g, arg) + } +} + type argVU struct { d []Word // d is a Word slice, the input parameters x and z come from this array. l uint // l is the length of the input parameters x and z. @@ -241,20 +301,20 @@ var argshrVU = []argVU{ } func testShiftFunc(t *testing.T, f func(z, x []Word, s uint) Word, a argVU) { - // save a.d for error message, or it will be overwritten. + // work on copy of a.d to preserve the original data. b := make([]Word, len(a.d)) copy(b, a.d) - z := a.d[a.zp : a.zp+a.l] - x := a.d[a.xp : a.xp+a.l] + z := b[a.zp : a.zp+a.l] + x := b[a.xp : a.xp+a.l] c := f(z, x, a.s) for i, zi := range z { if zi != a.r[i] { - t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot z[%d] = %#x; want %#x", a.d, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, i, zi, a.r[i]) break } } if c != a.c { - t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", b, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) + t.Errorf("d := %v, %s(d[%d:%d], d[%d:%d], %d)\n\tgot c = %#x; want %#x", a.d, a.m, a.zp, a.zp+a.l, a.xp, a.xp+a.l, a.s, c, a.c) } } @@ -299,6 +359,24 @@ func BenchmarkAddVW(b *testing.B) { } } +// Benchmarking addVW using vector of maximum uint to force carry flag set +func BenchmarkAddVWext(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + y := ^Word(0) + x := makeWordVec(y, n) + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + addVW(z, x, y) + } + }) + } +} + func BenchmarkSubVW(b *testing.B) { for _, n := range benchSizes { if isRaceBuilder && n > 1e3 { @@ -316,6 +394,24 @@ func BenchmarkSubVW(b *testing.B) { } } +// Benchmarking subVW using vector of zero to force carry flag set +func BenchmarkSubVWext(b *testing.B) { + for _, n := range benchSizes { + if isRaceBuilder && n > 1e3 { + continue + } + x := makeWordVec(0, n) + y := Word(1) + z := make([]Word, n) + b.Run(fmt.Sprint(n), func(b *testing.B) { + b.SetBytes(int64(n * _S)) + for i := 0; i < b.N; i++ { + subVW(z, x, y) + } + }) + } +} + type funVWW func(z, x []Word, y, r Word) (c Word) type argVWW struct { z, x nat diff --git a/src/math/big/example_test.go b/src/math/big/example_test.go index cfc77351d4..31ca784154 100644 --- a/src/math/big/example_test.go +++ b/src/math/big/example_test.go @@ -25,6 +25,13 @@ func ExampleInt_SetString() { // Output: 420 } +func ExampleFloat_SetString() { + f := new(big.Float) + f.SetString("3.14159") + fmt.Println(f) + // Output: 3.14159 +} + func ExampleRat_Scan() { // The Scan function is rarely used directly; // the fmt package recognizes it as an implementation of fmt.Scanner. diff --git a/src/math/big/float.go b/src/math/big/float.go index da964eef3e..42050e2c39 100644 --- a/src/math/big/float.go +++ b/src/math/big/float.go @@ -322,10 +322,11 @@ func (z *Float) SetMantExp(mant *Float, exp int) *Float { mant.validate() } z.Copy(mant) - if z.form != finite { - return z + + if z.form == finite { + // 0 < |mant| < +Inf + z.setExpAndRound(int64(z.exp)+int64(exp), 0) } - z.setExpAndRound(int64(z.exp)+int64(exp), 0) return z } |