aboutsummaryrefslogtreecommitdiff
path: root/src/math
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2020-08-17 10:57:54 -0400
committerLynn Boger <laboger@linux.vnet.ibm.com>2020-08-18 20:25:26 +0000
commit216714e44f703470f102cf248f7e9097160093d4 (patch)
treefcdf6fde7d4dccb64fd85a182946a68fef0ab162 /src/math
parenta40171857595db60e95a04b64aad8ba262cf64a7 (diff)
downloadgo-216714e44f703470f102cf248f7e9097160093d4.tar.gz
go-216714e44f703470f102cf248f7e9097160093d4.zip
math/big: improve performance of mulAddVWW on ppc64x
This changes the assembly implementation on ppc64x to improve performance by reordering some instructions. It also eliminates an unnecessary move by changing an ADDZE to use the correct target register. Improvement on power9: MulAddVWW/1 6.89ns ± 0% 7.30ns ± 0% +5.95% (p=1.000 n=1+1) MulAddVWW/2 8.04ns ± 0% 8.06ns ± 0% +0.25% (p=1.000 n=1+1) MulAddVWW/3 9.39ns ± 0% 9.39ns ± 0% ~ (all equal) MulAddVWW/4 9.76ns ± 0% 9.48ns ± 0% -2.87% (p=1.000 n=1+1) MulAddVWW/5 10.5ns ± 0% 10.3ns ± 0% -1.90% (p=1.000 n=1+1) MulAddVWW/10 15.4ns ± 0% 14.9ns ± 0% -3.25% (p=1.000 n=1+1) MulAddVWW/100 149ns ± 0% 125ns ± 0% -16.11% (p=1.000 n=1+1) MulAddVWW/1000 1.42µs ± 0% 1.28µs ± 0% -9.74% (p=1.000 n=1+1) MulAddVWW/10000 14.2µs ± 0% 12.8µs ± 0% -9.73% (p=1.000 n=1+1) MulAddVWW/100000 144µs ± 0% 129µs ± 0% -10.10% (p=1.000 n=1+1) Change-Id: I0ae7002a69783ca19d7a4e3e42042ae75dc60069 Reviewed-on: https://go-review.googlesource.com/c/go/+/248721 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com> Reviewed-by: Paul Murphy <murp@ibm.com>
Diffstat (limited to 'src/math')
-rw-r--r--src/math/big/arith_ppc64x.s9
1 files changed, 4 insertions, 5 deletions
diff --git a/src/math/big/arith_ppc64x.s b/src/math/big/arith_ppc64x.s
index dbb168a376..409e10ab48 100644
--- a/src/math/big/arith_ppc64x.s
+++ b/src/math/big/arith_ppc64x.s
@@ -394,17 +394,16 @@ loop:
ADDZE R21
MULLD R9, R22, R26
MULHDU R9, R22, R22
- ADDC R21, R26
- ADDZE R22
MULLD R9, R23, R27
MULHDU R9, R23, R23
- ADDC R22, R27
- ADDZE R23
+ ADDC R21, R26
+ ADDZE R22
MOVD R24, 8(R10) // z[i]
MOVD R25, 16(R10) // z[i+1]
+ ADDC R22, R27
+ ADDZE R23,R4 // update carry
MOVD R26, 24(R10) // z[i+2]
MOVDU R27, 32(R10) // z[i+3]
- MOVD R23, R4 // R4 = c
ADD $-4, R11 // R11 = z_len - 4
BC 16, 0, loop // bdnz