diff options
author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-03-30 15:23:19 -0400 |
---|---|---|
committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-04-06 12:09:39 +0000 |
commit | 815509ae31fc7eaf753def9deb9cafee968f92b3 (patch) | |
tree | bd27aa71683f030619f08c9441616cbdcf3d62fb /test/codegen | |
parent | 5f3354d1bf2e6a61e4b9e1e31ee04b99dfe7de35 (diff) | |
download | go-815509ae31fc7eaf753def9deb9cafee968f92b3.tar.gz go-815509ae31fc7eaf753def9deb9cafee968f92b3.zip |
cmd/compile: improve lowered moves and zeros for ppc64le
This change includes the following:
- Generate LXV/STXV sequences instead of LXVD2X/STXVD2X on power9.
These instructions do not require an index register, which
allows more loads and stores within a loop without initializing
multiple index registers. The LoweredQuadXXX generate LXV/STXV.
- Create LoweredMoveXXXShort and LoweredZeroXXXShort for short
moves that don't generate loops, and therefore don't clobber the
address registers or flags.
- Use registers other than R3 and R4 to avoid conflicting with
registers that have already been allocated to avoid unnecessary
register moves.
- Eliminate the use of R14 as scratch register and use R31
instead.
- Add PCALIGN when the LoweredMoveXXX or LoweredZeroXXX generates a
loop with more than 3 iterations.
This performance opportunity was noticed in github.com/golang/snappy
benchmarks. Results on power9:
WordsDecode1e1 54.1ns ± 0% 53.8ns ± 0% -0.51% (p=0.029 n=4+4)
WordsDecode1e2 287ns ± 0% 282ns ± 1% -1.83% (p=0.029 n=4+4)
WordsDecode1e3 3.98µs ± 0% 3.64µs ± 0% -8.52% (p=0.029 n=4+4)
WordsDecode1e4 66.9µs ± 0% 67.0µs ± 0% +0.20% (p=0.029 n=4+4)
WordsDecode1e5 723µs ± 0% 723µs ± 0% -0.01% (p=0.200 n=4+4)
WordsDecode1e6 7.21ms ± 0% 7.21ms ± 0% -0.02% (p=1.000 n=4+4)
WordsEncode1e1 29.9ns ± 0% 29.4ns ± 0% -1.51% (p=0.029 n=4+4)
WordsEncode1e2 2.12µs ± 0% 1.75µs ± 0% -17.70% (p=0.029 n=4+4)
WordsEncode1e3 11.7µs ± 0% 11.2µs ± 0% -4.61% (p=0.029 n=4+4)
WordsEncode1e4 119µs ± 0% 120µs ± 0% +0.36% (p=0.029 n=4+4)
WordsEncode1e5 1.21ms ± 0% 1.22ms ± 0% +0.41% (p=0.029 n=4+4)
WordsEncode1e6 12.0ms ± 0% 12.0ms ± 0% +0.57% (p=0.029 n=4+4)
RandomEncode 286µs ± 0% 203µs ± 0% -28.82% (p=0.029 n=4+4)
ExtendMatch 47.4µs ± 0% 47.0µs ± 0% -0.85% (p=0.029 n=4+4)
Change-Id: Iecad3a39ae55280286e42760a5c9d5c1168f5858
Reviewed-on: https://go-review.googlesource.com/c/go/+/226539
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'test/codegen')
-rw-r--r-- | test/codegen/copy.go | 34 |
1 files changed, 34 insertions, 0 deletions
diff --git a/test/codegen/copy.go b/test/codegen/copy.go index 46c2bde9ab..db75cde1c6 100644 --- a/test/codegen/copy.go +++ b/test/codegen/copy.go @@ -34,6 +34,8 @@ func movesmall7() { func movesmall16() { x := [...]byte{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16} // amd64:-".*memmove" + // ppc64:".*memmove" + // ppc64le:".*memmove" copy(x[1:], x[:]) } @@ -41,10 +43,34 @@ var x [256]byte // Check that large disjoint copies are replaced with moves. +func moveDisjointStack32() { + var s [32]byte + // ppc64:-".*memmove" + // ppc64le:-".*memmove" + // ppc64le/power8:"LXVD2X",-"ADD",-"BC" + // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC" + copy(s[:], x[:32]) + runtime.KeepAlive(&s) +} + +func moveDisjointStack64() { + var s [96]byte + // ppc64:-".*memmove" + // ppc64le:-".*memmove" + // ppc64le/power8:"LXVD2X","ADD","BC" + // ppc64le/power9:"LXV",-"LXVD2X",-"ADD",-"BC" + copy(s[:], x[:96]) + runtime.KeepAlive(&s) +} + func moveDisjointStack() { var s [256]byte // s390x:-".*memmove" // amd64:-".*memmove" + // ppc64:-".*memmove" + // ppc64le:-".*memmove" + // ppc64le/power8:"LXVD2X" + // ppc64le/power9:"LXV",-"LXVD2X" copy(s[:], x[:]) runtime.KeepAlive(&s) } @@ -53,6 +79,10 @@ func moveDisjointArg(b *[256]byte) { var s [256]byte // s390x:-".*memmove" // amd64:-".*memmove" + // ppc64:-".*memmove" + // ppc64le:-".*memmove" + // ppc64le/power8:"LXVD2X" + // ppc64le/power9:"LXV",-"LXVD2X" copy(s[:], b[:]) runtime.KeepAlive(&s) } @@ -60,6 +90,10 @@ func moveDisjointArg(b *[256]byte) { func moveDisjointNoOverlap(a *[256]byte) { // s390x:-".*memmove" // amd64:-".*memmove" + // ppc64:-".*memmove" + // ppc64le:-".*memmove" + // ppc64le/power8:"LXVD2X" + // ppc64le/power9:"LXV",-"LXVD2X" copy(a[:], a[128:]) } |