diff options
author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-08-31 09:43:40 -0400 |
---|---|---|
committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2020-09-17 12:37:40 +0000 |
commit | 967465da2975fe4322080703ce5a77ea90752829 (patch) | |
tree | 1f3a8d2b15aaa382b3551c3d5a1b68b6808f8592 /src/cmd/compile/internal/ppc64 | |
parent | 0dde60a5fefcb1447c97efa5c7bb4dbcf3575736 (diff) | |
download | go-967465da2975fe4322080703ce5a77ea90752829.tar.gz go-967465da2975fe4322080703ce5a77ea90752829.zip |
cmd/compile: use combined shifts to improve array addressing on ppc64x
This change adds rules to find pairs of instructions that can
be combined into a single shifts. These instruction sequences
are common in array addressing within loops. Improvements can
be seen in many crypto packages and the hash packages.
These are based on the extended mnemonics found in the ISA
sections C.8.1 and C.8.2.
Some rules in PPC64.rules were moved because the ordering prevented
some matching.
The following results were generated on power9.
hash/crc32:
CRC32/poly=Koopman/size=40/align=0 195ns ± 0% 163ns ± 0% -16.41%
CRC32/poly=Koopman/size=40/align=1 200ns ± 0% 163ns ± 0% -18.50%
CRC32/poly=Koopman/size=512/align=0 1.98µs ± 0% 1.67µs ± 0% -15.46%
CRC32/poly=Koopman/size=512/align=1 1.98µs ± 0% 1.69µs ± 0% -14.80%
CRC32/poly=Koopman/size=1kB/align=0 3.90µs ± 0% 3.31µs ± 0% -15.27%
CRC32/poly=Koopman/size=1kB/align=1 3.85µs ± 0% 3.31µs ± 0% -14.15%
CRC32/poly=Koopman/size=4kB/align=0 15.3µs ± 0% 13.1µs ± 0% -14.22%
CRC32/poly=Koopman/size=4kB/align=1 15.4µs ± 0% 13.1µs ± 0% -14.79%
CRC32/poly=Koopman/size=32kB/align=0 137µs ± 0% 105µs ± 0% -23.56%
CRC32/poly=Koopman/size=32kB/align=1 137µs ± 0% 105µs ± 0% -23.53%
crypto/rc4:
RC4_128 733ns ± 0% 650ns ± 0% -11.32% (p=1.000 n=1+1)
RC4_1K 5.80µs ± 0% 5.17µs ± 0% -10.89% (p=1.000 n=1+1)
RC4_8K 45.7µs ± 0% 40.8µs ± 0% -10.73% (p=1.000 n=1+1)
crypto/sha1:
Hash8Bytes 635ns ± 0% 613ns ± 0% -3.46% (p=1.000 n=1+1)
Hash320Bytes 2.30µs ± 0% 2.18µs ± 0% -5.38% (p=1.000 n=1+1)
Hash1K 5.88µs ± 0% 5.38µs ± 0% -8.62% (p=1.000 n=1+1)
Hash8K 42.0µs ± 0% 37.9µs ± 0% -9.75% (p=1.000 n=1+1)
There are other improvements found in golang.org/x/crypto which are all in the
range of 5-15%.
Change-Id: I193471fbcf674151ffe2edab212799d9b08dfb8c
Reviewed-on: https://go-review.googlesource.com/c/go/+/252097
Trust: Lynn Boger <laboger@linux.vnet.ibm.com>
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Carlos Eduardo Seo <cseo@linux.vnet.ibm.com>
Diffstat (limited to 'src/cmd/compile/internal/ppc64')
-rw-r--r-- | src/cmd/compile/internal/ppc64/ssa.go | 36 |
1 files changed, 36 insertions, 0 deletions
diff --git a/src/cmd/compile/internal/ppc64/ssa.go b/src/cmd/compile/internal/ppc64/ssa.go index f8d9ac2379..4a83a0bdd7 100644 --- a/src/cmd/compile/internal/ppc64/ssa.go +++ b/src/cmd/compile/internal/ppc64/ssa.go @@ -565,6 +565,42 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p = s.Prog(obj.ANOP) gc.Patch(pbover, p) + case ssa.OpPPC64CLRLSLWI: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + // clrlslwi ra,rs,sh,mb will become rlwinm ra,rs,sh,mb-sh,31-n as described in ISA + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)} + p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + case ssa.OpPPC64CLRLSLDI: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + // clrlsldi ra,rs,sh,mb will become rldic ra,rs,sh,mb-sh + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)} + p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + + // Mask has been set as sh + case ssa.OpPPC64RLDICL: + r := v.Reg() + r1 := v.Args[0].Reg() + shifts := v.AuxInt + p := s.Prog(v.Op.Asm()) + p.From = obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftsh(shifts)} + p.SetFrom3(obj.Addr{Type: obj.TYPE_CONST, Offset: ssa.GetPPC64Shiftmb(shifts)}) + p.Reg = r1 + p.To.Type = obj.TYPE_REG + p.To.Reg = r + case ssa.OpPPC64ADD, ssa.OpPPC64FADD, ssa.OpPPC64FADDS, ssa.OpPPC64SUB, ssa.OpPPC64FSUB, ssa.OpPPC64FSUBS, ssa.OpPPC64MULLD, ssa.OpPPC64MULLW, ssa.OpPPC64DIVDU, ssa.OpPPC64DIVWU, ssa.OpPPC64SRAD, ssa.OpPPC64SRAW, ssa.OpPPC64SRD, ssa.OpPPC64SRW, ssa.OpPPC64SLD, ssa.OpPPC64SLW, |