diff options
author | Michael Munday <mike.munday@ibm.com> | 2018-05-25 17:54:58 +0100 |
---|---|---|
committer | Michael Munday <mike.munday@ibm.com> | 2018-09-03 14:35:38 +0000 |
commit | 6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82 (patch) | |
tree | 4850119296318dcb26d743333d5998d2c71a5b9b /src/cmd/compile/internal/s390x | |
parent | ff468a43be1740890a0f3b64a6ab920ea92c2c17 (diff) | |
download | go-6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82.tar.gz go-6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82.zip |
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions
as intrinsics on s390x using the 'population count' (popcnt)
instruction. This instruction was released as the 'population-count'
facility which uses the same facility bit (45) as the
'distinct-operands' facility which is a pre-requisite for Go on
s390x. We can therefore use it without a feature check.
The s390x popcnt instruction treats a 64 bit register as a vector
of 8 bytes, summing the number of ones in each byte individually.
It then writes the results to the corresponding bytes in the
output register. Therefore to implement OnesCount{16,32,64} we
need to sum the individual byte counts using some extra
instructions. To do this efficiently I've added some additional
pseudo operations to the s390x SSA backend.
Unlike other architectures the new instruction sequence is faster
for OnesCount8, so that is implemented using the intrinsic.
name old time/op new time/op delta
OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20)
OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20)
OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17)
OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20)
OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20)
Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0
Reviewed-on: https://go-review.googlesource.com/114675
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/cmd/compile/internal/s390x')
-rw-r--r-- | src/cmd/compile/internal/s390x/ssa.go | 5 |
1 files changed, 4 insertions, 1 deletions
diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go index fe206f74e8..90e61c34fd 100644 --- a/src/cmd/compile/internal/s390x/ssa.go +++ b/src/cmd/compile/internal/s390x/ssa.go @@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Type = obj.TYPE_MEM p.To.Name = obj.NAME_EXTERN p.To.Sym = v.Aux.(*obj.LSym) - case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW, + case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT, + ssa.OpS390XNEG, ssa.OpS390XNEGW, ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR: p := s.Prog(v.Op.Asm()) p.From.Type = obj.TYPE_REG @@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) { p.To.Reg = v.Reg() case ssa.OpS390XNOT, ssa.OpS390XNOTW: v.Fatalf("NOT/NOTW generated %s", v.LongString()) + case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8: + v.Fatalf("SumBytes generated %s", v.LongString()) case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE, ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE, ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE, |