aboutsummaryrefslogtreecommitdiff
path: root/src/cmd/compile/internal/s390x
diff options
context:
space:
mode:
authorMichael Munday <mike.munday@ibm.com>2018-05-25 17:54:58 +0100
committerMichael Munday <mike.munday@ibm.com>2018-09-03 14:35:38 +0000
commit6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82 (patch)
tree4850119296318dcb26d743333d5998d2c71a5b9b /src/cmd/compile/internal/s390x
parentff468a43be1740890a0f3b64a6ab920ea92c2c17 (diff)
downloadgo-6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82.tar.gz
go-6f9b94ab6658bbebe4c89791dc3e5ebe53be3d82.zip
cmd/compile: implement OnesCount{8,16,32,64} intrinsics on s390x
This CL implements the math/bits.OnesCount{8,16,32,64} functions as intrinsics on s390x using the 'population count' (popcnt) instruction. This instruction was released as the 'population-count' facility which uses the same facility bit (45) as the 'distinct-operands' facility which is a pre-requisite for Go on s390x. We can therefore use it without a feature check. The s390x popcnt instruction treats a 64 bit register as a vector of 8 bytes, summing the number of ones in each byte individually. It then writes the results to the corresponding bytes in the output register. Therefore to implement OnesCount{16,32,64} we need to sum the individual byte counts using some extra instructions. To do this efficiently I've added some additional pseudo operations to the s390x SSA backend. Unlike other architectures the new instruction sequence is faster for OnesCount8, so that is implemented using the intrinsic. name old time/op new time/op delta OnesCount 3.21ns ± 1% 1.35ns ± 0% -58.00% (p=0.000 n=20+20) OnesCount8 0.91ns ± 1% 0.81ns ± 0% -11.43% (p=0.000 n=20+20) OnesCount16 1.51ns ± 3% 1.21ns ± 0% -19.71% (p=0.000 n=20+17) OnesCount32 1.91ns ± 0% 1.12ns ± 1% -41.60% (p=0.000 n=19+20) OnesCount64 3.18ns ± 4% 1.35ns ± 0% -57.52% (p=0.000 n=20+20) Change-Id: Id54f0bd28b6db9a887ad12c0d72fcc168ef9c4e0 Reviewed-on: https://go-review.googlesource.com/114675 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/cmd/compile/internal/s390x')
-rw-r--r--src/cmd/compile/internal/s390x/ssa.go5
1 files changed, 4 insertions, 1 deletions
diff --git a/src/cmd/compile/internal/s390x/ssa.go b/src/cmd/compile/internal/s390x/ssa.go
index fe206f74e8..90e61c34fd 100644
--- a/src/cmd/compile/internal/s390x/ssa.go
+++ b/src/cmd/compile/internal/s390x/ssa.go
@@ -513,7 +513,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Type = obj.TYPE_MEM
p.To.Name = obj.NAME_EXTERN
p.To.Sym = v.Aux.(*obj.LSym)
- case ssa.OpS390XFLOGR, ssa.OpS390XNEG, ssa.OpS390XNEGW,
+ case ssa.OpS390XFLOGR, ssa.OpS390XPOPCNT,
+ ssa.OpS390XNEG, ssa.OpS390XNEGW,
ssa.OpS390XMOVWBR, ssa.OpS390XMOVDBR:
p := s.Prog(v.Op.Asm())
p.From.Type = obj.TYPE_REG
@@ -522,6 +523,8 @@ func ssaGenValue(s *gc.SSAGenState, v *ssa.Value) {
p.To.Reg = v.Reg()
case ssa.OpS390XNOT, ssa.OpS390XNOTW:
v.Fatalf("NOT/NOTW generated %s", v.LongString())
+ case ssa.OpS390XSumBytes2, ssa.OpS390XSumBytes4, ssa.OpS390XSumBytes8:
+ v.Fatalf("SumBytes generated %s", v.LongString())
case ssa.OpS390XMOVDEQ, ssa.OpS390XMOVDNE,
ssa.OpS390XMOVDLT, ssa.OpS390XMOVDLE,
ssa.OpS390XMOVDGT, ssa.OpS390XMOVDGE,