diff options
author | Michael Munday <mike.munday@ibm.com> | 2019-10-23 06:43:23 -0700 |
---|---|---|
committer | Brad Fitzpatrick <bradfitz@golang.org> | 2019-11-11 15:23:59 +0000 |
commit | b3885dbc93ceae1b12f7e80edd2696baf566edec (patch) | |
tree | c7f9c66dbe89b891b512347b0e56cf69d253a387 /src/runtime/internal | |
parent | 75c839af22a50cb027766ea54335e234dac32836 (diff) | |
download | go-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip |
cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x
Intrinsify these functions to match other platforms. Update the
sequence of instructions used in the assembly implementations to
match the intrinsics.
Also, add a micro benchmark so we can more easily measure the
performance of these two functions:
name old time/op new time/op delta
And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20)
And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20)
Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20)
Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20)
By using a 'rotate then xor selected bits' instruction combined with
either a 'load and and' or a 'load and or' instruction we can
implement And8 and Or8 with far fewer instructions. Replacing
'compare and swap' with atomic instructions may also improve
performance when there is contention.
Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01
Reviewed-on: https://go-review.googlesource.com/c/go/+/204277
Run-TryBot: Michael Munday <mike.munday@ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
Diffstat (limited to 'src/runtime/internal')
-rw-r--r-- | src/runtime/internal/atomic/asm_s390x.s | 44 | ||||
-rw-r--r-- | src/runtime/internal/atomic/bench_test.go | 40 |
2 files changed, 57 insertions, 27 deletions
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s index 78abd48afa..9a19bc0ece 100644 --- a/src/runtime/internal/atomic/asm_s390x.s +++ b/src/runtime/internal/atomic/asm_s390x.s @@ -176,37 +176,27 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24 TEXT ·Or8(SB), NOSPLIT, $0-9 MOVD ptr+0(FP), R3 MOVBZ val+8(FP), R4 - // Calculate shift. - MOVD R3, R5 - AND $3, R5 - XOR $3, R5 // big endian - flip direction - SLD $3, R5 // MUL $8, R5 - SLD R5, R4 - // Align ptr down to 4 bytes so we can use 32-bit load/store. - AND $-4, R3 - MOVWZ 0(R3), R6 -again: - OR R4, R6, R7 - CS R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3) - BNE again + // We don't have atomic operations that work on individual bytes so we + // need to align addr down to a word boundary and create a mask + // containing v to OR with the entire word atomically. + MOVD $(3<<3), R5 + RXSBG $59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3) + ANDW $~3, R3 // R3 = floor(addr, 4) = addr &^ 3 + SLW R5, R4 // R4 = uint32(v) << R5 + LAO R4, R6, 0(R3) // R6 = *R3; *R3 |= R4; (atomic) RET // func And8(addr *uint8, v uint8) TEXT ·And8(SB), NOSPLIT, $0-9 MOVD ptr+0(FP), R3 MOVBZ val+8(FP), R4 - // Calculate shift. - MOVD R3, R5 - AND $3, R5 - XOR $3, R5 // big endian - flip direction - SLD $3, R5 // MUL $8, R5 - OR $-256, R4 // create 0xffffffffffffffxx - RLLG R5, R4 - // Align ptr down to 4 bytes so we can use 32-bit load/store. - AND $-4, R3 - MOVWZ 0(R3), R6 -again: - AND R4, R6, R7 - CS R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3) - BNE again + // We don't have atomic operations that work on individual bytes so we + // need to align addr down to a word boundary and create a mask + // containing v to AND with the entire word atomically. + ORW $~0xff, R4 // R4 = uint32(v) | 0xffffff00 + MOVD $(3<<3), R5 + RXSBG $59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3) + ANDW $~3, R3 // R3 = floor(addr, 4) = addr &^ 3 + RLL R5, R4, R4 // R4 = rotl(R4, R5) + LAN R4, R6, 0(R3) // R6 = *R3; *R3 &= R4; (atomic) RET diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go index 083a75cb07..de71b0f2c7 100644 --- a/src/runtime/internal/atomic/bench_test.go +++ b/src/runtime/internal/atomic/bench_test.go @@ -43,6 +43,46 @@ func BenchmarkAtomicStore(b *testing.B) { } } +func BenchmarkAnd8(b *testing.B) { + var x [512]uint8 // give byte its own cache line + sink = &x + for i := 0; i < b.N; i++ { + atomic.And8(&x[255], uint8(i)) + } +} + +func BenchmarkAnd8Parallel(b *testing.B) { + var x [512]uint8 // give byte its own cache line + sink = &x + b.RunParallel(func(pb *testing.PB) { + i := uint8(0) + for pb.Next() { + atomic.And8(&x[255], i) + i++ + } + }) +} + +func BenchmarkOr8(b *testing.B) { + var x [512]uint8 // give byte its own cache line + sink = &x + for i := 0; i < b.N; i++ { + atomic.Or8(&x[255], uint8(i)) + } +} + +func BenchmarkOr8Parallel(b *testing.B) { + var x [512]uint8 // give byte its own cache line + sink = &x + b.RunParallel(func(pb *testing.PB) { + i := uint8(0) + for pb.Next() { + atomic.Or8(&x[255], i) + i++ + } + }) +} + func BenchmarkXadd(b *testing.B) { var x uint32 ptr := &x |