cmd/compile, runtime: intrinsify atomic And8 and Or8 on s390x

Intrinsify these functions to match other platforms. Update the sequence of instructions used in the assembly implementations to match the intrinsics. Also, add a micro benchmark so we can more easily measure the performance of these two functions: name old time/op new time/op delta And8-8 5.33ns ± 7% 2.55ns ± 8% -52.12% (p=0.000 n=20+20) And8Parallel-8 7.39ns ± 5% 3.74ns ± 4% -49.34% (p=0.000 n=20+20) Or8-8 4.84ns ±15% 2.64ns ±11% -45.50% (p=0.000 n=20+20) Or8Parallel-8 7.27ns ± 3% 3.84ns ± 4% -47.10% (p=0.000 n=19+20) By using a 'rotate then xor selected bits' instruction combined with either a 'load and and' or a 'load and or' instruction we can implement And8 and Or8 with far fewer instructions. Replacing 'compare and swap' with atomic instructions may also improve performance when there is contention. Change-Id: I28bb8032052b73ae8ccdf6e4c612d2877085fa01 Reviewed-on: https://go-review.googlesource.com/c/go/+/204277 Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
author: Michael Munday <mike.munday@ibm.com> 2019-10-23 06:43:23 -0700
committer: Brad Fitzpatrick <bradfitz@golang.org> 2019-11-11 15:23:59 +0000
commit: b3885dbc93ceae1b12f7e80edd2696baf566edec (patch)
tree: c7f9c66dbe89b891b512347b0e56cf69d253a387 /src/runtime/internal
parent: 75c839af22a50cb027766ea54335e234dac32836 (diff)
download: go-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz
go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip
2 files changed, 57 insertions, 27 deletions
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 78abd48afa..9a19bc0ece 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -176,37 +176,27 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 TEXT ·Or8(SB), NOSPLIT, $0-9
 	MOVD    ptr+0(FP), R3
 	MOVBZ   val+8(FP), R4
-	// Calculate shift.
-	MOVD	R3, R5
-	AND	$3, R5
-	XOR	$3, R5 // big endian - flip direction
-	SLD	$3, R5 // MUL $8, R5
-	SLD	R5, R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	AND	$-4, R3
-	MOVWZ	0(R3), R6
-again:
-	OR	R4, R6, R7
-	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
-	BNE	again
+	// We don't have atomic operations that work on individual bytes so we
+	// need to align addr down to a word boundary and create a mask
+	// containing v to OR with the entire word atomically.
+	MOVD	$(3<<3), R5
+	RXSBG	$59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
+	ANDW	$~3, R3              // R3 = floor(addr, 4) = addr &^ 3
+	SLW	R5, R4               // R4 = uint32(v) << R5
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
 	RET
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
 	MOVD    ptr+0(FP), R3
 	MOVBZ   val+8(FP), R4
-	// Calculate shift.
-	MOVD	R3, R5
-	AND	$3, R5
-	XOR	$3, R5 // big endian - flip direction
-	SLD	$3, R5 // MUL $8, R5
-	OR	$-256, R4 // create 0xffffffffffffffxx
-	RLLG	R5, R4
-	// Align ptr down to 4 bytes so we can use 32-bit load/store.
-	AND	$-4, R3
-	MOVWZ	0(R3), R6
-again:
-	AND	R4, R6, R7
-	CS	R6, R7, 0(R3) // if R6==(R3) then (R3)=R7 else R6=(R3)
-	BNE	again
+	// We don't have atomic operations that work on individual bytes so we
+	// need to align addr down to a word boundary and create a mask
+	// containing v to AND with the entire word atomically.
+	ORW	$~0xff, R4           // R4 = uint32(v) | 0xffffff00
+	MOVD	$(3<<3), R5
+	RXSBG	$59, $60, $3, R3, R5 // R5 = 24 - ((addr % 4) * 8) = ((addr & 3) << 3) ^ (3 << 3)
+	ANDW	$~3, R3              // R3 = floor(addr, 4) = addr &^ 3
+	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index 083a75cb07..de71b0f2c7 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -43,6 +43,46 @@ func BenchmarkAtomicStore(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkAnd8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.And8(&x[255], i)
+			i++
+		}
+	})
+}
+
+func BenchmarkOr8(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or8(&x[255], uint8(i))
+	}
+}
+
+func BenchmarkOr8Parallel(b *testing.B) {
+	var x [512]uint8 // give byte its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint8(0)
+		for pb.Next() {
+			atomic.Or8(&x[255], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
author	Michael Munday <mike.munday@ibm.com>	2019-10-23 06:43:23 -0700
committer	Brad Fitzpatrick <bradfitz@golang.org>	2019-11-11 15:23:59 +0000
commit	b3885dbc93ceae1b12f7e80edd2696baf566edec (patch)
tree	c7f9c66dbe89b891b512347b0e56cf69d253a387 /src/runtime/internal
parent	75c839af22a50cb027766ea54335e234dac32836 (diff)
download	go-b3885dbc93ceae1b12f7e80edd2696baf566edec.tar.gz go-b3885dbc93ceae1b12f7e80edd2696baf566edec.zip