diff options
author | Keith Randall <khr@golang.org> | 2017-01-23 08:22:10 -0800 |
---|---|---|
committer | Keith Randall <khr@golang.org> | 2017-01-23 19:39:22 +0000 |
commit | a96e117a58dce1d55fd83a7b3391fa667dd66652 (patch) | |
tree | 0d600d2367a82eb542b4f53a18cd25b347eb06ec | |
parent | 4cce27a3fa0cc1f13afa6ffa358efa07144e00ec (diff) | |
download | go-a96e117a58dce1d55fd83a7b3391fa667dd66652.tar.gz go-a96e117a58dce1d55fd83a7b3391fa667dd66652.zip |
runtime: amd64, use 4-byte ops for memmove of 4 bytes
memmove used to use 2 2-byte load/store pairs to move 4 bytes.
When the result is loaded with a single 4-byte load, it caused
a store to load fowarding stall. To avoid the stall,
special case memmove to use 4 byte ops for the 4 byte copy case.
We already have a special case for 8-byte copies.
386 already specializes 4-byte copies.
I'll do 2-byte copies also, but not for 1.8.
benchmark old ns/op new ns/op delta
BenchmarkIssue18740-8 7567 4799 -36.58%
3-byte copies get a bit slower. Other copies are unchanged.
name old time/op new time/op delta
Memmove/3-8 4.76ns ± 5% 5.26ns ± 3% +10.50% (p=0.000 n=10+10)
Fixes #18740
Change-Id: Iec82cbac0ecfee80fa3c8fc83828f9a1819c3c74
Reviewed-on: https://go-review.googlesource.com/35567
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: David Chase <drchase@google.com>
-rw-r--r-- | src/runtime/memmove_amd64.s | 10 | ||||
-rw-r--r-- | src/runtime/memmove_test.go | 20 |
2 files changed, 28 insertions, 2 deletions
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s index 464f5fdc1b..c2286d3edd 100644 --- a/src/runtime/memmove_amd64.s +++ b/src/runtime/memmove_amd64.s @@ -146,10 +146,16 @@ move_1or2: move_0: RET move_3or4: + CMPQ BX, $4 + JB move_3 + MOVL (SI), AX + MOVL AX, (DI) + RET +move_3: MOVW (SI), AX - MOVW -2(SI)(BX*1), CX + MOVB 2(SI), CX MOVW AX, (DI) - MOVW CX, -2(DI)(BX*1) + MOVB CX, 2(DI) RET move_5through7: MOVL (SI), AX diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index dbfa284c28..74b8753b5f 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -6,6 +6,7 @@ package runtime_test import ( "crypto/rand" + "encoding/binary" "fmt" "internal/race" . "runtime" @@ -447,3 +448,22 @@ func BenchmarkCopyFat1024(b *testing.B) { _ = y } } + +func BenchmarkIssue18740(b *testing.B) { + // This tests that memmove uses one 4-byte load/store to move 4 bytes. + // It used to do 2 2-byte load/stores, which leads to a pipeline stall + // when we try to read the result with one 4-byte load. + var buf [4]byte + for j := 0; j < b.N; j++ { + s := uint32(0) + for i := 0; i < 4096; i += 4 { + copy(buf[:], g[i:]) + s += binary.LittleEndian.Uint32(buf[:]) + } + sink = uint64(s) + } +} + +// TODO: 2 byte and 8 byte benchmarks also. + +var g [4096]byte |