diff options
author | Jonathan Swinney <jswinney@amazon.com> | 2020-10-30 18:46:23 +0000 |
---|---|---|
committer | Cherry Zhang <cherryyz@google.com> | 2020-11-02 15:23:43 +0000 |
commit | d5388e23b5c75bf8189b173051d24a0176a5a303 (patch) | |
tree | 8d4313273ad941969eb0e8d5b23b323395e507c1 /src/runtime/memmove_test.go | |
parent | 7be8358f70ff858f28b9aefe11986da25f1762bc (diff) | |
download | go-d5388e23b5c75bf8189b173051d24a0176a5a303.tar.gz go-d5388e23b5c75bf8189b173051d24a0176a5a303.zip |
runtime: improve memmove performance on arm64
Replace the memmove implementation for moves of 17 bytes or larger
with an implementation from ARM optimized software. The moves of 16
bytes or fewer are unchanged, but the registers used are updated to
match the rest of the implementation.
This implementation makes use of new optimizations:
- software pipelined loop for large (>128 byte) moves
- medium size moves (17..128 bytes) have a new implementation
- address realignment when src or dst is unaligned
- preference for aligned src (loads) or dst (stores) depending on CPU
To support preference for aligned loads or aligned stores, a new CPU
flag is added. This flag indicates that the detected micro
architecture performs better with aligned loads. Some tested CPUs did
not exhibit a significant difference and are left with the default
behavior of realigning based on the destination address (stores).
Neoverse N1 (Tested on Graviton 2)
name old time/op new time/op delta
Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10)
Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal)
Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9)
Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9)
Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8)
Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8)
Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10)
Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10)
Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10)
Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10)
Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10)
Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7)
MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal)
MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10)
MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9)
MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10)
MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10)
MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9)
MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10)
MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal)
MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9)
MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8)
MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10)
name old speed new speed delta
Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9)
MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10)
MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9)
MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7)
Cortex-A72 (Graviton 1)
name old time/op new time/op delta
Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9)
Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10)
Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal)
Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal)
Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10)
Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10)
Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10)
Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10)
Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10)
Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10)
Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10)
Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10)
MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10)
MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10)
MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10)
MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10)
MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10)
MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9)
MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10)
MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10)
MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10)
MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10)
MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10)
MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9)
Cortex-A53 (Raspberry Pi 3)
name old time/op new time/op delta
Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10)
Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10)
Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8)
Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8)
Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8)
Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10)
Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10)
Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8)
Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8)
Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10)
Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9)
Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10)
MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8)
MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9)
MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10)
MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10)
MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8)
MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9)
MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8)
MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8)
MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10)
MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8)
MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8)
MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10)
MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10)
MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10)
Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab
Reviewed-on: https://go-review.googlesource.com/c/go/+/243357
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Martin Möhrmann <moehrmann@google.com>
Reviewed-by: eric fang <eric.fang@arm.com>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/runtime/memmove_test.go')
-rw-r--r-- | src/runtime/memmove_test.go | 30 |
1 files changed, 30 insertions, 0 deletions
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go index b549433f71..7c9d2ada45 100644 --- a/src/runtime/memmove_test.go +++ b/src/runtime/memmove_test.go @@ -286,6 +286,9 @@ var bufSizes = []int{ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, } +var bufSizesOverlap = []int{ + 32, 64, 128, 256, 512, 1024, 2048, 4096, +} func BenchmarkMemmove(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { @@ -297,6 +300,15 @@ func BenchmarkMemmove(b *testing.B) { }) } +func BenchmarkMemmoveOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[:n]) + } + }) +} + func BenchmarkMemmoveUnalignedDst(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n+1) @@ -307,6 +319,15 @@ func BenchmarkMemmoveUnalignedDst(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedDstOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+16) + for i := 0; i < b.N; i++ { + copy(x[16:n+16], x[1:n+1]) + } + }) +} + func BenchmarkMemmoveUnalignedSrc(b *testing.B) { benchmarkSizes(b, bufSizes, func(b *testing.B, n int) { x := make([]byte, n) @@ -317,6 +338,15 @@ func BenchmarkMemmoveUnalignedSrc(b *testing.B) { }) } +func BenchmarkMemmoveUnalignedSrcOverlap(b *testing.B) { + benchmarkSizes(b, bufSizesOverlap, func(b *testing.B, n int) { + x := make([]byte, n+1) + for i := 0; i < b.N; i++ { + copy(x[1:n+1], x[:n]) + } + }) +} + func TestMemclr(t *testing.T) { size := 512 if testing.Short() { |