From 03cd8a7b0e47f2141a1dbdad9fdce175942d5515 Mon Sep 17 00:00:00 2001 From: Mark Ryan Date: Fri, 2 Jun 2023 13:13:29 +0200 Subject: internal/bytealg: fix alignment code in equal_riscv64.s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The riscv64 implementation of equal has an optimization that is applied when both pointers share the same alignment but that alignment is not 8 bytes. In this case it tries to align both pointers to an 8 byte boundaries, by individually comparing the first few bytes of each buffer. Unfortunately, the existing code is incorrect. It adjusts the pointers by the wrong number of bytes resulting, in most cases, in pointers that are not 8 byte aligned. This commit fixes the issue by individually comparing the first (8 - (pointer & 7)) bytes of each buffer rather than the first (pointer & 7) bytes. This particular optimization is not covered by any of the existing benchmarks so a new benchmark, BenchmarkEqualBothUnaligned, is provided. The benchmark tests the case where both pointers have the same alignment but may not be 8 byte aligned. Results of the new benchmark along with some of the existing benchmarks generated on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04 are presented below. Equal/0-4 3.356n ± 0% 3.357n ± 0% ~ (p=0.840 n=10) Equal/1-4 63.91n ± 7% 65.97n ± 5% +3.22% (p=0.029 n=10) Equal/6-4 72.94n ± 5% 76.09n ± 4% ~ (p=0.075 n=10) Equal/9-4 84.61n ± 7% 85.83n ± 3% ~ (p=0.315 n=10) Equal/15-4 103.7n ± 2% 102.9n ± 4% ~ (p=0.739 n=10) Equal/16-4 89.14n ± 3% 100.40n ± 4% +12.64% (p=0.000 n=10) Equal/20-4 107.8n ± 3% 106.8n ± 3% ~ (p=0.725 n=10) Equal/32-4 63.95n ± 8% 67.79n ± 7% ~ (p=0.089 n=10) Equal/4K-4 1.256µ ± 1% 1.254µ ± 0% ~ (p=0.925 n=10) Equal/4M-4 1.231m ± 0% 1.230m ± 0% -0.04% (p=0.011 n=10) Equal/64M-4 19.77m ± 0% 19.78m ± 0% ~ (p=0.052 n=10) EqualBothUnaligned/64_0-4 43.70n ± 4% 44.40n ± 5% ~ (p=0.529 n=10) EqualBothUnaligned/64_1-4 6957.5n ± 0% 105.9n ± 1% -98.48% (p=0.000 n=10) EqualBothUnaligned/64_4-4 100.1n ± 2% 101.5n ± 4% ~ (p=0.149 n=10) EqualBothUnaligned/64_7-4 6965.00n ± 0% 95.60n ± 4% -98.63% (p=0.000 n=10) EqualBothUnaligned/4096_0-4 1.233µ ± 1% 1.225µ ± 0% -0.65% (p=0.015 n=10) EqualBothUnaligned/4096_1-4 584.226µ ± 0% 1.277µ ± 0% -99.78% (p=0.000 n=10) EqualBothUnaligned/4096_4-4 1.270µ ± 1% 1.268µ ± 0% ~ (p=0.105 n=10) EqualBothUnaligned/4096_7-4 584.944µ ± 0% 1.266µ ± 1% -99.78% (p=0.000 n=10) EqualBothUnaligned/4194304_0-4 1.241m ± 0% 1.236m ± 0% -0.38% (p=0.035 n=10) EqualBothUnaligned/4194304_1-4 600.956m ± 0% 1.238m ± 0% -99.79% (p=0.000 n=10) EqualBothUnaligned/4194304_4-4 1.239m ± 0% 1.241m ± 0% +0.22% (p=0.007 n=10) EqualBothUnaligned/4194304_7-4 601.036m ± 0% 1.239m ± 0% -99.79% (p=0.000 n=10) EqualBothUnaligned/67108864_0-4 19.79m ± 0% 19.78m ± 0% ~ (p=0.393 n=10) EqualBothUnaligned/67108864_1-4 9616.61m ± 0% 19.82m ± 0% -99.79% (p=0.000 n=10) EqualBothUnaligned/67108864_4-4 19.82m ± 0% 19.82m ± 0% ~ (p=0.971 n=10) EqualBothUnaligned/67108864_7-4 9616.34m ± 0% 19.86m ± 0% -99.79% (p=0.000 n=10) geomean 38.38µ 7.194µ -81.26% Change-Id: I4caab6c3450bd7e2773426b08b70bbc37fbe4e5f Reviewed-on: https://go-review.googlesource.com/c/go/+/500855 TryBot-Result: Gopher Robot Reviewed-by: Keith Randall Reviewed-by: Keith Randall Run-TryBot: Keith Randall Reviewed-by: Cherry Mui --- src/bytes/bytes_test.go | 32 ++++++++++++++++++++++++++++++++ src/internal/bytealg/equal_riscv64.s | 2 ++ 2 files changed, 34 insertions(+) diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go index 05c0090b61..f0733edd3f 100644 --- a/src/bytes/bytes_test.go +++ b/src/bytes/bytes_test.go @@ -652,6 +652,38 @@ func bmEqual(equal func([]byte, []byte) bool) func(b *testing.B, n int) { } } +func BenchmarkEqualBothUnaligned(b *testing.B) { + sizes := []int{64, 4 << 10} + if !isRaceBuilder { + sizes = append(sizes, []int{4 << 20, 64 << 20}...) + } + maxSize := 2 * (sizes[len(sizes)-1] + 8) + if len(bmbuf) < maxSize { + bmbuf = make([]byte, maxSize) + } + + for _, n := range sizes { + for _, off := range []int{0, 1, 4, 7} { + buf1 := bmbuf[off : off+n] + buf2Start := (len(bmbuf) / 2) + off + buf2 := bmbuf[buf2Start : buf2Start+n] + buf1[n-1] = 'x' + buf2[n-1] = 'x' + b.Run(fmt.Sprint(n, off), func(b *testing.B) { + b.SetBytes(int64(n)) + for i := 0; i < b.N; i++ { + eq := Equal(buf1, buf2) + if !eq { + b.Fatal("bad equal") + } + } + }) + buf1[n-1] = '\x00' + buf2[n-1] = '\x00' + } + } +} + func BenchmarkIndex(b *testing.B) { benchBytes(b, indexSizes, func(b *testing.B, n int) { buf := bmbuf[0:n] diff --git a/src/internal/bytealg/equal_riscv64.s b/src/internal/bytealg/equal_riscv64.s index 3834083ec9..503aac5751 100644 --- a/src/internal/bytealg/equal_riscv64.s +++ b/src/internal/bytealg/equal_riscv64.s @@ -37,6 +37,8 @@ TEXT memequal<>(SB),NOSPLIT|NOFRAME,$0 BEQZ X9, loop32_check // Check one byte at a time until we reach 8 byte alignment. + SUB X9, X0, X9 + ADD $8, X9, X9 SUB X9, X12, X12 align: ADD $-1, X9 -- cgit v1.2.3-54-g00ecf