runtime: improve memmove performance on arm64

Replace the memmove implementation for moves of 17 bytes or larger with an implementation from ARM optimized software. The moves of 16 bytes or fewer are unchanged, but the registers used are updated to match the rest of the implementation. This implementation makes use of new optimizations: - software pipelined loop for large (>128 byte) moves - medium size moves (17..128 bytes) have a new implementation - address realignment when src or dst is unaligned - preference for aligned src (loads) or dst (stores) depending on CPU To support preference for aligned loads or aligned stores, a new CPU flag is added. This flag indicates that the detected micro architecture performs better with aligned loads. Some tested CPUs did not exhibit a significant difference and are left with the default behavior of realigning based on the destination address (stores). Neoverse N1 (Tested on Graviton 2) name old time/op new time/op delta Memmove/0-4 1.88ns ± 1% 1.87ns ± 1% -0.58% (p=0.020 n=10+10) Memmove/1-4 4.40ns ± 0% 4.40ns ± 0% ~ (all equal) Memmove/8-4 3.88ns ± 3% 3.80ns ± 0% -1.97% (p=0.001 n=10+9) Memmove/16-4 3.90ns ± 3% 3.80ns ± 0% -2.49% (p=0.000 n=10+9) Memmove/32-4 4.80ns ± 0% 4.40ns ± 0% -8.33% (p=0.000 n=9+8) Memmove/64-4 5.86ns ± 0% 5.00ns ± 0% -14.76% (p=0.000 n=8+8) Memmove/128-4 8.46ns ± 0% 8.06ns ± 0% -4.62% (p=0.000 n=10+10) Memmove/256-4 12.4ns ± 0% 12.2ns ± 0% -1.61% (p=0.000 n=10+10) Memmove/512-4 19.5ns ± 0% 19.1ns ± 0% -2.05% (p=0.000 n=10+10) Memmove/1024-4 33.7ns ± 0% 33.5ns ± 0% -0.59% (p=0.000 n=10+10) Memmove/2048-4 62.1ns ± 0% 59.0ns ± 0% -4.99% (p=0.000 n=10+10) Memmove/4096-4 117ns ± 1% 110ns ± 0% -5.66% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 6.41ns ± 0% 5.62ns ± 0% -12.32% (p=0.000 n=10+7) MemmoveUnalignedDst/128-4 9.40ns ± 0% 8.34ns ± 0% -11.24% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 12.8ns ± 0% 12.8ns ± 0% ~ (all equal) MemmoveUnalignedDst/512-4 20.4ns ± 0% 19.7ns ± 0% -3.43% (p=0.000 n=9+10) MemmoveUnalignedDst/1024-4 34.1ns ± 0% 35.1ns ± 0% +2.93% (p=0.000 n=9+9) MemmoveUnalignedDst/2048-4 61.5ns ± 0% 60.4ns ± 0% -1.77% (p=0.000 n=10+10) MemmoveUnalignedDst/4096-4 122ns ± 0% 113ns ± 0% -7.38% (p=0.002 n=8+10) MemmoveUnalignedSrc/64-4 7.25ns ± 1% 6.26ns ± 0% -13.64% (p=0.000 n=9+9) MemmoveUnalignedSrc/128-4 10.5ns ± 0% 9.7ns ± 0% -7.52% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 17.1ns ± 0% 17.3ns ± 0% +1.17% (p=0.000 n=10+10) MemmoveUnalignedSrc/512-4 27.0ns ± 0% 27.0ns ± 0% ~ (all equal) MemmoveUnalignedSrc/1024-4 46.7ns ± 0% 35.7ns ± 0% -23.55% (p=0.000 n=10+9) MemmoveUnalignedSrc/2048-4 85.2ns ± 0% 61.2ns ± 0% -28.17% (p=0.000 n=10+8) MemmoveUnalignedSrc/4096-4 162ns ± 0% 113ns ± 0% -30.25% (p=0.000 n=10+10) name old speed new speed delta Memmove/4096-4 35.2GB/s ± 0% 37.1GB/s ± 0% +5.56% (p=0.000 n=10+9) MemmoveUnalignedSrc/1024-4 21.9GB/s ± 0% 28.7GB/s ± 0% +30.90% (p=0.000 n=10+10) MemmoveUnalignedSrc/2048-4 24.0GB/s ± 0% 33.5GB/s ± 0% +39.18% (p=0.000 n=10+9) MemmoveUnalignedSrc/4096-4 25.3GB/s ± 0% 36.2GB/s ± 0% +43.50% (p=0.000 n=10+7) Cortex-A72 (Graviton 1) name old time/op new time/op delta Memmove/0-4 3.06ns ± 3% 3.08ns ± 1% ~ (p=0.958 n=10+9) Memmove/1-4 8.72ns ± 0% 7.85ns ± 0% -9.98% (p=0.002 n=8+10) Memmove/8-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/16-4 8.29ns ± 0% 8.29ns ± 0% ~ (all equal) Memmove/32-4 8.19ns ± 2% 8.29ns ± 0% ~ (p=0.114 n=10+10) Memmove/64-4 18.3ns ± 4% 10.0ns ± 0% -45.36% (p=0.000 n=10+10) Memmove/128-4 14.8ns ± 0% 17.4ns ± 0% +17.77% (p=0.000 n=10+10) Memmove/256-4 21.8ns ± 0% 23.1ns ± 0% +5.96% (p=0.000 n=10+10) Memmove/512-4 35.8ns ± 0% 37.2ns ± 0% +3.91% (p=0.000 n=10+10) Memmove/1024-4 63.7ns ± 0% 67.2ns ± 0% +5.49% (p=0.000 n=10+10) Memmove/2048-4 126ns ± 0% 123ns ± 0% -2.38% (p=0.000 n=10+10) Memmove/4096-4 238ns ± 1% 243ns ± 1% +1.93% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 19.3ns ± 1% 12.0ns ± 1% -37.49% (p=0.000 n=10+10) MemmoveUnalignedDst/128-4 17.2ns ± 0% 17.4ns ± 0% +1.16% (p=0.000 n=10+10) MemmoveUnalignedDst/256-4 28.2ns ± 8% 29.2ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedDst/512-4 49.8ns ± 3% 48.9ns ± 0% ~ (p=1.000 n=10+10) MemmoveUnalignedDst/1024-4 89.5ns ± 0% 80.5ns ± 1% -10.02% (p=0.000 n=10+10) MemmoveUnalignedDst/2048-4 180ns ± 0% 127ns ± 0% -29.44% (p=0.000 n=9+10) MemmoveUnalignedDst/4096-4 347ns ± 0% 244ns ± 0% -29.59% (p=0.000 n=10+9) MemmoveUnalignedSrc/128-4 16.1ns ± 0% 21.8ns ± 0% +35.40% (p=0.000 n=10+10) MemmoveUnalignedSrc/256-4 24.9ns ± 8% 26.6ns ± 0% +6.70% (p=0.015 n=10+10) MemmoveUnalignedSrc/512-4 39.4ns ± 6% 40.6ns ± 0% ~ (p=0.352 n=10+10) MemmoveUnalignedSrc/1024-4 72.5ns ± 0% 83.0ns ± 1% +14.44% (p=0.000 n=9+10) MemmoveUnalignedSrc/2048-4 129ns ± 1% 128ns ± 1% ~ (p=0.179 n=10+10) MemmoveUnalignedSrc/4096-4 241ns ± 0% 253ns ± 1% +4.99% (p=0.000 n=9+9) Cortex-A53 (Raspberry Pi 3) name old time/op new time/op delta Memmove/0-4 11.0ns ± 0% 11.0ns ± 1% ~ (p=0.294 n=8+10) Memmove/1-4 29.6ns ± 0% 28.0ns ± 1% -5.41% (p=0.000 n=9+10) Memmove/8-4 23.5ns ± 0% 22.1ns ± 0% -6.11% (p=0.000 n=8+8) Memmove/16-4 23.7ns ± 1% 22.1ns ± 0% -6.59% (p=0.000 n=10+8) Memmove/32-4 27.9ns ± 0% 27.1ns ± 0% -3.13% (p=0.000 n=8+8) Memmove/64-4 33.8ns ± 0% 31.5ns ± 1% -6.99% (p=0.000 n=8+10) Memmove/128-4 45.6ns ± 0% 44.2ns ± 1% -3.23% (p=0.000 n=9+10) Memmove/256-4 69.3ns ± 0% 69.3ns ± 0% ~ (p=0.072 n=8+8) Memmove/512-4 127ns ± 0% 110ns ± 0% -13.39% (p=0.000 n=8+8) Memmove/1024-4 222ns ± 0% 205ns ± 1% -7.66% (p=0.000 n=7+10) Memmove/2048-4 411ns ± 0% 366ns ± 0% -10.98% (p=0.000 n=8+9) Memmove/4096-4 795ns ± 1% 695ns ± 1% -12.63% (p=0.000 n=10+10) MemmoveUnalignedDst/64-4 44.0ns ± 0% 40.5ns ± 0% -7.93% (p=0.000 n=8+8) MemmoveUnalignedDst/128-4 59.6ns ± 0% 54.9ns ± 0% -7.85% (p=0.000 n=9+9) MemmoveUnalignedDst/256-4 98.2ns ±11% 90.0ns ± 1% ~ (p=0.130 n=10+10) MemmoveUnalignedDst/512-4 161ns ± 2% 145ns ± 1% -9.96% (p=0.000 n=10+10) MemmoveUnalignedDst/1024-4 281ns ± 0% 265ns ± 0% -5.65% (p=0.000 n=9+8) MemmoveUnalignedDst/2048-4 528ns ± 0% 482ns ± 0% -8.73% (p=0.000 n=8+9) MemmoveUnalignedDst/4096-4 1.02µs ± 1% 0.92µs ± 0% -10.00% (p=0.000 n=10+8) MemmoveUnalignedSrc/64-4 42.4ns ± 1% 40.5ns ± 0% -4.39% (p=0.000 n=10+8) MemmoveUnalignedSrc/128-4 57.4ns ± 0% 57.0ns ± 1% -0.75% (p=0.048 n=9+10) MemmoveUnalignedSrc/256-4 88.1ns ± 1% 89.6ns ± 0% +1.70% (p=0.000 n=9+8) MemmoveUnalignedSrc/512-4 160ns ± 2% 144ns ± 0% -9.89% (p=0.000 n=10+8) MemmoveUnalignedSrc/1024-4 286ns ± 0% 266ns ± 1% -6.69% (p=0.000 n=8+10) MemmoveUnalignedSrc/2048-4 525ns ± 0% 483ns ± 1% -7.96% (p=0.000 n=9+10) MemmoveUnalignedSrc/4096-4 1.01µs ± 0% 0.92µs ± 1% -9.40% (p=0.000 n=8+10) Change-Id: Ia1144e9d4dfafdece6e167c5e576bf80f254c8ab Reviewed-on: https://go-review.googlesource.com/c/go/+/243357 TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Martin Möhrmann <moehrmann@google.com> Reviewed-by: eric fang <eric.fang@arm.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
author: Jonathan Swinney <jswinney@amazon.com> 2020-10-30 18:46:23 +0000
committer: Cherry Zhang <cherryyz@google.com> 2020-11-02 15:23:43 +0000
commit: d5388e23b5c75bf8189b173051d24a0176a5a303 (patch)
tree: 8d4313273ad941969eb0e8d5b23b323395e507c1 /src/internal
parent: 7be8358f70ff858f28b9aefe11986da25f1762bc (diff)
download: go-d5388e23b5c75bf8189b173051d24a0176a5a303.tar.gz
go-d5388e23b5c75bf8189b173051d24a0176a5a303.zip
3 files changed, 59 insertions, 26 deletions
diff --git a/src/internal/cpu/cpu.go b/src/internal/cpu/cpu.go
index 2829945af0..0ceedcd7d2 100644
--- a/src/internal/cpu/cpu.go
+++ b/src/internal/cpu/cpu.go
@@ -56,32 +56,34 @@ var ARM struct {
 // The booleans in ARM64 contain the correspondingly named cpu feature bit.
 // The struct is padded to avoid false sharing.
 var ARM64 struct {
-	_           CacheLinePad
-	HasFP       bool
-	HasASIMD    bool
-	HasEVTSTRM  bool
-	HasAES      bool
-	HasPMULL    bool
-	HasSHA1     bool
-	HasSHA2     bool
-	HasCRC32    bool
-	HasATOMICS  bool
-	HasFPHP     bool
-	HasASIMDHP  bool
-	HasCPUID    bool
-	HasASIMDRDM bool
-	HasJSCVT    bool
-	HasFCMA     bool
-	HasLRCPC    bool
-	HasDCPOP    bool
-	HasSHA3     bool
-	HasSM3      bool
-	HasSM4      bool
-	HasASIMDDP  bool
-	HasSHA512   bool
-	HasSVE      bool
-	HasASIMDFHM bool
-	_           CacheLinePad
+	_            CacheLinePad
+	HasFP        bool
+	HasASIMD     bool
+	HasEVTSTRM   bool
+	HasAES       bool
+	HasPMULL     bool
+	HasSHA1      bool
+	HasSHA2      bool
+	HasCRC32     bool
+	HasATOMICS   bool
+	HasFPHP      bool
+	HasASIMDHP   bool
+	HasCPUID     bool
+	HasASIMDRDM  bool
+	HasJSCVT     bool
+	HasFCMA      bool
+	HasLRCPC     bool
+	HasDCPOP     bool
+	HasSHA3      bool
+	HasSM3       bool
+	HasSM4       bool
+	HasASIMDDP   bool
+	HasSHA512    bool
+	HasSVE       bool
+	HasASIMDFHM  bool
+	IsNeoverseN1 bool
+	IsZeus       bool
+	_            CacheLinePad
 }
 
 var MIPS64X struct {
diff --git a/src/internal/cpu/cpu_arm64.go b/src/internal/cpu/cpu_arm64.go
index 533bea2470..8fde39f03e 100644
--- a/src/internal/cpu/cpu_arm64.go
+++ b/src/internal/cpu/cpu_arm64.go
@@ -18,6 +18,7 @@ const (
 	hwcap_SHA2    = 1 << 6
 	hwcap_CRC32   = 1 << 7
 	hwcap_ATOMICS = 1 << 8
+	hwcap_CPUID   = 1 << 11
 )
 
 func doinit() {
@@ -28,6 +29,8 @@ func doinit() {
 		{Name: "sha2", Feature: &ARM64.HasSHA2},
 		{Name: "crc32", Feature: &ARM64.HasCRC32},
 		{Name: "atomics", Feature: &ARM64.HasATOMICS},
+		{Name: "isNeoverseN1", Feature: &ARM64.IsNeoverseN1},
+		{Name: "isZeus", Feature: &ARM64.IsZeus},
 	}
 
 	switch GOOS {
@@ -40,12 +43,32 @@ func doinit() {
 		ARM64.HasSHA1 = isSet(HWCap, hwcap_SHA1)
 		ARM64.HasSHA2 = isSet(HWCap, hwcap_SHA2)
 		ARM64.HasCRC32 = isSet(HWCap, hwcap_CRC32)
+		ARM64.HasCPUID = isSet(HWCap, hwcap_CPUID)
 
 		// The Samsung S9+ kernel reports support for atomics, but not all cores
 		// actually support them, resulting in SIGILL. See issue #28431.
 		// TODO(elias.naur): Only disable the optimization on bad chipsets on android.
 		ARM64.HasATOMICS = isSet(HWCap, hwcap_ATOMICS) && GOOS != "android"
 
+		// Check to see if executing on a NeoverseN1 and in order to do that,
+		// check the AUXV for the CPUID bit. The getMIDR function executes an
+		// instruction which would normally be an illegal instruction, but it's
+		// trapped by the kernel, the value sanitized and then returned. Without
+		// the CPUID bit the kernel will not trap the instruction and the process
+		// will be terminated with SIGILL.
+		if ARM64.HasCPUID {
+			midr := getMIDR()
+			part_num := uint16((midr >> 4) & 0xfff)
+			implementor := byte((midr >> 24) & 0xff)
+
+			if implementor == 'A' && part_num == 0xd0c {
+				ARM64.IsNeoverseN1 = true
+			}
+			if implementor == 'A' && part_num == 0xd40 {
+				ARM64.IsZeus = true
+			}
+		}
+
 	case "freebsd":
 		// Retrieve info from system register ID_AA64ISAR0_EL1.
 		isar0 := getisar0()
@@ -93,3 +116,5 @@ func isSet(hwc uint, value uint) bool {
 }
 
 func getisar0() uint64
+
+func getMIDR() uint64
diff --git a/src/internal/cpu/cpu_arm64.s b/src/internal/cpu/cpu_arm64.s
index d85914973f..d6e7f44373 100644
--- a/src/internal/cpu/cpu_arm64.s
+++ b/src/internal/cpu/cpu_arm64.s
@@ -10,3 +10,9 @@ TEXT ·getisar0(SB),NOSPLIT,$0
 	MRS	ID_AA64ISAR0_EL1, R0
 	MOVD	R0, ret+0(FP)
 	RET
+
+// func getMIDR() uint64
+TEXT ·getMIDR(SB), NOSPLIT, $0-8
+	MRS	MIDR_EL1, R0
+	MOVD	R0, ret+0(FP)
+	RET
author	Jonathan Swinney <jswinney@amazon.com>	2020-10-30 18:46:23 +0000
committer	Cherry Zhang <cherryyz@google.com>	2020-11-02 15:23:43 +0000
commit	d5388e23b5c75bf8189b173051d24a0176a5a303 (patch)
tree	8d4313273ad941969eb0e8d5b23b323395e507c1 /src/internal
parent	7be8358f70ff858f28b9aefe11986da25f1762bc (diff)
download	go-d5388e23b5c75bf8189b173051d24a0176a5a303.tar.gz go-d5388e23b5c75bf8189b173051d24a0176a5a303.zip