aboutsummaryrefslogtreecommitdiff
path: root/src/internal
diff options
context:
space:
mode:
authorAustin Clements <austin@google.com>2021-04-14 19:15:42 -0400
committerAustin Clements <austin@google.com>2021-04-15 04:10:33 +0000
commit8f4c5068e07a03e16998b6d8d38a0482433fc7fe (patch)
tree119a7b08b79aaa90c8335e72c11b34aff850484a /src/internal
parent48b7432e3f5318a026842fc4f39fb690e13f79f8 (diff)
downloadgo-8f4c5068e07a03e16998b6d8d38a0482433fc7fe.tar.gz
go-8f4c5068e07a03e16998b6d8d38a0482433fc7fe.zip
internal/bytealg: port more performance-critical functions to ABIInternal
CL 308931 ported several runtime assembly functions to ABIInternal so that compiler-generated ABIInternal calls don't go through ABI wrappers, but it missed the runtime assembly functions that are actually defined in internal/bytealg. This eliminates the cost of wrappers for the BleveQuery and GopherLuaKNucleotide benchmarks, but there's still more to do for Tile38. 0-base 1-wrappers sec/op sec/op vs base BleveQuery 6.507 ± 0% 6.477 ± 0% -0.46% (p=0.004 n=20) GopherLuaKNucleotide 30.39 ± 1% 30.34 ± 0% ~ (p=0.301 n=20) Tile38IntersectsCircle100kmRequest 1.038m ± 1% 1.080m ± 2% +4.03% (p=0.000 n=20) For #40724. Change-Id: I0b722443f684fcb997b1d70802c5ed4b8d8f9829 Reviewed-on: https://go-review.googlesource.com/c/go/+/310184 Trust: Austin Clements <austin@google.com> Reviewed-by: Michael Knyszek <mknyszek@google.com> Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/internal')
-rw-r--r--src/internal/bytealg/compare_amd64.s38
-rw-r--r--src/internal/bytealg/equal_amd64.s70
2 files changed, 99 insertions, 9 deletions
diff --git a/src/internal/bytealg/compare_amd64.s b/src/internal/bytealg/compare_amd64.s
index 900b92a21e..8295acb03a 100644
--- a/src/internal/bytealg/compare_amd64.s
+++ b/src/internal/bytealg/compare_amd64.s
@@ -5,20 +5,41 @@
#include "go_asm.h"
#include "textflag.h"
-TEXT ·Compare(SB),NOSPLIT,$0-56
+TEXT ·Compare<ABIInternal>(SB),NOSPLIT,$0-56
+#ifdef GOEXPERIMENT_regabiargs
+ // AX = a_base (want in SI)
+ // BX = a_len (want in BX)
+ // CX = a_cap (unused)
+ // DI = b_base (want in DI)
+ // SI = b_len (want in DX)
+ // R8 = b_cap (unused)
+ MOVQ SI, DX
+ MOVQ AX, SI
+#else
MOVQ a_base+0(FP), SI
MOVQ a_len+8(FP), BX
MOVQ b_base+24(FP), DI
MOVQ b_len+32(FP), DX
LEAQ ret+48(FP), R9
+#endif
JMP cmpbody<>(SB)
-TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
+TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
+#ifdef GOEXPERIMENT_regabiargs
+ // AX = a_base (want in SI)
+ // BX = a_len (want in BX)
+ // CX = b_base (want in DI)
+ // DI = b_len (want in DX)
+ MOVQ AX, SI
+ MOVQ DI, DX
+ MOVQ CX, DI
+#else
MOVQ a_base+0(FP), SI
MOVQ a_len+8(FP), BX
MOVQ b_base+16(FP), DI
MOVQ b_len+24(FP), DX
LEAQ ret+32(FP), R9
+#endif
JMP cmpbody<>(SB)
// input:
@@ -26,7 +47,12 @@ TEXT runtime·cmpstring(SB),NOSPLIT,$0-40
// DI = b
// BX = alen
// DX = blen
+#ifndef GOEXPERIMENT_regabiargs
// R9 = address of output word (stores -1/0/1 here)
+#else
+// output:
+// AX = output (-1/0/1)
+#endif
TEXT cmpbody<>(SB),NOSPLIT,$0-0
CMPQ SI, DI
JEQ allsame
@@ -74,7 +100,9 @@ diff16:
CMPB CX, (DI)(BX*1)
SETHI AX
LEAQ -1(AX*2), AX // convert 1/0 to +1/-1
+#ifndef GOEXPERIMENT_regabiargs
MOVQ AX, (R9)
+#endif
RET
// 0 through 16 bytes left, alen>=8, blen>=8
@@ -100,7 +128,9 @@ diff8:
SHRQ CX, AX // move a's bit to bottom
ANDQ $1, AX // mask bit
LEAQ -1(AX*2), AX // 1/0 => +1/-1
+#ifndef GOEXPERIMENT_regabiargs
MOVQ AX, (R9)
+#endif
RET
// 0-7 bytes in common
@@ -139,7 +169,9 @@ di_finish:
SHRQ CX, SI // move a's bit to bottom
ANDQ $1, SI // mask bit
LEAQ -1(SI*2), AX // 1/0 => +1/-1
+#ifndef GOEXPERIMENT_regabiargs
MOVQ AX, (R9)
+#endif
RET
allsame:
@@ -149,7 +181,9 @@ allsame:
SETGT AX // 1 if alen > blen
SETEQ CX // 1 if alen == blen
LEAQ -1(CX)(AX*2), AX // 1,0,-1 result
+#ifndef GOEXPERIMENT_regabiargs
MOVQ AX, (R9)
+#endif
RET
// this works for >= 64 bytes of data.
diff --git a/src/internal/bytealg/equal_amd64.s b/src/internal/bytealg/equal_amd64.s
index c816409545..6f12d2a169 100644
--- a/src/internal/bytealg/equal_amd64.s
+++ b/src/internal/bytealg/equal_amd64.s
@@ -6,7 +6,21 @@
#include "textflag.h"
// memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal(SB),NOSPLIT,$0-25
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT,$0-25
+#ifdef GOEXPERIMENT_regabiargs
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // CX = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ CX, BX
+ JMP memeqbody<>(SB)
+#else
MOVQ a+0(FP), SI
MOVQ b+8(FP), DI
CMPQ SI, DI
@@ -17,9 +31,24 @@ TEXT runtime·memequal(SB),NOSPLIT,$0-25
eq:
MOVB $1, ret+24(FP)
RET
+#endif
// memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0-17
+#ifdef GOEXPERIMENT_regabiargs
+ // AX = a (want in SI)
+ // BX = b (want in DI)
+ // 8(DX) = size (want in BX)
+ CMPQ AX, BX
+ JNE neq
+ MOVQ $1, AX // return 1
+ RET
+neq:
+ MOVQ AX, SI
+ MOVQ BX, DI
+ MOVQ 8(DX), BX // compiler stores size at offset 8 in the closure
+ JMP memeqbody<>(SB)
+#else
MOVQ a+0(FP), SI
MOVQ b+8(FP), DI
CMPQ SI, DI
@@ -30,11 +59,18 @@ TEXT runtime·memequal_varlen(SB),NOSPLIT,$0-17
eq:
MOVB $1, ret+16(FP)
RET
-
-// a in SI
-// b in DI
-// count in BX
-// address of result byte in AX
+#endif
+
+// Input:
+// a in SI
+// b in DI
+// count in BX
+#ifndef GOEXPERIMENT_regabiargs
+// address of result byte in AX
+#else
+// Output:
+// result in AX
+#endif
TEXT memeqbody<>(SB),NOSPLIT,$0-0
CMPQ BX, $8
JB small
@@ -68,7 +104,11 @@ hugeloop:
SUBQ $64, BX
CMPL DX, $0xffff
JEQ hugeloop
+#ifdef GOEXPERIMENT_regabiargs
+ XORQ AX, AX // return 0
+#else
MOVB $0, (AX)
+#endif
RET
// 64 bytes at a time using ymm registers
@@ -89,7 +129,11 @@ hugeloop_avx2:
CMPL DX, $0xffffffff
JEQ hugeloop_avx2
VZEROUPPER
+#ifdef GOEXPERIMENT_regabiargs
+ XORQ AX, AX // return 0
+#else
MOVB $0, (AX)
+#endif
RET
bigloop_avx2:
@@ -106,7 +150,11 @@ bigloop:
SUBQ $8, BX
CMPQ CX, DX
JEQ bigloop
+#ifdef GOEXPERIMENT_regabiargs
+ XORQ AX, AX // return 0
+#else
MOVB $0, (AX)
+#endif
RET
// remaining 0-8 bytes
@@ -114,7 +162,11 @@ leftover:
MOVQ -8(SI)(BX*1), CX
MOVQ -8(DI)(BX*1), DX
CMPQ CX, DX
+#ifdef GOEXPERIMENT_regabiargs
+ SETEQ AX
+#else
SETEQ (AX)
+#endif
RET
small:
@@ -149,6 +201,10 @@ di_finish:
SUBQ SI, DI
SHLQ CX, DI
equal:
+#ifdef GOEXPERIMENT_regabiargs
+ SETEQ AX
+#else
SETEQ (AX)
+#endif
RET