aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorArchana R <aravind5@in.ibm.com>2021-11-10 01:18:42 -0600
committerLynn Boger <laboger@linux.vnet.ibm.com>2022-04-22 12:12:38 +0000
commit78fb1d03d39e8357e4790a9f0788ef0a8e7d8ae1 (patch)
tree8402d3170fe39d5e0b0a2ae0e62c30eb58981739
parent1e5987635cc8bf99e8a20d240da80bd6f0f793f7 (diff)
downloadgo-78fb1d03d39e8357e4790a9f0788ef0a8e7d8ae1.tar.gz
go-78fb1d03d39e8357e4790a9f0788ef0a8e7d8ae1.zip
internal/bytealg: optimize cmpbody for ppc64le/ppc64
Vectorize the cmpbody loop for bytes of size greater than or equal to 32 on both POWER8(LE and BE) and POWER9(LE and BE) and improve performance of smaller size compares Performance improves for most sizes with this change on POWER8, 9 and POWER10. For the very small sizes (upto 8) the overhead of calling function starts to impact performance. POWER9: name old time/op new time/op delta BytesCompare/1 4.60ns ± 0% 5.49ns ± 0% +19.27% BytesCompare/2 4.68ns ± 0% 5.46ns ± 0% +16.71% BytesCompare/4 6.58ns ± 0% 5.49ns ± 0% -16.58% BytesCompare/8 4.89ns ± 0% 5.46ns ± 0% +11.64% BytesCompare/16 5.21ns ± 0% 4.96ns ± 0% -4.70% BytesCompare/32 5.09ns ± 0% 4.98ns ± 0% -2.14% BytesCompare/64 6.40ns ± 0% 5.96ns ± 0% -6.84% BytesCompare/128 11.3ns ± 0% 8.1ns ± 0% -28.09% BytesCompare/256 15.1ns ± 0% 12.8ns ± 0% -15.16% BytesCompare/512 26.5ns ± 0% 23.3ns ± 5% -12.03% BytesCompare/1024 50.2ns ± 0% 41.6ns ± 2% -17.01% BytesCompare/2048 99.3ns ± 0% 86.5ns ± 0% -12.88% Change-Id: I24f93b2910591e6829ddd8509aa6eeaa6355c609 Reviewed-on: https://go-review.googlesource.com/c/go/+/362797 Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> Run-TryBot: Archana Ravindar <aravind5@in.ibm.com> TryBot-Result: Gopher Robot <gobot@golang.org> Reviewed-by: Ian Lance Taylor <iant@google.com> Reviewed-by: Than McIntosh <thanm@google.com>
-rw-r--r--src/internal/bytealg/compare_ppc64x.s552
1 files changed, 388 insertions, 164 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
index fc6f170ca8..cbe0525af5 100644
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
-#ifdef GOARCH_ppc64le
- BR cmpbodyLE<>(SB)
-#else
- BR cmpbodyBE<>(SB)
-#endif
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
-#ifdef GOARCH_ppc64le
- BR cmpbodyLE<>(SB)
-#else
- BR cmpbodyBE<>(SB)
-#endif
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@@ -70,209 +72,431 @@ done:
MOVD $0, R3
RET
-// Do an efficient memcmp for ppc64le
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
-TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
- BC 12,8,setuplen // BLT CR2
+ BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
- MOVD R8,CTR // set up loop counter
- CMP R8,$8 // only optimize >=8
- BLT simplecheck
- DCBT (R5) // cache hint
- DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9
- BLT setup8a // 8 byte moves only
-setup32a:
- SRADCC $5,R8,R9 // number of 32 byte chunks
- MOVD R9,CTR
-
- // Special processing for 32 bytes or longer.
- // Loading this way is faster and correct as long as the
- // doublewords being compared are equal. Once they
- // are found unequal, reload them in proper byte order
- // to determine greater or less than.
-loop32a:
- MOVD 0(R5),R9 // doublewords to compare
- MOVD 0(R6),R10 // get 4 doublewords
- MOVD 8(R5),R14
- MOVD 8(R6),R15
- CMPU R9,R10 // bytes equal?
- MOVD $0,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD 16(R5),R9 // get next pair of doublewords
- MOVD 16(R6),R10
- CMPU R14,R15 // bytes match?
- MOVD $8,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD 24(R5),R14 // get next pair of doublewords
- MOVD 24(R6),R15
- CMPU R9,R10 // bytes match?
- MOVD $16,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
- ADD $32,R5 // bump up to next 32
- ADD $32,R6
- CMPU R14,R15 // bytes match?
- BC 8,2,loop32a // br ctr and cr
- BNE cmpne
+ BLT setup8a // optimize < 32
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$64
+ BLT cmp32 // process size 32-63
+
+ DCBT (R5) // optimize >= 64
+ DCBT (R6) // cache hint
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different // jump out if its different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
+ LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
+ LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+rem:
+ MOVD R9,R8
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
+ BR setup8a
+
+different:
+#ifdef GOARCH_ppc64le
+ MOVD $byteswap<>+00(SB), R16
+ LXVD2X (R16)(R0),SWAP // Set up swap string
+
+ VPERM V3,V3,SWAP,V3
+ VPERM V4,V4,SWAP,V4
+#endif
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
+ MFVSRD VS35,R16
+ VSLDOI $8,V4,V4,V4
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
setup8a:
- SRADCC $3,R9,R9 // get the 8 byte count
+ SRADCC $3,R8,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
+ CMPU R8,$8 // optimize 8byte move
+ BEQ size8
+ CMPU R8,$16
+ BEQ size16
MOVD R9,CTR // loop count for doublewords
loop8:
- MOVDBR (R5+R0),R9 // doublewords to compare
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
ADD $8,R5
ADD $8,R6
- CMPU R9,R10 // match?
+ CMPU R16,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
- MOVD R9,CTR // save the ctr
- BNE simple // leftover bytes
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less
- BR greater
+ BEQ zeroremainder
simplecheck:
- CMP R8,$0 // remaining compare length 0
- BNE simple // do simple compare
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less // 1st len < 2nd len, result less
- BR greater // 1st len > 2nd len must be greater
-simple:
- MOVBZ 0(R5), R9 // get byte from 1st operand
- ADD $1,R5
- MOVBZ 0(R6), R10 // get byte from 2nd operand
- ADD $1,R6
- CMPU R9, R10
- BC 8,2,simple // bc ctr <> 0 && cr
- BGT greater // 1st > 2nd
- BLT less // 1st < 2nd
- BC 12,10,equal // test CR2 for length comparison
- BC 12,9,greater // 2nd len > 1st len
- BR less // must be less
-cmpne: // only here is not equal
- MOVDBR (R5+R16),R8 // reload in reverse order
- MOVDBR (R6+R16),R9
- CMPU R8,R9 // compare correct endianness
- BGT greater // here only if NE
-less:
- MOVD $-1, R3 // return value if A < B
+ MOVD R0,R14
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5)(R14),R10
+ MOVWBR (R6)(R14),R11
+#else
+ MOVWZ (R5)(R14),R10
+ MOVWZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-4,R9
+ ADD $4,R14
+ PCALIGN $16
+
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5)(R14),R10
+ MOVHBR (R6)(R14),R11
+#else
+ MOVHZ (R5)(R14),R10
+ MOVHZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-2,R9
+ ADD $2,R14
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5)(R14),R10
+ MOVBZ (R6)(R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
+
+less: MOVD $-1,R3 // return value if A < B
RET
+size16:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+zeroremainder:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+size8:
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
+ MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
+ CMPU R16,R10 // match?
+ BGT greater
+ BLT less
+ BGT CR2,greater // 2nd len > 1st len
+ BLT CR2,less // 2nd len < 1st len
equal:
MOVD $0, R3 // return value if A == B
RET
greater:
- MOVD $1, R3 // return value if A > B
+ MOVD $1,R3 // return value if A > B
RET
-// Do an efficient memcmp for ppc64 (BE)
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
-TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
- BC 12,8,setuplen // BLT CR2
+ BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
- MOVD R8,CTR // set up loop counter
- CMP R8,$8 // only optimize >=8
- BLT simplecheck
- DCBT (R5) // cache hint
- DCBT (R6)
- CMP R8,$32 // optimize >= 32
+ CMP R8,$16 // optimize for size<16
MOVD R8,R9
- BLT setup8a // 8 byte moves only
-
-setup32a:
- SRADCC $5,R8,R9 // number of 32 byte chunks
- MOVD R9,CTR
-loop32a:
- MOVD 0(R5),R9 // doublewords to compare
- MOVD 0(R6),R10 // get 4 doublewords
- MOVD 8(R5),R14
- MOVD 8(R6),R15
- CMPU R9,R10 // bytes equal?
- BLT less // found to be less
- BGT greater // found to be greater
- MOVD 16(R5),R9 // get next pair of doublewords
- MOVD 16(R6),R10
- CMPU R14,R15 // bytes match?
- BLT less // found less
- BGT greater // found greater
- MOVD 24(R5),R14 // get next pair of doublewords
- MOVD 24(R6),R15
- CMPU R9,R10 // bytes match?
- BLT less // found to be less
- BGT greater // found to be greater
- ADD $32,R5 // bump up to next 32
- ADD $32,R6
- CMPU R14,R15 // bytes match?
- BC 8,2,loop32a // br ctr and cr
- BLT less // with BE, byte ordering is
- BGT greater // good for compare
- ANDCC $24,R8,R9 // Any 8 byte chunks?
- BEQ leftover // and result is 0
-setup8a:
- SRADCC $3,R9,R9 // get the 8 byte count
- BEQ leftover // shifted value is 0
- MOVD R9,CTR // loop count for doublewords
-loop8:
- MOVD (R5),R9
- MOVD (R6),R10
- ADD $8,R5
- ADD $8,R6
- CMPU R9,R10 // match?
- BC 8,2,loop8 // bt ctr <> 0 && cr
+ BLT simplecheck
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$32 // optimize for size 16-31
+ BLT cmp16
+ CMP R8,$64
+ BLT cmp32 // optimize for size 32-63
+ DCBT (R5) // optimize for size>=64
+ DCBT (R6) // cache hint
+
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
+ LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
+ LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$16
+ BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remaining bytes
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
+ BGE cmp16
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remainder bytes
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+different:
+
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
BGT greater
- BLT less
-leftover:
- ANDCC $7,R8,R9 // check for leftover bytes
- MOVD R9,CTR // save the ctr
- BNE simple // leftover bytes
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less
- BR greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
+ MFVSRLD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+
+greater:
+ MOVD $1,R3 // return value if A > B
+ RET
+cmp16:
+ ANDCC $16,R9,R31
+ BEQ tail
+
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $16,R5
+ ADD $16,R6
+tail:
+ ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
+ BEQ end
+
+ ADD R9,R5
+ ADD R9,R6
+ MOVD $-16,R10
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+end:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
+ BR greater // jump to greater otherwise
simplecheck:
- CMP R8,$0 // remaining compare length 0
- BNE simple // do simple compare
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less // 1st len < 2nd len, result less
- BR greater // same len, must be equal
-simple:
- MOVBZ 0(R5),R9 // get byte from 1st operand
- ADD $1,R5
- MOVBZ 0(R6),R10 // get byte from 2nd operand
- ADD $1,R6
- CMPU R9,R10
- BC 8,2,simple // bc ctr <> 0 && cr
- BGT greater // 1st > 2nd
- BLT less // 1st < 2nd
- BC 12,10,equal // test CR2 for length comparison
- BC 12,9,greater // 2nd len > 1st len
+ MOVD $0,R14 // process 8 bytes
+ CMP R9,$8
+ BLT word
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R14),R10
+ MOVDBR (R6+R14),R11
+#else
+ MOVD (R5+R14),R10
+ MOVD (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $8,R14
+ ADD $-8,R9
+ PCALIGN $16
+word:
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5+R14),R10
+ MOVWBR (R6+R14),R11
+#else
+ MOVWZ (R5+R14),R10
+ MOVWZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $4,R14
+ ADD $-4,R9
+ PCALIGN $16
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5+R14),R10
+ MOVHBR (R6+R14),R11
+#else
+ MOVHZ (R5+R14),R10
+ MOVHZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $2,R14
+ ADD $-2,R9
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5+R14),R10
+ MOVBZ (R6+R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
less:
- MOVD $-1, R3 // return value if A < B
+ MOVD $-1,R3 // return value if A < B
RET
equal:
MOVD $0, R3 // return value if A == B
RET
-greater:
- MOVD $1, R3 // return value if A > B
- RET