aboutsummaryrefslogtreecommitdiff
path: root/src/internal/bytealg/compare_ppc64x.s
diff options
context:
space:
mode:
Diffstat (limited to 'src/internal/bytealg/compare_ppc64x.s')
-rw-r--r--src/internal/bytealg/compare_ppc64x.s552
1 files changed, 388 insertions, 164 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
index fc6f170ca8..cbe0525af5 100644
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
-#ifdef GOARCH_ppc64le
- BR cmpbodyLE<>(SB)
-#else
- BR cmpbodyBE<>(SB)
-#endif
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
CMP R5,R6,CR7
CMP R3,R4,CR6
BEQ CR7,equal
-#ifdef GOARCH_ppc64le
- BR cmpbodyLE<>(SB)
-#else
- BR cmpbodyBE<>(SB)
-#endif
+ MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+ CMP R16,$1
+ BNE power8
+ BR cmpbodyp9<>(SB)
+power8:
+ BR cmpbody<>(SB)
equal:
BEQ CR6,done
MOVD $1, R8
@@ -70,209 +72,431 @@ done:
MOVD $0, R3
RET
-// Do an efficient memcmp for ppc64le
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
-TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
- BC 12,8,setuplen // BLT CR2
+ BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
- MOVD R8,CTR // set up loop counter
- CMP R8,$8 // only optimize >=8
- BLT simplecheck
- DCBT (R5) // cache hint
- DCBT (R6)
CMP R8,$32 // optimize >= 32
MOVD R8,R9
- BLT setup8a // 8 byte moves only
-setup32a:
- SRADCC $5,R8,R9 // number of 32 byte chunks
- MOVD R9,CTR
-
- // Special processing for 32 bytes or longer.
- // Loading this way is faster and correct as long as the
- // doublewords being compared are equal. Once they
- // are found unequal, reload them in proper byte order
- // to determine greater or less than.
-loop32a:
- MOVD 0(R5),R9 // doublewords to compare
- MOVD 0(R6),R10 // get 4 doublewords
- MOVD 8(R5),R14
- MOVD 8(R6),R15
- CMPU R9,R10 // bytes equal?
- MOVD $0,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD 16(R5),R9 // get next pair of doublewords
- MOVD 16(R6),R10
- CMPU R14,R15 // bytes match?
- MOVD $8,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD 24(R5),R14 // get next pair of doublewords
- MOVD 24(R6),R15
- CMPU R9,R10 // bytes match?
- MOVD $16,R16 // set up for cmpne
- BNE cmpne // further compare for LT or GT
- MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32
- ADD $32,R5 // bump up to next 32
- ADD $32,R6
- CMPU R14,R15 // bytes match?
- BC 8,2,loop32a // br ctr and cr
- BNE cmpne
+ BLT setup8a // optimize < 32
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$64
+ BLT cmp32 // process size 32-63
+
+ DCBT (R5) // optimize >= 64
+ DCBT (R6) // cache hint
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different // jump out if its different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector
+ LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector
+ LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector
+ LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector
+
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$0
+ BNE rem // loop to rem if the remainder is not 0
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+rem:
+ MOVD R9,R8
ANDCC $24,R8,R9 // Any 8 byte chunks?
BEQ leftover // and result is 0
+ BR setup8a
+
+different:
+#ifdef GOARCH_ppc64le
+ MOVD $byteswap<>+00(SB), R16
+ LXVD2X (R16)(R0),SWAP // Set up swap string
+
+ VPERM V3,V3,SWAP,V3
+ VPERM V4,V4,SWAP,V4
+#endif
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison
+ MFVSRD VS35,R16
+ VSLDOI $8,V4,V4,V4
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
setup8a:
- SRADCC $3,R9,R9 // get the 8 byte count
+ SRADCC $3,R8,R9 // get the 8 byte count
BEQ leftover // shifted value is 0
+ CMPU R8,$8 // optimize 8byte move
+ BEQ size8
+ CMPU R8,$16
+ BEQ size16
MOVD R9,CTR // loop count for doublewords
loop8:
- MOVDBR (R5+R0),R9 // doublewords to compare
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
ADD $8,R5
ADD $8,R6
- CMPU R9,R10 // match?
+ CMPU R16,R10 // match?
BC 8,2,loop8 // bt ctr <> 0 && cr
BGT greater
BLT less
leftover:
ANDCC $7,R8,R9 // check for leftover bytes
- MOVD R9,CTR // save the ctr
- BNE simple // leftover bytes
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less
- BR greater
+ BEQ zeroremainder
simplecheck:
- CMP R8,$0 // remaining compare length 0
- BNE simple // do simple compare
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less // 1st len < 2nd len, result less
- BR greater // 1st len > 2nd len must be greater
-simple:
- MOVBZ 0(R5), R9 // get byte from 1st operand
- ADD $1,R5
- MOVBZ 0(R6), R10 // get byte from 2nd operand
- ADD $1,R6
- CMPU R9, R10
- BC 8,2,simple // bc ctr <> 0 && cr
- BGT greater // 1st > 2nd
- BLT less // 1st < 2nd
- BC 12,10,equal // test CR2 for length comparison
- BC 12,9,greater // 2nd len > 1st len
- BR less // must be less
-cmpne: // only here is not equal
- MOVDBR (R5+R16),R8 // reload in reverse order
- MOVDBR (R6+R16),R9
- CMPU R8,R9 // compare correct endianness
- BGT greater // here only if NE
-less:
- MOVD $-1, R3 // return value if A < B
+ MOVD R0,R14
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5)(R14),R10
+ MOVWBR (R6)(R14),R11
+#else
+ MOVWZ (R5)(R14),R10
+ MOVWZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-4,R9
+ ADD $4,R14
+ PCALIGN $16
+
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5)(R14),R10
+ MOVHBR (R6)(R14),R11
+#else
+ MOVHZ (R5)(R14),R10
+ MOVHZ (R6)(R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $-2,R9
+ ADD $2,R14
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5)(R14),R10
+ MOVBZ (R6)(R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
+
+less: MOVD $-1,R3 // return value if A < B
RET
+size16:
+ LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector
+ LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+zeroremainder:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+size8:
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R0),R16 // doublewords to compare
+ MOVDBR (R6+R0),R10 // LE compare order
+#else
+ MOVD (R5+R0),R16 // doublewords to compare
+ MOVD (R6+R0),R10 // BE compare order
+#endif
+ CMPU R16,R10 // match?
+ BGT greater
+ BLT less
+ BGT CR2,greater // 2nd len > 1st len
+ BLT CR2,less // 2nd len < 1st len
equal:
MOVD $0, R3 // return value if A == B
RET
greater:
- MOVD $1, R3 // return value if A > B
+ MOVD $1,R3 // return value if A > B
RET
-// Do an efficient memcmp for ppc64 (BE)
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
// R3 = a len
// R4 = b len
// R5 = a addr
// R6 = b addr
// On exit:
// R3 = return value
-TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
MOVD R3,R8 // set up length
CMP R3,R4,CR2 // unequal?
- BC 12,8,setuplen // BLT CR2
+ BLT CR2,setuplen // BLT CR2
MOVD R4,R8 // use R4 for comparison len
setuplen:
- MOVD R8,CTR // set up loop counter
- CMP R8,$8 // only optimize >=8
- BLT simplecheck
- DCBT (R5) // cache hint
- DCBT (R6)
- CMP R8,$32 // optimize >= 32
+ CMP R8,$16 // optimize for size<16
MOVD R8,R9
- BLT setup8a // 8 byte moves only
-
-setup32a:
- SRADCC $5,R8,R9 // number of 32 byte chunks
- MOVD R9,CTR
-loop32a:
- MOVD 0(R5),R9 // doublewords to compare
- MOVD 0(R6),R10 // get 4 doublewords
- MOVD 8(R5),R14
- MOVD 8(R6),R15
- CMPU R9,R10 // bytes equal?
- BLT less // found to be less
- BGT greater // found to be greater
- MOVD 16(R5),R9 // get next pair of doublewords
- MOVD 16(R6),R10
- CMPU R14,R15 // bytes match?
- BLT less // found less
- BGT greater // found greater
- MOVD 24(R5),R14 // get next pair of doublewords
- MOVD 24(R6),R15
- CMPU R9,R10 // bytes match?
- BLT less // found to be less
- BGT greater // found to be greater
- ADD $32,R5 // bump up to next 32
- ADD $32,R6
- CMPU R14,R15 // bytes match?
- BC 8,2,loop32a // br ctr and cr
- BLT less // with BE, byte ordering is
- BGT greater // good for compare
- ANDCC $24,R8,R9 // Any 8 byte chunks?
- BEQ leftover // and result is 0
-setup8a:
- SRADCC $3,R9,R9 // get the 8 byte count
- BEQ leftover // shifted value is 0
- MOVD R9,CTR // loop count for doublewords
-loop8:
- MOVD (R5),R9
- MOVD (R6),R10
- ADD $8,R5
- ADD $8,R6
- CMPU R9,R10 // match?
- BC 8,2,loop8 // bt ctr <> 0 && cr
+ BLT simplecheck
+ MOVD $16,R10 // set offsets to load into vectors
+ CMP R8,$32 // optimize for size 16-31
+ BLT cmp16
+ CMP R8,$64
+ BLT cmp32 // optimize for size 32-63
+ DCBT (R5) // optimize for size>=64
+ DCBT (R6) // cache hint
+
+ MOVD $32,R11 // set offsets to load into vector
+ MOVD $48,R12 // set offsets to load into vector
+
+loop64a:// process size 64 and greater
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector
+ LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector
+ LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-64,R9,R9 // reduce remaining size by 64
+ ADD $64,R5,R5 // increment to next 64 bytes of A
+ ADD $64,R6,R6 // increment to next 64 bytes of B
+ CMPU R9,$64
+ BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining
+
+ CMPU R9,$32
+ BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining
+ CMPU R9,$16
+ BGE cmp16 // loop to cmp16 if there are 16-31 bytes left
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remaining bytes
+
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+cmp32:
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector
+
+ VCMPNEBCC V3,V4,V1 // record comparison into V1
+ BNE CR6,different // jump out if its different
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPNEBCC V3,V4,V1
+ BNE CR6,different
+
+ ADD $-32,R9,R9 // reduce remaining size by 32
+ ADD $32,R5,R5 // increment to next 32 bytes of A
+ ADD $32,R6,R6 // increment to next 32 bytes of B
+ CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left
+ BGE cmp16
+ CMPU R9,$0
+ BNE simplecheck // loop to simplecheck for remainder bytes
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if len(A)<len(B)
+ BR greater // jump to greater otherwise
+different:
+
+ MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison
+ MFVSRD VS36,R10
+
+ CMPU R16,R10
+ BEQ lower
BGT greater
- BLT less
-leftover:
- ANDCC $7,R8,R9 // check for leftover bytes
- MOVD R9,CTR // save the ctr
- BNE simple // leftover bytes
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less
- BR greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+lower:
+ MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison
+ MFVSRLD VS36,R10
+
+ CMPU R16,R10
+ BGT greater
+ MOVD $-1,R3 // return value if A < B
+ RET
+
+greater:
+ MOVD $1,R3 // return value if A > B
+ RET
+cmp16:
+ ANDCC $16,R9,R31
+ BEQ tail
+
+ LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+
+ ADD $16,R5
+ ADD $16,R6
+tail:
+ ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b)
+ BEQ end
+
+ ADD R9,R5
+ ADD R9,R6
+ MOVD $-16,R10
+
+ LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector
+ LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector
+ VCMPEQUDCC V3,V4,V1
+ BGE CR6,different
+end:
+ BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B)
+ BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B)
+ BR greater // jump to greater otherwise
simplecheck:
- CMP R8,$0 // remaining compare length 0
- BNE simple // do simple compare
- BC 12,10,equal // test CR2 for length comparison
- BC 12,8,less // 1st len < 2nd len, result less
- BR greater // same len, must be equal
-simple:
- MOVBZ 0(R5),R9 // get byte from 1st operand
- ADD $1,R5
- MOVBZ 0(R6),R10 // get byte from 2nd operand
- ADD $1,R6
- CMPU R9,R10
- BC 8,2,simple // bc ctr <> 0 && cr
- BGT greater // 1st > 2nd
- BLT less // 1st < 2nd
- BC 12,10,equal // test CR2 for length comparison
- BC 12,9,greater // 2nd len > 1st len
+ MOVD $0,R14 // process 8 bytes
+ CMP R9,$8
+ BLT word
+#ifdef GOARCH_ppc64le
+ MOVDBR (R5+R14),R10
+ MOVDBR (R6+R14),R11
+#else
+ MOVD (R5+R14),R10
+ MOVD (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $8,R14
+ ADD $-8,R9
+ PCALIGN $16
+word:
+ CMP R9,$4 // process 4 bytes
+ BLT halfword
+#ifdef GOARCH_ppc64le
+ MOVWBR (R5+R14),R10
+ MOVWBR (R6+R14),R11
+#else
+ MOVWZ (R5+R14),R10
+ MOVWZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $4,R14
+ ADD $-4,R9
+ PCALIGN $16
+halfword:
+ CMP R9,$2 // process 2 bytes
+ BLT byte
+#ifdef GOARCH_ppc64le
+ MOVHBR (R5+R14),R10
+ MOVHBR (R6+R14),R11
+#else
+ MOVHZ (R5+R14),R10
+ MOVHZ (R6+R14),R11
+#endif
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ ADD $2,R14
+ ADD $-2,R9
+ PCALIGN $16
+byte:
+ CMP R9,$0 // process 1 byte
+ BEQ skip
+ MOVBZ (R5+R14),R10
+ MOVBZ (R6+R14),R11
+ CMPU R10,R11
+ BGT greater
+ BLT less
+ PCALIGN $16
+skip:
+ BEQ CR2,equal
+ BGT CR2,greater
less:
- MOVD $-1, R3 // return value if A < B
+ MOVD $-1,R3 // return value if A < B
RET
equal:
MOVD $0, R3 // return value if A == B
RET
-greater:
- MOVD $1, R3 // return value if A > B
- RET