diff options
Diffstat (limited to 'src/internal/bytealg/compare_ppc64x.s')
-rw-r--r-- | src/internal/bytealg/compare_ppc64x.s | 552 |
1 files changed, 388 insertions, 164 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s index fc6f170ca8..cbe0525af5 100644 --- a/src/internal/bytealg/compare_ppc64x.s +++ b/src/internal/bytealg/compare_ppc64x.s @@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal -#ifdef GOARCH_ppc64le - BR cmpbodyLE<>(SB) -#else - BR cmpbodyBE<>(SB) -#endif + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 @@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40 CMP R5,R6,CR7 CMP R3,R4,CR6 BEQ CR7,equal -#ifdef GOARCH_ppc64le - BR cmpbodyLE<>(SB) -#else - BR cmpbodyBE<>(SB) -#endif + MOVBZ internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16 + CMP R16,$1 + BNE power8 + BR cmpbodyp9<>(SB) +power8: + BR cmpbody<>(SB) equal: BEQ CR6,done MOVD $1, R8 @@ -70,209 +72,431 @@ done: MOVD $0, R3 RET -// Do an efficient memcmp for ppc64le +#ifdef GOARCH_ppc64le +DATA byteswap<>+0(SB)/8, $0x0706050403020100 +DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908 +GLOBL byteswap<>+0(SB), RODATA, $16 +#define SWAP V21 +#endif + +// Do an efficient memcmp for ppc64le/ppc64/POWER8 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value -TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0 +TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? - BC 12,8,setuplen // BLT CR2 + BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: - MOVD R8,CTR // set up loop counter - CMP R8,$8 // only optimize >=8 - BLT simplecheck - DCBT (R5) // cache hint - DCBT (R6) CMP R8,$32 // optimize >= 32 MOVD R8,R9 - BLT setup8a // 8 byte moves only -setup32a: - SRADCC $5,R8,R9 // number of 32 byte chunks - MOVD R9,CTR - - // Special processing for 32 bytes or longer. - // Loading this way is faster and correct as long as the - // doublewords being compared are equal. Once they - // are found unequal, reload them in proper byte order - // to determine greater or less than. -loop32a: - MOVD 0(R5),R9 // doublewords to compare - MOVD 0(R6),R10 // get 4 doublewords - MOVD 8(R5),R14 - MOVD 8(R6),R15 - CMPU R9,R10 // bytes equal? - MOVD $0,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD 16(R5),R9 // get next pair of doublewords - MOVD 16(R6),R10 - CMPU R14,R15 // bytes match? - MOVD $8,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD 24(R5),R14 // get next pair of doublewords - MOVD 24(R6),R15 - CMPU R9,R10 // bytes match? - MOVD $16,R16 // set up for cmpne - BNE cmpne // further compare for LT or GT - MOVD $-8,R16 // for cmpne, R5,R6 already inc by 32 - ADD $32,R5 // bump up to next 32 - ADD $32,R6 - CMPU R14,R15 // bytes match? - BC 8,2,loop32a // br ctr and cr - BNE cmpne + BLT setup8a // optimize < 32 + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$64 + BLT cmp32 // process size 32-63 + + DCBT (R5) // optimize >= 64 + DCBT (R6) // cache hint + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different // jump out if its different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R11),V3 // load bytes of A at offset 32 into vector + LXVD2X (R6)(R11),V4 // load bytes of B at offset 32 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R12),V3 // load bytes of A at offset 64 into vector + LXVD2X (R6)(R12),V4 // load bytes of B at offset 64 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + LXVD2X (R5)(R10),V3 // load bytes of A at offset 16 into vector + LXVD2X (R6)(R10),V4 // load bytes of B at offset 16 into vector + + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$0 + BNE rem // loop to rem if the remainder is not 0 + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +rem: + MOVD R9,R8 ANDCC $24,R8,R9 // Any 8 byte chunks? BEQ leftover // and result is 0 + BR setup8a + +different: +#ifdef GOARCH_ppc64le + MOVD $byteswap<>+00(SB), R16 + LXVD2X (R16)(R0),SWAP // Set up swap string + + VPERM V3,V3,SWAP,V3 + VPERM V4,V4,SWAP,V4 +#endif + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower + BGT greater + MOVD $-1,R3 // return value if A < B + RET +lower: + VSLDOI $8,V3,V3,V3 // move lower doublwords of A and B into GPR for comparison + MFVSRD VS35,R16 + VSLDOI $8,V4,V4,V4 + MFVSRD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET setup8a: - SRADCC $3,R9,R9 // get the 8 byte count + SRADCC $3,R8,R9 // get the 8 byte count BEQ leftover // shifted value is 0 + CMPU R8,$8 // optimize 8byte move + BEQ size8 + CMPU R8,$16 + BEQ size16 MOVD R9,CTR // loop count for doublewords loop8: - MOVDBR (R5+R0),R9 // doublewords to compare +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif ADD $8,R5 ADD $8,R6 - CMPU R9,R10 // match? + CMPU R16,R10 // match? BC 8,2,loop8 // bt ctr <> 0 && cr BGT greater BLT less leftover: ANDCC $7,R8,R9 // check for leftover bytes - MOVD R9,CTR // save the ctr - BNE simple // leftover bytes - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less - BR greater + BEQ zeroremainder simplecheck: - CMP R8,$0 // remaining compare length 0 - BNE simple // do simple compare - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less // 1st len < 2nd len, result less - BR greater // 1st len > 2nd len must be greater -simple: - MOVBZ 0(R5), R9 // get byte from 1st operand - ADD $1,R5 - MOVBZ 0(R6), R10 // get byte from 2nd operand - ADD $1,R6 - CMPU R9, R10 - BC 8,2,simple // bc ctr <> 0 && cr - BGT greater // 1st > 2nd - BLT less // 1st < 2nd - BC 12,10,equal // test CR2 for length comparison - BC 12,9,greater // 2nd len > 1st len - BR less // must be less -cmpne: // only here is not equal - MOVDBR (R5+R16),R8 // reload in reverse order - MOVDBR (R6+R16),R9 - CMPU R8,R9 // compare correct endianness - BGT greater // here only if NE -less: - MOVD $-1, R3 // return value if A < B + MOVD R0,R14 + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5)(R14),R10 + MOVWBR (R6)(R14),R11 +#else + MOVWZ (R5)(R14),R10 + MOVWZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-4,R9 + ADD $4,R14 + PCALIGN $16 + +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5)(R14),R10 + MOVHBR (R6)(R14),R11 +#else + MOVHZ (R5)(R14),R10 + MOVHZ (R6)(R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $-2,R9 + ADD $2,R14 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5)(R14),R10 + MOVBZ (R6)(R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater + +less: MOVD $-1,R3 // return value if A < B RET +size16: + LXVD2X (R5)(R0),V3 // load bytes of A at offset 0 into vector + LXVD2X (R6)(R0),V4 // load bytes of B at offset 0 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +zeroremainder: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +size8: +#ifdef GOARCH_ppc64le + MOVDBR (R5+R0),R16 // doublewords to compare + MOVDBR (R6+R0),R10 // LE compare order +#else + MOVD (R5+R0),R16 // doublewords to compare + MOVD (R6+R0),R10 // BE compare order +#endif + CMPU R16,R10 // match? + BGT greater + BLT less + BGT CR2,greater // 2nd len > 1st len + BLT CR2,less // 2nd len < 1st len equal: MOVD $0, R3 // return value if A == B RET greater: - MOVD $1, R3 // return value if A > B + MOVD $1,R3 // return value if A > B RET -// Do an efficient memcmp for ppc64 (BE) +// Do an efficient memcmp for ppc64le/ppc64/POWER9 // R3 = a len // R4 = b len // R5 = a addr // R6 = b addr // On exit: // R3 = return value -TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0 +TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0 MOVD R3,R8 // set up length CMP R3,R4,CR2 // unequal? - BC 12,8,setuplen // BLT CR2 + BLT CR2,setuplen // BLT CR2 MOVD R4,R8 // use R4 for comparison len setuplen: - MOVD R8,CTR // set up loop counter - CMP R8,$8 // only optimize >=8 - BLT simplecheck - DCBT (R5) // cache hint - DCBT (R6) - CMP R8,$32 // optimize >= 32 + CMP R8,$16 // optimize for size<16 MOVD R8,R9 - BLT setup8a // 8 byte moves only - -setup32a: - SRADCC $5,R8,R9 // number of 32 byte chunks - MOVD R9,CTR -loop32a: - MOVD 0(R5),R9 // doublewords to compare - MOVD 0(R6),R10 // get 4 doublewords - MOVD 8(R5),R14 - MOVD 8(R6),R15 - CMPU R9,R10 // bytes equal? - BLT less // found to be less - BGT greater // found to be greater - MOVD 16(R5),R9 // get next pair of doublewords - MOVD 16(R6),R10 - CMPU R14,R15 // bytes match? - BLT less // found less - BGT greater // found greater - MOVD 24(R5),R14 // get next pair of doublewords - MOVD 24(R6),R15 - CMPU R9,R10 // bytes match? - BLT less // found to be less - BGT greater // found to be greater - ADD $32,R5 // bump up to next 32 - ADD $32,R6 - CMPU R14,R15 // bytes match? - BC 8,2,loop32a // br ctr and cr - BLT less // with BE, byte ordering is - BGT greater // good for compare - ANDCC $24,R8,R9 // Any 8 byte chunks? - BEQ leftover // and result is 0 -setup8a: - SRADCC $3,R9,R9 // get the 8 byte count - BEQ leftover // shifted value is 0 - MOVD R9,CTR // loop count for doublewords -loop8: - MOVD (R5),R9 - MOVD (R6),R10 - ADD $8,R5 - ADD $8,R6 - CMPU R9,R10 // match? - BC 8,2,loop8 // bt ctr <> 0 && cr + BLT simplecheck + MOVD $16,R10 // set offsets to load into vectors + CMP R8,$32 // optimize for size 16-31 + BLT cmp16 + CMP R8,$64 + BLT cmp32 // optimize for size 32-63 + DCBT (R5) // optimize for size>=64 + DCBT (R6) // cache hint + + MOVD $32,R11 // set offsets to load into vector + MOVD $48,R12 // set offsets to load into vector + +loop64a:// process size 64 and greater + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R11)(R5),V3 // load bytes of A at offset 32 into vector + LXVB16X (R11)(R6),V4 // load bytes of B at offset 32 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + LXVB16X (R12)(R5),V3 // load bytes of A at offset 48 into vector + LXVB16X (R12)(R6),V4 // load bytes of B at offset 48 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-64,R9,R9 // reduce remaining size by 64 + ADD $64,R5,R5 // increment to next 64 bytes of A + ADD $64,R6,R6 // increment to next 64 bytes of B + CMPU R9,$64 + BGE loop64a // loop back to loop64a only if there are >= 64 bytes remaining + + CMPU R9,$32 + BGE cmp32 // loop to cmp32 if there are 32-64 bytes remaining + CMPU R9,$16 + BGE cmp16 // loop to cmp16 if there are 16-31 bytes left + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remaining bytes + + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +cmp32: + LXVB16X (R0)(R5),V3 // load bytes of A at offset 0 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 0 into vector + + VCMPNEBCC V3,V4,V1 // record comparison into V1 + BNE CR6,different // jump out if its different + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPNEBCC V3,V4,V1 + BNE CR6,different + + ADD $-32,R9,R9 // reduce remaining size by 32 + ADD $32,R5,R5 // increment to next 32 bytes of A + ADD $32,R6,R6 // increment to next 32 bytes of B + CMPU R9,$16 // loop to cmp16 if there are 16-31 bytes left + BGE cmp16 + CMPU R9,$0 + BNE simplecheck // loop to simplecheck for remainder bytes + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if len(A)<len(B) + BR greater // jump to greater otherwise +different: + + MFVSRD VS35,R16 // move upper doublwords of A and B into GPR for comparison + MFVSRD VS36,R10 + + CMPU R16,R10 + BEQ lower BGT greater - BLT less -leftover: - ANDCC $7,R8,R9 // check for leftover bytes - MOVD R9,CTR // save the ctr - BNE simple // leftover bytes - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less - BR greater + MOVD $-1,R3 // return value if A < B + RET +lower: + MFVSRLD VS35,R16 // next move lower doublewords of A and B into GPR for comparison + MFVSRLD VS36,R10 + + CMPU R16,R10 + BGT greater + MOVD $-1,R3 // return value if A < B + RET + +greater: + MOVD $1,R3 // return value if A > B + RET +cmp16: + ANDCC $16,R9,R31 + BEQ tail + + LXVB16X (R0)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R0)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different + + ADD $16,R5 + ADD $16,R6 +tail: + ANDCC $15,R9 // Load the last 16 bytes (we know there are at least 32b) + BEQ end + + ADD R9,R5 + ADD R9,R6 + MOVD $-16,R10 + + LXVB16X (R10)(R5),V3 // load bytes of A at offset 16 into vector + LXVB16X (R10)(R6),V4 // load bytes of B at offset 16 into vector + VCMPEQUDCC V3,V4,V1 + BGE CR6,different +end: + BEQ CR2,equal // remainder is zero, jump to equal if len(A)==len(B) + BLT CR2,less // jump to less if BLT CR2 that is, len(A)<len(B) + BR greater // jump to greater otherwise simplecheck: - CMP R8,$0 // remaining compare length 0 - BNE simple // do simple compare - BC 12,10,equal // test CR2 for length comparison - BC 12,8,less // 1st len < 2nd len, result less - BR greater // same len, must be equal -simple: - MOVBZ 0(R5),R9 // get byte from 1st operand - ADD $1,R5 - MOVBZ 0(R6),R10 // get byte from 2nd operand - ADD $1,R6 - CMPU R9,R10 - BC 8,2,simple // bc ctr <> 0 && cr - BGT greater // 1st > 2nd - BLT less // 1st < 2nd - BC 12,10,equal // test CR2 for length comparison - BC 12,9,greater // 2nd len > 1st len + MOVD $0,R14 // process 8 bytes + CMP R9,$8 + BLT word +#ifdef GOARCH_ppc64le + MOVDBR (R5+R14),R10 + MOVDBR (R6+R14),R11 +#else + MOVD (R5+R14),R10 + MOVD (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $8,R14 + ADD $-8,R9 + PCALIGN $16 +word: + CMP R9,$4 // process 4 bytes + BLT halfword +#ifdef GOARCH_ppc64le + MOVWBR (R5+R14),R10 + MOVWBR (R6+R14),R11 +#else + MOVWZ (R5+R14),R10 + MOVWZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $4,R14 + ADD $-4,R9 + PCALIGN $16 +halfword: + CMP R9,$2 // process 2 bytes + BLT byte +#ifdef GOARCH_ppc64le + MOVHBR (R5+R14),R10 + MOVHBR (R6+R14),R11 +#else + MOVHZ (R5+R14),R10 + MOVHZ (R6+R14),R11 +#endif + CMPU R10,R11 + BGT greater + BLT less + ADD $2,R14 + ADD $-2,R9 + PCALIGN $16 +byte: + CMP R9,$0 // process 1 byte + BEQ skip + MOVBZ (R5+R14),R10 + MOVBZ (R6+R14),R11 + CMPU R10,R11 + BGT greater + BLT less + PCALIGN $16 +skip: + BEQ CR2,equal + BGT CR2,greater less: - MOVD $-1, R3 // return value if A < B + MOVD $-1,R3 // return value if A < B RET equal: MOVD $0, R3 // return value if A == B RET -greater: - MOVD $1, R3 // return value if A > B - RET |