1 files changed, 388 insertions, 164 deletions
diff --git a/src/internal/bytealg/compare_ppc64x.s b/src/internal/bytealg/compare_ppc64x.s
index fc6f170ca8..cbe0525af5 100644
--- a/src/internal/bytealg/compare_ppc64x.s
+++ b/src/internal/bytealg/compare_ppc64x.s
@@ -21,11 +21,12 @@ TEXT ·Compare<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-56
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -52,11 +53,12 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-40
 	CMP     R5,R6,CR7
 	CMP	R3,R4,CR6
 	BEQ	CR7,equal
-#ifdef	GOARCH_ppc64le
-	BR	cmpbodyLE<>(SB)
-#else
-	BR      cmpbodyBE<>(SB)
-#endif
+	MOVBZ	internal∕cpu·PPC64+const_offsetPPC64HasPOWER9(SB), R16
+	CMP	R16,$1
+	BNE	power8
+	BR	cmpbodyp9<>(SB)
+power8:
+	BR	cmpbody<>(SB)
 equal:
 	BEQ	CR6,done
 	MOVD	$1, R8
@@ -70,209 +72,431 @@ done:
 	MOVD	$0, R3
 	RET
 
-// Do an efficient memcmp for ppc64le
+#ifdef GOARCH_ppc64le
+DATA byteswap<>+0(SB)/8, $0x0706050403020100
+DATA byteswap<>+8(SB)/8, $0x0f0e0d0c0b0a0908
+GLOBL byteswap<>+0(SB), RODATA, $16
+#define SWAP V21
+#endif
+
+// Do an efficient memcmp for ppc64le/ppc64/POWER8
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyLE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
 	CMP	R8,$32		// optimize >= 32
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
-
-	// Special processing for 32 bytes or longer.
-	// Loading this way is faster and correct as long as the
-	// doublewords being compared are equal. Once they
-	// are found unequal, reload them in proper byte order
-	// to determine greater or less than.
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	MOVD	$0,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	MOVD	$8,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD    24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	MOVD	$16,R16		// set up for cmpne
-	BNE	cmpne		// further compare for LT or GT
-	MOVD	$-8,R16		// for cmpne, R5,R6 already inc by 32
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU    R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BNE	cmpne
+	BLT	setup8a		// optimize < 32
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$64
+	BLT	cmp32		// process size 32-63
+
+	DCBT	(R5)		// optimize >= 64
+	DCBT	(R6)		// cache hint
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different	// jump out if its different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R11),V3	// load bytes of A at offset 32 into vector
+	LXVD2X	(R6)(R11),V4	// load bytes of B at offset 32 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R12),V3	// load bytes of A at offset 64 into vector
+	LXVD2X	(R6)(R12),V4	// load bytes of B at offset 64 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+	
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	LXVD2X	(R5)(R10),V3	// load bytes of A at offset 16 into vector
+	LXVD2X	(R6)(R10),V4	// load bytes of B at offset 16 into vector
+
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$0
+	BNE	rem		// loop to rem if the remainder is not 0
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+rem:
+	MOVD	R9,R8
 	ANDCC	$24,R8,R9	// Any 8 byte chunks?
 	BEQ	leftover	// and result is 0
+	BR	setup8a
+
+different:
+#ifdef	GOARCH_ppc64le
+	MOVD	$byteswap<>+00(SB), R16
+	LXVD2X	(R16)(R0),SWAP	// Set up swap string
+
+	VPERM	V3,V3,SWAP,V3
+	VPERM	V4,V4,SWAP,V4
+#endif
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	VSLDOI	$8,V3,V3,V3	// move lower doublwords of A and B into GPR for comparison
+	MFVSRD	VS35,R16
+	VSLDOI	$8,V4,V4,V4
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
 setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
+	SRADCC	$3,R8,R9	// get the 8 byte count
 	BEQ	leftover	// shifted value is 0
+	CMPU	R8,$8		// optimize 8byte move
+	BEQ	size8
+	CMPU	R8,$16
+	BEQ	size16
 	MOVD	R9,CTR		// loop count for doublewords
 loop8:
-	MOVDBR	(R5+R0),R9	// doublewords to compare
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
 	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
 	ADD	$8,R5
 	ADD	$8,R6
-	CMPU	R9,R10		// match?
+	CMPU	R16,R10		// match?
 	BC	8,2,loop8	// bt ctr <> 0 && cr
 	BGT	greater
 	BLT	less
 leftover:
 	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
+	BEQ	zeroremainder
 simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// 1st len > 2nd len must be greater
-simple:
-	MOVBZ	0(R5), R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6), R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9, R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
-	BR	less		// must be less
-cmpne:				// only here is not equal
-	MOVDBR	(R5+R16),R8	// reload in reverse order
-	MOVDBR	(R6+R16),R9
-	CMPU	R8,R9		// compare correct endianness
-	BGT	greater		// here only if NE
-less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	R0,R14
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5)(R14),R10
+	MOVWBR	(R6)(R14),R11
+#else
+	MOVWZ	(R5)(R14),R10
+	MOVWZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-4,R9
+	ADD	$4,R14
+	PCALIGN	$16
+
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5)(R14),R10
+	MOVHBR	(R6)(R14),R11
+#else
+	MOVHZ	(R5)(R14),R10
+	MOVHZ	(R6)(R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$-2,R9
+	ADD	$2,R14
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5)(R14),R10
+	MOVBZ	(R6)(R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
+
+less:	MOVD	$-1,R3		// return value if A < B
 	RET
+size16:
+	LXVD2X	(R5)(R0),V3	// load bytes of A at offset 0 into vector
+	LXVD2X	(R6)(R0),V4	// load bytes of B at offset 0 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+zeroremainder:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+size8:
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R0),R16	// doublewords to compare
+	MOVDBR	(R6+R0),R10	// LE compare order
+#else
+	MOVD	(R5+R0),R16	// doublewords to compare
+	MOVD	(R6+R0),R10	// BE compare order
+#endif
+	CMPU	R16,R10		// match?
+	BGT	greater
+	BLT	less
+	BGT	CR2,greater	// 2nd len > 1st len
+	BLT	CR2,less	// 2nd len < 1st len
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
 greater:
-	MOVD	$1, R3		// return value if A > B
+	MOVD	$1,R3		// return value if A > B
 	RET
 
-// Do an efficient memcmp for ppc64 (BE)
+// Do an efficient memcmp for ppc64le/ppc64/POWER9
 // R3 = a len
 // R4 = b len
 // R5 = a addr
 // R6 = b addr
 // On exit:
 // R3 = return value
-TEXT cmpbodyBE<>(SB),NOSPLIT|NOFRAME,$0-0
+TEXT cmpbodyp9<>(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD	R3,R8		// set up length
 	CMP	R3,R4,CR2	// unequal?
-	BC	12,8,setuplen	// BLT CR2
+	BLT	CR2,setuplen	// BLT CR2
 	MOVD	R4,R8		// use R4 for comparison len
 setuplen:
-	MOVD	R8,CTR		// set up loop counter
-	CMP	R8,$8		// only optimize >=8
-	BLT	simplecheck
-	DCBT	(R5)		// cache hint
-	DCBT	(R6)
-	CMP	R8,$32		// optimize >= 32
+	CMP	R8,$16		// optimize for size<16
 	MOVD	R8,R9
-	BLT	setup8a		// 8 byte moves only
-
-setup32a:
-	SRADCC	$5,R8,R9	// number of 32 byte chunks
-	MOVD	R9,CTR
-loop32a:
-	MOVD	0(R5),R9	// doublewords to compare
-	MOVD	0(R6),R10	// get 4 doublewords
-	MOVD	8(R5),R14
-	MOVD	8(R6),R15
-	CMPU	R9,R10		// bytes equal?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	MOVD	16(R5),R9	// get next pair of doublewords
-	MOVD	16(R6),R10
-	CMPU	R14,R15		// bytes match?
-	BLT	less		// found less
-	BGT	greater		// found greater
-	MOVD	24(R5),R14	// get next pair of doublewords
-	MOVD	24(R6),R15
-	CMPU	R9,R10		// bytes match?
-	BLT	less		// found to be less
-	BGT	greater		// found to be greater
-	ADD	$32,R5		// bump up to next 32
-	ADD	$32,R6
-	CMPU	R14,R15		// bytes match?
-	BC	8,2,loop32a	// br ctr and cr
-	BLT	less		// with BE, byte ordering is
-	BGT	greater		// good for compare
-	ANDCC	$24,R8,R9	// Any 8 byte chunks?
-	BEQ	leftover	// and result is 0
-setup8a:
-	SRADCC	$3,R9,R9	// get the 8 byte count
-	BEQ	leftover	// shifted value is 0
-	MOVD	R9,CTR		// loop count for doublewords
-loop8:
-	MOVD	(R5),R9
-	MOVD	(R6),R10
-	ADD	$8,R5
-	ADD	$8,R6
-	CMPU	R9,R10		// match?
-	BC	8,2,loop8	// bt ctr <> 0 && cr
+	BLT	simplecheck
+	MOVD	$16,R10		// set offsets to load into vectors
+	CMP	R8,$32		// optimize for size 16-31
+	BLT	cmp16
+	CMP	R8,$64
+	BLT	cmp32		// optimize for size 32-63
+	DCBT	(R5)		// optimize for size>=64
+	DCBT	(R6)		// cache hint
+
+	MOVD	$32,R11		// set offsets to load into vector
+	MOVD	$48,R12		// set offsets to load into vector
+
+loop64a:// process size 64 and greater
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R11)(R5),V3	// load bytes of A at offset 32 into vector
+	LXVB16X	(R11)(R6),V4	// load bytes of B at offset 32 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	LXVB16X	(R12)(R5),V3	// load bytes of A at offset 48 into vector
+	LXVB16X	(R12)(R6),V4	// load bytes of B at offset 48 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-64,R9,R9	// reduce remaining size by 64
+	ADD	$64,R5,R5	// increment to next 64 bytes of A
+	ADD	$64,R6,R6	// increment to next 64 bytes of B
+	CMPU	R9,$64
+	BGE	loop64a		// loop back to loop64a only if there are >= 64 bytes remaining
+
+	CMPU	R9,$32
+	BGE	cmp32		// loop to cmp32 if there are 32-64 bytes remaining
+	CMPU	R9,$16
+	BGE	cmp16		// loop to cmp16 if there are 16-31 bytes left
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remaining bytes
+
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+cmp32:
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 0 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 0 into vector
+
+	VCMPNEBCC	V3,V4,V1	// record comparison into V1
+	BNE	CR6,different	// jump out if its different
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPNEBCC	V3,V4,V1
+	BNE	CR6,different
+
+	ADD	$-32,R9,R9	// reduce remaining size by 32
+	ADD	$32,R5,R5	// increment to next 32 bytes of A
+	ADD	$32,R6,R6	// increment to next 32 bytes of B
+	CMPU	R9,$16		// loop to cmp16 if there are 16-31 bytes left
+	BGE	cmp16
+	CMPU	R9,$0
+	BNE	simplecheck	// loop to simplecheck for remainder bytes
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if len(A)<len(B)
+	BR	greater		// jump to greater otherwise
+different:
+
+	MFVSRD	VS35,R16	// move upper doublwords of A and B into GPR for comparison
+	MFVSRD	VS36,R10
+
+	CMPU	R16,R10
+	BEQ	lower
 	BGT	greater
-	BLT	less
-leftover:
-	ANDCC	$7,R8,R9	// check for leftover bytes
-	MOVD	R9,CTR		// save the ctr
-	BNE	simple		// leftover bytes
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,8,less
-	BR	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+lower:
+	MFVSRLD	VS35,R16	// next move lower doublewords of A and B into GPR for comparison
+	MFVSRLD	VS36,R10
+
+	CMPU	R16,R10
+	BGT	greater
+	MOVD	$-1,R3		// return value if A < B
+	RET
+
+greater:
+	MOVD	$1,R3		// return value if A > B
+	RET
+cmp16:
+	ANDCC	$16,R9,R31
+	BEQ	tail
+
+	LXVB16X	(R0)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R0)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+
+	ADD	$16,R5
+	ADD	$16,R6
+tail:
+	ANDCC	$15,R9		// Load the last 16 bytes (we know there are at least 32b)
+	BEQ	end
+
+	ADD	R9,R5
+	ADD	R9,R6
+	MOVD	$-16,R10
+
+	LXVB16X	(R10)(R5),V3	// load bytes of A at offset 16 into vector
+	LXVB16X	(R10)(R6),V4	// load bytes of B at offset 16 into vector
+	VCMPEQUDCC	V3,V4,V1
+	BGE	CR6,different
+end:
+	BEQ	CR2,equal	// remainder is zero, jump to equal if len(A)==len(B)
+	BLT	CR2,less	// jump to less if BLT CR2 that is, len(A)<len(B)
+	BR	greater		// jump to greater otherwise
 simplecheck:
-	CMP	R8,$0		// remaining compare length 0
-	BNE	simple		// do simple compare
-	BC	12,10,equal	// test CR2 for length comparison
-	BC 	12,8,less	// 1st len < 2nd len, result less
-	BR	greater		// same len, must be equal
-simple:
-	MOVBZ	0(R5),R9	// get byte from 1st operand
-	ADD	$1,R5
-	MOVBZ	0(R6),R10	// get byte from 2nd operand
-	ADD	$1,R6
-	CMPU	R9,R10
-	BC	8,2,simple	// bc ctr <> 0 && cr
-	BGT	greater		// 1st > 2nd
-	BLT	less		// 1st < 2nd
-	BC	12,10,equal	// test CR2 for length comparison
-	BC	12,9,greater	// 2nd len > 1st len
+	MOVD	$0,R14		// process 8 bytes
+	CMP	R9,$8
+	BLT	word
+#ifdef  GOARCH_ppc64le
+	MOVDBR	(R5+R14),R10
+	MOVDBR	(R6+R14),R11
+#else
+	MOVD	(R5+R14),R10
+	MOVD	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$8,R14
+	ADD	$-8,R9
+	PCALIGN	$16
+word:
+	CMP	R9,$4		// process 4 bytes
+	BLT	halfword
+#ifdef  GOARCH_ppc64le
+	MOVWBR	(R5+R14),R10
+	MOVWBR	(R6+R14),R11
+#else
+	MOVWZ	(R5+R14),R10
+	MOVWZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$4,R14
+	ADD	$-4,R9
+	PCALIGN	$16
+halfword:
+	CMP	R9,$2		// process 2 bytes
+	BLT	byte
+#ifdef  GOARCH_ppc64le
+	MOVHBR	(R5+R14),R10
+	MOVHBR	(R6+R14),R11
+#else
+	MOVHZ	(R5+R14),R10
+	MOVHZ	(R6+R14),R11
+#endif
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	ADD	$2,R14
+	ADD	$-2,R9
+	PCALIGN	$16
+byte:
+	CMP	R9,$0		// process 1 byte
+	BEQ	skip
+	MOVBZ	(R5+R14),R10
+	MOVBZ	(R6+R14),R11
+	CMPU	R10,R11
+	BGT	greater
+	BLT	less
+	PCALIGN	$16
+skip:
+	BEQ	CR2,equal
+	BGT	CR2,greater
 less:
-	MOVD	$-1, R3		// return value if A < B
+	MOVD	$-1,R3		// return value if A < B
 	RET
 equal:
 	MOVD	$0, R3		// return value if A == B
 	RET
-greater:
-	MOVD	$1, R3		// return value if A > B
-	RET