aboutsummaryrefslogtreecommitdiff
path: root/src/crypto
diff options
context:
space:
mode:
authorPaul E. Murphy <murp@ibm.com>2021-03-10 17:06:54 -0600
committerLynn Boger <laboger@linux.vnet.ibm.com>2021-03-15 12:30:38 +0000
commit4350e4961a6ea3d36a33271423735b37c96dd5bf (patch)
tree6315d542cb50cdb176f4b73e098689fe9004d424 /src/crypto
parent6ccb5c49cca52766f6d288d128db67be6392c579 (diff)
downloadgo-4350e4961a6ea3d36a33271423735b37c96dd5bf.tar.gz
go-4350e4961a6ea3d36a33271423735b37c96dd5bf.zip
crypto/md5: improve ppc64x performance
This is mostly cleanup and simplification. This removes many unneeded register moves, loads, and bit twiddlings which were holdovers from porting this from the amd64 version. The updated code loads each block once per iteration instead of once per round. Similarly, the logical operations now match the original md5 specification. Likewise, add extra sizes to the benchtest to give more data points on how the implementation scales with input size. All in all, this is roughly a 20% improvement on ppc64le code running on POWER9 (POWER8 is similar, but around 16%): name old time/op new time/op delta Hash8Bytes 297ns ± 0% 255ns ± 0% -14.14% Hash64 527ns ± 0% 444ns ± 0% -15.76% Hash128 771ns ± 0% 645ns ± 0% -16.35% Hash256 1.26µs ± 0% 1.05µs ± 0% -16.68% Hash512 2.23µs ± 0% 1.85µs ± 0% -16.82% Hash1K 4.16µs ± 0% 3.46µs ± 0% -16.83% Hash8K 31.2µs ± 0% 26.0µs ± 0% -16.74% Hash1M 3.58ms ± 0% 2.98ms ± 0% -16.74% Hash8M 26.1ms ± 0% 21.7ms ± 0% -16.81% Hash8BytesUnaligned 297ns ± 0% 255ns ± 0% -14.08% Hash1KUnaligned 4.16µs ± 0% 3.46µs ± 0% -16.79% Hash8KUnaligned 31.2µs ± 0% 26.0µs ± 0% -16.78% name old speed new speed delta Hash8Bytes 26.9MB/s ± 0% 31.4MB/s ± 0% +16.45% Hash64 122MB/s ± 0% 144MB/s ± 0% +18.69% Hash128 166MB/s ± 0% 199MB/s ± 0% +19.54% Hash256 203MB/s ± 0% 244MB/s ± 0% +20.01% Hash512 230MB/s ± 0% 276MB/s ± 0% +20.18% Hash1K 246MB/s ± 0% 296MB/s ± 0% +20.26% Hash8K 263MB/s ± 0% 315MB/s ± 0% +20.11% Hash1M 293MB/s ± 0% 352MB/s ± 0% +20.10% Hash8M 321MB/s ± 0% 386MB/s ± 0% +20.21% Hash8BytesUnaligned 26.9MB/s ± 0% 31.4MB/s ± 0% +16.41% Hash1KUnaligned 246MB/s ± 0% 296MB/s ± 0% +20.19% Hash8KUnaligned 263MB/s ± 0% 315MB/s ± 0% +20.15% Change-Id: I269bfa6878966bb4f6a64dc349100f5dc453ab7c Reviewed-on: https://go-review.googlesource.com/c/go/+/300613 Run-TryBot: Paul Murphy <murp@ibm.com> Reviewed-by: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Go Bot <gobot@golang.org> Trust: Emmanuel Odeke <emmanuel@orijtech.com>
Diffstat (limited to 'src/crypto')
-rw-r--r--src/crypto/md5/md5_test.go26
-rw-r--r--src/crypto/md5/md5block_ppc64x.s303
2 files changed, 182 insertions, 147 deletions
diff --git a/src/crypto/md5/md5_test.go b/src/crypto/md5/md5_test.go
index c0ac0971c4..acd456af21 100644
--- a/src/crypto/md5/md5_test.go
+++ b/src/crypto/md5/md5_test.go
@@ -212,7 +212,7 @@ func TestLargeHashes(t *testing.T) {
}
var bench = New()
-var buf = make([]byte, 8192+1)
+var buf = make([]byte, 1024*1024*8+1)
var sum = make([]byte, bench.Size())
func benchmarkSize(b *testing.B, size int, unaligned bool) {
@@ -235,6 +235,22 @@ func BenchmarkHash8Bytes(b *testing.B) {
benchmarkSize(b, 8, false)
}
+func BenchmarkHash64(b *testing.B) {
+ benchmarkSize(b, 64, false)
+}
+
+func BenchmarkHash128(b *testing.B) {
+ benchmarkSize(b, 128, false)
+}
+
+func BenchmarkHash256(b *testing.B) {
+ benchmarkSize(b, 256, false)
+}
+
+func BenchmarkHash512(b *testing.B) {
+ benchmarkSize(b, 512, false)
+}
+
func BenchmarkHash1K(b *testing.B) {
benchmarkSize(b, 1024, false)
}
@@ -243,6 +259,14 @@ func BenchmarkHash8K(b *testing.B) {
benchmarkSize(b, 8192, false)
}
+func BenchmarkHash1M(b *testing.B) {
+ benchmarkSize(b, 1024*1024, false)
+}
+
+func BenchmarkHash8M(b *testing.B) {
+ benchmarkSize(b, 8*1024*1024, false)
+}
+
func BenchmarkHash8BytesUnaligned(b *testing.B) {
benchmarkSize(b, 8, true)
}
diff --git a/src/crypto/md5/md5block_ppc64x.s b/src/crypto/md5/md5block_ppc64x.s
index f309a1413d..e1f859e337 100644
--- a/src/crypto/md5/md5block_ppc64x.s
+++ b/src/crypto/md5/md5block_ppc64x.s
@@ -28,169 +28,179 @@
MOVWBR (idx)(ptr), dst
#endif
-TEXT ·block(SB),NOSPLIT,$0-32
- MOVD dig+0(FP), R10
- MOVD p+8(FP), R6
- MOVD p_len+16(FP), R5
- SLD $6, R5
- SRD $6, R5
- ADD R6, R5, R7
-
- MOVWZ 0(R10), R22
- MOVWZ 4(R10), R3
- MOVWZ 8(R10), R4
- MOVWZ 12(R10), R5
- CMP R6, R7
- BEQ end
-
-loop:
- MOVWZ R22, R14
- MOVWZ R3, R15
- MOVWZ R4, R16
- MOVWZ R5, R17
-
- ENDIAN_MOVE(0,R6,R8,R21)
- MOVWZ R5, R9
+#define M00 R18
+#define M01 R19
+#define M02 R20
+#define M03 R24
+#define M04 R25
+#define M05 R26
+#define M06 R27
+#define M07 R28
+#define M08 R29
+#define M09 R21
+#define M10 R11
+#define M11 R8
+#define M12 R7
+#define M13 R12
+#define M14 R23
+#define M15 R10
#define ROUND1(a, b, c, d, index, const, shift) \
- XOR c, R9; \
- ADD $const, a; \
- ADD R8, a; \
- AND b, R9; \
- XOR d, R9; \
- ENDIAN_MOVE(index*4,R6,R8,R21); \
+ ADD $const, index, R9; \
ADD R9, a; \
- RLWMI $shift, a, $0xffffffff, a; \
- MOVWZ c, R9; \
- ADD b, a; \
- MOVWZ a, a
-
- ROUND1(R22,R3,R4,R5, 1,0xd76aa478, 7);
- ROUND1(R5,R22,R3,R4, 2,0xe8c7b756,12);
- ROUND1(R4,R5,R22,R3, 3,0x242070db,17);
- ROUND1(R3,R4,R5,R22, 4,0xc1bdceee,22);
- ROUND1(R22,R3,R4,R5, 5,0xf57c0faf, 7);
- ROUND1(R5,R22,R3,R4, 6,0x4787c62a,12);
- ROUND1(R4,R5,R22,R3, 7,0xa8304613,17);
- ROUND1(R3,R4,R5,R22, 8,0xfd469501,22);
- ROUND1(R22,R3,R4,R5, 9,0x698098d8, 7);
- ROUND1(R5,R22,R3,R4,10,0x8b44f7af,12);
- ROUND1(R4,R5,R22,R3,11,0xffff5bb1,17);
- ROUND1(R3,R4,R5,R22,12,0x895cd7be,22);
- ROUND1(R22,R3,R4,R5,13,0x6b901122, 7);
- ROUND1(R5,R22,R3,R4,14,0xfd987193,12);
- ROUND1(R4,R5,R22,R3,15,0xa679438e,17);
- ROUND1(R3,R4,R5,R22, 0,0x49b40821,22);
-
- ENDIAN_MOVE(1*4,R6,R8,R21)
- MOVWZ R5, R9
- MOVWZ R5, R10
+ AND b, c, R9; \
+ ANDN b, d, R31; \
+ OR R9, R31, R9; \
+ ADD R9, a; \
+ ROTLW $shift, a; \
+ ADD b, a;
#define ROUND2(a, b, c, d, index, const, shift) \
- XOR $0xffffffff, R9; \ // NOTW R9
- ADD $const, a; \
- ADD R8, a; \
- AND b, R10; \
- AND c, R9; \
- ENDIAN_MOVE(index*4,R6,R8,R21); \
- OR R9, R10; \
- MOVWZ c, R9; \
- ADD R10, a; \
- MOVWZ c, R10; \
- RLWMI $shift, a, $0xffffffff, a; \
- ADD b, a; \
- MOVWZ a, a
-
- ROUND2(R22,R3,R4,R5, 6,0xf61e2562, 5);
- ROUND2(R5,R22,R3,R4,11,0xc040b340, 9);
- ROUND2(R4,R5,R22,R3, 0,0x265e5a51,14);
- ROUND2(R3,R4,R5,R22, 5,0xe9b6c7aa,20);
- ROUND2(R22,R3,R4,R5,10,0xd62f105d, 5);
- ROUND2(R5,R22,R3,R4,15, 0x2441453, 9);
- ROUND2(R4,R5,R22,R3, 4,0xd8a1e681,14);
- ROUND2(R3,R4,R5,R22, 9,0xe7d3fbc8,20);
- ROUND2(R22,R3,R4,R5,14,0x21e1cde6, 5);
- ROUND2(R5,R22,R3,R4, 3,0xc33707d6, 9);
- ROUND2(R4,R5,R22,R3, 8,0xf4d50d87,14);
- ROUND2(R3,R4,R5,R22,13,0x455a14ed,20);
- ROUND2(R22,R3,R4,R5, 2,0xa9e3e905, 5);
- ROUND2(R5,R22,R3,R4, 7,0xfcefa3f8, 9);
- ROUND2(R4,R5,R22,R3,12,0x676f02d9,14);
- ROUND2(R3,R4,R5,R22, 0,0x8d2a4c8a,20);
-
- ENDIAN_MOVE(5*4,R6,R8,R21)
- MOVWZ R4, R9
+ ADD $const, index, R9; \
+ ADD R9, a; \
+ AND b, d, R31; \
+ ANDN d, c, R9; \
+ OR R9, R31; \
+ ADD R31, a; \
+ ROTLW $shift, a; \
+ ADD b, a;
#define ROUND3(a, b, c, d, index, const, shift) \
- ADD $const, a; \
- ADD R8, a; \
- ENDIAN_MOVE(index*4,R6,R8,R21); \
- XOR d, R9; \
- XOR b, R9; \
+ ADD $const, index, R9; \
ADD R9, a; \
- RLWMI $shift, a, $0xffffffff, a; \
- MOVWZ b, R9; \
- ADD b, a; \
- MOVWZ a, a
-
- ROUND3(R22,R3,R4,R5, 8,0xfffa3942, 4);
- ROUND3(R5,R22,R3,R4,11,0x8771f681,11);
- ROUND3(R4,R5,R22,R3,14,0x6d9d6122,16);
- ROUND3(R3,R4,R5,R22, 1,0xfde5380c,23);
- ROUND3(R22,R3,R4,R5, 4,0xa4beea44, 4);
- ROUND3(R5,R22,R3,R4, 7,0x4bdecfa9,11);
- ROUND3(R4,R5,R22,R3,10,0xf6bb4b60,16);
- ROUND3(R3,R4,R5,R22,13,0xbebfbc70,23);
- ROUND3(R22,R3,R4,R5, 0,0x289b7ec6, 4);
- ROUND3(R5,R22,R3,R4, 3,0xeaa127fa,11);
- ROUND3(R4,R5,R22,R3, 6,0xd4ef3085,16);
- ROUND3(R3,R4,R5,R22, 9, 0x4881d05,23);
- ROUND3(R22,R3,R4,R5,12,0xd9d4d039, 4);
- ROUND3(R5,R22,R3,R4,15,0xe6db99e5,11);
- ROUND3(R4,R5,R22,R3, 2,0x1fa27cf8,16);
- ROUND3(R3,R4,R5,R22, 0,0xc4ac5665,23);
-
- ENDIAN_MOVE(0,R6,R8,R21)
- MOVWZ $0xffffffff, R9
- XOR R5, R9
+ XOR d, c, R31; \
+ XOR b, R31; \
+ ADD R31, a; \
+ ROTLW $shift, a; \
+ ADD b, a;
#define ROUND4(a, b, c, d, index, const, shift) \
- ADD $const, a; \
- ADD R8, a; \
- OR b, R9; \
- XOR c, R9; \
+ ADD $const, index, R9; \
ADD R9, a; \
- ENDIAN_MOVE(index*4,R6,R8,R21); \
- MOVWZ $0xffffffff, R9; \
- RLWMI $shift, a, $0xffffffff, a; \
- XOR c, R9; \
- ADD b, a; \
- MOVWZ a, a
-
- ROUND4(R22,R3,R4,R5, 7,0xf4292244, 6);
- ROUND4(R5,R22,R3,R4,14,0x432aff97,10);
- ROUND4(R4,R5,R22,R3, 5,0xab9423a7,15);
- ROUND4(R3,R4,R5,R22,12,0xfc93a039,21);
- ROUND4(R22,R3,R4,R5, 3,0x655b59c3, 6);
- ROUND4(R5,R22,R3,R4,10,0x8f0ccc92,10);
- ROUND4(R4,R5,R22,R3, 1,0xffeff47d,15);
- ROUND4(R3,R4,R5,R22, 8,0x85845dd1,21);
- ROUND4(R22,R3,R4,R5,15,0x6fa87e4f, 6);
- ROUND4(R5,R22,R3,R4, 6,0xfe2ce6e0,10);
- ROUND4(R4,R5,R22,R3,13,0xa3014314,15);
- ROUND4(R3,R4,R5,R22, 4,0x4e0811a1,21);
- ROUND4(R22,R3,R4,R5,11,0xf7537e82, 6);
- ROUND4(R5,R22,R3,R4, 2,0xbd3af235,10);
- ROUND4(R4,R5,R22,R3, 9,0x2ad7d2bb,15);
- ROUND4(R3,R4,R5,R22, 0,0xeb86d391,21);
+ ORN d, b, R31; \
+ XOR c, R31; \
+ ADD R31, a; \
+ ROTLW $shift, a; \
+ ADD b, a;
+
+
+TEXT ·block(SB),NOSPLIT,$0-32
+ MOVD dig+0(FP), R10
+ MOVD p+8(FP), R6
+ MOVD p_len+16(FP), R5
+
+ // We assume p_len >= 64
+ SRD $6, R5
+ MOVD R5, CTR
+
+ MOVWZ 0(R10), R22
+ MOVWZ 4(R10), R3
+ MOVWZ 8(R10), R4
+ MOVWZ 12(R10), R5
+
+loop:
+ MOVD R22, R14
+ MOVD R3, R15
+ MOVD R4, R16
+ MOVD R5, R17
+
+ ENDIAN_MOVE( 0,R6,M00,M15)
+ ENDIAN_MOVE( 4,R6,M01,M15)
+ ENDIAN_MOVE( 8,R6,M02,M15)
+ ENDIAN_MOVE(12,R6,M03,M15)
+
+ ROUND1(R22,R3,R4,R5,M00,0xd76aa478, 7);
+ ROUND1(R5,R22,R3,R4,M01,0xe8c7b756,12);
+ ROUND1(R4,R5,R22,R3,M02,0x242070db,17);
+ ROUND1(R3,R4,R5,R22,M03,0xc1bdceee,22);
+
+ ENDIAN_MOVE(16,R6,M04,M15)
+ ENDIAN_MOVE(20,R6,M05,M15)
+ ENDIAN_MOVE(24,R6,M06,M15)
+ ENDIAN_MOVE(28,R6,M07,M15)
+
+ ROUND1(R22,R3,R4,R5,M04,0xf57c0faf, 7);
+ ROUND1(R5,R22,R3,R4,M05,0x4787c62a,12);
+ ROUND1(R4,R5,R22,R3,M06,0xa8304613,17);
+ ROUND1(R3,R4,R5,R22,M07,0xfd469501,22);
+
+ ENDIAN_MOVE(32,R6,M08,M15)
+ ENDIAN_MOVE(36,R6,M09,M15)
+ ENDIAN_MOVE(40,R6,M10,M15)
+ ENDIAN_MOVE(44,R6,M11,M15)
+
+ ROUND1(R22,R3,R4,R5,M08,0x698098d8, 7);
+ ROUND1(R5,R22,R3,R4,M09,0x8b44f7af,12);
+ ROUND1(R4,R5,R22,R3,M10,0xffff5bb1,17);
+ ROUND1(R3,R4,R5,R22,M11,0x895cd7be,22);
+
+ ENDIAN_MOVE(48,R6,M12,M15)
+ ENDIAN_MOVE(52,R6,M13,M15)
+ ENDIAN_MOVE(56,R6,M14,M15)
+ ENDIAN_MOVE(60,R6,M15,M15)
+
+ ROUND1(R22,R3,R4,R5,M12,0x6b901122, 7);
+ ROUND1(R5,R22,R3,R4,M13,0xfd987193,12);
+ ROUND1(R4,R5,R22,R3,M14,0xa679438e,17);
+ ROUND1(R3,R4,R5,R22,M15,0x49b40821,22);
+
+ ROUND2(R22,R3,R4,R5,M01,0xf61e2562, 5);
+ ROUND2(R5,R22,R3,R4,M06,0xc040b340, 9);
+ ROUND2(R4,R5,R22,R3,M11,0x265e5a51,14);
+ ROUND2(R3,R4,R5,R22,M00,0xe9b6c7aa,20);
+ ROUND2(R22,R3,R4,R5,M05,0xd62f105d, 5);
+ ROUND2(R5,R22,R3,R4,M10, 0x2441453, 9);
+ ROUND2(R4,R5,R22,R3,M15,0xd8a1e681,14);
+ ROUND2(R3,R4,R5,R22,M04,0xe7d3fbc8,20);
+ ROUND2(R22,R3,R4,R5,M09,0x21e1cde6, 5);
+ ROUND2(R5,R22,R3,R4,M14,0xc33707d6, 9);
+ ROUND2(R4,R5,R22,R3,M03,0xf4d50d87,14);
+ ROUND2(R3,R4,R5,R22,M08,0x455a14ed,20);
+ ROUND2(R22,R3,R4,R5,M13,0xa9e3e905, 5);
+ ROUND2(R5,R22,R3,R4,M02,0xfcefa3f8, 9);
+ ROUND2(R4,R5,R22,R3,M07,0x676f02d9,14);
+ ROUND2(R3,R4,R5,R22,M12,0x8d2a4c8a,20);
+
+ ROUND3(R22,R3,R4,R5,M05,0xfffa3942, 4);
+ ROUND3(R5,R22,R3,R4,M08,0x8771f681,11);
+ ROUND3(R4,R5,R22,R3,M11,0x6d9d6122,16);
+ ROUND3(R3,R4,R5,R22,M14,0xfde5380c,23);
+ ROUND3(R22,R3,R4,R5,M01,0xa4beea44, 4);
+ ROUND3(R5,R22,R3,R4,M04,0x4bdecfa9,11);
+ ROUND3(R4,R5,R22,R3,M07,0xf6bb4b60,16);
+ ROUND3(R3,R4,R5,R22,M10,0xbebfbc70,23);
+ ROUND3(R22,R3,R4,R5,M13,0x289b7ec6, 4);
+ ROUND3(R5,R22,R3,R4,M00,0xeaa127fa,11);
+ ROUND3(R4,R5,R22,R3,M03,0xd4ef3085,16);
+ ROUND3(R3,R4,R5,R22,M06, 0x4881d05,23);
+ ROUND3(R22,R3,R4,R5,M09,0xd9d4d039, 4);
+ ROUND3(R5,R22,R3,R4,M12,0xe6db99e5,11);
+ ROUND3(R4,R5,R22,R3,M15,0x1fa27cf8,16);
+ ROUND3(R3,R4,R5,R22,M02,0xc4ac5665,23);
+
+ ROUND4(R22,R3,R4,R5,M00,0xf4292244, 6);
+ ROUND4(R5,R22,R3,R4,M07,0x432aff97,10);
+ ROUND4(R4,R5,R22,R3,M14,0xab9423a7,15);
+ ROUND4(R3,R4,R5,R22,M05,0xfc93a039,21);
+ ROUND4(R22,R3,R4,R5,M12,0x655b59c3, 6);
+ ROUND4(R5,R22,R3,R4,M03,0x8f0ccc92,10);
+ ROUND4(R4,R5,R22,R3,M10,0xffeff47d,15);
+ ROUND4(R3,R4,R5,R22,M01,0x85845dd1,21);
+ ROUND4(R22,R3,R4,R5,M08,0x6fa87e4f, 6);
+ ROUND4(R5,R22,R3,R4,M15,0xfe2ce6e0,10);
+ ROUND4(R4,R5,R22,R3,M06,0xa3014314,15);
+ ROUND4(R3,R4,R5,R22,M13,0x4e0811a1,21);
+ ROUND4(R22,R3,R4,R5,M04,0xf7537e82, 6);
+ ROUND4(R5,R22,R3,R4,M11,0xbd3af235,10);
+ ROUND4(R4,R5,R22,R3,M02,0x2ad7d2bb,15);
+ ROUND4(R3,R4,R5,R22,M09,0xeb86d391,21);
ADD R14, R22
ADD R15, R3
ADD R16, R4
ADD R17, R5
ADD $64, R6
- CMP R6, R7
- BLT loop
+ BC 16, 0, loop // bdnz
end:
MOVD dig+0(FP), R10
@@ -198,4 +208,5 @@ end:
MOVWZ R3, 4(R10)
MOVWZ R4, 8(R10)
MOVWZ R5, 12(R10)
+
RET