math/big: rewrite subVW to use fast path on s390x

This CL replaces the original subVW implementation with a implementation that uses a similar idea as CL 164968. When we know the borrow bit is zero, we can copy the rest of words as they will not be updated. Also, since we are copying vector of a words, a faster implementation of copy is written in this CL to copy a word or multiple words at a time. Benchmarks: name old time/op new time/op delta SubVW/1-18 4.43ns ± 0% 3.82ns ± 0% -13.85% (p=0.000 n=20+20) SubVW/2-18 5.39ns ± 0% 4.25ns ± 0% -21.23% (p=0.000 n=20+20) SubVW/3-18 6.29ns ± 0% 4.65ns ± 0% -26.07% (p=0.000 n=16+19) SubVW/4-18 6.08ns ± 2% 4.84ns ± 0% -20.43% (p=0.000 n=20+20) SubVW/5-18 7.06ns ± 1% 4.93ns ± 0% -30.18% (p=0.000 n=20+20) SubVW/10-18 10.3ns ± 2% 7.2ns ± 0% -30.35% (p=0.000 n=20+19) SubVW/100-18 48.0ns ± 4% 17.6ns ± 0% -63.32% (p=0.000 n=18+19) SubVW/1000-18 448ns ±10% 236ns ± 1% -47.24% (p=0.000 n=20+20) SubVW/10000-18 4.83µs ± 5% 2.96µs ± 0% -38.73% (p=0.000 n=20+19) SubVW/100000-18 46.6µs ± 3% 30.6µs ± 1% -34.30% (p=0.000 n=20+20) [Geo mean] 56.3ns 37.0ns -34.24% name old speed new speed delta SubVW/1-18 1.80GB/s ± 0% 2.10GB/s ± 0% +16.16% (p=0.000 n=20+20) SubVW/2-18 2.97GB/s ± 0% 3.77GB/s ± 0% +26.95% (p=0.000 n=20+20) SubVW/3-18 3.82GB/s ± 0% 5.16GB/s ± 0% +35.26% (p=0.000 n=20+19) SubVW/4-18 5.26GB/s ± 1% 6.61GB/s ± 0% +25.59% (p=0.000 n=20+20) SubVW/5-18 5.67GB/s ± 1% 8.11GB/s ± 0% +43.12% (p=0.000 n=20+20) SubVW/10-18 7.79GB/s ± 2% 11.17GB/s ± 0% +43.52% (p=0.000 n=20+19) SubVW/100-18 16.7GB/s ± 4% 45.5GB/s ± 0% +172.61% (p=0.000 n=18+20) SubVW/1000-18 17.9GB/s ± 9% 33.9GB/s ± 1% +89.25% (p=0.000 n=20+20) SubVW/10000-18 16.6GB/s ± 5% 27.0GB/s ± 0% +63.08% (p=0.000 n=20+19) SubVW/100000-18 17.2GB/s ± 2% 26.1GB/s ± 1% +52.18% (p=0.000 n=20+20) [Geo mean] 7.25GB/s 11.03GB/s +52.01% Change-Id: I32e99cbab3260054a96231d02b87049c833ab77e Reviewed-on: https://go-review.googlesource.com/c/go/+/227297 Reviewed-by: Michael Munday <mike.munday@ibm.com> Run-TryBot: Michael Munday <mike.munday@ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org>
author: Ruixin(Peter) Bao <ruixin.bao@ibm.com> 2020-03-24 11:51:22 -0400
committer: Michael Munday <mike.munday@ibm.com> 2020-04-24 14:50:59 +0000
commit: 3a37fd4010dc20403d642e2c628cb0656a4fb968 (patch)
tree: 76d46fe05ba837b781e8ec2b058b59598e482e3e /src/math
parent: 45f1ee3d5f94e775ea61015b157c5cb17a6966d9 (diff)
download: go-3a37fd4010dc20403d642e2c628cb0656a4fb968.tar.gz
go-3a37fd4010dc20403d642e2c628cb0656a4fb968.zip
3 files changed, 73 insertions, 210 deletions
diff --git a/src/math/big/arith_decl_s390x.go b/src/math/big/arith_decl_s390x.go
index 38fefcc7b0..5973d3cfc1 100644
--- a/src/math/big/arith_decl_s390x.go
+++ b/src/math/big/arith_decl_s390x.go
@@ -12,9 +12,6 @@ func addVV_novec(z, x, y []Word) (c Word)
 func subVV_check(z, x, y []Word) (c Word)
 func subVV_vec(z, x, y []Word) (c Word)
 func subVV_novec(z, x, y []Word) (c Word)
-func subVW_check(z, x []Word, y Word) (c Word)
-func subVW_vec(z, x []Word, y Word) (c Word)
-func subVW_novec(z, x []Word, y Word) (c Word)
 func hasVectorFacility() bool
 
 var hasVX = hasVectorFacility()
diff --git a/src/math/big/arith_s390x.s b/src/math/big/arith_s390x.s
index f0464df201..ef0192224f 100644
--- a/src/math/big/arith_s390x.s
+++ b/src/math/big/arith_s390x.s
@@ -631,220 +631,95 @@ returnC:
 	RET
 
 TEXT ·subVW(SB), NOSPLIT, $0
-	MOVD subwvectorfacility+0x00(SB), R1
-	BR   (R1)
-
-TEXT ·subVW_check(SB), NOSPLIT, $0
-	MOVB   ·hasVX(SB), R1
-	CMPBEQ R1, $1, vectorimpl               // vectorfacility = 1, vector supported
-	MOVD   $subwvectorfacility+0x00(SB), R1
-	MOVD   $·subVW_novec(SB), R2
-	MOVD   R2, 0(R1)
-
-	// MOVD	$·subVW_novec(SB), 0(R1)
-	BR ·subVW_novec(SB)
-
-vectorimpl:
-	MOVD $subwvectorfacility+0x00(SB), R1
-	MOVD $·subVW_vec(SB), R2
-	MOVD R2, 0(R1)
-
-	// MOVD	$·subVW_vec(SB), 0(R1)
-	BR ·subVW_vec(SB)
-
-GLOBL subwvectorfacility+0x00(SB), NOPTR, $8
-DATA subwvectorfacility+0x00(SB)/8, $·subVW_check(SB)
-
-// func subVW(z, x []Word, y Word) (c Word)
-TEXT ·subVW_vec(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R3
-	MOVD x+24(FP), R8
-	MOVD y+48(FP), R4    // c = y
-	MOVD z+0(FP), R2
-
-	MOVD $0, R0  // make sure it's zero
-	MOVD $0, R10 // i = 0
-	MOVD R8, R5
-	MOVD R2, R7
-
-	// s/JL/JMP/ below to disable the unrolled loop
-	SUB $4, R3  // n -= 4
-	BLT v11     // if n < 0 goto v11
-	SUB $12, R3
-	BLT A11
-
-	VZERO V0
-	MOVD  $1, R6     // prepare V0 to be final carry register
-	VLVGG $1, R6, V0 // borrow is initially "no borrow"
-	VZERO V9         // to ensure upper half is zero
-	VLVGG $1, R4, V9
-
-	// n >= 0
-	// regular loop body unrolled 16x
-
-UU1:
-	VLM  0(R5), V1, V4    // 64-bytes into V1..V4
-	ADD  $64, R5
-	VPDI $0x4, V1, V1, V1 // flip the doublewords to big-endian order
-	VPDI $0x4, V2, V2, V2 // flip the doublewords to big-endian order
-
-	VSBCBIQ V1, V9, V0, V25
-	VSBIQ   V1, V9, V0, V17
-	VZERO   V9
-	VSBCBIQ V2, V9, V25, V26
-	VSBIQ   V2, V9, V25, V18
-
-	VLM 0(R5), V5, V6 // 32-bytes into V5..V6
-	ADD $32, R5
-
-	VPDI $0x4, V3, V3, V3 // flip the doublewords to big-endian order
-	VPDI $0x4, V4, V4, V4 // flip the doublewords to big-endian order
-
-	VSBCBIQ V3, V9, V26, V27
-	VSBIQ   V3, V9, V26, V19
-	VSBCBIQ V4, V9, V27, V28
-	VSBIQ   V4, V9, V27, V20
-
-	VLM 0(R5), V7, V8 // 32-bytes into V7..V8
-	ADD $32, R5
-
-	VPDI $0x4, V5, V5, V5 // flip the doublewords to big-endian order
-	VPDI $0x4, V6, V6, V6 // flip the doublewords to big-endian order
-
-	VSBCBIQ V5, V9, V28, V29
-	VSBIQ   V5, V9, V28, V21
-	VSBCBIQ V6, V9, V29, V30
-	VSBIQ   V6, V9, V29, V22
-
-	VPDI $0x4, V7, V7, V7 // flip the doublewords to big-endian order
-	VPDI $0x4, V8, V8, V8 // flip the doublewords to big-endian order
-
-	VSBCBIQ V7, V9, V30, V31
-	VSBIQ   V7, V9, V30, V23
-	VSBCBIQ V8, V9, V31, V0  // V0 has carry-over
-	VSBIQ   V8, V9, V31, V24
-
-	VPDI  $0x4, V17, V17, V17 // flip the doublewords to big-endian order
-	VPDI  $0x4, V18, V18, V18 // flip the doublewords to big-endian order
-	VPDI  $0x4, V19, V19, V19 // flip the doublewords to big-endian order
-	VPDI  $0x4, V20, V20, V20 // flip the doublewords to big-endian order
-	VPDI  $0x4, V21, V21, V21 // flip the doublewords to big-endian order
-	VPDI  $0x4, V22, V22, V22 // flip the doublewords to big-endian order
-	VPDI  $0x4, V23, V23, V23 // flip the doublewords to big-endian order
-	VPDI  $0x4, V24, V24, V24 // flip the doublewords to big-endian order
-	VSTM  V17, V24, 0(R7)     // 128-bytes into z
-	ADD   $128, R7
-	ADD   $128, R10           // i += 16
-	SUB   $16, R3             // n -= 16
-	BGE   UU1                 // if n >= 0 goto U1
-	VLGVG $1, V0, R4          // put cf into R4 in case we branch to v10
-	SUB   $1, R4              // save cf
-	NEG   R4, R4
-
-A11:
-	ADD $12, R3 // n += 16
-
-	BLT v11 // if n < 0 goto v11
-
-	// n >= 0
-	// regular loop body unrolled 4x
-
-U4:  // n >= 0
-	// regular loop body unrolled 4x
-	MOVD 0(R8)(R10*1), R5
-	MOVD 8(R8)(R10*1), R6
-	MOVD 16(R8)(R10*1), R7
-	MOVD 24(R8)(R10*1), R1
-	SUBC R4, R5            // SLGR  -> SUBC
-	SUBE R0, R6            // SLBGR -> SUBE
-	SUBE R0, R7
-	SUBE R0, R1
-	SUBE R4, R4            // save CF
-	NEG  R4, R4
-	MOVD R5, 0(R2)(R10*1)
-	MOVD R6, 8(R2)(R10*1)
-	MOVD R7, 16(R2)(R10*1)
-	MOVD R1, 24(R2)(R10*1)
+	MOVD z_len+8(FP), R5
+	MOVD x+24(FP), R6
+	MOVD y+48(FP), R7    // The borrow bit passed in
+	MOVD z+0(FP), R8
+	MOVD $0, R0          // R0 is a temporary variable used during computation. Ensure it has zero in it.
 
-	ADD $32, R10 // i += 4 -> i +=32
-	SUB $4, R3   // n -= 4
-	BGE U4       // if n >= 0 goto U4
+	CMPBEQ R5, $0, returnC // len(z) == 0, have an early return
 
-v11:
-	ADD $4, R3 // n += 4
-	BLE E11    // if n <= 0 goto E4
+	// Subtract the first two words, and determine which path (copy path or loop path) to take based on the borrow flag
+	MOVD   0(R6), R9
+	SUBC   R7, R9
+	MOVD   R9, 0(R8)
+	CMPBEQ R5, $1, returnResult
+	MOVD   8(R6), R9
+	SUBE   R0, R9
+	MOVD   R9, 8(R8)
+	CMPBEQ R5, $2, returnResult
 
-L4:  // n > 0
+	// Update the counters
+	MOVD $16, R12    // i = 2
+	MOVD $-2(R5), R5 // n = n - 2
 
-	MOVD 0(R8)(R10*1), R5
-	SUBC R4, R5
-	SUBE R4, R4           // save CF
-	NEG  R4, R4
-	MOVD R5, 0(R2)(R10*1)
+loopOverEachWord:
+	BRC  $3, copySetup    // no borrow, copy the rest
+	MOVD 0(R6)(R12*1), R9
 
-	ADD $8, R10 // i++
-	SUB $1, R3  // n--
-	BGT L4      // if n > 0 goto L4
+	// Originally we used the borrow flag generated in the previous iteration
+	// (i.e: SUBE could be used here to do the subtraction). However, since we
+	// already know borrow is 1 (otherwise we will go to copy section), we can
+	// use SUBC here so the current iteration does not depend on the borrow flag
+	// generated in the previous iteration. This could be useful when branch prediction happens.
+	SUBC $1, R9
+	MOVD R9, 0(R8)(R12*1) // z[i] = x[i] - 1
 
-E11:
-	MOVD R4, c+56(FP) // return c
+	MOVD  $8(R12), R12         // i++
+	BRCTG R5, loopOverEachWord // n--
 
+// return the current borrow value
+returnResult:
+	SUBE R0, R0
+	NEG  R0, R0
+	MOVD R0, c+56(FP)
 	RET
 
-// DI = R3, CX = R4, SI = r10, r8 = r8, r10 = r2 , r11 = r5, r12 = r6, r13 = r7, r14 = r1 (R0 set to 0)
-// func subVW(z, x []Word, y Word) (c Word)
-// (same as addVW except for SUBC/SUBE instead of ADDC/ADDE and label names)
-TEXT ·subVW_novec(SB), NOSPLIT, $0
-	MOVD z_len+8(FP), R3
-	MOVD x+24(FP), R8
-	MOVD y+48(FP), R4    // c = y
-	MOVD z+0(FP), R2
-	MOVD $0, R0          // make sure it's 0
-	MOVD $0, R10         // i = 0
-
-	// s/JL/JMP/ below to disable the unrolled loop
-	SUB $4, R3 // n -= 4
-	BLT v4     // if n < 4 goto v4
+// Update position of x(R6) and z(R8) based on the current counter value and perform copying.
+// With the assumption that x and z will not overlap with each other or x and z will
+// point to same memory region, we can use a faster version of copy using only MVC here.
+// In the following implementation, we have three copy loops, each copying a word, 4 words, and
+// 32 words at a time. Via benchmarking, this implementation is faster than calling runtime·memmove.
+copySetup:
+	ADD R12, R6
+	ADD R12, R8
 
-U4:  // n >= 0
-	// regular loop body unrolled 4x
-	MOVD 0(R8)(R10*1), R5
-	MOVD 8(R8)(R10*1), R6
-	MOVD 16(R8)(R10*1), R7
-	MOVD 24(R8)(R10*1), R1
-	SUBC R4, R5            // SLGR  -> SUBC
-	SUBE R0, R6            // SLBGR -> SUBE
-	SUBE R0, R7
-	SUBE R0, R1
-	SUBE R4, R4            // save CF
-	NEG  R4, R4
-	MOVD R5, 0(R2)(R10*1)
-	MOVD R6, 8(R2)(R10*1)
-	MOVD R7, 16(R2)(R10*1)
-	MOVD R1, 24(R2)(R10*1)
+	CMPBGE R5, $4, mediumLoop
 
-	ADD $32, R10 // i += 4 -> i +=32
-	SUB $4, R3   // n -= 4
-	BGE U4       // if n >= 0 goto U4
+smallLoop:  // does a loop unrolling to copy word when n < 4
+	CMPBEQ R5, $0, returnZero
+	MVC    $8, 0(R6), 0(R8)
+	CMPBEQ R5, $1, returnZero
+	MVC    $8, 8(R6), 8(R8)
+	CMPBEQ R5, $2, returnZero
+	MVC    $8, 16(R6), 16(R8)
 
-v4:
-	ADD $4, R3 // n += 4
-	BLE E4     // if n <= 0 goto E4
+returnZero:
+	MOVD $0, c+56(FP) // return 0 as borrow
+	RET
 
-L4:  // n > 0
-	MOVD 0(R8)(R10*1), R5
-	SUBC R4, R5
-	SUBE R4, R4           // save CF
-	NEG  R4, R4
-	MOVD R5, 0(R2)(R10*1)
+mediumLoop:
+	CMPBLT R5, $4, smallLoop
+	CMPBLT R5, $32, mediumLoopBody
 
-	ADD $8, R10 // i++
-	SUB $1, R3  // n--
-	BGT L4      // if n > 0 goto L4
+largeLoop:  // Copying 256 bytes at a time
+	MVC    $256, 0(R6), 0(R8)
+	MOVD   $256(R6), R6
+	MOVD   $256(R8), R8
+	MOVD   $-32(R5), R5
+	CMPBGE R5, $32, largeLoop
+	BR     mediumLoop
 
-E4:
-	MOVD R4, c+56(FP) // return c
+mediumLoopBody:  // Copying 32 bytes at a time
+	MVC    $32, 0(R6), 0(R8)
+	MOVD   $32(R6), R6
+	MOVD   $32(R8), R8
+	MOVD   $-4(R5), R5
+	CMPBGE R5, $4, mediumLoopBody
+	BR     smallLoop
 
+returnC:
+	MOVD R7, c+56(FP)
 	RET
 
 // func shlVU(z, x []Word, s uint) (c Word)
diff --git a/src/math/big/arith_s390x_test.go b/src/math/big/arith_s390x_test.go
index e7f1b57ea5..ce6bca8885 100644
--- a/src/math/big/arith_s390x_test.go
+++ b/src/math/big/arith_s390x_test.go
@@ -30,12 +30,3 @@ func TestFunVVnovec(t *testing.T) {
 		}
 	}
 }
-
-func TestFunVWnovec(t *testing.T) {
-	if hasVX == true {
-		for _, a := range sumVW {
-			arg := argVW{a.x, a.z, a.y, a.c}
-			testFunVW(t, "subVW_novec", subVW_novec, arg)
-		}
-	}
-}
author	Ruixin(Peter) Bao <ruixin.bao@ibm.com>	2020-03-24 11:51:22 -0400
committer	Michael Munday <mike.munday@ibm.com>	2020-04-24 14:50:59 +0000
commit	3a37fd4010dc20403d642e2c628cb0656a4fb968 (patch)
tree	76d46fe05ba837b781e8ec2b058b59598e482e3e /src/math
parent	45f1ee3d5f94e775ea61015b157c5cb17a6966d9 (diff)
download	go-3a37fd4010dc20403d642e2c628cb0656a4fb968.tar.gz go-3a37fd4010dc20403d642e2c628cb0656a4fb968.zip