aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/memmove_ppc64x.s
diff options
context:
space:
mode:
authorLynn Boger <laboger@linux.vnet.ibm.com>2018-10-05 14:21:39 -0400
committerLynn Boger <laboger@linux.vnet.ibm.com>2018-11-06 14:54:59 +0000
commitaa9bcea3907a74f45303b3bdb603b9952cc72b7b (patch)
tree55dbad3c975d51993c099d374c29f88230d301e3 /src/runtime/memmove_ppc64x.s
parente1978a2d7a6deac29aa778a17a1cbea25586abc6 (diff)
downloadgo-aa9bcea3907a74f45303b3bdb603b9952cc72b7b.tar.gz
go-aa9bcea3907a74f45303b3bdb603b9952cc72b7b.zip
runtime: improve performance of memclr, memmove on ppc64x
This improves the asm implementations for memmove and memclr on ppc64x through use of vsx loads and stores when size is >= 32 bytes. For memclr, dcbz is used when the size is >= 512 and aligned to 128. Memclr/64 13.3ns ± 0% 10.7ns ± 0% -19.55% (p=0.000 n=8+7) Memclr/96 14.9ns ± 0% 11.4ns ± 0% -23.49% (p=0.000 n=8+8) Memclr/128 16.3ns ± 0% 12.3ns ± 0% -24.54% (p=0.000 n=8+8) Memclr/160 17.3ns ± 0% 13.0ns ± 0% -24.86% (p=0.000 n=8+8) Memclr/256 20.0ns ± 0% 15.3ns ± 0% -23.62% (p=0.000 n=8+8) Memclr/512 34.2ns ± 0% 10.2ns ± 0% -70.20% (p=0.000 n=8+8) Memclr/4096 178ns ± 0% 23ns ± 0% -87.13% (p=0.000 n=8+8) Memclr/65536 2.67µs ± 0% 0.30µs ± 0% -88.89% (p=0.000 n=7+8) Memclr/1M 43.2µs ± 0% 10.0µs ± 0% -76.85% (p=0.000 n=8+8) Memclr/4M 173µs ± 0% 40µs ± 0% -76.88% (p=0.000 n=8+8) Memclr/8M 349µs ± 0% 82µs ± 0% -76.58% (p=0.000 n=8+8) Memclr/16M 701µs ± 7% 672µs ± 0% -4.05% (p=0.040 n=8+7) Memclr/64M 2.70ms ± 0% 2.67ms ± 0% -0.96% (p=0.000 n=8+7) Memmove/32 6.59ns ± 0% 5.84ns ± 0% -11.34% (p=0.029 n=4+4) Memmove/64 7.91ns ± 0% 6.97ns ± 0% -11.92% (p=0.029 n=4+4) Memmove/128 10.5ns ± 0% 8.8ns ± 0% -16.24% (p=0.029 n=4+4) Memmove/256 21.0ns ± 0% 12.9ns ± 0% -38.57% (p=0.029 n=4+4) Memmove/512 28.4ns ± 0% 26.2ns ± 0% -7.75% (p=0.029 n=4+4) Memmove/1024 48.2ns ± 1% 39.4ns ± 0% -18.26% (p=0.029 n=4+4) Memmove/2048 85.4ns ± 0% 69.0ns ± 0% -19.20% (p=0.029 n=4+4) Memmove/4096 159ns ± 0% 128ns ± 0% -19.50% (p=0.029 n=4+4) Change-Id: I8c1adf88790845bf31444a15249456006eb5bf8b Reviewed-on: https://go-review.googlesource.com/c/141217 Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Michael Munday <mike.munday@ibm.com>
Diffstat (limited to 'src/runtime/memmove_ppc64x.s')
-rw-r--r--src/runtime/memmove_ppc64x.s51
1 files changed, 34 insertions, 17 deletions
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s
index b79f76d388..60cbcc41ec 100644
--- a/src/runtime/memmove_ppc64x.s
+++ b/src/runtime/memmove_ppc64x.s
@@ -16,7 +16,7 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24
// copy so a more efficient move can be done
check:
ANDCC $7, R5, R7 // R7: bytes to copy
- SRAD $3, R5, R6 // R6: double words to copy
+ SRD $3, R5, R6 // R6: double words to copy
CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy
// Determine overlap by subtracting dest - src and comparing against the
@@ -31,9 +31,9 @@ check:
// Copying forward if no overlap.
BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge"
- MOVD R6,CTR // R6 = number of double words
- SRADCC $2,R6,R8 // 32 byte chunks?
+ SRDCC $2,R6,R8 // 32 byte chunks?
BNE forward32setup //
+ MOVD R6,CTR // R6 = number of double words
// Move double words
@@ -51,17 +51,14 @@ forward32setup:
DCBTST (R3) // prepare data cache
DCBT (R4)
MOVD R8, CTR // double work count
+ MOVD $16, R8
forward32:
- MOVD 0(R4), R8 // load 4 double words
- MOVD 8(R4), R9
- MOVD 16(R4), R14
- MOVD 24(R4), R15
- ADD $32,R4
- MOVD R8, 0(R3) // store those 4
- MOVD R9, 8(R3)
- MOVD R14,16(R3)
- MOVD R15,24(R3)
+ LXVD2X (R4+R0), VS32 // load 16 bytes
+ LXVD2X (R4+R8), VS33
+ ADD $32, R4
+ STXVD2X VS32, (R3+R0) // store 16 bytes
+ STXVD2X VS33, (R3+R8)
ADD $32,R3 // bump up for next set
BC 16, 0, forward32 // continue
RLDCLCC $61,R5,$3,R6 // remaining doublewords
@@ -71,7 +68,7 @@ forward32:
noforwardlarge:
CMP R7,$0 // any remaining bytes
- BC 4, 1, LR
+ BC 4, 1, LR // ble lr
forwardtail:
MOVD R7, CTR // move tail bytes
@@ -101,19 +98,39 @@ backwardtailloop:
SUB $1,R4
MOVBZ R8, -1(R3)
SUB $1,R3
- BC 16, 0, backwardtailloop
+ BC 16, 0, backwardtailloop // bndz
nobackwardtail:
- CMP R6,$0
- BC 4, 5, LR
+ BC 4, 5, LR // ble CR1 lr
backwardlarge:
MOVD R6, CTR
+ SUB R3, R4, R9 // Use vsx if moving
+ CMP R9, $32 // at least 32 byte chunks
+ BLT backwardlargeloop // and distance >= 32
+ SRDCC $2,R6,R8 // 32 byte chunks
+ BNE backward32setup
backwardlargeloop:
MOVD -8(R4), R8
SUB $8,R4
MOVD R8, -8(R3)
SUB $8,R3
- BC 16, 0, backwardlargeloop //
+ BC 16, 0, backwardlargeloop // bndz
RET
+
+backward32setup:
+ MOVD R8, CTR // set up loop ctr
+ MOVD $16, R8 // 32 bytes at at time
+
+backward32loop:
+ SUB $32, R4
+ SUB $32, R3
+ LXVD2X (R4+R0), VS32 // load 16 bytes
+ LXVD2X (R4+R8), VS33
+ STXVD2X VS32, (R3+R0) // store 16 bytes
+ STXVD2X VS33, (R3+R8)
+ BC 16, 0, backward32loop // bndz
+ BC 4, 5, LR // ble CR1 lr
+ MOVD R6, CTR
+ BR backwardlargeloop