diff options
author | Lynn Boger <laboger@linux.vnet.ibm.com> | 2018-10-05 14:21:39 -0400 |
---|---|---|
committer | Lynn Boger <laboger@linux.vnet.ibm.com> | 2018-11-06 14:54:59 +0000 |
commit | aa9bcea3907a74f45303b3bdb603b9952cc72b7b (patch) | |
tree | 55dbad3c975d51993c099d374c29f88230d301e3 /src/runtime/memmove_ppc64x.s | |
parent | e1978a2d7a6deac29aa778a17a1cbea25586abc6 (diff) | |
download | go-aa9bcea3907a74f45303b3bdb603b9952cc72b7b.tar.gz go-aa9bcea3907a74f45303b3bdb603b9952cc72b7b.zip |
runtime: improve performance of memclr, memmove on ppc64x
This improves the asm implementations for memmove and memclr on
ppc64x through use of vsx loads and stores when size is >= 32 bytes.
For memclr, dcbz is used when the size is >= 512 and aligned to 128.
Memclr/64 13.3ns ± 0% 10.7ns ± 0% -19.55% (p=0.000 n=8+7)
Memclr/96 14.9ns ± 0% 11.4ns ± 0% -23.49% (p=0.000 n=8+8)
Memclr/128 16.3ns ± 0% 12.3ns ± 0% -24.54% (p=0.000 n=8+8)
Memclr/160 17.3ns ± 0% 13.0ns ± 0% -24.86% (p=0.000 n=8+8)
Memclr/256 20.0ns ± 0% 15.3ns ± 0% -23.62% (p=0.000 n=8+8)
Memclr/512 34.2ns ± 0% 10.2ns ± 0% -70.20% (p=0.000 n=8+8)
Memclr/4096 178ns ± 0% 23ns ± 0% -87.13% (p=0.000 n=8+8)
Memclr/65536 2.67µs ± 0% 0.30µs ± 0% -88.89% (p=0.000 n=7+8)
Memclr/1M 43.2µs ± 0% 10.0µs ± 0% -76.85% (p=0.000 n=8+8)
Memclr/4M 173µs ± 0% 40µs ± 0% -76.88% (p=0.000 n=8+8)
Memclr/8M 349µs ± 0% 82µs ± 0% -76.58% (p=0.000 n=8+8)
Memclr/16M 701µs ± 7% 672µs ± 0% -4.05% (p=0.040 n=8+7)
Memclr/64M 2.70ms ± 0% 2.67ms ± 0% -0.96% (p=0.000 n=8+7)
Memmove/32 6.59ns ± 0% 5.84ns ± 0% -11.34% (p=0.029 n=4+4)
Memmove/64 7.91ns ± 0% 6.97ns ± 0% -11.92% (p=0.029 n=4+4)
Memmove/128 10.5ns ± 0% 8.8ns ± 0% -16.24% (p=0.029 n=4+4)
Memmove/256 21.0ns ± 0% 12.9ns ± 0% -38.57% (p=0.029 n=4+4)
Memmove/512 28.4ns ± 0% 26.2ns ± 0% -7.75% (p=0.029 n=4+4)
Memmove/1024 48.2ns ± 1% 39.4ns ± 0% -18.26% (p=0.029 n=4+4)
Memmove/2048 85.4ns ± 0% 69.0ns ± 0% -19.20% (p=0.029 n=4+4)
Memmove/4096 159ns ± 0% 128ns ± 0% -19.50% (p=0.029 n=4+4)
Change-Id: I8c1adf88790845bf31444a15249456006eb5bf8b
Reviewed-on: https://go-review.googlesource.com/c/141217
Run-TryBot: Lynn Boger <laboger@linux.vnet.ibm.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Michael Munday <mike.munday@ibm.com>
Diffstat (limited to 'src/runtime/memmove_ppc64x.s')
-rw-r--r-- | src/runtime/memmove_ppc64x.s | 51 |
1 files changed, 34 insertions, 17 deletions
diff --git a/src/runtime/memmove_ppc64x.s b/src/runtime/memmove_ppc64x.s index b79f76d388..60cbcc41ec 100644 --- a/src/runtime/memmove_ppc64x.s +++ b/src/runtime/memmove_ppc64x.s @@ -16,7 +16,7 @@ TEXT runtime·memmove(SB), NOSPLIT|NOFRAME, $0-24 // copy so a more efficient move can be done check: ANDCC $7, R5, R7 // R7: bytes to copy - SRAD $3, R5, R6 // R6: double words to copy + SRD $3, R5, R6 // R6: double words to copy CMP R6, $0, CR1 // CR1[EQ] set if no double words to copy // Determine overlap by subtracting dest - src and comparing against the @@ -31,9 +31,9 @@ check: // Copying forward if no overlap. BC 12, 6, noforwardlarge // "BEQ CR1, noforwardlarge" - MOVD R6,CTR // R6 = number of double words - SRADCC $2,R6,R8 // 32 byte chunks? + SRDCC $2,R6,R8 // 32 byte chunks? BNE forward32setup // + MOVD R6,CTR // R6 = number of double words // Move double words @@ -51,17 +51,14 @@ forward32setup: DCBTST (R3) // prepare data cache DCBT (R4) MOVD R8, CTR // double work count + MOVD $16, R8 forward32: - MOVD 0(R4), R8 // load 4 double words - MOVD 8(R4), R9 - MOVD 16(R4), R14 - MOVD 24(R4), R15 - ADD $32,R4 - MOVD R8, 0(R3) // store those 4 - MOVD R9, 8(R3) - MOVD R14,16(R3) - MOVD R15,24(R3) + LXVD2X (R4+R0), VS32 // load 16 bytes + LXVD2X (R4+R8), VS33 + ADD $32, R4 + STXVD2X VS32, (R3+R0) // store 16 bytes + STXVD2X VS33, (R3+R8) ADD $32,R3 // bump up for next set BC 16, 0, forward32 // continue RLDCLCC $61,R5,$3,R6 // remaining doublewords @@ -71,7 +68,7 @@ forward32: noforwardlarge: CMP R7,$0 // any remaining bytes - BC 4, 1, LR + BC 4, 1, LR // ble lr forwardtail: MOVD R7, CTR // move tail bytes @@ -101,19 +98,39 @@ backwardtailloop: SUB $1,R4 MOVBZ R8, -1(R3) SUB $1,R3 - BC 16, 0, backwardtailloop + BC 16, 0, backwardtailloop // bndz nobackwardtail: - CMP R6,$0 - BC 4, 5, LR + BC 4, 5, LR // ble CR1 lr backwardlarge: MOVD R6, CTR + SUB R3, R4, R9 // Use vsx if moving + CMP R9, $32 // at least 32 byte chunks + BLT backwardlargeloop // and distance >= 32 + SRDCC $2,R6,R8 // 32 byte chunks + BNE backward32setup backwardlargeloop: MOVD -8(R4), R8 SUB $8,R4 MOVD R8, -8(R3) SUB $8,R3 - BC 16, 0, backwardlargeloop // + BC 16, 0, backwardlargeloop // bndz RET + +backward32setup: + MOVD R8, CTR // set up loop ctr + MOVD $16, R8 // 32 bytes at at time + +backward32loop: + SUB $32, R4 + SUB $32, R3 + LXVD2X (R4+R0), VS32 // load 16 bytes + LXVD2X (R4+R8), VS33 + STXVD2X VS32, (R3+R0) // store 16 bytes + STXVD2X VS33, (R3+R8) + BC 16, 0, backward32loop // bndz + BC 4, 5, LR // ble CR1 lr + MOVD R6, CTR + BR backwardlargeloop |