diff options
author | Balaram Makam <bmakam.qdt@qualcommdatacenter.com> | 2017-12-05 17:51:10 -0500 |
---|---|---|
committer | Cherry Zhang <cherryyz@google.com> | 2018-03-14 18:20:40 +0000 |
commit | b46d398887e1b3f2ccf279f91007ecf78466a573 (patch) | |
tree | ededca927bf8d84ce98f2d647dc32c2b4fd6a2a8 /src/runtime/memclr_arm64.s | |
parent | e65d6a6abe1ea4731dcb86f02fd7a68e97ed97eb (diff) | |
download | go-b46d398887e1b3f2ccf279f91007ecf78466a573.tar.gz go-b46d398887e1b3f2ccf279f91007ecf78466a573.zip |
runtime: improve arm64 memclr implementation
Improve runtime memclr_arm64.s using ZVA feature to zero out memory when n
is at least 64 bytes.
Also add DCZID_EL0 system register to use in MRS instruction.
Benchmark results of runtime/Memclr on Amberwing:
name old time/op new time/op delta
Memclr/5 12.7ns ± 0% 12.7ns ± 0% ~ (all equal)
Memclr/16 12.7ns ± 0% 12.2ns ± 1% -4.13% (p=0.000 n=7+8)
Memclr/64 14.0ns ± 0% 14.6ns ± 1% +4.29% (p=0.000 n=7+8)
Memclr/256 23.7ns ± 0% 25.7ns ± 0% +8.44% (p=0.000 n=8+7)
Memclr/4096 204ns ± 0% 74ns ± 0% -63.71% (p=0.000 n=8+8)
Memclr/65536 2.89µs ± 0% 0.84µs ± 0% -70.91% (p=0.000 n=8+8)
Memclr/1M 45.9µs ± 0% 17.0µs ± 0% -62.88% (p=0.000 n=8+8)
Memclr/4M 184µs ± 0% 77µs ± 4% -57.94% (p=0.001 n=6+8)
Memclr/8M 367µs ± 0% 144µs ± 1% -60.72% (p=0.000 n=7+8)
Memclr/16M 734µs ± 0% 293µs ± 1% -60.09% (p=0.000 n=8+8)
Memclr/64M 2.94ms ± 0% 1.23ms ± 0% -58.06% (p=0.000 n=7+8)
GoMemclr/5 8.00ns ± 0% 8.79ns ± 0% +9.83% (p=0.000 n=8+8)
GoMemclr/16 8.00ns ± 0% 7.60ns ± 0% -5.00% (p=0.000 n=8+8)
GoMemclr/64 10.8ns ± 0% 10.4ns ± 0% -3.70% (p=0.000 n=8+8)
GoMemclr/256 20.4ns ± 0% 21.2ns ± 0% +3.92% (p=0.000 n=8+8)
name old speed new speed delta
Memclr/5 394MB/s ± 0% 393MB/s ± 0% -0.28% (p=0.006 n=8+8)
Memclr/16 1.26GB/s ± 0% 1.31GB/s ± 1% +4.07% (p=0.000 n=7+8)
Memclr/64 4.57GB/s ± 0% 4.39GB/s ± 2% -3.91% (p=0.000 n=7+8)
Memclr/256 10.8GB/s ± 0% 10.0GB/s ± 0% -7.95% (p=0.001 n=7+6)
Memclr/4096 20.1GB/s ± 0% 55.3GB/s ± 0% +175.46% (p=0.000 n=8+8)
Memclr/65536 22.6GB/s ± 0% 77.8GB/s ± 0% +243.63% (p=0.000 n=7+8)
Memclr/1M 22.8GB/s ± 0% 61.5GB/s ± 0% +169.38% (p=0.000 n=8+8)
Memclr/4M 22.8GB/s ± 0% 54.3GB/s ± 4% +137.85% (p=0.001 n=6+8)
Memclr/8M 22.8GB/s ± 0% 58.1GB/s ± 1% +154.56% (p=0.000 n=7+8)
Memclr/16M 22.8GB/s ± 0% 57.2GB/s ± 1% +150.54% (p=0.000 n=8+8)
Memclr/64M 22.8GB/s ± 0% 54.4GB/s ± 0% +138.42% (p=0.000 n=7+8)
GoMemclr/5 625MB/s ± 0% 569MB/s ± 0% -8.90% (p=0.000 n=7+8)
GoMemclr/16 2.00GB/s ± 0% 2.10GB/s ± 0% +5.26% (p=0.000 n=8+8)
GoMemclr/64 5.92GB/s ± 0% 6.15GB/s ± 0% +3.83% (p=0.000 n=7+8)
GoMemclr/256 12.5GB/s ± 0% 12.1GB/s ± 0% -3.77% (p=0.000 n=8+7)
Benchmark results of runtime/Memclr on Amberwing without ZVA:
name old time/op new time/op delta
Memclr/5 12.7ns ± 0% 12.8ns ± 0% +0.79% (p=0.008 n=5+5)
Memclr/16 12.7ns ± 0% 12.7ns ± 0% ~ (p=0.444 n=5+5)
Memclr/64 14.0ns ± 0% 14.4ns ± 0% +2.86% (p=0.008 n=5+5)
Memclr/256 23.7ns ± 1% 19.2ns ± 0% -19.06% (p=0.008 n=5+5)
Memclr/4096 203ns ± 0% 119ns ± 0% -41.38% (p=0.008 n=5+5)
Memclr/65536 2.89µs ± 0% 1.66µs ± 0% -42.76% (p=0.008 n=5+5)
Memclr/1M 45.9µs ± 0% 26.2µs ± 0% -42.82% (p=0.008 n=5+5)
Memclr/4M 184µs ± 0% 105µs ± 0% -42.81% (p=0.008 n=5+5)
Memclr/8M 367µs ± 0% 210µs ± 0% -42.76% (p=0.008 n=5+5)
Memclr/16M 734µs ± 0% 420µs ± 0% -42.74% (p=0.008 n=5+5)
Memclr/64M 2.94ms ± 0% 1.69ms ± 0% -42.46% (p=0.008 n=5+5)
GoMemclr/5 8.00ns ± 0% 8.40ns ± 0% +5.00% (p=0.008 n=5+5)
GoMemclr/16 8.00ns ± 0% 8.40ns ± 0% +5.00% (p=0.008 n=5+5)
GoMemclr/64 10.8ns ± 0% 9.6ns ± 0% -11.02% (p=0.008 n=5+5)
GoMemclr/256 20.4ns ± 0% 17.2ns ± 0% -15.69% (p=0.008 n=5+5)
name old speed new speed delta
Memclr/5 393MB/s ± 0% 391MB/s ± 0% -0.64% (p=0.008 n=5+5)
Memclr/16 1.26GB/s ± 0% 1.26GB/s ± 0% -0.55% (p=0.008 n=5+5)
Memclr/64 4.57GB/s ± 0% 4.44GB/s ± 0% -2.79% (p=0.008 n=5+5)
Memclr/256 10.8GB/s ± 0% 13.3GB/s ± 0% +23.07% (p=0.016 n=4+5)
Memclr/4096 20.1GB/s ± 0% 34.3GB/s ± 0% +70.91% (p=0.008 n=5+5)
Memclr/65536 22.7GB/s ± 0% 39.6GB/s ± 0% +74.65% (p=0.008 n=5+5)
Memclr/1M 22.8GB/s ± 0% 40.0GB/s ± 0% +74.88% (p=0.008 n=5+5)
Memclr/4M 22.8GB/s ± 0% 39.9GB/s ± 0% +74.84% (p=0.008 n=5+5)
Memclr/8M 22.9GB/s ± 0% 39.9GB/s ± 0% +74.71% (p=0.008 n=5+5)
Memclr/16M 22.9GB/s ± 0% 39.9GB/s ± 0% +74.64% (p=0.008 n=5+5)
Memclr/64M 22.8GB/s ± 0% 39.7GB/s ± 0% +73.79% (p=0.008 n=5+5)
GoMemclr/5 625MB/s ± 0% 595MB/s ± 0% -4.77% (p=0.000 n=4+5)
GoMemclr/16 2.00GB/s ± 0% 1.90GB/s ± 0% -4.77% (p=0.008 n=5+5)
GoMemclr/64 5.92GB/s ± 0% 6.66GB/s ± 0% +12.48% (p=0.016 n=4+5)
GoMemclr/256 12.5GB/s ± 0% 14.9GB/s ± 0% +18.95% (p=0.008 n=5+5)
Fixes #22948
Change-Id: Iaae4e22391e25b54d299821bb7f8a81ac3986b93
Reviewed-on: https://go-review.googlesource.com/82055
Run-TryBot: Brad Fitzpatrick <bradfitz@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
Diffstat (limited to 'src/runtime/memclr_arm64.s')
-rw-r--r-- | src/runtime/memclr_arm64.s | 199 |
1 files changed, 161 insertions, 38 deletions
diff --git a/src/runtime/memclr_arm64.s b/src/runtime/memclr_arm64.s index bf954e047f..159cac2486 100644 --- a/src/runtime/memclr_arm64.s +++ b/src/runtime/memclr_arm64.s @@ -8,52 +8,175 @@ TEXT runtime·memclrNoHeapPointers(SB),NOSPLIT,$0-16 MOVD ptr+0(FP), R0 MOVD n+8(FP), R1 - // If size is less than 16 bytes, use tail_zero to zero what remains + CMP $16, R1 - BLT tail_zero - // Get buffer offset into 16 byte aligned address for better performance - ANDS $15, R0, ZR - BNE unaligned_to_16 -aligned_to_16: - LSR $4, R1, R2 + // If n is equal to 16 bytes, use zero_exact_16 to zero + BEQ zero_exact_16 + + // If n is greater than 16 bytes, use zero_by_16 to zero + BHI zero_by_16 + + // n is less than 16 bytes + ADD R1, R0, R7 + TBZ $3, R1, less_than_8 + MOVD ZR, (R0) + MOVD ZR, -8(R7) + RET + +less_than_8: + TBZ $2, R1, less_than_4 + MOVW ZR, (R0) + MOVW ZR, -4(R7) + RET + +less_than_4: + CBZ R1, ending + MOVB ZR, (R0) + TBZ $1, R1, ending + MOVH ZR, -2(R7) + +ending: + RET + +zero_exact_16: + // n is exactly 16 bytes + STP (ZR, ZR), (R0) + RET + zero_by_16: - STP.P (ZR, ZR), 16(R0) - SUBS $1, R2, R2 - BNE zero_by_16 + // n greater than 16 bytes, check if the start address is aligned + NEG R0, R4 + ANDS $15, R4, R4 + // Try zeroing using zva if the start address is aligned with 16 + BEQ try_zva + + // Non-aligned store + STP (ZR, ZR), (R0) + // Make the destination aligned + SUB R4, R1, R1 + ADD R4, R0, R0 + B try_zva + +tail_maybe_long: + CMP $64, R1 + BHS no_zva +tail63: + ANDS $48, R1, R3 + BEQ last16 + CMPW $32, R3 + BEQ last48 + BLT last32 + STP.P (ZR, ZR), 16(R0) +last48: + STP.P (ZR, ZR), 16(R0) +last32: + STP.P (ZR, ZR), 16(R0) + // The last store length is at most 16, so it is safe to use + // stp to write last 16 bytes +last16: ANDS $15, R1, R1 - BEQ ending + CBZ R1, last_end + ADD R1, R0, R0 + STP (ZR, ZR), -16(R0) +last_end: + RET - // Zero buffer with size=R1 < 16 -tail_zero: - TBZ $3, R1, tail_zero_4 - MOVD.P ZR, 8(R0) +no_zva: + SUB $16, R0, R0 + SUB $64, R1, R1 -tail_zero_4: - TBZ $2, R1, tail_zero_2 - MOVW.P ZR, 4(R0) +loop_64: + STP (ZR, ZR), 16(R0) + STP (ZR, ZR), 32(R0) + STP (ZR, ZR), 48(R0) + STP.W (ZR, ZR), 64(R0) + SUBS $64, R1, R1 + BGE loop_64 + ANDS $63, R1, ZR + ADD $16, R0, R0 + BNE tail63 + RET -tail_zero_2: - TBZ $1, R1, tail_zero_1 - MOVH.P ZR, 2(R0) +try_zva: + // Try using the ZVA feature to zero entire cache lines + // It is not meaningful to use ZVA if the block size is less than 64, + // so make sure that n is greater than or equal to 64 + CMP $63, R1 + BLE tail63 -tail_zero_1: - TBZ $0, R1, ending - MOVB ZR, (R0) + CMP $128, R1 + // Ensure n is at least 128 bytes, so that there is enough to copy after + // alignment. + BLT no_zva + // Check if ZVA is allowed from user code, and if so get the block size + MOVW block_size<>(SB), R5 + TBNZ $31, R5, no_zva + CBNZ R5, zero_by_line + // DCZID_EL0 bit assignments + // [63:5] Reserved + // [4] DZP, if bit set DC ZVA instruction is prohibited, else permitted + // [3:0] log2 of the block size in words, eg. if it returns 0x4 then block size is 16 words + MRS DCZID_EL0, R3 + TBZ $4, R3, init + // ZVA not available + MOVW $~0, R5 + MOVW R5, block_size<>(SB) + B no_zva -ending: +init: + MOVW $4, R9 + ANDW $15, R3, R5 + LSLW R5, R9, R5 + MOVW R5, block_size<>(SB) + + ANDS $63, R5, R9 + // Block size is less than 64. + BNE no_zva + +zero_by_line: + CMP R5, R1 + // Not enough memory to reach alignment + BLO no_zva + SUB $1, R5, R6 + NEG R0, R4 + ANDS R6, R4, R4 + // Already aligned + BEQ aligned + + // check there is enough to copy after alignment + SUB R4, R1, R3 + + // Check that the remaining length to ZVA after alignment + // is greater than 64. + CMP $64, R3 + CCMP GE, R3, R5, $10 // condition code GE, NZCV=0b1010 + BLT no_zva + + // We now have at least 64 bytes to zero, update n + MOVD R3, R1 + +loop_zva_prolog: + STP (ZR, ZR), (R0) + STP (ZR, ZR), 16(R0) + STP (ZR, ZR), 32(R0) + SUBS $64, R4, R4 + STP (ZR, ZR), 48(R0) + ADD $64, R0, R0 + BGE loop_zva_prolog + + ADD R4, R0, R0 + +aligned: + SUB R5, R1, R1 + +loop_zva: + WORD $0xd50b7420 // DC ZVA, R0 + ADD R5, R0, R0 + SUBS R5, R1, R1 + BHS loop_zva + ANDS R6, R1, R1 + BNE tail_maybe_long RET -unaligned_to_16: - MOVD R0, R2 -head_loop: - MOVBU.P ZR, 1(R0) - ANDS $15, R0, ZR - BNE head_loop - // Adjust length for what remains - SUB R2, R0, R3 - SUB R3, R1 - // If size is less than 16 bytes, use tail_zero to zero what remains - CMP $16, R1 - BLT tail_zero - B aligned_to_16 +GLOBL block_size<>(SB), NOPTR, $8 |