aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/memclr_amd64.s
diff options
context:
space:
mode:
authorRuss Cox <rsc@golang.org>2016-01-22 22:25:15 -0500
committerRuss Cox <rsc@golang.org>2016-01-24 13:55:18 +0000
commit8d881b811d8212ffd1d43e296f2a1c1bf78198ab (patch)
treef1ccb58e27f036d1ac432d34340577332b964797 /src/runtime/memclr_amd64.s
parent7f620a57d01ec4230a69c4ee96d3809cfd6febab (diff)
downloadgo-8d881b811d8212ffd1d43e296f2a1c1bf78198ab.tar.gz
go-8d881b811d8212ffd1d43e296f2a1c1bf78198ab.zip
cmd/asm: correct, complete newly added AVX instructions
Use the standard names, for discoverability. Use the standard register arguments, for correctness. Implement all possible arguments, for completeness. Enable the corresponding tests now that everything is standard. Update the uses in package runtime. Fixes #14068. Change-Id: I8e1af9a41e7d02d98c2a82af3d4cdb3e9204824f Reviewed-on: https://go-review.googlesource.com/18852 Run-TryBot: Russ Cox <rsc@golang.org> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Rob Pike <r@golang.org>
Diffstat (limited to 'src/runtime/memclr_amd64.s')
-rw-r--r--src/runtime/memclr_amd64.s36
1 files changed, 18 insertions, 18 deletions
diff --git a/src/runtime/memclr_amd64.s b/src/runtime/memclr_amd64.s
index 5e78037df6..c257d59b30 100644
--- a/src/runtime/memclr_amd64.s
+++ b/src/runtime/memclr_amd64.s
@@ -65,40 +65,40 @@ loop:
JMP tail
loop_preheader_avx2:
- VPXOR X0, X0, X0
+ VPXOR Y0, Y0, Y0
// For smaller sizes MOVNTDQ may be faster or slower depending on hardware.
// For larger sizes it is always faster, even on dual Xeons with 30M cache.
// TODO take into account actual LLC size. E. g. glibc uses LLC size/2.
CMPQ BX, $0x2000000
JAE loop_preheader_avx2_huge
loop_avx2:
- MOVHDU X0, 0(DI)
- MOVHDU X0, 32(DI)
- MOVHDU X0, 64(DI)
- MOVHDU X0, 96(DI)
+ VMOVDQU Y0, 0(DI)
+ VMOVDQU Y0, 32(DI)
+ VMOVDQU Y0, 64(DI)
+ VMOVDQU Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
JAE loop_avx2
- MOVHDU X0, -32(DI)(BX*1)
- MOVHDU X0, -64(DI)(BX*1)
- MOVHDU X0, -96(DI)(BX*1)
- MOVHDU X0, -128(DI)(BX*1)
+ VMOVDQU Y0, -32(DI)(BX*1)
+ VMOVDQU Y0, -64(DI)(BX*1)
+ VMOVDQU Y0, -96(DI)(BX*1)
+ VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET
loop_preheader_avx2_huge:
// Align to 32 byte boundary
- MOVHDU X0, 0(DI)
+ VMOVDQU Y0, 0(DI)
MOVQ DI, SI
ADDQ $32, DI
ANDQ $~31, DI
SUBQ DI, SI
ADDQ SI, BX
loop_avx2_huge:
- MOVNTHD X0, 0(DI)
- MOVNTHD X0, 32(DI)
- MOVNTHD X0, 64(DI)
- MOVNTHD X0, 96(DI)
+ VMOVNTDQ Y0, 0(DI)
+ VMOVNTDQ Y0, 32(DI)
+ VMOVNTDQ Y0, 64(DI)
+ VMOVNTDQ Y0, 96(DI)
SUBQ $128, BX
ADDQ $128, DI
CMPQ BX, $128
@@ -108,10 +108,10 @@ loop_avx2_huge:
// should be used in conjunction with MOVNTDQ instructions..."
// [1] 64-ia-32-architectures-software-developer-manual-325462.pdf
SFENCE
- MOVHDU X0, -32(DI)(BX*1)
- MOVHDU X0, -64(DI)(BX*1)
- MOVHDU X0, -96(DI)(BX*1)
- MOVHDU X0, -128(DI)(BX*1)
+ VMOVDQU Y0, -32(DI)(BX*1)
+ VMOVDQU Y0, -64(DI)(BX*1)
+ VMOVDQU Y0, -96(DI)(BX*1)
+ VMOVDQU Y0, -128(DI)(BX*1)
VZEROUPPER
RET