Revert "runtime: improve memmove for amd64"

This reverts commit 3607c5f4f18ad4d423e40996ebf7f46b2f79ce02. This was causing failures on amd64 machines without AVX. Fixes #16939 Change-Id: I70080fbb4e7ae791857334f2bffd847d08cb25fa Reviewed-on: https://go-review.googlesource.com/28274 Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
author: Joe Tsai <joetsai@digital-static.net> 2016-08-31 20:44:42 +0000
committer: Joe Tsai <thebrokentoaster@gmail.com> 2016-08-31 21:07:35 +0000
commit: 6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb (patch)
tree: 113c3762033fd7957b36cfc6b876a959ae3e925c /src/runtime/memmove_amd64.s
parent: cc0248aea53b252ec5c0e1c57e32edb102bc36fe (diff)
download: go-6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb.tar.gz
go-6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb.zip
1 files changed, 1 insertions, 242 deletions
diff --git a/src/runtime/memmove_amd64.s b/src/runtime/memmove_amd64.s
index ffcc6613cf..5d23ce3e6c 100644
--- a/src/runtime/memmove_amd64.s
+++ b/src/runtime/memmove_amd64.s
@@ -64,9 +64,6 @@ tail:
 	JBE	move_129through256
 	// TODO: use branch table and BSR to make this just a single dispatch
 
-	TESTB	$1, runtime·useRepMovs(SB)
-	JZ	avxUnaligned
-
 /*
  * check and set for backwards
  */
@@ -111,6 +108,7 @@ back:
 	ADDQ	BX, CX
 	CMPQ	CX, DI
 	JLS	forward
+	
 /*
  * whole thing backwards has
  * adjusted addresses
@@ -275,242 +273,3 @@ move_256through2048:
 	LEAQ	256(DI), DI
 	JGE	move_256through2048
 	JMP	tail
-
-avxUnaligned:
-	// There are two implementations of the move algorithm.
-	// The first one for non-overlapped memory regions. It uses forward copying.
-	// The second one for overlapped regions. It uses backward copying
-	MOVQ	DI, CX
-	SUBQ	SI, CX
-	// Now CX contains distance between SRC and DEST
-	CMPQ	CX, BX
-	// If the distance lesser than region length it means that regions are overlapped
-	JC	copy_backward
-
-	// Non-temporal copy would be better for big sizes.
-	CMPQ	BX, $0x100000
-	JAE	gobble_big_data_fwd
-
-	// Memory layout on the source side
-	// SI                                       CX
-	// |<---------BX before correction--------->|
-	// |       |<--BX corrected-->|             |
-	// |       |                  |<--- AX  --->|
-	// |<-R11->|                  |<-128 bytes->|
-	// +----------------------------------------+
-	// | Head  | Body             | Tail        |
-	// +-------+------------------+-------------+
-	// ^       ^                  ^
-	// |       |                  |
-	// Save head into Y4          Save tail into X5..X12
-	//         |
-	//         SI+R11, where R11 = ((DI & -32) + 32) - DI
-	// Algorithm:
-	// 1. Unaligned save of the tail's 128 bytes
-	// 2. Unaligned save of the head's 32  bytes
-	// 3. Destination-aligned copying of body (128 bytes per iteration)
-	// 4. Put head on the new place
-	// 5. Put the tail on the new place
-	// It can be important to satisfy processor's pipeline requirements for
-	// small sizes as the cost of unaligned memory region copying is
-	// comparable with the cost of main loop. So code is slightly messed there.
-	// There is more clean implementation of that algorithm for bigger sizes
-	// where the cost of unaligned part copying is negligible.
-	// You can see it after gobble_big_data_fwd label.
-	LEAQ	(SI)(BX*1), CX
-	MOVQ	DI, R10
-	// CX points to the end of buffer so we need go back slightly. We will use negative offsets there.
-	MOVOU	-0x80(CX), X5
-	MOVOU	-0x70(CX), X6
-	MOVQ	$0x80, AX
-	// Align destination address
-	ANDQ	$-32, DI
-	ADDQ	$32, DI
-	// Continue tail saving.
-	MOVOU	-0x60(CX), X7
-	MOVOU	-0x50(CX), X8
-	// Make R11 delta between aligned and unaligned destination addresses.
-	MOVQ	DI, R11
-	SUBQ	R10, R11
-	// Continue tail saving.
-	MOVOU	-0x40(CX), X9
-	MOVOU	-0x30(CX), X10
-	// Let's make bytes-to-copy value adjusted as we've prepared unaligned part for copying.
-	SUBQ	R11, BX
-	// Continue tail saving.
-	MOVOU	-0x20(CX), X11
-	MOVOU	-0x10(CX), X12
-	// The tail will be put on it's place after main body copying.
-	// It's time for the unaligned heading part.
-	VMOVDQU	(SI), Y4
-	// Adjust source address to point past head.
-	ADDQ	R11, SI
-	SUBQ	AX, BX
-	// Aligned memory copying there
-gobble_128_loop:
-	VMOVDQU	(SI), Y0
-	VMOVDQU	0x20(SI), Y1
-	VMOVDQU	0x40(SI), Y2
-	VMOVDQU	0x60(SI), Y3
-	ADDQ	AX, SI
-	VMOVDQA	Y0, (DI)
-	VMOVDQA	Y1, 0x20(DI)
-	VMOVDQA	Y2, 0x40(DI)
-	VMOVDQA	Y3, 0x60(DI)
-	ADDQ	AX, DI
-	SUBQ	AX, BX
-	JA	gobble_128_loop
-	// Now we can store unaligned parts.
-	ADDQ	AX, BX
-	ADDQ	DI, BX
-	VMOVDQU	Y4, (R10)
-	VZEROUPPER
-	MOVOU	X5, -0x80(BX)
-	MOVOU	X6, -0x70(BX)
-	MOVOU	X7, -0x60(BX)
-	MOVOU	X8, -0x50(BX)
-	MOVOU	X9, -0x40(BX)
-	MOVOU	X10, -0x30(BX)
-	MOVOU	X11, -0x20(BX)
-	MOVOU	X12, -0x10(BX)
-	RET
-
-gobble_big_data_fwd:
-	// There is forward copying for big regions.
-	// It uses non-temporal mov instructions.
-	// Details of this algorithm are commented previously for small sizes.
-	LEAQ	(SI)(BX*1), CX
-	MOVOU	-0x80(SI)(BX*1), X5
-	MOVOU	-0x70(CX), X6
-	MOVOU	-0x60(CX), X7
-	MOVOU	-0x50(CX), X8
-	MOVOU	-0x40(CX), X9
-	MOVOU	-0x30(CX), X10
-	MOVOU	-0x20(CX), X11
-	MOVOU	-0x10(CX), X12
-	VMOVDQU	(SI), Y4
-	MOVQ	DI, R8
-	ANDQ	$-32, DI
-	ADDQ	$32, DI
-	MOVQ	DI, R10
-	SUBQ	R8, R10
-	SUBQ	R10, BX
-	ADDQ	R10, SI
-	LEAQ	(DI)(BX*1), CX
-	SUBQ	$0x80, BX
-gobble_mem_fwd_loop:
-	PREFETCHNTA 0x1C0(SI)
-	PREFETCHNTA 0x280(SI)
-	// Prefetch values were choosen empirically.
-	// Approach for prefetch usage as in 7.6.6 of [1]
-	// [1] 64-ia-32-architectures-optimization-manual.pdf
-	// http://www.intel.ru/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-optimization-manual.pdf
-	VMOVDQU	(SI), Y0
-	VMOVDQU	0x20(SI), Y1
-	VMOVDQU	0x40(SI), Y2
-	VMOVDQU	0x60(SI), Y3
-	ADDQ	$0x80, SI
-	VMOVNTDQ Y0, (DI)
-	VMOVNTDQ Y1, 0x20(DI)
-	VMOVNTDQ Y2, 0x40(DI)
-	VMOVNTDQ Y3, 0x60(DI)
-	ADDQ	$0x80, DI
-	SUBQ	$0x80, BX
-	JA		gobble_mem_fwd_loop
-	// NT instructions don't follow the normal cache-coherency rules.
-	// We need SFENCE there to make copied data available timely.
-	SFENCE
-	VMOVDQU	Y4, (R8)
-	VZEROUPPER
-	MOVOU	X5, -0x80(CX)
-	MOVOU	X6, -0x70(CX)
-	MOVOU	X7, -0x60(CX)
-	MOVOU	X8, -0x50(CX)
-	MOVOU	X9, -0x40(CX)
-	MOVOU	X10, -0x30(CX)
-	MOVOU	X11, -0x20(CX)
-	MOVOU	X12, -0x10(CX)
-	RET
-
-copy_backward:
-	MOVQ	DI, AX
-	// Backward copying is about the same as the forward one.
-	// Firstly we load unaligned tail in the beginning of region.
-	MOVOU	(SI), X5
-	MOVOU	0x10(SI), X6
-	ADDQ	BX, DI
-	MOVOU	0x20(SI), X7
-	MOVOU	0x30(SI), X8
-	LEAQ	-0x20(DI), R10
-	MOVQ	DI, R11
-	MOVOU	0x40(SI), X9
-	MOVOU	0x50(SI), X10
-	ANDQ	$0x1F, R11
-	MOVOU	0x60(SI), X11
-	MOVOU	0x70(SI), X12
-	XORQ	R11, DI
-	// Let's point SI to the end of region
-	ADDQ	BX, SI
-	// and load unaligned head into X4.
-	VMOVDQU	-0x20(SI), Y4
-	SUBQ	R11, SI
-	SUBQ	R11, BX
-	// If there is enough data for non-temporal moves go to special loop
-	CMPQ	BX, $0x100000
-	JA		gobble_big_data_bwd
-	SUBQ	$0x80, BX
-gobble_mem_bwd_loop:
-	VMOVDQU	-0x20(SI), Y0
-	VMOVDQU	-0x40(SI), Y1
-	VMOVDQU	-0x60(SI), Y2
-	VMOVDQU	-0x80(SI), Y3
-	SUBQ	$0x80, SI
-	VMOVDQA	Y0, -0x20(DI)
-	VMOVDQA	Y1, -0x40(DI)
-	VMOVDQA	Y2, -0x60(DI)
-	VMOVDQA	Y3, -0x80(DI)
-	SUBQ	$0x80, DI
-	SUBQ	$0x80, BX
-	JA		gobble_mem_bwd_loop
-	// Let's store unaligned data
-	VMOVDQU	Y4, (R10)
-	VZEROUPPER
-	MOVOU	X5, (AX)
-	MOVOU	X6, 0x10(AX)
-	MOVOU	X7, 0x20(AX)
-	MOVOU	X8, 0x30(AX)
-	MOVOU	X9, 0x40(AX)
-	MOVOU	X10, 0x50(AX)
-	MOVOU	X11, 0x60(AX)
-	MOVOU	X12, 0x70(AX)
-	RET
-
-gobble_big_data_bwd:
-	SUBQ	$0x80, BX
-gobble_big_mem_bwd_loop:
-	PREFETCHNTA -0x1C0(SI)
-	PREFETCHNTA -0x280(SI)
-	VMOVDQU	-0x20(SI), Y0
-	VMOVDQU	-0x40(SI), Y1
-	VMOVDQU	-0x60(SI), Y2
-	VMOVDQU	-0x80(SI), Y3
-	SUBQ	$0x80, SI
-	VMOVNTDQ	Y0, -0x20(DI)
-	VMOVNTDQ	Y1, -0x40(DI)
-	VMOVNTDQ	Y2, -0x60(DI)
-	VMOVNTDQ	Y3, -0x80(DI)
-	SUBQ	$0x80, DI
-	SUBQ	$0x80, BX
-	JA	gobble_big_mem_bwd_loop
-	SFENCE
-	VMOVDQU	Y4, (R10)
-	VZEROUPPER
-	MOVOU	X5, (AX)
-	MOVOU	X6, 0x10(AX)
-	MOVOU	X7, 0x20(AX)
-	MOVOU	X8, 0x30(AX)
-	MOVOU	X9, 0x40(AX)
-	MOVOU	X10, 0x50(AX)
-	MOVOU	X11, 0x60(AX)
-	MOVOU	X12, 0x70(AX)
-	RET
author	Joe Tsai <joetsai@digital-static.net>	2016-08-31 20:44:42 +0000
committer	Joe Tsai <thebrokentoaster@gmail.com>	2016-08-31 21:07:35 +0000
commit	6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb (patch)
tree	113c3762033fd7957b36cfc6b876a959ae3e925c /src/runtime/memmove_amd64.s
parent	cc0248aea53b252ec5c0e1c57e32edb102bc36fe (diff)
download	go-6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb.tar.gz go-6fb4b15f98bba7ef3966c5edc6b8fe2cc99c6beb.zip