cmd/6g: treat vardef-initialized fat variables as live at calls

This CL forces the optimizer to preserve some memory stores that would be redundant except that a stack scan due to garbage collection or stack copying might look at them during a function call. As such, it forces additional memory writes and therefore slows down the execution of some programs, especially garbage-heavy programs that are already limited by memory bandwidth. The slowdown can be as much as 7% for end-to-end benchmarks. These numbers are from running go1.test -test.benchtime=5s three times, taking the best (lowest) ns/op for each benchmark. I am excluding benchmarks with time/op < 10us to focus on macro effects. All benchmarks are on amd64. Comparing tip (a27f34c771cb) against this CL on an Intel Core i5 MacBook Pro: benchmark old ns/op new ns/op delta BenchmarkBinaryTree17 3876500413 3856337341 -0.52% BenchmarkFannkuch11 2965104777 2991182127 +0.88% BenchmarkGobDecode 8563026 8788340 +2.63% BenchmarkGobEncode 5050608 5267394 +4.29% BenchmarkGzip 431191816 434168065 +0.69% BenchmarkGunzip 107873523 110563792 +2.49% BenchmarkHTTPClientServer 85036 86131 +1.29% BenchmarkJSONEncode 22143764 22501647 +1.62% BenchmarkJSONDecode 79646916 85658808 +7.55% BenchmarkMandelbrot200 4720421 4700108 -0.43% BenchmarkGoParse 4651575 4712247 +1.30% BenchmarkRegexpMatchMedium_1K 71986 73490 +2.09% BenchmarkRegexpMatchHard_1K 111018 117495 +5.83% BenchmarkRevcomp 648798723 659352759 +1.63% BenchmarkTemplate 112673009 112819078 +0.13% Comparing tip (a27f34c771cb) against this CL on an Intel Xeon E5520: BenchmarkBinaryTree17 5461110720 5393104469 -1.25% BenchmarkFannkuch11 4314677151 4327177615 +0.29% BenchmarkGobDecode 11065853 11235272 +1.53% BenchmarkGobEncode 6500065 6959837 +7.07% BenchmarkGzip 647478596 671769097 +3.75% BenchmarkGunzip 139348579 141096376 +1.25% BenchmarkHTTPClientServer 69376 73610 +6.10% BenchmarkJSONEncode 30172320 31796106 +5.38% BenchmarkJSONDecode 113704905 114239137 +0.47% BenchmarkMandelbrot200 6032730 6003077 -0.49% BenchmarkGoParse 6775251 6405995 -5.45% BenchmarkRegexpMatchMedium_1K 111832 113895 +1.84% BenchmarkRegexpMatchHard_1K 161112 168420 +4.54% BenchmarkRevcomp 876363406 892319935 +1.82% BenchmarkTemplate 146273096 148998339 +1.86% Just to get a sense of where we are compared to the previous release, here are the same benchmarks comparing Go 1.2 to this CL. Comparing Go 1.2 against this CL on an Intel Core i5 MacBook Pro: BenchmarkBinaryTree17 4370077662 3856337341 -11.76% BenchmarkFannkuch11 3347052657 2991182127 -10.63% BenchmarkGobDecode 8791384 8788340 -0.03% BenchmarkGobEncode 4968759 5267394 +6.01% BenchmarkGzip 437815669 434168065 -0.83% BenchmarkGunzip 94604099 110563792 +16.87% BenchmarkHTTPClientServer 87798 86131 -1.90% BenchmarkJSONEncode 22818243 22501647 -1.39% BenchmarkJSONDecode 97182444 85658808 -11.86% BenchmarkMandelbrot200 4733516 4700108 -0.71% BenchmarkGoParse 5054384 4712247 -6.77% BenchmarkRegexpMatchMedium_1K 67612 73490 +8.69% BenchmarkRegexpMatchHard_1K 107321 117495 +9.48% BenchmarkRevcomp 733270055 659352759 -10.08% BenchmarkTemplate 109304977 112819078 +3.21% Comparing Go 1.2 against this CL on an Intel Xeon E5520: BenchmarkBinaryTree17 5986953594 5393104469 -9.92% BenchmarkFannkuch11 4861139174 4327177615 -10.98% BenchmarkGobDecode 11830997 11235272 -5.04% BenchmarkGobEncode 6608722 6959837 +5.31% BenchmarkGzip 661875826 671769097 +1.49% BenchmarkGunzip 138630019 141096376 +1.78% BenchmarkHTTPClientServer 71534 73610 +2.90% BenchmarkJSONEncode 30393609 31796106 +4.61% BenchmarkJSONDecode 139645860 114239137 -18.19% BenchmarkMandelbrot200 5988660 6003077 +0.24% BenchmarkGoParse 6974092 6405995 -8.15% BenchmarkRegexpMatchMedium_1K 111331 113895 +2.30% BenchmarkRegexpMatchHard_1K 165961 168420 +1.48% BenchmarkRevcomp 995049292 892319935 -10.32% BenchmarkTemplate 145623363 148998339 +2.32% Fixes #8036. LGTM=khr R=golang-codereviews, josharian, khr CC=golang-codereviews, iant, r https://golang.org/cl/99660044
author: Russ Cox <rsc@golang.org> 2014-05-30 16:41:58 -0400
committer: Russ Cox <rsc@golang.org> 2014-05-30 16:41:58 -0400
commit: 1afbceb5999f9c9743630ff8ea002d3ec58a08af (patch)
tree: 0f4087bb84b1dbbdd771832647462640027dc2c3
parent: aad4609c086cc5069aecb66cd1b7c32700fda1d5 (diff)
download: go-1afbceb5999f9c9743630ff8ea002d3ec58a08af.tar.gz
go-1afbceb5999f9c9743630ff8ea002d3ec58a08af.zip
4 files changed, 210 insertions, 6 deletions
diff --git a/src/cmd/5g/reg.c b/src/cmd/5g/reg.c
index 6129698f3a..4762df5062 100644
--- a/src/cmd/5g/reg.c
+++ b/src/cmd/5g/reg.c
@@ -129,13 +129,15 @@ static char* regname[] = {
 
 static Node* regnodes[NREGVAR];
 
+static void walkvardef(Node *n, Reg *r, int active);
+
 void
 regopt(Prog *firstp)
 {
 	Reg *r, *r1;
 	Prog *p;
 	Graph *g;
-	int i, z;
+	int i, z, active;
 	uint32 vreg;
 	Bits bit;
 	ProgInfo info;
@@ -250,6 +252,26 @@ regopt(Prog *firstp)
 		dumpit("pass2", &firstr->f, 1);
 
 	/*
+	 * pass 2.5
+	 * iterate propagating fat vardef covering forward
+	 * r->act records vars with a VARDEF since the last CALL.
+	 * (r->act will be reused in pass 5 for something else,
+	 * but we'll be done with it by then.)
+	 */
+	active = 0;
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		r->f.active = 0;
+		r->act = zbits;
+	}
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		p = r->f.prog;
+		if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
+			active++;
+			walkvardef(p->to.node, r, active);
+		}
+	}
+
+	/*
 	 * pass 3
 	 * iterate propagating usage
 	 * 	back until flow graph is complete
@@ -524,6 +546,32 @@ brk:
 	}
 }
 
+static void
+walkvardef(Node *n, Reg *r, int active)
+{
+	Reg *r1, *r2;
+	int bn;
+	Var *v;
+	
+	for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
+		if(r1->f.active == active)
+			break;
+		r1->f.active = active;
+		if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
+			break;
+		for(v=n->opt; v!=nil; v=v->nextinnode) {
+			bn = v - var;
+			r1->act.b[bn/32] |= 1L << (bn%32);
+		}
+		if(r1->f.prog->as == ABL)
+			break;
+	}
+
+	for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
+		if(r2->f.s2 != nil)
+			walkvardef(n, (Reg*)r2->f.s2, active);
+}
+
 void
 addsplits(void)
 {
@@ -891,8 +939,13 @@ prop(Reg *r, Bits ref, Bits cal)
 			// Mark all input variables (ivar) as used, because that's what the
 			// liveness bitmaps say. The liveness bitmaps say that so that a
 			// panic will not show stale values in the parameter dump.
+			// Mark variables with a recent VARDEF (r1->act) as used,
+			// so that the optimizer flushes initializations to memory,
+			// so that if a garbage collection happens during this CALL,
+			// the collector will see initialized memory. Again this is to
+			// match what the liveness bitmaps say.
 			for(z=0; z<BITS; z++) {
-				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
+				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
 				ref.b[z] = 0;
 			}
 			
diff --git a/src/cmd/6g/reg.c b/src/cmd/6g/reg.c
index 919a07d7bc..f3b1e55de8 100644
--- a/src/cmd/6g/reg.c
+++ b/src/cmd/6g/reg.c
@@ -114,6 +114,8 @@ static char* regname[] = {
 
 static Node* regnodes[NREGVAR];
 
+static void walkvardef(Node *n, Reg *r, int active);
+
 void
 regopt(Prog *firstp)
 {
@@ -121,7 +123,7 @@ regopt(Prog *firstp)
 	Prog *p;
 	Graph *g;
 	ProgInfo info;
-	int i, z;
+	int i, z, active;
 	uint32 vreg;
 	Bits bit;
 
@@ -235,6 +237,26 @@ regopt(Prog *firstp)
 		dumpit("pass2", &firstr->f, 1);
 
 	/*
+	 * pass 2.5
+	 * iterate propagating fat vardef covering forward
+	 * r->act records vars with a VARDEF since the last CALL.
+	 * (r->act will be reused in pass 5 for something else,
+	 * but we'll be done with it by then.)
+	 */
+	active = 0;
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		r->f.active = 0;
+		r->act = zbits;
+	}
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		p = r->f.prog;
+		if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
+			active++;
+			walkvardef(p->to.node, r, active);
+		}
+	}
+
+	/*
 	 * pass 3
 	 * iterate propagating usage
 	 * 	back until flow graph is complete
@@ -435,6 +457,32 @@ brk:
 	}
 }
 
+static void
+walkvardef(Node *n, Reg *r, int active)
+{
+	Reg *r1, *r2;
+	int bn;
+	Var *v;
+	
+	for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
+		if(r1->f.active == active)
+			break;
+		r1->f.active = active;
+		if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
+			break;
+		for(v=n->opt; v!=nil; v=v->nextinnode) {
+			bn = v - var;
+			r1->act.b[bn/32] |= 1L << (bn%32);
+		}
+		if(r1->f.prog->as == ACALL)
+			break;
+	}
+
+	for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
+		if(r2->f.s2 != nil)
+			walkvardef(n, (Reg*)r2->f.s2, active);
+}
+
 /*
  * add mov b,rn
  * just after r
@@ -745,8 +793,13 @@ prop(Reg *r, Bits ref, Bits cal)
 			// Mark all input variables (ivar) as used, because that's what the
 			// liveness bitmaps say. The liveness bitmaps say that so that a
 			// panic will not show stale values in the parameter dump.
+			// Mark variables with a recent VARDEF (r1->act) as used,
+			// so that the optimizer flushes initializations to memory,
+			// so that if a garbage collection happens during this CALL,
+			// the collector will see initialized memory. Again this is to
+			// match what the liveness bitmaps say.
 			for(z=0; z<BITS; z++) {
-				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
+				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
 				ref.b[z] = 0;
 			}
 			
diff --git a/src/cmd/8g/reg.c b/src/cmd/8g/reg.c
index ed019f9373..fd610f87a6 100644
--- a/src/cmd/8g/reg.c
+++ b/src/cmd/8g/reg.c
@@ -84,6 +84,8 @@ static char* regname[] = {
 
 static Node* regnodes[NREGVAR];
 
+static void walkvardef(Node *n, Reg *r, int active);
+
 void
 regopt(Prog *firstp)
 {
@@ -91,7 +93,7 @@ regopt(Prog *firstp)
 	Prog *p;
 	Graph *g;
 	ProgInfo info;
-	int i, z;
+	int i, z, active;
 	uint32 vreg;
 	Bits bit;
 
@@ -207,6 +209,26 @@ regopt(Prog *firstp)
 		dumpit("pass2", &firstr->f, 1);
 
 	/*
+	 * pass 2.5
+	 * iterate propagating fat vardef covering forward
+	 * r->act records vars with a VARDEF since the last CALL.
+	 * (r->act will be reused in pass 5 for something else,
+	 * but we'll be done with it by then.)
+	 */
+	active = 0;
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		r->f.active = 0;
+		r->act = zbits;
+	}
+	for(r = firstr; r != R; r = (Reg*)r->f.link) {
+		p = r->f.prog;
+		if(p->as == AVARDEF && isfat(p->to.node->type) && p->to.node->opt != nil) {
+			active++;
+			walkvardef(p->to.node, r, active);
+		}
+	}
+
+	/*
 	 * pass 3
 	 * iterate propagating usage
 	 * 	back until flow graph is complete
@@ -404,6 +426,32 @@ brk:
 	}
 }
 
+static void
+walkvardef(Node *n, Reg *r, int active)
+{
+	Reg *r1, *r2;
+	int bn;
+	Var *v;
+	
+	for(r1=r; r1!=R; r1=(Reg*)r1->f.s1) {
+		if(r1->f.active == active)
+			break;
+		r1->f.active = active;
+		if(r1->f.prog->as == AVARKILL && r1->f.prog->to.node == n)
+			break;
+		for(v=n->opt; v!=nil; v=v->nextinnode) {
+			bn = v - var;
+			r1->act.b[bn/32] |= 1L << (bn%32);
+		}
+		if(r1->f.prog->as == ACALL)
+			break;
+	}
+
+	for(r2=r; r2!=r1; r2=(Reg*)r2->f.s1)
+		if(r2->f.s2 != nil)
+			walkvardef(n, (Reg*)r2->f.s2, active);
+}
+
 /*
  * add mov b,rn
  * just after r
@@ -711,8 +759,13 @@ prop(Reg *r, Bits ref, Bits cal)
 			// Mark all input variables (ivar) as used, because that's what the
 			// liveness bitmaps say. The liveness bitmaps say that so that a
 			// panic will not show stale values in the parameter dump.
+			// Mark variables with a recent VARDEF (r1->act) as used,
+			// so that the optimizer flushes initializations to memory,
+			// so that if a garbage collection happens during this CALL,
+			// the collector will see initialized memory. Again this is to
+			// match what the liveness bitmaps say.
 			for(z=0; z<BITS; z++) {
-				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z];
+				cal.b[z] |= ref.b[z] | externs.b[z] | ivar.b[z] | r1->act.b[z];
 				ref.b[z] = 0;
 			}
 			
diff --git a/test/fixedbugs/issue8036.go b/test/fixedbugs/issue8036.go
new file mode 100644
index 0000000000..f32fde84ab
--- /dev/null
+++ b/test/fixedbugs/issue8036.go
@@ -0,0 +1,45 @@
+// run
+
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Issue 8036. Stores necessary for stack scan being eliminated as redundant by optimizer.
+
+package main
+
+import "runtime"
+
+type T struct {
+	X *int
+	Y *int
+	Z *int
+}
+
+type TI [3]uintptr
+
+func G() (t TI) {
+	t[0] = 1
+	t[1] = 2
+	t[2] = 3
+	runtime.GC() // prevent inlining
+	return
+}
+
+func F() (t T) {
+	t.X = newint()
+	t.Y = t.X
+	t.Z = t.Y
+	runtime.GC() // prevent inlining
+	return
+}
+
+func newint() *int {
+	runtime.GC()
+	return nil
+}
+
+func main() {
+	G() // leave non-pointers where F's return values go
+	F()
+}
author	Russ Cox <rsc@golang.org>	2014-05-30 16:41:58 -0400
committer	Russ Cox <rsc@golang.org>	2014-05-30 16:41:58 -0400
commit	1afbceb5999f9c9743630ff8ea002d3ec58a08af (patch)
tree	0f4087bb84b1dbbdd771832647462640027dc2c3
parent	aad4609c086cc5069aecb66cd1b7c32700fda1d5 (diff)
download	go-1afbceb5999f9c9743630ff8ea002d3ec58a08af.tar.gz go-1afbceb5999f9c9743630ff8ea002d3ec58a08af.zip