runtime: sleep less when we can do work

Usleep(100) in runqgrab negatively affects latency and throughput of parallel application. We are sleeping instead of doing useful work. This is effect is particularly visible on windows where minimal sleep duration is 1-15ms. Reduce sleep from 100us to 3us and use osyield on windows. Sync chan send/recv takes ~50ns, so 3us gives us ~50x overshoot. benchmark old ns/op new ns/op delta BenchmarkChanSync-12 216 217 +0.46% BenchmarkChanSyncWork-12 27213 25816 -5.13% CPU consumption goes up from 106% to 108% in the first case, and from 107% to 125% in the second case. Test case from #14790 on windows: BenchmarkDefaultResolution-8 4583372 29720 -99.35% Benchmark1ms-8 992056 30701 -96.91% 99-th latency percentile for HTTP request serving is improved by up to 15% (see http://golang.org/cl/20835 for details). The following benchmarks are from the change that originally added this sleep (see https://golang.org/s/go15gomaxprocs): name old time/op new time/op delta Chain 22.6µs ± 2% 22.7µs ± 6% ~ (p=0.905 n=9+10) ChainBuf 22.4µs ± 3% 22.5µs ± 4% ~ (p=0.780 n=9+10) Chain-2 23.5µs ± 4% 24.9µs ± 1% +5.66% (p=0.000 n=10+9) ChainBuf-2 23.7µs ± 1% 24.4µs ± 1% +3.31% (p=0.000 n=9+10) Chain-4 24.2µs ± 2% 25.1µs ± 3% +3.70% (p=0.000 n=9+10) ChainBuf-4 24.4µs ± 5% 25.0µs ± 2% +2.37% (p=0.023 n=10+10) Powser 2.37s ± 1% 2.37s ± 1% ~ (p=0.423 n=8+9) Powser-2 2.48s ± 2% 2.57s ± 2% +3.74% (p=0.000 n=10+9) Powser-4 2.66s ± 1% 2.75s ± 1% +3.40% (p=0.000 n=10+10) Sieve 13.3s ± 2% 13.3s ± 2% ~ (p=1.000 n=10+9) Sieve-2 7.00s ± 2% 7.44s ±16% ~ (p=0.408 n=8+10) Sieve-4 4.13s ±21% 3.85s ±22% ~ (p=0.113 n=9+9) Fixes #14790 Change-Id: Ie7c6a1c4f9c8eb2f5d65ab127a3845386d6f8b5d Reviewed-on: https://go-review.googlesource.com/20835 Reviewed-by: Austin Clements <austin@google.com>
author: Dmitry Vyukov <dvyukov@google.com> 2016-03-18 11:00:03 +0100
committer: Dmitry Vyukov <dvyukov@google.com> 2016-04-05 15:32:06 +0000
commit: 3b246fa863dbd91588c5920969f9fd0ce8362129 (patch)
tree: dea10c7aea4de32bbd5c6f5700c545fe8715906d /src/runtime/chan_test.go
parent: 036d09d5bff8b9da789ba0c914e5966055d9d7fe (diff)
download: go-3b246fa863dbd91588c5920969f9fd0ce8362129.tar.gz
go-3b246fa863dbd91588c5920969f9fd0ce8362129.zip
1 files changed, 28 insertions, 1 deletions
diff --git a/src/runtime/chan_test.go b/src/runtime/chan_test.go
index 911821bea5..219f2b449b 100644
--- a/src/runtime/chan_test.go
+++ b/src/runtime/chan_test.go
@@ -777,7 +777,7 @@ func BenchmarkChanContended(b *testing.B) {
 	})
 }
 
-func BenchmarkChanSync(b *testing.B) {
+func benchmarkChanSync(b *testing.B, work int) {
 	const CallsPerSched = 1000
 	procs := 2
 	N := int32(b.N / CallsPerSched / procs * procs)
@@ -793,10 +793,14 @@ func BenchmarkChanSync(b *testing.B) {
 				for g := 0; g < CallsPerSched; g++ {
 					if i%2 == 0 {
 						<-myc
+						localWork(work)
 						myc <- 0
+						localWork(work)
 					} else {
 						myc <- 0
+						localWork(work)
 						<-myc
+						localWork(work)
 					}
 				}
 			}
@@ -808,6 +812,14 @@ func BenchmarkChanSync(b *testing.B) {
 	}
 }
 
+func BenchmarkChanSync(b *testing.B) {
+	benchmarkChanSync(b, 0)
+}
+
+func BenchmarkChanSyncWork(b *testing.B) {
+	benchmarkChanSync(b, 1000)
+}
+
 func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
 	const CallsPerSched = 1000
 	procs := runtime.GOMAXPROCS(-1)
@@ -981,3 +993,18 @@ func BenchmarkChanPopular(b *testing.B) {
 	}
 	wg.Wait()
 }
+
+var (
+	alwaysFalse = false
+	workSink    = 0
+)
+
+func localWork(w int) {
+	foo := 0
+	for i := 0; i < w; i++ {
+		foo /= (foo + 1)
+	}
+	if alwaysFalse {
+		workSink += foo
+	}
+}
author	Dmitry Vyukov <dvyukov@google.com>	2016-03-18 11:00:03 +0100
committer	Dmitry Vyukov <dvyukov@google.com>	2016-04-05 15:32:06 +0000
commit	3b246fa863dbd91588c5920969f9fd0ce8362129 (patch)
tree	dea10c7aea4de32bbd5c6f5700c545fe8715906d /src/runtime/chan_test.go
parent	036d09d5bff8b9da789ba0c914e5966055d9d7fe (diff)
download	go-3b246fa863dbd91588c5920969f9fd0ce8362129.tar.gz go-3b246fa863dbd91588c5920969f9fd0ce8362129.zip