diff options
author | Jamie Liu <jamieliu@google.com> | 2017-11-15 12:47:22 -0800 |
---|---|---|
committer | Austin Clements <austin@google.com> | 2017-11-21 19:31:06 +0000 |
commit | 868c8b374d766c46ebf09f056ff7eff6e2186c75 (patch) | |
tree | f12d28d853c5e11d4b0f9084dd315fea2acae2df /src/runtime/proc_test.go | |
parent | 2951f909eae0afc4bea4067c98fef1a4f760006c (diff) | |
download | go-868c8b374d766c46ebf09f056ff7eff6e2186c75.tar.gz go-868c8b374d766c46ebf09f056ff7eff6e2186c75.zip |
runtime: only sleep before stealing work from a running P
The sleep in question does not make sense if the stolen-from P cannot
run the stolen G. The usleep(3) has been observed delaying execution of
woken G's by ~60us; skipping it reduces the wakeup-to-execution latency
to ~7us in these cases, improving CPU utilization.
Benchmarks added by this change:
name old time/op new time/op delta
WakeupParallelSpinning/0s-12 14.4µs ± 1% 14.3µs ± 1% ~ (p=0.227 n=19+20)
WakeupParallelSpinning/1µs-12 18.3µs ± 0% 18.3µs ± 1% ~ (p=0.950 n=20+19)
WakeupParallelSpinning/2µs-12 22.3µs ± 1% 22.3µs ± 1% ~ (p=0.670 n=20+18)
WakeupParallelSpinning/5µs-12 31.7µs ± 0% 31.7µs ± 0% ~ (p=0.460 n=20+17)
WakeupParallelSpinning/10µs-12 51.8µs ± 0% 51.8µs ± 0% ~ (p=0.883 n=20+20)
WakeupParallelSpinning/20µs-12 91.9µs ± 0% 91.9µs ± 0% ~ (p=0.245 n=20+20)
WakeupParallelSpinning/50µs-12 214µs ± 0% 214µs ± 0% ~ (p=0.509 n=19+20)
WakeupParallelSpinning/100µs-12 335µs ± 0% 335µs ± 0% -0.05% (p=0.006 n=17+15)
WakeupParallelSyscall/0s-12 228µs ± 2% 129µs ± 1% -43.32% (p=0.000 n=20+19)
WakeupParallelSyscall/1µs-12 232µs ± 1% 131µs ± 1% -43.60% (p=0.000 n=19+20)
WakeupParallelSyscall/2µs-12 236µs ± 1% 133µs ± 1% -43.44% (p=0.000 n=18+19)
WakeupParallelSyscall/5µs-12 248µs ± 2% 139µs ± 1% -43.68% (p=0.000 n=18+19)
WakeupParallelSyscall/10µs-12 263µs ± 3% 150µs ± 2% -42.97% (p=0.000 n=18+20)
WakeupParallelSyscall/20µs-12 281µs ± 2% 170µs ± 1% -39.43% (p=0.000 n=19+19)
WakeupParallelSyscall/50µs-12 345µs ± 4% 246µs ± 7% -28.85% (p=0.000 n=20+20)
WakeupParallelSyscall/100µs-12 460µs ± 5% 350µs ± 4% -23.85% (p=0.000 n=20+20)
Benchmarks associated with the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):
name old time/op new time/op delta
Chain 19.4µs ± 2% 19.3µs ± 1% ~ (p=0.101 n=19+20)
ChainBuf 19.5µs ± 2% 19.4µs ± 2% ~ (p=0.840 n=19+19)
Chain-2 19.9µs ± 1% 19.9µs ± 2% ~ (p=0.734 n=19+19)
ChainBuf-2 20.0µs ± 2% 20.0µs ± 2% ~ (p=0.175 n=19+17)
Chain-4 20.3µs ± 1% 20.1µs ± 1% -0.62% (p=0.010 n=19+18)
ChainBuf-4 20.3µs ± 1% 20.2µs ± 1% -0.52% (p=0.023 n=19+19)
Powser 2.09s ± 1% 2.10s ± 3% ~ (p=0.908 n=19+19)
Powser-2 2.21s ± 1% 2.20s ± 1% -0.35% (p=0.010 n=19+18)
Powser-4 2.31s ± 2% 2.31s ± 2% ~ (p=0.578 n=18+19)
Sieve 13.6s ± 1% 13.6s ± 1% ~ (p=0.909 n=17+18)
Sieve-2 8.02s ±52% 7.28s ±15% ~ (p=0.336 n=20+16)
Sieve-4 4.00s ±35% 3.98s ±26% ~ (p=0.654 n=20+18)
Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f
Reviewed-on: https://go-review.googlesource.com/78538
Reviewed-by: Austin Clements <austin@google.com>
Diffstat (limited to 'src/runtime/proc_test.go')
-rw-r--r-- | src/runtime/proc_test.go | 108 |
1 files changed, 108 insertions, 0 deletions
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go index c6ecc2a472..a0112f2fac 100644 --- a/src/runtime/proc_test.go +++ b/src/runtime/proc_test.go @@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) { _ = sum } +func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) { + if runtime.GOMAXPROCS(0) == 1 { + b.Skip("skipping: GOMAXPROCS=1") + } + + wakeDelay := 5 * time.Microsecond + for _, delay := range []time.Duration{ + 0, + 1 * time.Microsecond, + 2 * time.Microsecond, + 5 * time.Microsecond, + 10 * time.Microsecond, + 20 * time.Microsecond, + 50 * time.Microsecond, + 100 * time.Microsecond, + } { + b.Run(delay.String(), func(b *testing.B) { + if b.N == 0 { + return + } + // Start two goroutines, which alternate between being + // sender and receiver in the following protocol: + // + // - The receiver spins for `delay` and then does a + // blocking receive on a channel. + // + // - The sender spins for `delay+wakeDelay` and then + // sends to the same channel. (The addition of + // `wakeDelay` improves the probability that the + // receiver will be blocking when the send occurs when + // the goroutines execute in parallel.) + // + // In each iteration of the benchmark, each goroutine + // acts once as sender and once as receiver, so each + // goroutine spins for delay twice. + // + // BenchmarkWakeupParallel is used to estimate how + // efficiently the scheduler parallelizes goroutines in + // the presence of blocking: + // + // - If both goroutines are executed on the same core, + // an increase in delay by N will increase the time per + // iteration by 4*N, because all 4 delays are + // serialized. + // + // - Otherwise, an increase in delay by N will increase + // the time per iteration by 2*N, and the time per + // iteration is 2 * (runtime overhead + chan + // send/receive pair + delay + wakeDelay). This allows + // the runtime overhead, including the time it takes + // for the unblocked goroutine to be scheduled, to be + // estimated. + ping, pong := make(chan struct{}), make(chan struct{}) + start := make(chan struct{}) + done := make(chan struct{}) + go func() { + <-start + for i := 0; i < b.N; i++ { + // sender + spin(delay + wakeDelay) + ping <- struct{}{} + // receiver + spin(delay) + <-pong + } + done <- struct{}{} + }() + go func() { + for i := 0; i < b.N; i++ { + // receiver + spin(delay) + <-ping + // sender + spin(delay + wakeDelay) + pong <- struct{}{} + } + done <- struct{}{} + }() + b.ResetTimer() + start <- struct{}{} + <-done + <-done + }) + } +} + +func BenchmarkWakeupParallelSpinning(b *testing.B) { + benchmarkWakeupParallel(b, func(d time.Duration) { + end := time.Now().Add(d) + for time.Now().Before(end) { + // do nothing + } + }) +} + +func BenchmarkWakeupParallelSyscall(b *testing.B) { + benchmarkWakeupParallel(b, func(d time.Duration) { + // Invoke a blocking syscall directly; calling time.Sleep() + // would deschedule the goroutine instead. + ts := syscall.NsecToTimespec(d.Nanoseconds()) + for { + if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR { + return + } + } + }) +} + type Matrix [][]float64 func BenchmarkMatmult(b *testing.B) { |