runtime: reduce timer latency

Change the scheduler to treat expired timers with the same approach it uses to steal runnable G's. Previously the scheduler ignored timers on P's not marked for preemption. That had the downside that any G's waiting on those expired timers starved until the G running on their P completed or was preempted. That could take as long as 20ms if sysmon was in a 10ms wake up cycle. In addition, a spinning P that ignored an expired timer and found no other work would stop despite there being available work, missing the opportunity for greater parallelism. With this change the scheduler no longer ignores timers on non-preemptable P's or relies on sysmon as a backstop to start threads when timers expire. Instead it wakes an idle P, if needed, when creating a new timer because it cannot predict if the current P will have a scheduling opportunity before the new timer expires. The P it wakes will determine how long to sleep and block on the netpoller for the required time, potentially stealing the new timer when it wakes. This change also eliminates a race between a spinning P transitioning to idle concurrently with timer creation using the same pattern used for submission of new goroutines in the same window. Benchmark analysis: CL 232199, which was included in Go 1.15 improved timer latency over Go 1.14 by allowing P's to steal timers from P's not marked for preemption. The benchmarks added in this CL measure that improvement in the ParallelTimerLatency benchmark seen below. However, Go 1.15 still relies on sysmon to notice expired timers in some situations and sysmon can sleep for up to 10ms before waking to check timers. This CL fixes that shortcoming with modest regression on other benchmarks. name \ avg-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 17.3M ± 3% 7.9M ± 0% 0.2M ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 53.4k ±23% 50.7k ±31% 252.4k ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 204k ±14% 90k ±58% 188k ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 1.17M ± 0% 0.11M ± 5% 0.11M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 1.81M ±44% 0.10M ± 4% 0.10M ± 2% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 2.28M ±66% 0.09M ±13% 0.08M ±21% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 2.84M ±85% 0.07M ±15% 0.07M ±18% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 2.13M ±27% 0.06M ± 4% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 2.63M ± 6% 0.06M ±11% 0.06M ± 9% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 3.32M ±17% 0.06M ±16% 0.07M ±14% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 8.46M ±20% 4.37M ±21% 5.03M ±23% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 1.02M ± 1% 0.20M ± 2% 0.20M ± 2% name \ max-late-ns go14.time.bench go15.time.bench fix.time.bench ParallelTimerLatency-8 18.3M ± 1% 8.2M ± 0% 0.5M ±12% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=1-8 141k ±19% 127k ±19% 1129k ± 3% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=2-8 2.78M ± 4% 1.23M ±15% 1.26M ± 5% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=3-8 6.05M ± 5% 0.67M ±56% 0.81M ±33% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=4-8 7.93M ±20% 0.71M ±46% 0.76M ±41% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=5-8 9.41M ±30% 0.92M ±23% 0.81M ±44% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=6-8 10.8M ±42% 0.8M ±41% 0.8M ±30% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=7-8 9.62M ±24% 0.77M ±38% 0.88M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=8-8 10.6M ±10% 0.8M ±32% 0.7M ±27% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=9-8 11.9M ±36% 0.6M ±46% 0.8M ±38% StaggeredTickerLatency/work-dur=300µs/tickers-per-P=10-8 36.8M ±21% 24.7M ±21% 27.5M ±16% StaggeredTickerLatency/work-dur=2ms/tickers-per-P=1-8 2.12M ± 2% 1.02M ±11% 1.03M ± 7% Other time benchmarks: name \ time/op go14.time.bench go15.time.bench fix.time.bench AfterFunc-8 137µs ± 4% 123µs ± 4% 131µs ± 2% After-8 212µs ± 3% 195µs ± 4% 204µs ± 7% Stop-8 165µs ± 6% 156µs ± 2% 151µs ±12% SimultaneousAfterFunc-8 260µs ± 3% 248µs ± 3% 284µs ± 2% StartStop-8 65.8µs ± 9% 64.4µs ± 7% 67.3µs ±15% Reset-8 13.6µs ± 2% 9.6µs ± 2% 9.1µs ± 4% Sleep-8 307µs ± 4% 306µs ± 3% 320µs ± 2% Ticker-8 53.0µs ± 5% 54.5µs ± 5% 57.0µs ±11% TickerReset-8 9.24µs ± 2% 9.51µs ± 3% TickerResetNaive-8 149µs ± 5% 145µs ± 5% Fixes #38860 Updates #25471 Updates #27707 Change-Id: If52680509b0f3b66dbd1d0c13fa574bd2d0bbd57 Reviewed-on: https://go-review.googlesource.com/c/go/+/232298 Run-TryBot: Alberto Donizetti <alb.donizetti@gmail.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Austin Clements <austin@google.com> Trust: Ian Lance Taylor <iant@golang.org>
author: Chris Hines <chris.cs.guy@gmail.com> 2020-05-01 17:04:36 -0400
committer: Alberto Donizetti <alb.donizetti@gmail.com> 2020-10-27 17:24:42 +0000
commit: 8fdc79e18a9704185bd6471b592db1e8004bd993 (patch)
tree: 05d4a1701a5b328cf7d3d2d94134327bb8d51ad8 /src/time
parent: b3f7f60129b822978115717912f4d477a46e8467 (diff)
download: go-8fdc79e18a9704185bd6471b592db1e8004bd993.tar.gz
go-8fdc79e18a9704185bd6471b592db1e8004bd993.zip
1 files changed, 187 insertions, 0 deletions
diff --git a/src/time/sleep_test.go b/src/time/sleep_test.go
index f5678020b9..ba0016bf49 100644
--- a/src/time/sleep_test.go
+++ b/src/time/sleep_test.go
@@ -501,3 +501,190 @@ func TestZeroTimerStopPanics(t *testing.T) {
 	var tr Timer
 	tr.Stop()
 }
+
+// Benchmark timer latency when the thread that creates the timer is busy with
+// other work and the timers must be serviced by other threads.
+// https://golang.org/issue/38860
+func BenchmarkParallelTimerLatency(b *testing.B) {
+	gmp := runtime.GOMAXPROCS(0)
+	if gmp < 2 || runtime.NumCPU() < gmp {
+		b.Skip("skipping with GOMAXPROCS < 2 or NumCPU < GOMAXPROCS")
+	}
+
+	// allocate memory now to avoid GC interference later.
+	timerCount := gmp - 1
+	stats := make([]struct {
+		sum   float64
+		max   Duration
+		count int64
+		_     [5]int64 // cache line padding
+	}, timerCount)
+
+	// Ensure the time to start new threads to service timers will not pollute
+	// the results.
+	warmupScheduler(gmp)
+
+	// Note that other than the AfterFunc calls this benchmark is measuring it
+	// avoids using any other timers. In particular, the main goroutine uses
+	// doWork to spin for some durations because up through Go 1.15 if all
+	// threads are idle sysmon could leave deep sleep when we wake.
+
+	// Ensure sysmon is in deep sleep.
+	doWork(30 * Millisecond)
+
+	b.ResetTimer()
+
+	const delay = Millisecond
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < b.N; i++ {
+		wg.Add(timerCount)
+		atomic.StoreInt32(&count, 0)
+		for j := 0; j < timerCount; j++ {
+			j := j
+			expectedWakeup := Now().Add(delay)
+			AfterFunc(delay, func() {
+				late := Since(expectedWakeup)
+				if late < 0 {
+					late = 0
+				}
+				stats[j].count++
+				stats[j].sum += float64(late.Nanoseconds())
+				if late > stats[j].max {
+					stats[j].max = late
+				}
+				atomic.AddInt32(&count, 1)
+				for atomic.LoadInt32(&count) < int32(timerCount) {
+					// spin until all timers fired
+				}
+				wg.Done()
+			})
+		}
+
+		for atomic.LoadInt32(&count) < int32(timerCount) {
+			// spin until all timers fired
+		}
+		wg.Wait()
+
+		// Spin for a bit to let the other scheduler threads go idle before the
+		// next round.
+		doWork(Millisecond)
+	}
+	var total float64
+	var samples float64
+	max := Duration(0)
+	for _, s := range stats {
+		if s.max > max {
+			max = s.max
+		}
+		total += s.sum
+		samples += float64(s.count)
+	}
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(total/samples, "avg-late-ns")
+	b.ReportMetric(float64(max.Nanoseconds()), "max-late-ns")
+}
+
+// Benchmark timer latency with staggered wakeup times and varying CPU bound
+// workloads. https://golang.org/issue/38860
+func BenchmarkStaggeredTickerLatency(b *testing.B) {
+	gmp := runtime.GOMAXPROCS(0)
+	if gmp < 2 || runtime.NumCPU() < gmp {
+		b.Skip("skipping with GOMAXPROCS < 2 or NumCPU < GOMAXPROCS")
+	}
+
+	const delay = 3 * Millisecond
+
+	for _, dur := range []Duration{300 * Microsecond, 2 * Millisecond} {
+		b.Run(fmt.Sprintf("work-dur=%s", dur), func(b *testing.B) {
+			for tickersPerP := 1; tickersPerP < int(delay/dur)+1; tickersPerP++ {
+				tickerCount := gmp * tickersPerP
+				b.Run(fmt.Sprintf("tickers-per-P=%d", tickersPerP), func(b *testing.B) {
+					// allocate memory now to avoid GC interference later.
+					stats := make([]struct {
+						sum   float64
+						max   Duration
+						count int64
+						_     [5]int64 // cache line padding
+					}, tickerCount)
+
+					// Ensure the time to start new threads to service timers
+					// will not pollute the results.
+					warmupScheduler(gmp)
+
+					b.ResetTimer()
+
+					var wg sync.WaitGroup
+					wg.Add(tickerCount)
+					for j := 0; j < tickerCount; j++ {
+						j := j
+						doWork(delay / Duration(gmp))
+						expectedWakeup := Now().Add(delay)
+						ticker := NewTicker(delay)
+						go func(c int, ticker *Ticker, firstWake Time) {
+							defer ticker.Stop()
+
+							for ; c > 0; c-- {
+								<-ticker.C
+								late := Since(expectedWakeup)
+								if late < 0 {
+									late = 0
+								}
+								stats[j].count++
+								stats[j].sum += float64(late.Nanoseconds())
+								if late > stats[j].max {
+									stats[j].max = late
+								}
+								expectedWakeup = expectedWakeup.Add(delay)
+								doWork(dur)
+							}
+							wg.Done()
+						}(b.N, ticker, expectedWakeup)
+					}
+					wg.Wait()
+
+					var total float64
+					var samples float64
+					max := Duration(0)
+					for _, s := range stats {
+						if s.max > max {
+							max = s.max
+						}
+						total += s.sum
+						samples += float64(s.count)
+					}
+					b.ReportMetric(0, "ns/op")
+					b.ReportMetric(total/samples, "avg-late-ns")
+					b.ReportMetric(float64(max.Nanoseconds()), "max-late-ns")
+				})
+			}
+		})
+	}
+}
+
+// warmupScheduler ensures the scheduler has at least targetThreadCount threads
+// in its thread pool.
+func warmupScheduler(targetThreadCount int) {
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < targetThreadCount; i++ {
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(&count, 1)
+			for atomic.LoadInt32(&count) < int32(targetThreadCount) {
+				// spin until all threads started
+			}
+
+			// spin a bit more to ensure they are all running on separate CPUs.
+			doWork(Millisecond)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func doWork(dur Duration) {
+	start := Now()
+	for Since(start) < dur {
+	}
+}
author	Chris Hines <chris.cs.guy@gmail.com>	2020-05-01 17:04:36 -0400
committer	Alberto Donizetti <alb.donizetti@gmail.com>	2020-10-27 17:24:42 +0000
commit	8fdc79e18a9704185bd6471b592db1e8004bd993 (patch)
tree	05d4a1701a5b328cf7d3d2d94134327bb8d51ad8 /src/time
parent	b3f7f60129b822978115717912f4d477a46e8467 (diff)
download	go-8fdc79e18a9704185bd6471b592db1e8004bd993.tar.gz go-8fdc79e18a9704185bd6471b592db1e8004bd993.zip