3 files changed, 294 insertions, 83 deletions
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 0cbbfc4f45..3f9b087856 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index ebecc92745..6feecef985 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2264,11 +2264,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2477,40 +2482,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
 
-			// Don't bother to attempt to steal if p2 is idle.
-			if !idlepMask.read(enum.position()) {
-				if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-					return gp, false
-				}
-			}
-
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
 			//
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
 			//
-			// TODO(prattmic): Maintain a global look-aside similar
-			// to idlepMask to avoid looking at p2 if it can't
-			// possibly have timers.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// TODO(prattmic): Maintain a global look-aside similar to idlepMask
+			// to avoid looking at p2 if it can't possibly have timers.
+			if stealTimersOrRunNextG {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2531,6 +2529,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2606,7 +2611,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2640,6 +2645,35 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for _, _p_ := range allpSnapshot {
+		// This is similar to nobarrierWakeTime, but minimizes calls to
+		// nanotime.
+		if atomic.Load(&_p_.adjustTimers) > 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			pollUntil = now
+		} else {
+			w := int64(atomic.Load64(&_p_.timer0When))
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
 		lock(&sched.lock)
@@ -2735,9 +2769,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2748,6 +2782,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -3034,25 +3072,6 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 	return rnow, pollUntil, ran
 }
 
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
-}
-
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
 	unlock((*mutex)(lock))
 	return true
@@ -4603,7 +4622,7 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
-	maskWords := (nprocs+31) / 32
+	maskWords := (nprocs + 31) / 32
 
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
@@ -4927,11 +4946,28 @@ func sysmon() {
 		}
 		usleep(delay)
 		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4945,33 +4981,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
 					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4996,12 +5026,6 @@ func sysmon() {
 			}
 		}
 		mDoFixup()
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
diff --git a/src/time/sleep_test.go b/src/time/sleep_test.go
index f5678020b9..ba0016bf49 100644
--- a/src/time/sleep_test.go
+++ b/src/time/sleep_test.go
@@ -501,3 +501,190 @@ func TestZeroTimerStopPanics(t *testing.T) {
 	var tr Timer
 	tr.Stop()
 }
+
+// Benchmark timer latency when the thread that creates the timer is busy with
+// other work and the timers must be serviced by other threads.
+// https://golang.org/issue/38860
+func BenchmarkParallelTimerLatency(b *testing.B) {
+	gmp := runtime.GOMAXPROCS(0)
+	if gmp < 2 || runtime.NumCPU() < gmp {
+		b.Skip("skipping with GOMAXPROCS < 2 or NumCPU < GOMAXPROCS")
+	}
+
+	// allocate memory now to avoid GC interference later.
+	timerCount := gmp - 1
+	stats := make([]struct {
+		sum   float64
+		max   Duration
+		count int64
+		_     [5]int64 // cache line padding
+	}, timerCount)
+
+	// Ensure the time to start new threads to service timers will not pollute
+	// the results.
+	warmupScheduler(gmp)
+
+	// Note that other than the AfterFunc calls this benchmark is measuring it
+	// avoids using any other timers. In particular, the main goroutine uses
+	// doWork to spin for some durations because up through Go 1.15 if all
+	// threads are idle sysmon could leave deep sleep when we wake.
+
+	// Ensure sysmon is in deep sleep.
+	doWork(30 * Millisecond)
+
+	b.ResetTimer()
+
+	const delay = Millisecond
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < b.N; i++ {
+		wg.Add(timerCount)
+		atomic.StoreInt32(&count, 0)
+		for j := 0; j < timerCount; j++ {
+			j := j
+			expectedWakeup := Now().Add(delay)
+			AfterFunc(delay, func() {
+				late := Since(expectedWakeup)
+				if late < 0 {
+					late = 0
+				}
+				stats[j].count++
+				stats[j].sum += float64(late.Nanoseconds())
+				if late > stats[j].max {
+					stats[j].max = late
+				}
+				atomic.AddInt32(&count, 1)
+				for atomic.LoadInt32(&count) < int32(timerCount) {
+					// spin until all timers fired
+				}
+				wg.Done()
+			})
+		}
+
+		for atomic.LoadInt32(&count) < int32(timerCount) {
+			// spin until all timers fired
+		}
+		wg.Wait()
+
+		// Spin for a bit to let the other scheduler threads go idle before the
+		// next round.
+		doWork(Millisecond)
+	}
+	var total float64
+	var samples float64
+	max := Duration(0)
+	for _, s := range stats {
+		if s.max > max {
+			max = s.max
+		}
+		total += s.sum
+		samples += float64(s.count)
+	}
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(total/samples, "avg-late-ns")
+	b.ReportMetric(float64(max.Nanoseconds()), "max-late-ns")
+}
+
+// Benchmark timer latency with staggered wakeup times and varying CPU bound
+// workloads. https://golang.org/issue/38860
+func BenchmarkStaggeredTickerLatency(b *testing.B) {
+	gmp := runtime.GOMAXPROCS(0)
+	if gmp < 2 || runtime.NumCPU() < gmp {
+		b.Skip("skipping with GOMAXPROCS < 2 or NumCPU < GOMAXPROCS")
+	}
+
+	const delay = 3 * Millisecond
+
+	for _, dur := range []Duration{300 * Microsecond, 2 * Millisecond} {
+		b.Run(fmt.Sprintf("work-dur=%s", dur), func(b *testing.B) {
+			for tickersPerP := 1; tickersPerP < int(delay/dur)+1; tickersPerP++ {
+				tickerCount := gmp * tickersPerP
+				b.Run(fmt.Sprintf("tickers-per-P=%d", tickersPerP), func(b *testing.B) {
+					// allocate memory now to avoid GC interference later.
+					stats := make([]struct {
+						sum   float64
+						max   Duration
+						count int64
+						_     [5]int64 // cache line padding
+					}, tickerCount)
+
+					// Ensure the time to start new threads to service timers
+					// will not pollute the results.
+					warmupScheduler(gmp)
+
+					b.ResetTimer()
+
+					var wg sync.WaitGroup
+					wg.Add(tickerCount)
+					for j := 0; j < tickerCount; j++ {
+						j := j
+						doWork(delay / Duration(gmp))
+						expectedWakeup := Now().Add(delay)
+						ticker := NewTicker(delay)
+						go func(c int, ticker *Ticker, firstWake Time) {
+							defer ticker.Stop()
+
+							for ; c > 0; c-- {
+								<-ticker.C
+								late := Since(expectedWakeup)
+								if late < 0 {
+									late = 0
+								}
+								stats[j].count++
+								stats[j].sum += float64(late.Nanoseconds())
+								if late > stats[j].max {
+									stats[j].max = late
+								}
+								expectedWakeup = expectedWakeup.Add(delay)
+								doWork(dur)
+							}
+							wg.Done()
+						}(b.N, ticker, expectedWakeup)
+					}
+					wg.Wait()
+
+					var total float64
+					var samples float64
+					max := Duration(0)
+					for _, s := range stats {
+						if s.max > max {
+							max = s.max
+						}
+						total += s.sum
+						samples += float64(s.count)
+					}
+					b.ReportMetric(0, "ns/op")
+					b.ReportMetric(total/samples, "avg-late-ns")
+					b.ReportMetric(float64(max.Nanoseconds()), "max-late-ns")
+				})
+			}
+		})
+	}
+}
+
+// warmupScheduler ensures the scheduler has at least targetThreadCount threads
+// in its thread pool.
+func warmupScheduler(targetThreadCount int) {
+	var wg sync.WaitGroup
+	var count int32
+	for i := 0; i < targetThreadCount; i++ {
+		wg.Add(1)
+		go func() {
+			atomic.AddInt32(&count, 1)
+			for atomic.LoadInt32(&count) < int32(targetThreadCount) {
+				// spin until all threads started
+			}
+
+			// spin a bit more to ensure they are all running on separate CPUs.
+			doWork(Millisecond)
+			wg.Done()
+		}()
+	}
+	wg.Wait()
+}
+
+func doWork(dur Duration) {
+	start := Now()
+	for Since(start) < dur {
+	}
+}