runtime: refactor the scavenger and make it testable

This change refactors the scavenger into a type whose methods represent the actual function and scheduling of the scavenger. It also stubs out access to global state in order to make it testable. This change thus also adds a test for the scavenger. In writing this test, I discovered the lack of a behavior I expected: if the pageAlloc.scavenge returns < the bytes requested scavenged, that means the heap is exhausted. This has been true this whole time, but was not documented or explicitly relied upon. This change rectifies that. In theory this means the scavenger could spin in run() indefinitely (as happened in the test) if shouldStop never told it to stop. In practice, shouldStop fires long before the heap is exhausted, but for future changes it may be important. At the very least it's good to be intentional about these things. While we're here, I also moved the call to stopTimer out of wake and into sleep. There's no reason to add more operations to a context that's already precarious (running without a P on sysmon). Change-Id: Ib31b86379fd9df84f25ae282734437afc540da5c Reviewed-on: https://go-review.googlesource.com/c/go/+/384734 Reviewed-by: Michael Pratt <mpratt@google.com> Run-TryBot: Michael Knyszek <mknyszek@google.com> TryBot-Result: Gopher Robot <gobot@golang.org>
author: Michael Anthony Knyszek <mknyszek@google.com> 2022-02-10 00:49:44 +0000
committer: Michael Knyszek <mknyszek@google.com> 2022-04-26 22:15:21 +0000
commit: d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f (patch)
tree: b17cc67d7d830262267d264799f5ed4e95da8ed8
parent: d8cf2243e0ed1c498ed405432c10f9596815a582 (diff)
download: go-d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f.tar.gz
go-d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f.zip
5 files changed, 580 insertions, 231 deletions
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 0cf2fb4ea7..3916eaf0e3 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1372,3 +1372,107 @@ func NewPIController(kp, ti, tt, min, max float64) *PIController {
 func (c *PIController) Next(input, setpoint, period float64) (float64, bool) {
 	return c.piController.next(input, setpoint, period)
 }
+
+const ScavengePercent = scavengePercent
+
+type Scavenger struct {
+	Sleep      func(int64) int64
+	Scavenge   func(uintptr) (uintptr, int64)
+	ShouldStop func() bool
+	GoMaxProcs func() int32
+
+	released  atomic.Uintptr
+	scavenger scavengerState
+	stop      chan<- struct{}
+	done      <-chan struct{}
+}
+
+func (s *Scavenger) Start() {
+	if s.Sleep == nil || s.Scavenge == nil || s.ShouldStop == nil || s.GoMaxProcs == nil {
+		panic("must populate all stubs")
+	}
+
+	// Install hooks.
+	s.scavenger.sleepStub = s.Sleep
+	s.scavenger.scavenge = s.Scavenge
+	s.scavenger.shouldStop = s.ShouldStop
+	s.scavenger.gomaxprocs = s.GoMaxProcs
+
+	// Start up scavenger goroutine, and wait for it to be ready.
+	stop := make(chan struct{})
+	s.stop = stop
+	done := make(chan struct{})
+	s.done = done
+	go func() {
+		// This should match bgscavenge, loosely.
+		s.scavenger.init()
+		s.scavenger.park()
+		for {
+			select {
+			case <-stop:
+				close(done)
+				return
+			default:
+			}
+			released, workTime := s.scavenger.run()
+			if released == 0 {
+				s.scavenger.park()
+				continue
+			}
+			s.released.Add(released)
+			s.scavenger.sleep(workTime)
+		}
+	}()
+	if !s.BlockUntilParked(1e9 /* 1 second */) {
+		panic("timed out waiting for scavenger to get ready")
+	}
+}
+
+// BlockUntilParked blocks until the scavenger parks, or until
+// timeout is exceeded. Returns true if the scavenger parked.
+//
+// Note that in testing, parked means something slightly different.
+// In anger, the scavenger parks to sleep, too, but in testing,
+// it only parks when it actually has no work to do.
+func (s *Scavenger) BlockUntilParked(timeout int64) bool {
+	// Just spin, waiting for it to park.
+	//
+	// The actual parking process is racy with respect to
+	// wakeups, which is fine, but for testing we need something
+	// a bit more robust.
+	start := nanotime()
+	for nanotime()-start < timeout {
+		lock(&s.scavenger.lock)
+		parked := s.scavenger.parked
+		unlock(&s.scavenger.lock)
+		if parked {
+			return true
+		}
+		Gosched()
+	}
+	return false
+}
+
+// Released returns how many bytes the scavenger released.
+func (s *Scavenger) Released() uintptr {
+	return s.released.Load()
+}
+
+// Wake wakes up a parked scavenger to keep running.
+func (s *Scavenger) Wake() {
+	s.scavenger.wake()
+}
+
+// Stop cleans up the scavenger's resources. The scavenger
+// must be parked for this to work.
+func (s *Scavenger) Stop() {
+	lock(&s.scavenger.lock)
+	parked := s.scavenger.parked
+	unlock(&s.scavenger.lock)
+	if !parked {
+		panic("tried to clean up scavenger that is not parked")
+	}
+	close(s.stop)
+	s.Wake()
+	<-s.done
+}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 5f50378adf..1abdbf3a0d 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -163,53 +163,186 @@ func gcPaceScavenger(heapGoal, lastHeapGoal uint64) {
 	atomic.Store64(&mheap_.scavengeGoal, retainedGoal)
 }
 
+const (
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues. Pick something conservative which we'll
+	// also use as a fallback.
+	startingScavSleepRatio = 0.001
+
+	// Spend at least 1 ms scavenging, otherwise the corresponding
+	// sleep time to maintain our desired utilization is too low to
+	// be reliable.
+	minScavWorkTime = 1e6
+)
+
 // Sleep/wait state of the background scavenger.
-var scavenge struct {
-	lock                 mutex
-	g                    *g
-	parked               bool
-	timer                *timer
-	sysmonWake           uint32 // Set atomically.
-	printControllerReset bool   // Whether the scavenger is in cooldown.
+var scavenger scavengerState
+
+type scavengerState struct {
+	// lock protects all fields below.
+	lock mutex
+
+	// g is the goroutine the scavenger is bound to.
+	g *g
+
+	// parked is whether or not the scavenger is parked.
+	parked bool
+
+	// timer is the timer used for the scavenger to sleep.
+	timer *timer
+
+	// sysmonWake signals to sysmon that it should wake the scavenger.
+	sysmonWake atomic.Uint32
+
+	// targetCPUFraction is the target CPU overhead for the scavenger.
+	targetCPUFraction float64
+
+	// sleepRatio is the ratio of time spent doing scavenging work to
+	// time spent sleeping. This is used to decide how long the scavenger
+	// should sleep for in between batches of work. It is set by
+	// critSleepController in order to maintain a CPU overhead of
+	// targetCPUFraction.
+	//
+	// Lower means more sleep, higher means more aggressive scavenging.
+	sleepRatio float64
+
+	// sleepController controls sleepRatio.
+	//
+	// See sleepRatio for more details.
+	sleepController piController
+
+	// cooldown is the time left in nanoseconds during which we avoid
+	// using the controller and we hold sleepRatio at a conservative
+	// value. Used if the controller's assumptions fail to hold.
+	controllerCooldown int64
+
+	// printControllerReset instructs printScavTrace to signal that
+	// the controller was reset.
+	printControllerReset bool
+
+	// sleepStub is a stub used for testing to avoid actually having
+	// the scavenger sleep.
+	//
+	// Unlike the other stubs, this is not populated if left nil
+	// Instead, it is called when non-nil because any valid implementation
+	// of this function basically requires closing over this scavenger
+	// state, and allocating a closure is not allowed in the runtime as
+	// a matter of policy.
+	sleepStub func(n int64) int64
+
+	// scavenge is a function that scavenges n bytes of memory.
+	// Returns how many bytes of memory it actually scavenged, as
+	// well as the time it took in nanoseconds. Usually mheap.pages.scavenge
+	// with nanotime called around it, but stubbed out for testing.
+	// Like mheap.pages.scavenge, if it scavenges less than n bytes of
+	// memory, the caller may assume the heap is exhausted of scavengable
+	// memory for now.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	scavenge func(n uintptr) (uintptr, int64)
+
+	// shouldStop is a callback called in the work loop and provides a
+	// point that can force the scavenger to stop early, for example because
+	// the scavenge policy dictates too much has been scavenged already.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	shouldStop func() bool
+
+	// gomaxprocs returns the current value of gomaxprocs. Stub for testing.
+	//
+	// If this is nil, it is populated with the real thing in init.
+	gomaxprocs func() int32
 }
 
-// readyForScavenger signals sysmon to wake the scavenger because
-// there may be new work to do.
+// init initializes a scavenger state and wires to the current G.
 //
-// There may be a significant delay between when this function runs
-// and when the scavenger is kicked awake, but it may be safely invoked
-// in contexts where wakeScavenger is unsafe to call directly.
-func readyForScavenger() {
-	atomic.Store(&scavenge.sysmonWake, 1)
+// Must be called from a regular goroutine that can allocate.
+func (s *scavengerState) init() {
+	if s.g != nil {
+		throw("scavenger state is already wired")
+	}
+	lockInit(&s.lock, lockRankScavenge)
+	s.g = getg()
+
+	s.timer = new(timer)
+	s.timer.arg = s
+	s.timer.f = func(s any, _ uintptr) {
+		s.(*scavengerState).wake()
+	}
+
+	// input: fraction of CPU time actually used.
+	// setpoint: ideal CPU fraction.
+	// output: ratio of time worked to time slept (determines sleep time).
+	//
+	// The output of this controller is somewhat indirect to what we actually
+	// want to achieve: how much time to sleep for. The reason for this definition
+	// is to ensure that the controller's outputs have a direct relationship with
+	// its inputs (as opposed to an inverse relationship), making it somewhat
+	// easier to reason about for tuning purposes.
+	s.sleepController = piController{
+		// Tuned loosely via Ziegler-Nichols process.
+		kp: 0.3375,
+		ti: 3.2e6,
+		tt: 1e9, // 1 second reset time.
+
+		// These ranges seem wide, but we want to give the controller plenty of
+		// room to hunt for the optimal value.
+		min: 0.001,  // 1:1000
+		max: 1000.0, // 1000:1
+	}
+	s.sleepRatio = startingScavSleepRatio
+
+	// Install real functions if stubs aren't present.
+	if s.scavenge == nil {
+		s.scavenge = func(n uintptr) (uintptr, int64) {
+			start := nanotime()
+			r := mheap_.pages.scavenge(n)
+			end := nanotime()
+			if start >= end {
+				return r, 0
+			}
+			return r, end - start
+		}
+	}
+	if s.shouldStop == nil {
+		s.shouldStop = func() bool {
+			// If background scavenging is disabled or if there's no work to do just stop.
+			return heapRetained() <= atomic.Load64(&mheap_.scavengeGoal)
+		}
+	}
+	if s.gomaxprocs == nil {
+		s.gomaxprocs = func() int32 {
+			return gomaxprocs
+		}
+	}
+}
+
+// park parks the scavenger goroutine.
+func (s *scavengerState) park() {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to park scavenger from another goroutine")
+	}
+	s.parked = true
+	goparkunlock(&s.lock, waitReasonGCScavengeWait, traceEvGoBlock, 2)
 }
 
-// wakeScavenger immediately unparks the scavenger if necessary.
-//
-// May run without a P, but it may allocate, so it must not be called
-// on any allocation path.
-//
-// mheap_.lock, scavenge.lock, and sched.lock must not be held.
-func wakeScavenger() {
-	lock(&scavenge.lock)
-	if scavenge.parked {
-		// Notify sysmon that it shouldn't bother waking up the scavenger.
-		atomic.Store(&scavenge.sysmonWake, 0)
-
-		// Try to stop the timer but we don't really care if we succeed.
-		// It's possible that either a timer was never started, or that
-		// we're racing with it.
-		// In the case that we're racing with there's the low chance that
-		// we experience a spurious wake-up of the scavenger, but that's
-		// totally safe.
-		stopTimer(scavenge.timer)
-
-		// Unpark the goroutine and tell it that there may have been a pacing
-		// change. Note that we skip the scheduler's runnext slot because we
-		// want to avoid having the scavenger interfere with the fair
-		// scheduling of user goroutines. In effect, this schedules the
-		// scavenger at a "lower priority" but that's OK because it'll
-		// catch up on the work it missed when it does get scheduled.
-		scavenge.parked = false
+// ready signals to sysmon that the scavenger should be awoken.
+func (s *scavengerState) ready() {
+	s.sysmonWake.Store(1)
+}
+
+// wake immediately unparks the scavenger if necessary.
+//
+// Safe to run without a P.
+func (s *scavengerState) wake() {
+	lock(&s.lock)
+	if s.parked {
+		// Unset sysmonWake, since the scavenger is now being awoken.
+		s.sysmonWake.Store(0)
+
+		// s.parked is unset to prevent a double wake-up.
+		s.parked = false
 
 		// Ready the goroutine by injecting it. We use injectglist instead
 		// of ready or goready in order to allow us to run this function
@@ -218,217 +351,217 @@ func wakeScavenger() {
 		// the scavenger from interfering with user goroutine scheduling
 		// too much.
 		var list gList
-		list.push(scavenge.g)
+		list.push(s.g)
 		injectglist(&list)
 	}
-	unlock(&scavenge.lock)
+	unlock(&s.lock)
 }
 
-// scavengeSleep attempts to put the scavenger to sleep for ns.
+// sleep puts the scavenger to sleep based on the amount of time that it worked
+// in nanoseconds.
 //
 // Note that this function should only be called by the scavenger.
 //
 // The scavenger may be woken up earlier by a pacing change, and it may not go
 // to sleep at all if there's a pending pacing change.
-//
-// Returns the amount of time actually slept.
-func scavengeSleep(ns int64) int64 {
-	lock(&scavenge.lock)
-
-	// Set the timer.
-	//
-	// This must happen here instead of inside gopark
-	// because we can't close over any variables without
-	// failing escape analysis.
-	start := nanotime()
-	resetTimer(scavenge.timer, start+ns)
-
-	// Mark ourself as asleep and go to sleep.
-	scavenge.parked = true
-	goparkunlock(&scavenge.lock, waitReasonSleep, traceEvGoSleep, 2)
-
-	// Return how long we actually slept for.
-	return nanotime() - start
-}
-
-// Background scavenger.
-//
-// The background scavenger maintains the RSS of the application below
-// the line described by the proportional scavenging statistics in
-// the mheap struct.
-func bgscavenge(c chan int) {
-	scavenge.g = getg()
+func (s *scavengerState) sleep(worked float64) {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to sleep scavenger from another goroutine")
+	}
 
-	lockInit(&scavenge.lock, lockRankScavenge)
-	lock(&scavenge.lock)
-	scavenge.parked = true
+	if worked < minScavWorkTime {
+		// This means there wasn't enough work to actually fill up minScavWorkTime.
+		// That's fine; we shouldn't try to do anything with this information
+		// because it's going result in a short enough sleep request that things
+		// will get messy. Just assume we did at least this much work.
+		// All this means is that we'll sleep longer than we otherwise would have.
+		worked = minScavWorkTime
+	}
 
-	scavenge.timer = new(timer)
-	scavenge.timer.f = func(_ any, _ uintptr) {
-		wakeScavenger()
+	// Multiply the critical time by 1 + the ratio of the costs of using
+	// scavenged memory vs. scavenging memory. This forces us to pay down
+	// the cost of reusing this memory eagerly by sleeping for a longer period
+	// of time and scavenging less frequently. More concretely, we avoid situations
+	// where we end up scavenging so often that we hurt allocation performance
+	// because of the additional overheads of using scavenged memory.
+	worked *= 1 + scavengeCostRatio
+
+	// sleepTime is the amount of time we're going to sleep, based on the amount
+	// of time we worked, and the sleepRatio.
+	sleepTime := int64(worked / s.sleepRatio)
+
+	var slept int64
+	if s.sleepStub == nil {
+		// Set the timer.
+		//
+		// This must happen here instead of inside gopark
+		// because we can't close over any variables without
+		// failing escape analysis.
+		start := nanotime()
+		resetTimer(s.timer, start+sleepTime)
+
+		// Mark ourselves as asleep and go to sleep.
+		s.parked = true
+		goparkunlock(&s.lock, waitReasonSleep, traceEvGoSleep, 2)
+
+		// How long we actually slept for.
+		slept = nanotime() - start
+
+		lock(&s.lock)
+		// Stop the timer here because s.wake is unable to do it for us.
+		// We don't really care if we succeed in stopping the timer. One
+		// reason we might fail is that we've already woken up, but the timer
+		// might be in the process of firing on some other P; essentially we're
+		// racing with it. That's totally OK. Double wake-ups are perfectly safe.
+		stopTimer(s.timer)
+		unlock(&s.lock)
+	} else {
+		unlock(&s.lock)
+		slept = s.sleepStub(sleepTime)
 	}
 
-	c <- 1
-	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
+	// Stop here if we're cooling down from the controller.
+	if s.controllerCooldown > 0 {
+		// worked and slept aren't exact measures of time, but it's OK to be a bit
+		// sloppy here. We're just hoping we're avoiding some transient bad behavior.
+		t := slept + int64(worked)
+		if t > s.controllerCooldown {
+			s.controllerCooldown = 0
+		} else {
+			s.controllerCooldown -= t
+		}
+		return
+	}
 
 	// idealFraction is the ideal % of overall application CPU time that we
 	// spend scavenging.
 	idealFraction := float64(scavengePercent) / 100.0
 
-	// Input: fraction of CPU time used.
-	// Setpoint: idealFraction.
-	// Output: ratio of critical time to sleep time (determines sleep time).
+	// Calculate the CPU time spent.
 	//
-	// The output of this controller is somewhat indirect to what we actually
-	// want to achieve: how much time to sleep for. The reason for this definition
-	// is to ensure that the controller's outputs have a direct relationship with
-	// its inputs (as opposed to an inverse relationship), making it somewhat
-	// easier to reason about for tuning purposes.
-	critSleepController := piController{
-		// Tuned loosely via Ziegler-Nichols process.
-		kp: 0.3375,
-		ti: 3.2e6,
-		tt: 1e9, // 1 second reset time.
-
-		// These ranges seem wide, but we want to give the controller plenty of
-		// room to hunt for the optimal value.
-		min: 0.001,  // 1:1000
-		max: 1000.0, // 1000:1
+	// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+	// recomputing this often enough relative to GOMAXPROCS changes in general
+	// (it only changes when the world is stopped, and not during a GC) that
+	// that small inaccuracy is in the noise.
+	cpuFraction := worked / ((float64(slept) + worked) * float64(s.gomaxprocs()))
+
+	// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+	var ok bool
+	s.sleepRatio, ok = s.sleepController.next(cpuFraction, idealFraction, float64(slept)+worked)
+	if !ok {
+		// The core assumption of the controller, that we can get a proportional
+		// response, broke down. This may be transient, so temporarily switch to
+		// sleeping a fixed, conservative amount.
+		s.sleepRatio = startingScavSleepRatio
+		s.controllerCooldown = 5e9 // 5 seconds.
+
+		// Signal the scav trace printer to output this.
+		s.controllerFailed()
 	}
-	// It doesn't really matter what value we start at, but we can't be zero, because
-	// that'll cause divide-by-zero issues. Pick something conservative which we'll
-	// also use as a fallback.
-	const startingCritSleepRatio = 0.001
-	critSleepRatio := startingCritSleepRatio
-	// Duration left in nanoseconds during which we avoid using the controller and
-	// we hold critSleepRatio at a conservative value. Used if the controller's
-	// assumptions fail to hold.
-	controllerCooldown := int64(0)
-	for {
-		released := uintptr(0)
-		crit := float64(0)
-
-		// Spend at least 1 ms scavenging, otherwise the corresponding
-		// sleep time to maintain our desired utilization is too low to
-		// be reliable.
-		const minCritTime = 1e6
-		for crit < minCritTime {
-			// If background scavenging is disabled or if there's no work to do just park.
-			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
-			if retained <= goal {
-				break
-			}
-
-			// scavengeQuantum is the amount of memory we try to scavenge
-			// in one go. A smaller value means the scavenger is more responsive
-			// to the scheduler in case of e.g. preemption. A larger value means
-			// that the overheads of scavenging are better amortized, so better
-			// scavenging throughput.
-			//
-			// The current value is chosen assuming a cost of ~10µs/physical page
-			// (this is somewhat pessimistic), which implies a worst-case latency of
-			// about 160µs for 4 KiB physical pages. The current value is biased
-			// toward latency over throughput.
-			const scavengeQuantum = 64 << 10
+}
 
-			// Accumulate the amount of time spent scavenging.
-			start := nanotime()
-			r := mheap_.pages.scavenge(scavengeQuantum)
-			atomic.Xadduintptr(&mheap_.pages.scav.released, r)
-			end := nanotime()
+// controllerFailed indicates that the scavenger's scheduling
+// controller failed.
+func (s *scavengerState) controllerFailed() {
+	lock(&s.lock)
+	s.printControllerReset = true
+	unlock(&s.lock)
+}
 
-			// On some platforms we may see end >= start if the time it takes to scavenge
-			// memory is less than the minimum granularity of its clock (e.g. Windows) or
-			// due to clock bugs.
-			//
-			// In this case, just assume scavenging takes 10 µs per regular physical page
-			// (determined empirically), and conservatively ignore the impact of huge pages
-			// on timing.
-			const approxCritNSPerPhysicalPage = 10e3
-			if end <= start {
-				crit += approxCritNSPerPhysicalPage * float64(r/physPageSize)
-			} else {
-				crit += float64(end - start)
-			}
-			released += r
+// run is the body of the main scavenging loop.
+//
+// Returns the number of bytes released and the estimated time spent
+// releasing those bytes.
+//
+// Must be run on the scavenger goroutine.
+func (s *scavengerState) run() (released uintptr, worked float64) {
+	lock(&s.lock)
+	if getg() != s.g {
+		throw("tried to run scavenger from another goroutine")
+	}
+	unlock(&s.lock)
 
-			// When using fake time just do one loop.
-			if faketime != 0 {
-				break
-			}
+	for worked < minScavWorkTime {
+		// If something from outside tells us to stop early, stop.
+		if s.shouldStop() {
+			break
 		}
 
-		if released == 0 {
-			lock(&scavenge.lock)
-			scavenge.parked = true
-			goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
-			continue
+		// scavengeQuantum is the amount of memory we try to scavenge
+		// in one go. A smaller value means the scavenger is more responsive
+		// to the scheduler in case of e.g. preemption. A larger value means
+		// that the overheads of scavenging are better amortized, so better
+		// scavenging throughput.
+		//
+		// The current value is chosen assuming a cost of ~10µs/physical page
+		// (this is somewhat pessimistic), which implies a worst-case latency of
+		// about 160µs for 4 KiB physical pages. The current value is biased
+		// toward latency over throughput.
+		const scavengeQuantum = 64 << 10
+
+		// Accumulate the amount of time spent scavenging.
+		r, duration := s.scavenge(scavengeQuantum)
+
+		// On some platforms we may see end >= start if the time it takes to scavenge
+		// memory is less than the minimum granularity of its clock (e.g. Windows) or
+		// due to clock bugs.
+		//
+		// In this case, just assume scavenging takes 10 µs per regular physical page
+		// (determined empirically), and conservatively ignore the impact of huge pages
+		// on timing.
+		const approxWorkedNSPerPhysicalPage = 10e3
+		if duration == 0 {
+			worked += approxWorkedNSPerPhysicalPage * float64(r/physPageSize)
+		} else {
+			// TODO(mknyszek): If duration is small compared to worked, it could be
+			// rounded down to zero. Probably not a problem in practice because the
+			// values are all within a few orders of magnitude of each other but maybe
+			// worth worrying about.
+			worked += float64(duration)
 		}
+		released += r
 
-		if released < physPageSize {
-			// If this happens, it means that we may have attempted to release part
-			// of a physical page, but the likely effect of that is that it released
-			// the whole physical page, some of which may have still been in-use.
-			// This could lead to memory corruption. Throw.
-			throw("released less than one physical page of memory")
+		// scavenge does not return until it either finds the requisite amount of
+		// memory to scavenge, or exhausts the heap. If we haven't found enough
+		// to scavenge, then the heap must be exhausted.
+		if r < scavengeQuantum {
+			break
 		}
-
-		if crit < minCritTime {
-			// This means there wasn't enough work to actually fill up minCritTime.
-			// That's fine; we shouldn't try to do anything with this information
-			// because it's going result in a short enough sleep request that things
-			// will get messy. Just assume we did at least this much work.
-			// All this means is that we'll sleep longer than we otherwise would have.
-			crit = minCritTime
+		// When using fake time just do one loop.
+		if faketime != 0 {
+			break
 		}
+	}
+	if released > 0 && released < physPageSize {
+		// If this happens, it means that we may have attempted to release part
+		// of a physical page, but the likely effect of that is that it released
+		// the whole physical page, some of which may have still been in-use.
+		// This could lead to memory corruption. Throw.
+		throw("released less than one physical page of memory")
+	}
+	return
+}
 
-		// Multiply the critical time by 1 + the ratio of the costs of using
-		// scavenged memory vs. scavenging memory. This forces us to pay down
-		// the cost of reusing this memory eagerly by sleeping for a longer period
-		// of time and scavenging less frequently. More concretely, we avoid situations
-		// where we end up scavenging so often that we hurt allocation performance
-		// because of the additional overheads of using scavenged memory.
-		crit *= 1 + scavengeCostRatio
-
-		// Go to sleep based on how much time we spent doing work.
-		slept := scavengeSleep(int64(crit / critSleepRatio))
-
-		// Stop here if we're cooling down from the controller.
-		if controllerCooldown > 0 {
-			// crit and slept aren't exact measures of time, but it's OK to be a bit
-			// sloppy here. We're just hoping we're avoiding some transient bad behavior.
-			t := slept + int64(crit)
-			if t > controllerCooldown {
-				controllerCooldown = 0
-			} else {
-				controllerCooldown -= t
-			}
-			continue
-		}
+// Background scavenger.
+//
+// The background scavenger maintains the RSS of the application below
+// the line described by the proportional scavenging statistics in
+// the mheap struct.
+func bgscavenge(c chan int) {
+	scavenger.init()
 
-		// Calculate the CPU time spent.
-		//
-		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
-		// recomputing this often enough relative to GOMAXPROCS changes in general
-		// (it only changes when the world is stopped, and not during a GC) that
-		// that small inaccuracy is in the noise.
-		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
-
-		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
-		var ok bool
-		critSleepRatio, ok = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
-		if !ok {
-			// The core assumption of the controller, that we can get a proportional
-			// response, broke down. This may be transient, so temporarily switch to
-			// sleeping a fixed, conservative amount.
-			critSleepRatio = startingCritSleepRatio
-			controllerCooldown = 5e9 // 5 seconds.
-
-			// Signal the scav trace printer to output this.
-			lock(&scavenge.lock)
-			scavenge.printControllerReset = true
-			unlock(&scavenge.lock)
+	c <- 1
+	scavenger.park()
+
+	for {
+		released, workTime := scavenger.run()
+		if released == 0 {
+			scavenger.park()
+			continue
 		}
+		atomic.Xadduintptr(&mheap_.pages.scav.released, released)
+		scavenger.sleep(workTime)
 	}
 }
 
@@ -438,6 +571,9 @@ func bgscavenge(c chan int) {
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
+//
+// scavenge always tries to scavenge nbytes worth of memory, and will
+// only fail to do so if the heap is exhausted for now.
 func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 	var (
 		addrs addrRange
@@ -468,9 +604,9 @@ func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 // was called, and forced indicates whether the scavenge was forced by the
 // application.
 //
-// scavenge.lock must be held.
+// scavenger.lock must be held.
 func printScavTrace(gen uint32, released uintptr, forced bool) {
-	assertLockHeld(&scavenge.lock)
+	assertLockHeld(&scavenger.lock)
 
 	printlock()
 	print("scav ", gen, " ",
@@ -480,9 +616,9 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 	)
 	if forced {
 		print(" (forced)")
-	} else if scavenge.printControllerReset {
+	} else if scavenger.printControllerReset {
 		print(" [controller reset]")
-		scavenge.printControllerReset = false
+		scavenger.printControllerReset = false
 	}
 	println()
 	printunlock()
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 0659293c60..8d92295961 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -9,7 +9,9 @@ import (
 	"internal/goos"
 	"math/rand"
 	. "runtime"
+	"runtime/internal/atomic"
 	"testing"
+	"time"
 )
 
 // makePallocData produces an initialized PallocData by setting
@@ -449,3 +451,113 @@ func TestPageAllocScavenge(t *testing.T) {
 		})
 	}
 }
+
+func TestScavenger(t *testing.T) {
+	// workedTime is a standard conversion of bytes of scavenge
+	// work to time elapsed.
+	workedTime := func(bytes uintptr) int64 {
+		return int64((bytes+4095)/4096) * int64(10*time.Microsecond)
+	}
+
+	// Set up a bunch of state that we're going to track and verify
+	// throughout the test.
+	totalWork := uint64(64<<20 - 3*PhysPageSize)
+	var totalSlept, totalWorked atomic.Int64
+	var availableWork atomic.Uint64
+	var stopAt atomic.Uint64 // How much available work to stop at.
+
+	// Set up the scavenger.
+	var s Scavenger
+	s.Sleep = func(ns int64) int64 {
+		totalSlept.Add(ns)
+		return ns
+	}
+	s.Scavenge = func(bytes uintptr) (uintptr, int64) {
+		avail := availableWork.Load()
+		if uint64(bytes) > avail {
+			bytes = uintptr(avail)
+		}
+		t := workedTime(bytes)
+		if bytes != 0 {
+			availableWork.Add(-int64(bytes))
+			totalWorked.Add(t)
+		}
+		return bytes, t
+	}
+	s.ShouldStop = func() bool {
+		if availableWork.Load() <= stopAt.Load() {
+			return true
+		}
+		return false
+	}
+	s.GoMaxProcs = func() int32 {
+		return 1
+	}
+
+	// Define a helper for verifying that various properties hold.
+	verifyScavengerState := func(t *testing.T, expWork uint64) {
+		t.Helper()
+
+		// Check to make sure it did the amount of work we expected.
+		if workDone := uint64(s.Released()); workDone != expWork {
+			t.Errorf("want %d bytes of work done, got %d", expWork, workDone)
+		}
+		// Check to make sure the scavenger is meeting its CPU target.
+		idealFraction := float64(ScavengePercent) / 100.0
+		cpuFraction := float64(totalWorked.Load()) / float64(totalWorked.Load()+totalSlept.Load())
+		if cpuFraction < idealFraction-0.005 || cpuFraction > idealFraction+0.005 {
+			t.Errorf("want %f CPU fraction, got %f", idealFraction, cpuFraction)
+		}
+	}
+
+	// Start the scavenger.
+	s.Start()
+
+	// Set up some work and let the scavenger run to completion.
+	availableWork.Store(totalWork)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run a check.
+	verifyScavengerState(t, totalWork)
+
+	// Now let's do it again and see what happens when we have no work to do.
+	// It should've gone right back to sleep.
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, totalWork)
+
+	// One more time, this time doing the same amount of work as the first time.
+	// Let's see if we can get the scavenger to continue.
+	availableWork.Store(totalWork)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, 2*totalWork)
+
+	// This time, let's stop after a certain amount of work.
+	//
+	// Pick a stopping point such that when subtracted from totalWork
+	// we get a multiple of a relatively large power of 2. verifyScavengerState
+	// always makes an exact check, but the scavenger might go a little over,
+	// which is OK. If this breaks often or gets annoying to maintain, modify
+	// verifyScavengerState.
+	availableWork.Store(totalWork)
+	stoppingPoint := uint64(1<<20 - 3*PhysPageSize)
+	stopAt.Store(stoppingPoint)
+	s.Wake()
+	if !s.BlockUntilParked(2e9 /* 2 seconds */) {
+		t.Fatal("timed out waiting for scavenger to run to completion")
+	}
+	// Run another check.
+	verifyScavengerState(t, 2*totalWork+(totalWork-stoppingPoint))
+
+	// Clean up.
+	s.Stop()
+}
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index d0b81fd3df..a5e04d6ce6 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -263,7 +263,7 @@ func finishsweep_m() {
 	// Sweeping is done, so if the scavenger isn't already awake,
 	// wake it up. There's definitely work for it to do at this
 	// point.
-	wakeScavenger()
+	scavenger.wake()
 
 	nextMarkBitArenaEpoch()
 }
@@ -403,10 +403,7 @@ func sweepone() uintptr {
 			mheap_.pages.scavengeStartGen()
 			unlock(&mheap_.lock)
 		})
-		// Since we might sweep in an allocation path, it's not possible
-		// for us to wake the scavenger directly via wakeScavenger, since
-		// it could allocate. Ask sysmon to do it for us instead.
-		readyForScavenger()
+		scavenger.ready()
 	}
 
 	gp.m.locks--
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 485bd65a9e..b72194c76a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -5182,9 +5182,9 @@ func sysmon() {
 				startm(nil, false)
 			}
 		}
-		if atomic.Load(&scavenge.sysmonWake) != 0 {
+		if scavenger.sysmonWake.Load() != 0 {
 			// Kick the scavenger awake if someone requested it.
-			wakeScavenger()
+			scavenger.wake()
 		}
 		// retake P's blocked in syscalls
 		// and preempt long running G's
author	Michael Anthony Knyszek <mknyszek@google.com>	2022-02-10 00:49:44 +0000
committer	Michael Knyszek <mknyszek@google.com>	2022-04-26 22:15:21 +0000
commit	d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f (patch)
tree	b17cc67d7d830262267d264799f5ed4e95da8ed8
parent	d8cf2243e0ed1c498ed405432c10f9596815a582 (diff)
download	go-d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f.tar.gz go-d29f5247b8cbf5f2cb7b0e325a5eb1c7c5c1a91f.zip