src/runtime/mgclimit.go


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484

// Copyright 2022 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package runtime

import "internal/runtime/atomic"

// gcCPULimiter is a mechanism to limit GC CPU utilization in situations
// where it might become excessive and inhibit application progress (e.g.
// a death spiral).
//
// The core of the limiter is a leaky bucket mechanism that fills with GC
// CPU time and drains with mutator time. Because the bucket fills and
// drains with time directly (i.e. without any weighting), this effectively
// sets a very conservative limit of 50%. This limit could be enforced directly,
// however, but the purpose of the bucket is to accommodate spikes in GC CPU
// utilization without hurting throughput.
//
// Note that the bucket in the leaky bucket mechanism can never go negative,
// so the GC never gets credit for a lot of CPU time spent without the GC
// running. This is intentional, as an application that stays idle for, say,
// an entire day, could build up enough credit to fail to prevent a death
// spiral the following day. The bucket's capacity is the GC's only leeway.
//
// The capacity thus also sets the window the limiter considers. For example,
// if the capacity of the bucket is 1 cpu-second, then the limiter will not
// kick in until at least 1 full cpu-second in the last 2 cpu-second window
// is spent on GC CPU time.
var gcCPULimiter gcCPULimiterState

type gcCPULimiterState struct {
	lock atomic.Uint32

	enabled atomic.Bool
	bucket  struct {
		// Invariants:
		// - fill >= 0
		// - capacity >= 0
		// - fill <= capacity
		fill, capacity uint64
	}
	// overflow is the cumulative amount of GC CPU time that we tried to fill the
	// bucket with but exceeded its capacity.
	overflow uint64

	// gcEnabled is an internal copy of gcBlackenEnabled that determines
	// whether the limiter tracks total assist time.
	//
	// gcBlackenEnabled isn't used directly so as to keep this structure
	// unit-testable.
	gcEnabled bool

	// transitioning is true when the GC is in a STW and transitioning between
	// the mark and sweep phases.
	transitioning bool

	// assistTimePool is the accumulated assist time since the last update.
	assistTimePool atomic.Int64

	// idleMarkTimePool is the accumulated idle mark time since the last update.
	idleMarkTimePool atomic.Int64

	// idleTimePool is the accumulated time Ps spent on the idle list since the last update.
	idleTimePool atomic.Int64

	// lastUpdate is the nanotime timestamp of the last time update was called.
	//
	// Updated under lock, but may be read concurrently.
	lastUpdate atomic.Int64

	// lastEnabledCycle is the GC cycle that last had the limiter enabled.
	lastEnabledCycle atomic.Uint32

	// nprocs is an internal copy of gomaxprocs, used to determine total available
	// CPU time.
	//
	// gomaxprocs isn't used directly so as to keep this structure unit-testable.
	nprocs int32

	// test indicates whether this instance of the struct was made for testing purposes.
	test bool
}

// limiting returns true if the CPU limiter is currently enabled, meaning the Go GC
// should take action to limit CPU utilization.
//
// It is safe to call concurrently with other operations.
func (l *gcCPULimiterState) limiting() bool {
	return l.enabled.Load()
}

// startGCTransition notifies the limiter of a GC transition.
//
// This call takes ownership of the limiter and disables all other means of
// updating the limiter. Release ownership by calling finishGCTransition.
//
// It is safe to call concurrently with other operations.
func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) {
	if !l.tryLock() {
		// This must happen during a STW, so we can't fail to acquire the lock.
		// If we did, something went wrong. Throw.
		throw("failed to acquire lock to start a GC transition")
	}
	if l.gcEnabled == enableGC {
		throw("transitioning GC to the same state as before?")
	}
	// Flush whatever was left between the last update and now.
	l.updateLocked(now)
	l.gcEnabled = enableGC
	l.transitioning = true
	// N.B. finishGCTransition releases the lock.
	//
	// We don't release here to increase the chance that if there's a failure
	// to finish the transition, that we throw on failing to acquire the lock.
}

// finishGCTransition notifies the limiter that the GC transition is complete
// and releases ownership of it. It also accumulates STW time in the bucket.
// now must be the timestamp from the end of the STW pause.
func (l *gcCPULimiterState) finishGCTransition(now int64) {
	if !l.transitioning {
		throw("finishGCTransition called without starting one?")
	}
	// Count the full nprocs set of CPU time because the world is stopped
	// between startGCTransition and finishGCTransition. Even though the GC
	// isn't running on all CPUs, it is preventing user code from doing so,
	// so it might as well be.
	if lastUpdate := l.lastUpdate.Load(); now >= lastUpdate {
		l.accumulate(0, (now-lastUpdate)*int64(l.nprocs))
	}
	l.lastUpdate.Store(now)
	l.transitioning = false
	l.unlock()
}

// gcCPULimiterUpdatePeriod dictates the maximum amount of wall-clock time
// we can go before updating the limiter.
const gcCPULimiterUpdatePeriod = 10e6 // 10ms

// needUpdate returns true if the limiter's maximum update period has been
// exceeded, and so would benefit from an update.
func (l *gcCPULimiterState) needUpdate(now int64) bool {
	return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod
}

// addAssistTime notifies the limiter of additional assist time. It will be
// included in the next update.
func (l *gcCPULimiterState) addAssistTime(t int64) {
	l.assistTimePool.Add(t)
}

// addIdleTime notifies the limiter of additional time a P spent on the idle list. It will be
// subtracted from the total CPU time in the next update.
func (l *gcCPULimiterState) addIdleTime(t int64) {
	l.idleTimePool.Add(t)
}

// update updates the bucket given runtime-specific information. now is the
// current monotonic time in nanoseconds.
//
// This is safe to call concurrently with other operations, except *GCTransition.
func (l *gcCPULimiterState) update(now int64) {
	if !l.tryLock() {
		// We failed to acquire the lock, which means something else is currently
		// updating. Just drop our update, the next one to update will include
		// our total assist time.
		return
	}
	if l.transitioning {
		throw("update during transition")
	}
	l.updateLocked(now)
	l.unlock()
}

// updateLocked is the implementation of update. l.lock must be held.
func (l *gcCPULimiterState) updateLocked(now int64) {
	lastUpdate := l.lastUpdate.Load()
	if now < lastUpdate {
		// Defensively avoid overflow. This isn't even the latest update anyway.
		return
	}
	windowTotalTime := (now - lastUpdate) * int64(l.nprocs)
	l.lastUpdate.Store(now)

	// Drain the pool of assist time.
	assistTime := l.assistTimePool.Load()
	if assistTime != 0 {
		l.assistTimePool.Add(-assistTime)
	}

	// Drain the pool of idle time.
	idleTime := l.idleTimePool.Load()
	if idleTime != 0 {
		l.idleTimePool.Add(-idleTime)
	}

	if !l.test {
		// Consume time from in-flight events. Make sure we're not preemptible so allp can't change.
		//
		// The reason we do this instead of just waiting for those events to finish and push updates
		// is to ensure that all the time we're accounting for happened sometime between lastUpdate
		// and now. This dramatically simplifies reasoning about the limiter because we're not at
		// risk of extra time being accounted for in this window than actually happened in this window,
		// leading to all sorts of weird transient behavior.
		mp := acquirem()
		for _, pp := range allp {
			typ, duration := pp.limiterEvent.consume(now)
			switch typ {
			case limiterEventIdleMarkWork:
				fallthrough
			case limiterEventIdle:
				idleTime += duration
				sched.idleTime.Add(duration)
			case limiterEventMarkAssist:
				fallthrough
			case limiterEventScavengeAssist:
				assistTime += duration
			case limiterEventNone:
				break
			default:
				throw("invalid limiter event type found")
			}
		}
		releasem(mp)
	}

	// Compute total GC time.
	windowGCTime := assistTime
	if l.gcEnabled {
		windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
	}

	// Subtract out all idle time from the total time. Do this after computing
	// GC time, because the background utilization is dependent on the *real*
	// total time, not the total time after idle time is subtracted.
	//
	// Idle time is counted as any time that a P is on the P idle list plus idle mark
	// time. Idle mark workers soak up time that the application spends idle.
	//
	// On a heavily undersubscribed system, any additional idle time can skew GC CPU
	// utilization, because the GC might be executing continuously and thrashing,
	// yet the CPU utilization with respect to GOMAXPROCS will be quite low, so
	// the limiter fails to turn on. By subtracting idle time, we're removing time that
	// we know the application was idle giving a more accurate picture of whether
	// the GC is thrashing.
	//
	// Note that this can cause the limiter to turn on even if it's not needed. For
	// instance, on a system with 32 Ps but only 1 running goroutine, each GC will have
	// 8 dedicated GC workers. Assuming the GC cycle is half mark phase and half sweep
	// phase, then the GC CPU utilization over that cycle, with idle time removed, will
	// be 8/(8+2) = 80%. Even though the limiter turns on, though, assist should be
	// unnecessary, as the GC has way more CPU time to outpace the 1 goroutine that's
	// running.
	windowTotalTime -= idleTime

	l.accumulate(windowTotalTime-windowGCTime, windowGCTime)
}

// accumulate adds time to the bucket and signals whether the limiter is enabled.
//
// This is an internal function that deals just with the bucket. Prefer update.
// l.lock must be held.
func (l *gcCPULimiterState) accumulate(mutatorTime, gcTime int64) {
	headroom := l.bucket.capacity - l.bucket.fill
	enabled := headroom == 0

	// Let's be careful about three things here:
	// 1. The addition and subtraction, for the invariants.
	// 2. Overflow.
	// 3. Excessive mutation of l.enabled, which is accessed
	//    by all assists, potentially more than once.
	change := gcTime - mutatorTime

	// Handle limiting case.
	if change > 0 && headroom <= uint64(change) {
		l.overflow += uint64(change) - headroom
		l.bucket.fill = l.bucket.capacity
		if !enabled {
			l.enabled.Store(true)
			l.lastEnabledCycle.Store(memstats.numgc + 1)
		}
		return
	}

	// Handle non-limiting cases.
	if change < 0 && l.bucket.fill <= uint64(-change) {
		// Bucket emptied.
		l.bucket.fill = 0
	} else {
		// All other cases.
		l.bucket.fill -= uint64(-change)
	}
	if change != 0 && enabled {
		l.enabled.Store(false)
	}
}

// tryLock attempts to lock l. Returns true on success.
func (l *gcCPULimiterState) tryLock() bool {
	return l.lock.CompareAndSwap(0, 1)
}

// unlock releases the lock on l. Must be called if tryLock returns true.
func (l *gcCPULimiterState) unlock() {
	old := l.lock.Swap(0)
	if old != 1 {
		throw("double unlock")
	}
}

// capacityPerProc is the limiter's bucket capacity for each P in GOMAXPROCS.
const capacityPerProc = 1e9 // 1 second in nanoseconds

// resetCapacity updates the capacity based on GOMAXPROCS. Must not be called
// while the GC is enabled.
//
// It is safe to call concurrently with other operations.
func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) {
	if !l.tryLock() {
		// This must happen during a STW, so we can't fail to acquire the lock.
		// If we did, something went wrong. Throw.
		throw("failed to acquire lock to reset capacity")
	}
	// Flush the rest of the time for this period.
	l.updateLocked(now)
	l.nprocs = nprocs

	l.bucket.capacity = uint64(nprocs) * capacityPerProc
	if l.bucket.fill > l.bucket.capacity {
		l.bucket.fill = l.bucket.capacity
		l.enabled.Store(true)
		l.lastEnabledCycle.Store(memstats.numgc + 1)
	} else if l.bucket.fill < l.bucket.capacity {
		l.enabled.Store(false)
	}
	l.unlock()
}

// limiterEventType indicates the type of an event occurring on some P.
//
// These events represent the full set of events that the GC CPU limiter tracks
// to execute its function.
//
// This type may use no more than limiterEventBits bits of information.
type limiterEventType uint8

const (
	limiterEventNone           limiterEventType = iota // None of the following events.
	limiterEventIdleMarkWork                           // Refers to an idle mark worker (see gcMarkWorkerMode).
	limiterEventMarkAssist                             // Refers to mark assist (see gcAssistAlloc).
	limiterEventScavengeAssist                         // Refers to a scavenge assist (see allocSpan).
	limiterEventIdle                                   // Refers to time a P spent on the idle list.

	limiterEventBits = 3
)

// limiterEventTypeMask is a mask for the bits in p.limiterEventStart that represent
// the event type. The rest of the bits of that field represent a timestamp.
const (
	limiterEventTypeMask  = uint64((1<<limiterEventBits)-1) << (64 - limiterEventBits)
	limiterEventStampNone = limiterEventStamp(0)
)

// limiterEventStamp is a nanotime timestamp packed with a limiterEventType.
type limiterEventStamp uint64

// makeLimiterEventStamp creates a new stamp from the event type and the current timestamp.
func makeLimiterEventStamp(typ limiterEventType, now int64) limiterEventStamp {
	return limiterEventStamp(uint64(typ)<<(64-limiterEventBits) | (uint64(now) &^ limiterEventTypeMask))
}

// duration computes the difference between now and the start time stored in the stamp.
//
// Returns 0 if the difference is negative, which may happen if now is stale or if the
// before and after timestamps cross a 2^(64-limiterEventBits) boundary.
func (s limiterEventStamp) duration(now int64) int64 {
	// The top limiterEventBits bits of the timestamp are derived from the current time
	// when computing a duration.
	start := int64((uint64(now) & limiterEventTypeMask) | (uint64(s) &^ limiterEventTypeMask))
	if now < start {
		return 0
	}
	return now - start
}

// type extracts the event type from the stamp.
func (s limiterEventStamp) typ() limiterEventType {
	return limiterEventType(s >> (64 - limiterEventBits))
}

// limiterEvent represents tracking state for an event tracked by the GC CPU limiter.
type limiterEvent struct {
	stamp atomic.Uint64 // Stores a limiterEventStamp.
}

// start begins tracking a new limiter event of the current type. If an event
// is already in flight, then a new event cannot begin because the current time is
// already being attributed to that event. In this case, this function returns false.
// Otherwise, it returns true.
//
// The caller must be non-preemptible until at least stop is called or this function
// returns false. Because this is trying to measure "on-CPU" time of some event, getting
// scheduled away during it can mean that whatever we're measuring isn't a reflection
// of "on-CPU" time. The OS could deschedule us at any time, but we want to maintain as
// close of an approximation as we can.
func (e *limiterEvent) start(typ limiterEventType, now int64) bool {
	if limiterEventStamp(e.stamp.Load()).typ() != limiterEventNone {
		return false
	}
	e.stamp.Store(uint64(makeLimiterEventStamp(typ, now)))
	return true
}

// consume acquires the partial event CPU time from any in-flight event.
// It achieves this by storing the current time as the new event time.
//
// Returns the type of the in-flight event, as well as how long it's currently been
// executing for. Returns limiterEventNone if no event is active.
func (e *limiterEvent) consume(now int64) (typ limiterEventType, duration int64) {
	// Read the limiter event timestamp and update it to now.
	for {
		old := limiterEventStamp(e.stamp.Load())
		typ = old.typ()
		if typ == limiterEventNone {
			// There's no in-flight event, so just push that up.
			return
		}
		duration = old.duration(now)
		if duration == 0 {
			// We might have a stale now value, or this crossed the
			// 2^(64-limiterEventBits) boundary in the clock readings.
			// Just ignore it.
			return limiterEventNone, 0
		}
		new := makeLimiterEventStamp(typ, now)
		if e.stamp.CompareAndSwap(uint64(old), uint64(new)) {
			break
		}
	}
	return
}

// stop stops the active limiter event. Throws if the
//
// The caller must be non-preemptible across the event. See start as to why.
func (e *limiterEvent) stop(typ limiterEventType, now int64) {
	var stamp limiterEventStamp
	for {
		stamp = limiterEventStamp(e.stamp.Load())
		if stamp.typ() != typ {
			print("runtime: want=", typ, " got=", stamp.typ(), "\n")
			throw("limiterEvent.stop: found wrong event in p's limiter event slot")
		}
		if e.stamp.CompareAndSwap(uint64(stamp), uint64(limiterEventStampNone)) {
			break
		}
	}
	duration := stamp.duration(now)
	if duration == 0 {
		// It's possible that we're missing time because we crossed a
		// 2^(64-limiterEventBits) boundary between the start and end.
		// In this case, we're dropping that information. This is OK because
		// at worst it'll cause a transient hiccup that will quickly resolve
		// itself as all new timestamps begin on the other side of the boundary.
		// Such a hiccup should be incredibly rare.
		return
	}
	// Account for the event.
	switch typ {
	case limiterEventIdleMarkWork:
		gcCPULimiter.addIdleTime(duration)
	case limiterEventIdle:
		gcCPULimiter.addIdleTime(duration)
		sched.idleTime.Add(duration)
	case limiterEventMarkAssist:
		fallthrough
	case limiterEventScavengeAssist:
		gcCPULimiter.addAssistTime(duration)
	default:
		throw("limiterEvent.stop: invalid limiter event type found")
	}
}