aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/trace2.go
blob: 673205dda8a4a68f61f54f80ed937ccb92642df0 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
// Copyright 2023 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

//go:build goexperiment.exectracer2

// Go execution tracer.
// The tracer captures a wide range of execution events like goroutine
// creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
// changes of heap size, processor start/stop, etc and writes them to a buffer
// in a compact form. A precise nanosecond-precision timestamp and a stack
// trace is captured for most events.
//
// Tracer invariants (to keep the synchronization making sense):
// - An m that has a trace buffer must be on either the allm or sched.freem lists.
// - Any trace buffer mutation must either be happening in traceAdvance or between
//   a traceAcquire and a subsequent traceRelease.
// - traceAdvance cannot return until the previous generation's buffers are all flushed.
//
// See https://go.dev/issue/60773 for a link to the full design.

package runtime

import (
	"runtime/internal/atomic"
	"unsafe"
)

// Trace state.

// trace is global tracing context.
var trace struct {
	// trace.lock must only be acquired on the system stack where
	// stack splits cannot happen while it is held.
	lock mutex

	// Trace buffer management.
	//
	// First we check the empty list for any free buffers. If not, buffers
	// are allocated directly from the OS. Once they're filled up and/or
	// flushed, they end up on the full queue for trace.gen%2.
	//
	// The trace reader takes buffers off the full list one-by-one and
	// places them into reading until they're finished being read from.
	// Then they're placed onto the empty list.
	//
	// Protected by trace.lock.
	reading       *traceBuf // buffer currently handed off to user
	empty         *traceBuf // stack of empty buffers
	full          [2]traceBufQueue
	workAvailable atomic.Bool

	// State for the trace reader goroutine.
	//
	// Protected by trace.lock.
	readerGen     atomic.Uintptr // the generation the reader is currently reading for
	flushedGen    atomic.Uintptr // the last completed generation
	headerWritten bool           // whether ReadTrace has emitted trace header

	// doneSema is used to synchronize the reader and traceAdvance. Specifically,
	// it notifies traceAdvance that the reader is done with a generation.
	// Both semaphores are 0 by default (so, acquires block). traceAdvance
	// attempts to acquire for gen%2 after flushing the last buffers for gen.
	// Meanwhile the reader releases the sema for gen%2 when it has finished
	// processing gen.
	doneSema [2]uint32

	// Trace data tables for deduplicating data going into the trace.
	// There are 2 of each: one for gen%2, one for 1-gen%2.
	stackTab  [2]traceStackTable  // maps stack traces to unique ids
	stringTab [2]traceStringTable // maps strings to unique ids

	// cpuLogRead accepts CPU profile samples from the signal handler where
	// they're generated. There are two profBufs here: one for gen%2, one for
	// 1-gen%2. These profBufs use a three-word header to hold the IDs of the P, G,
	// and M (respectively) that were active at the time of the sample. Because
	// profBuf uses a record with all zeros in its header to indicate overflow,
	// we make sure to make the P field always non-zero: The ID of a real P will
	// start at bit 1, and bit 0 will be set. Samples that arrive while no P is
	// running (such as near syscalls) will set the first header field to 0b10.
	// This careful handling of the first header field allows us to store ID of
	// the active G directly in the second field, even though that will be 0
	// when sampling g0.
	//
	// Initialization and teardown of these fields is protected by traceAdvanceSema.
	cpuLogRead  [2]*profBuf
	signalLock  atomic.Uint32              // protects use of the following member, only usable in signal handlers
	cpuLogWrite [2]atomic.Pointer[profBuf] // copy of cpuLogRead for use in signal handlers, set without signalLock
	cpuSleep    *wakeableSleep
	cpuLogDone  <-chan struct{}
	cpuBuf      [2]*traceBuf

	reader atomic.Pointer[g] // goroutine that called ReadTrace, or nil

	// Fast mappings from enumerations to string IDs that are prepopulated
	// in the trace.
	markWorkerLabels [2][len(gcMarkWorkerModeStrings)]traceArg
	goStopReasons    [2][len(traceGoStopReasonStrings)]traceArg
	goBlockReasons   [2][len(traceBlockReasonStrings)]traceArg

	// Trace generation counter.
	gen            atomic.Uintptr
	lastNonZeroGen uintptr // last non-zero value of gen

	// shutdown is set when we are waiting for trace reader to finish after setting gen to 0
	//
	// Writes protected by trace.lock.
	shutdown atomic.Bool

	// Number of goroutines in syscall exiting slow path.
	exitingSyscall atomic.Int32

	// seqGC is the sequence counter for GC begin/end.
	//
	// Mutated only during stop-the-world.
	seqGC uint64
}

// Trace public API.

var (
	traceAdvanceSema  uint32 = 1
	traceShutdownSema uint32 = 1
)

// StartTrace enables tracing for the current process.
// While tracing, the data will be buffered and available via [ReadTrace].
// StartTrace returns an error if tracing is already enabled.
// Most clients should use the [runtime/trace] package or the [testing] package's
// -test.trace flag instead of calling StartTrace directly.
func StartTrace() error {
	if traceEnabled() || traceShuttingDown() {
		return errorString("tracing is already enabled")
	}
	// Block until cleanup of the last trace is done.
	semacquire(&traceShutdownSema)
	semrelease(&traceShutdownSema)

	// Hold traceAdvanceSema across trace start, since we'll want it on
	// the other side of tracing being enabled globally.
	semacquire(&traceAdvanceSema)

	// Initialize CPU profile -> trace ingestion.
	traceInitReadCPU()

	// Compute the first generation for this StartTrace.
	//
	// Note: we start from the last non-zero generation rather than 1 so we
	// can avoid resetting all the arrays indexed by gen%2 or gen%3. There's
	// more than one of each per m, p, and goroutine.
	firstGen := traceNextGen(trace.lastNonZeroGen)

	// Reset GC sequencer.
	trace.seqGC = 1

	// Reset trace reader state.
	trace.headerWritten = false
	trace.readerGen.Store(firstGen)
	trace.flushedGen.Store(0)

	// Register some basic strings in the string tables.
	traceRegisterLabelsAndReasons(firstGen)

	// Stop the world.
	//
	// The purpose of stopping the world is to make sure that no goroutine is in a
	// context where it could emit an event by bringing all goroutines to a safe point
	// with no opportunity to transition.
	//
	// The exception to this rule are goroutines that are concurrently exiting a syscall.
	// Those will all be forced into the syscalling slow path, and we'll just make sure
	// that we don't observe any goroutines in that critical section before starting
	// the world again.
	//
	// A good follow-up question to this is why stopping the world is necessary at all
	// given that we have traceAcquire and traceRelease. Unfortunately, those only help
	// us when tracing is already active (for performance, so when tracing is off the
	// tracing seqlock is left untouched). The main issue here is subtle: we're going to
	// want to obtain a correct starting status for each goroutine, but there are windows
	// of time in which we could read and emit an incorrect status. Specifically:
	//
	//	trace := traceAcquire()
	//  // <----> problem window
	//	casgstatus(gp, _Gwaiting, _Grunnable)
	//	if trace.ok() {
	//		trace.GoUnpark(gp, 2)
	//		traceRelease(trace)
	//	}
	//
	// More precisely, if we readgstatus for a gp while another goroutine is in the problem
	// window and that goroutine didn't observe that tracing had begun, then we might write
	// a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
	// the transition from GoWaiting to GoRunnable. The trace will then be broken, because
	// future events will be emitted assuming the tracer sees GoRunnable.
	//
	// In short, what we really need here is to make sure that the next time *any goroutine*
	// hits a traceAcquire, it sees that the trace is enabled.
	//
	// Note also that stopping the world is necessary to make sure sweep-related events are
	// coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
	// the world and see an unpaired sweep 'end' event. Other parts of the tracer rely on this.
	stw := stopTheWorld(stwStartTrace)

	// Prevent sysmon from running any code that could generate events.
	lock(&sched.sysmonlock)

	// Reset mSyscallID on all Ps while we have them stationary and the trace is disabled.
	for _, pp := range allp {
		pp.trace.mSyscallID = -1
	}

	// Start tracing.
	//
	// After this executes, other Ms may start creating trace buffers and emitting
	// data into them.
	trace.gen.Store(firstGen)

	// Wait for exitingSyscall to drain.
	//
	// It may not monotonically decrease to zero, but in the limit it will always become
	// zero because the world is stopped and there are no available Ps for syscall-exited
	// goroutines to run on.
	//
	// Because we set gen before checking this, and because exitingSyscall is always incremented
	// *after* traceAcquire (which checks gen), we can be certain that when exitingSyscall is zero
	// that any goroutine that goes to exit a syscall from then on *must* observe the new gen.
	//
	// The critical section on each goroutine here is going to be quite short, so the likelihood
	// that we observe a zero value is high.
	for trace.exitingSyscall.Load() != 0 {
		osyield()
	}

	// Record some initial pieces of information.
	//
	// N.B. This will also emit a status event for this goroutine.
	tl := traceAcquire()
	tl.Gomaxprocs(gomaxprocs)  // Get this as early in the trace as possible. See comment in traceAdvance.
	tl.STWStart(stwStartTrace) // We didn't trace this above, so trace it now.

	// Record the fact that a GC is active, if applicable.
	if gcphase == _GCmark || gcphase == _GCmarktermination {
		tl.GCActive()
	}

	// Record the heap goal so we have it at the very beginning of the trace.
	tl.HeapGoal()

	// Make sure a ProcStatus is emitted for every P, while we're here.
	for _, pp := range allp {
		tl.writer().writeProcStatusForP(pp, pp == tl.mp.p.ptr()).end()
	}
	traceRelease(tl)

	unlock(&sched.sysmonlock)
	startTheWorld(stw)

	traceStartReadCPU()
	traceAdvancer.start()

	semrelease(&traceAdvanceSema)
	return nil
}

// StopTrace stops tracing, if it was previously enabled.
// StopTrace only returns after all the reads for the trace have completed.
func StopTrace() {
	traceAdvance(true)
}

// traceAdvance moves tracing to the next generation, and cleans up the current generation,
// ensuring that it's flushed out before returning. If stopTrace is true, it disables tracing
// altogether instead of advancing to the next generation.
//
// traceAdvanceSema must not be held.
func traceAdvance(stopTrace bool) {
	semacquire(&traceAdvanceSema)

	// Get the gen that we're advancing from. In this function we don't really care much
	// about the generation we're advancing _into_ since we'll do all the cleanup in this
	// generation for the next advancement.
	gen := trace.gen.Load()
	if gen == 0 {
		// We may end up here traceAdvance is called concurrently with StopTrace.
		semrelease(&traceAdvanceSema)
		return
	}

	// Write an EvFrequency event for this generation.
	//
	// N.B. This may block for quite a while to get a good frequency estimate, so make sure we do
	// this here and not e.g. on the trace reader.
	traceFrequency(gen)

	// Collect all the untraced Gs.
	type untracedG struct {
		gp           *g
		goid         uint64
		mid          int64
		status       uint32
		waitreason   waitReason
		inMarkAssist bool
	}
	var untracedGs []untracedG
	forEachGRace(func(gp *g) {
		// Make absolutely sure all Gs are ready for the next
		// generation. We need to do this even for dead Gs because
		// they may come alive with a new identity, and its status
		// traced bookkeeping might end up being stale.
		// We may miss totally new goroutines, but they'll always
		// have clean bookkeeping.
		gp.trace.readyNextGen(gen)
		// If the status was traced, nothing else to do.
		if gp.trace.statusWasTraced(gen) {
			return
		}
		// Scribble down information about this goroutine.
		ug := untracedG{gp: gp, mid: -1}
		systemstack(func() {
			me := getg().m.curg
			// We don't have to handle this G status transition because we
			// already eliminated ourselves from consideration above.
			casGToWaiting(me, _Grunning, waitReasonTraceGoroutineStatus)
			// We need to suspend and take ownership of the G to safely read its
			// goid. Note that we can't actually emit the event at this point
			// because we might stop the G in a window where it's unsafe to write
			// events based on the G's status. We need the global trace buffer flush
			// coming up to make sure we're not racing with the G.
			//
			// It should be very unlikely that we try to preempt a running G here.
			// The only situation that we might is that we're racing with a G
			// that's running for the first time in this generation. Therefore,
			// this should be relatively fast.
			s := suspendG(gp)
			if !s.dead {
				ug.goid = s.g.goid
				if s.g.m != nil {
					ug.mid = int64(s.g.m.procid)
				}
				ug.status = readgstatus(s.g) &^ _Gscan
				ug.waitreason = s.g.waitreason
				ug.inMarkAssist = s.g.inMarkAssist
			}
			resumeG(s)
			casgstatus(me, _Gwaiting, _Grunning)
		})
		if ug.goid != 0 {
			untracedGs = append(untracedGs, ug)
		}
	})

	if !stopTrace {
		// Re-register runtime goroutine labels and stop/block reasons.
		traceRegisterLabelsAndReasons(traceNextGen(gen))
	}

	// Now that we've done some of the heavy stuff, prevent the world from stopping.
	// This is necessary to ensure the consistency of the STW events. If we're feeling
	// adventurous we could lift this restriction and add a STWActive event, but the
	// cost of maintaining this consistency is low. We're not going to hold this semaphore
	// for very long and most STW periods are very short.
	// Once we hold worldsema, prevent preemption as well so we're not interrupted partway
	// through this. We want to get this done as soon as possible.
	semacquire(&worldsema)
	mp := acquirem()

	// Advance the generation or stop the trace.
	trace.lastNonZeroGen = gen
	if stopTrace {
		systemstack(func() {
			// Ordering is important here. Set shutdown first, then disable tracing,
			// so that conditions like (traceEnabled() || traceShuttingDown()) have
			// no opportunity to be false. Hold the trace lock so this update appears
			// atomic to the trace reader.
			lock(&trace.lock)
			trace.shutdown.Store(true)
			trace.gen.Store(0)
			unlock(&trace.lock)
		})
	} else {
		trace.gen.Store(traceNextGen(gen))
	}

	// Emit a ProcsChange event so we have one on record for each generation.
	// Let's emit it as soon as possible so that downstream tools can rely on the value
	// being there fairly soon in a generation.
	//
	// It's important that we do this before allowing stop-the-worlds again,
	// because the procs count could change.
	if !stopTrace {
		tl := traceAcquire()
		tl.Gomaxprocs(gomaxprocs)
		traceRelease(tl)
	}

	// Emit a GCActive event in the new generation if necessary.
	//
	// It's important that we do this before allowing stop-the-worlds again,
	// because that could emit global GC-related events.
	if !stopTrace && (gcphase == _GCmark || gcphase == _GCmarktermination) {
		tl := traceAcquire()
		tl.GCActive()
		traceRelease(tl)
	}

	// Preemption is OK again after this. If the world stops or whatever it's fine.
	// We're just cleaning up the last generation after this point.
	//
	// We also don't care if the GC starts again after this for the same reasons.
	releasem(mp)
	semrelease(&worldsema)

	// Snapshot allm and freem.
	//
	// Snapshotting after the generation counter update is sufficient.
	// Because an m must be on either allm or sched.freem if it has an active trace
	// buffer, new threads added to allm after this point must necessarily observe
	// the new generation number (sched.lock acts as a barrier).
	//
	// Threads that exit before this point and are on neither list explicitly
	// flush their own buffers in traceThreadDestroy.
	//
	// Snapshotting freem is necessary because Ms can continue to emit events
	// while they're still on that list. Removal from sched.freem is serialized with
	// this snapshot, so either we'll capture an m on sched.freem and race with
	// the removal to flush its buffers (resolved by traceThreadDestroy acquiring
	// the thread's seqlock, which one of us must win, so at least its old gen buffer
	// will be flushed in time for the new generation) or it will have flushed its
	// buffers before we snapshotted it to begin with.
	lock(&sched.lock)
	mToFlush := allm
	for mp := mToFlush; mp != nil; mp = mp.alllink {
		mp.trace.link = mp.alllink
	}
	for mp := sched.freem; mp != nil; mp = mp.freelink {
		mp.trace.link = mToFlush
		mToFlush = mp
	}
	unlock(&sched.lock)

	// Iterate over our snapshot, flushing every buffer until we're done.
	//
	// Because trace writers read the generation while the seqlock is
	// held, we can be certain that when there are no writers there are
	// also no stale generation values left. Therefore, it's safe to flush
	// any buffers that remain in that generation's slot.
	const debugDeadlock = false
	systemstack(func() {
		// Track iterations for some rudimentary deadlock detection.
		i := 0
		detectedDeadlock := false

		for mToFlush != nil {
			prev := &mToFlush
			for mp := *prev; mp != nil; {
				if mp.trace.seqlock.Load()%2 != 0 {
					// The M is writing. Come back to it later.
					prev = &mp.trace.link
					mp = mp.trace.link
					continue
				}
				// Flush the trace buffer.
				//
				// trace.lock needed for traceBufFlush, but also to synchronize
				// with traceThreadDestroy, which flushes both buffers unconditionally.
				lock(&trace.lock)
				bufp := &mp.trace.buf[gen%2]
				if *bufp != nil {
					traceBufFlush(*bufp, gen)
					*bufp = nil
				}
				unlock(&trace.lock)

				// Remove the m from the flush list.
				*prev = mp.trace.link
				mp.trace.link = nil
				mp = *prev
			}
			// Yield only if we're going to be going around the loop again.
			if mToFlush != nil {
				osyield()
			}

			if debugDeadlock {
				// Try to detect a deadlock. We probably shouldn't loop here
				// this many times.
				if i > 100000 && !detectedDeadlock {
					detectedDeadlock = true
					println("runtime: failing to flush")
					for mp := mToFlush; mp != nil; mp = mp.trace.link {
						print("runtime: m=", mp.id, "\n")
					}
				}
				i++
			}
		}
	})

	// At this point, the old generation is fully flushed minus stack and string
	// tables, CPU samples, and goroutines that haven't run at all during the last
	// generation.

	// Check to see if any Gs still haven't had events written out for them.
	statusWriter := unsafeTraceWriter(gen, nil)
	for _, ug := range untracedGs {
		if ug.gp.trace.statusWasTraced(gen) {
			// It was traced, we don't need to do anything.
			continue
		}
		// It still wasn't traced. Because we ensured all Ms stopped writing trace
		// events to the last generation, that must mean the G never had its status
		// traced in gen between when we recorded it and now. If that's true, the goid
		// and status we recorded then is exactly what we want right now.
		status := goStatusToTraceGoStatus(ug.status, ug.waitreason)
		statusWriter = statusWriter.writeGoStatus(ug.goid, ug.mid, status, ug.inMarkAssist)
	}
	statusWriter.flush().end()

	// Read everything out of the last gen's CPU profile buffer.
	traceReadCPU(gen)

	systemstack(func() {
		// Flush CPU samples, stacks, and strings for the last generation. This is safe,
		// because we're now certain no M is writing to the last generation.
		//
		// Ordering is important here. traceCPUFlush may generate new stacks and dumping
		// stacks may generate new strings.
		traceCPUFlush(gen)
		trace.stackTab[gen%2].dump(gen)
		trace.stringTab[gen%2].reset(gen)

		// That's it. This generation is done producing buffers.
		lock(&trace.lock)
		trace.flushedGen.Store(gen)
		unlock(&trace.lock)
	})

	if stopTrace {
		semacquire(&traceShutdownSema)

		// Finish off CPU profile reading.
		traceStopReadCPU()
	} else {
		// Go over each P and emit a status event for it if necessary.
		//
		// We do this at the beginning of the new generation instead of the
		// end like we do for goroutines because forEachP doesn't give us a
		// hook to skip Ps that have already been traced. Since we have to
		// preempt all Ps anyway, might as well stay consistent with StartTrace
		// which does this during the STW.
		semacquire(&worldsema)
		forEachP(waitReasonTraceProcStatus, func(pp *p) {
			tl := traceAcquire()
			if !pp.trace.statusWasTraced(tl.gen) {
				tl.writer().writeProcStatusForP(pp, false).end()
			}
			traceRelease(tl)
		})
		// Perform status reset on dead Ps because they just appear as idle.
		//
		// Holding worldsema prevents allp from changing.
		//
		// TODO(mknyszek): Consider explicitly emitting ProcCreate and ProcDestroy
		// events to indicate whether a P exists, rather than just making its
		// existence implicit.
		for _, pp := range allp[len(allp):cap(allp)] {
			pp.trace.readyNextGen(traceNextGen(gen))
		}
		semrelease(&worldsema)
	}

	// Block until the trace reader has finished processing the last generation.
	semacquire(&trace.doneSema[gen%2])
	if raceenabled {
		raceacquire(unsafe.Pointer(&trace.doneSema[gen%2]))
	}

	// Double-check that things look as we expect after advancing and perform some
	// final cleanup if the trace has fully stopped.
	systemstack(func() {
		lock(&trace.lock)
		if !trace.full[gen%2].empty() {
			throw("trace: non-empty full trace buffer for done generation")
		}
		if stopTrace {
			if !trace.full[1-(gen%2)].empty() {
				throw("trace: non-empty full trace buffer for next generation")
			}
			if trace.reading != nil || trace.reader.Load() != nil {
				throw("trace: reading after shutdown")
			}
			// Free all the empty buffers.
			for trace.empty != nil {
				buf := trace.empty
				trace.empty = buf.link
				sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf), &memstats.other_sys)
			}
			// Clear trace.shutdown and other flags.
			trace.headerWritten = false
			trace.shutdown.Store(false)
		}
		unlock(&trace.lock)
	})

	if stopTrace {
		// Clear the sweep state on every P for the next time tracing is enabled.
		//
		// It may be stale in the next trace because we may have ended tracing in
		// the middle of a sweep on a P.
		//
		// It's fine not to call forEachP here because tracing is disabled and we
		// know at this point that nothing is calling into the tracer, but we do
		// need to look at dead Ps too just because GOMAXPROCS could have been called
		// at any point since we stopped tracing, and we have to ensure there's no
		// bad state on dead Ps too. Prevent a STW and a concurrent GOMAXPROCS that
		// might mutate allp by making ourselves briefly non-preemptible.
		mp := acquirem()
		for _, pp := range allp[:cap(allp)] {
			pp.trace.inSweep = false
			pp.trace.maySweep = false
			pp.trace.swept = 0
			pp.trace.reclaimed = 0
		}
		releasem(mp)
	}

	// Release the advance semaphore. If stopTrace is true we're still holding onto
	// traceShutdownSema.
	//
	// Do a direct handoff. Don't let one caller of traceAdvance starve
	// other calls to traceAdvance.
	semrelease1(&traceAdvanceSema, true, 0)

	if stopTrace {
		// Stop the traceAdvancer. We can't be holding traceAdvanceSema here because
		// we'll deadlock (we're blocked on the advancer goroutine exiting, but it
		// may be currently trying to acquire traceAdvanceSema).
		traceAdvancer.stop()
		semrelease(&traceShutdownSema)
	}
}

func traceNextGen(gen uintptr) uintptr {
	if gen == ^uintptr(0) {
		// gen is used both %2 and %3 and we want both patterns to continue when we loop around.
		// ^uint32(0) and ^uint64(0) are both odd and multiples of 3. Therefore the next generation
		// we want is even and one more than a multiple of 3. The smallest such number is 4.
		return 4
	}
	return gen + 1
}

// traceRegisterLabelsAndReasons re-registers mark worker labels and
// goroutine stop/block reasons in the string table for the provided
// generation. Note: the provided generation must not have started yet.
func traceRegisterLabelsAndReasons(gen uintptr) {
	for i, label := range gcMarkWorkerModeStrings[:] {
		trace.markWorkerLabels[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, label))
	}
	for i, str := range traceBlockReasonStrings[:] {
		trace.goBlockReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
	}
	for i, str := range traceGoStopReasonStrings[:] {
		trace.goStopReasons[gen%2][i] = traceArg(trace.stringTab[gen%2].put(gen, str))
	}
}

// ReadTrace returns the next chunk of binary tracing data, blocking until data
// is available. If tracing is turned off and all the data accumulated while it
// was on has been returned, ReadTrace returns nil. The caller must copy the
// returned data before calling ReadTrace again.
// ReadTrace must be called from one goroutine at a time.
func ReadTrace() []byte {
top:
	var buf []byte
	var park bool
	systemstack(func() {
		buf, park = readTrace0()
	})
	if park {
		gopark(func(gp *g, _ unsafe.Pointer) bool {
			if !trace.reader.CompareAndSwapNoWB(nil, gp) {
				// We're racing with another reader.
				// Wake up and handle this case.
				return false
			}

			if g2 := traceReader(); gp == g2 {
				// New data arrived between unlocking
				// and the CAS and we won the wake-up
				// race, so wake up directly.
				return false
			} else if g2 != nil {
				printlock()
				println("runtime: got trace reader", g2, g2.goid)
				throw("unexpected trace reader")
			}

			return true
		}, nil, waitReasonTraceReaderBlocked, traceBlockSystemGoroutine, 2)
		goto top
	}

	return buf
}

// readTrace0 is ReadTrace's continuation on g0. This must run on the
// system stack because it acquires trace.lock.
//
//go:systemstack
func readTrace0() (buf []byte, park bool) {
	if raceenabled {
		// g0 doesn't have a race context. Borrow the user G's.
		if getg().racectx != 0 {
			throw("expected racectx == 0")
		}
		getg().racectx = getg().m.curg.racectx
		// (This defer should get open-coded, which is safe on
		// the system stack.)
		defer func() { getg().racectx = 0 }()
	}

	// This function must not allocate while holding trace.lock:
	// allocation can call heap allocate, which will try to emit a trace
	// event while holding heap lock.
	lock(&trace.lock)

	if trace.reader.Load() != nil {
		// More than one goroutine reads trace. This is bad.
		// But we rather do not crash the program because of tracing,
		// because tracing can be enabled at runtime on prod servers.
		unlock(&trace.lock)
		println("runtime: ReadTrace called from multiple goroutines simultaneously")
		return nil, false
	}
	// Recycle the old buffer.
	if buf := trace.reading; buf != nil {
		buf.link = trace.empty
		trace.empty = buf
		trace.reading = nil
	}
	// Write trace header.
	if !trace.headerWritten {
		trace.headerWritten = true
		unlock(&trace.lock)
		return []byte("go 1.22 trace\x00\x00\x00"), false
	}

	// Read the next buffer.

	if trace.readerGen.Load() == 0 {
		trace.readerGen.Store(1)
	}
	var gen uintptr
	for {
		assertLockHeld(&trace.lock)
		gen = trace.readerGen.Load()

		// Check to see if we need to block for more data in this generation
		// or if we need to move our generation forward.
		if !trace.full[gen%2].empty() {
			break
		}
		// Most of the time readerGen is one generation ahead of flushedGen, as the
		// current generation is being read from. Then, once the last buffer is flushed
		// into readerGen, flushedGen will rise to meet it. At this point, the tracer
		// is waiting on the reader to finish flushing the last generation so that it
		// can continue to advance.
		if trace.flushedGen.Load() == gen {
			if trace.shutdown.Load() {
				unlock(&trace.lock)

				// Wake up anyone waiting for us to be done with this generation.
				//
				// Do this after reading trace.shutdown, because the thread we're
				// waking up is going to clear trace.shutdown.
				if raceenabled {
					// Model synchronization on trace.doneSema, which te race
					// detector does not see. This is required to avoid false
					// race reports on writer passed to trace.Start.
					racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
				}
				semrelease(&trace.doneSema[gen%2])

				// We're shutting down, and the last generation is fully
				// read. We're done.
				return nil, false
			}
			// The previous gen has had all of its buffers flushed, and
			// there's nothing else for us to read. Advance the generation
			// we're reading from and try again.
			trace.readerGen.Store(trace.gen.Load())
			unlock(&trace.lock)

			// Wake up anyone waiting for us to be done with this generation.
			//
			// Do this after reading gen to make sure we can't have the trace
			// advance until we've read it.
			if raceenabled {
				// See comment above in the shutdown case.
				racerelease(unsafe.Pointer(&trace.doneSema[gen%2]))
			}
			semrelease(&trace.doneSema[gen%2])

			// Reacquire the lock and go back to the top of the loop.
			lock(&trace.lock)
			continue
		}
		// Wait for new data.
		//
		// We don't simply use a note because the scheduler
		// executes this goroutine directly when it wakes up
		// (also a note would consume an M).
		//
		// Before we drop the lock, clear the workAvailable flag. Work can
		// only be queued with trace.lock held, so this is at least true until
		// we drop the lock.
		trace.workAvailable.Store(false)
		unlock(&trace.lock)
		return nil, true
	}
	// Pull a buffer.
	tbuf := trace.full[gen%2].pop()
	trace.reading = tbuf
	unlock(&trace.lock)
	return tbuf.arr[:tbuf.pos], false
}

// traceReader returns the trace reader that should be woken up, if any.
// Callers should first check (traceEnabled() || traceShuttingDown()).
//
// This must run on the system stack because it acquires trace.lock.
//
//go:systemstack
func traceReader() *g {
	gp := traceReaderAvailable()
	if gp == nil || !trace.reader.CompareAndSwapNoWB(gp, nil) {
		return nil
	}
	return gp
}

// traceReaderAvailable returns the trace reader if it is not currently
// scheduled and should be. Callers should first check that
// (traceEnabled() || traceShuttingDown()) is true.
func traceReaderAvailable() *g {
	// There are three conditions under which we definitely want to schedule
	// the reader:
	// - The reader is lagging behind in finishing off the last generation.
	//   In this case, trace buffers could even be empty, but the trace
	//   advancer will be waiting on the reader, so we have to make sure
	//   to schedule the reader ASAP.
	// - The reader has pending work to process for it's reader generation
	//   (assuming readerGen is not lagging behind). Note that we also want
	//   to be careful *not* to schedule the reader if there's no work to do.
	// - The trace is shutting down. The trace stopper blocks on the reader
	//   to finish, much like trace advancement.
	//
	// We also want to be careful not to schedule the reader if there's no
	// reason to.
	if trace.flushedGen.Load() == trace.readerGen.Load() || trace.workAvailable.Load() || trace.shutdown.Load() {
		return trace.reader.Load()
	}
	return nil
}

// Trace advancer goroutine.
var traceAdvancer traceAdvancerState

type traceAdvancerState struct {
	timer *wakeableSleep
	done  chan struct{}
}

// start starts a new traceAdvancer.
func (s *traceAdvancerState) start() {
	// Start a goroutine to periodically advance the trace generation.
	s.done = make(chan struct{})
	s.timer = newWakeableSleep()
	go func() {
		for traceEnabled() {
			// Set a timer to wake us up
			s.timer.sleep(int64(debug.traceadvanceperiod))

			// Try to advance the trace.
			traceAdvance(false)
		}
		s.done <- struct{}{}
	}()
}

// stop stops a traceAdvancer and blocks until it exits.
func (s *traceAdvancerState) stop() {
	s.timer.wake()
	<-s.done
	close(s.done)
	s.timer.close()
}

// traceAdvancePeriod is the approximate period between
// new generations.
const defaultTraceAdvancePeriod = 1e9 // 1 second.

// wakeableSleep manages a wakeable goroutine sleep.
//
// Users of this type must call init before first use and
// close to free up resources. Once close is called, init
// must be called before another use.
type wakeableSleep struct {
	timer *timer

	// lock protects access to wakeup, but not send/recv on it.
	lock   mutex
	wakeup chan struct{}
}

// newWakeableSleep initializes a new wakeableSleep and returns it.
func newWakeableSleep() *wakeableSleep {
	s := new(wakeableSleep)
	lockInit(&s.lock, lockRankWakeableSleep)
	s.wakeup = make(chan struct{}, 1)
	s.timer = new(timer)
	s.timer.arg = s
	s.timer.f = func(s any, _ uintptr) {
		s.(*wakeableSleep).wake()
	}
	return s
}

// sleep sleeps for the provided duration in nanoseconds or until
// another goroutine calls wake.
//
// Must not be called by more than one goroutine at a time and
// must not be called concurrently with close.
func (s *wakeableSleep) sleep(ns int64) {
	resetTimer(s.timer, nanotime()+ns)
	lock(&s.lock)
	if raceenabled {
		raceacquire(unsafe.Pointer(&s.lock))
	}
	wakeup := s.wakeup
	if raceenabled {
		racerelease(unsafe.Pointer(&s.lock))
	}
	unlock(&s.lock)
	<-wakeup
	stopTimer(s.timer)
}

// wake awakens any goroutine sleeping on the timer.
//
// Safe for concurrent use with all other methods.
func (s *wakeableSleep) wake() {
	// Grab the wakeup channel, which may be nil if we're
	// racing with close.
	lock(&s.lock)
	if raceenabled {
		raceacquire(unsafe.Pointer(&s.lock))
	}
	if s.wakeup != nil {
		// Non-blocking send.
		//
		// Others may also write to this channel and we don't
		// want to block on the receiver waking up. This also
		// effectively batches together wakeup notifications.
		select {
		case s.wakeup <- struct{}{}:
		default:
		}
	}
	if raceenabled {
		racerelease(unsafe.Pointer(&s.lock))
	}
	unlock(&s.lock)
}

// close wakes any goroutine sleeping on the timer and prevents
// further sleeping on it.
//
// Once close is called, the wakeableSleep must no longer be used.
//
// It must only be called once no goroutine is sleeping on the
// timer *and* nothing else will call wake concurrently.
func (s *wakeableSleep) close() {
	// Set wakeup to nil so that a late timer ends up being a no-op.
	lock(&s.lock)
	if raceenabled {
		raceacquire(unsafe.Pointer(&s.lock))
	}
	wakeup := s.wakeup
	s.wakeup = nil

	// Close the channel.
	close(wakeup)

	if raceenabled {
		racerelease(unsafe.Pointer(&s.lock))
	}
	unlock(&s.lock)
	return
}