From 95afc744a7abd78dcaf06423583821ead5cc7c1e Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Mon, 27 Nov 2023 22:27:32 +0000 Subject: [release-branch.go1.20] runtime: put ReadMemStats debug assertions behind a double-check mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ReadMemStats has a few assertions it makes about the consistency of the stats it's about to produce. Specifically, how those stats line up with runtime-internal stats. These checks are generally useful, but crashing just because some stats are wrong is a heavy price to pay. For a long time this wasn't a problem, but very recently it became a real problem. It turns out that there's real benign skew that can happen wherein sysmon (which doesn't synchronize with a STW) generates a trace event when tracing is enabled, and may mutate some stats while ReadMemStats is running its checks. Fix this by synchronizing with both sysmon and the tracer. This is a bit heavy-handed, but better that than false positives. Also, put the checks behind a debug mode. We want to reduce the risk of backporting this change, and again, it's not great to crash just because user-facing stats are off. Still, enable this debug mode during the runtime tests so we don't lose quite as much coverage from disabling these checks by default. For #64401. Fixes #64409. Change-Id: I9adb3e5c7161d207648d07373a11da8a5f0fda9a Reviewed-on: https://go-review.googlesource.com/c/go/+/545277 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Pratt Reviewed-by: Felix Geisendörfer (cherry picked from commit b2efd1de97402ec4b8fb4e9e0ec29c8e49e8e200) Reviewed-on: https://go-review.googlesource.com/c/go/+/545556 Auto-Submit: Matthew Dempsky TryBot-Bypass: Matthew Dempsky TryBot-Result: Gopher Robot Run-TryBot: Matthew Dempsky --- src/runtime/export_test.go | 2 + src/runtime/gc_test.go | 5 ++ src/runtime/mstats.go | 114 +++++++++++++++++++++++++-------------------- 3 files changed, 71 insertions(+), 50 deletions(-) diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go index e7476e606b..2290f7c728 100644 --- a/src/runtime/export_test.go +++ b/src/runtime/export_test.go @@ -336,6 +336,8 @@ func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) startTheWorld() } +var DoubleCheckReadMemStats = &doubleCheckReadMemStats + // ReadMemStatsSlow returns both the runtime-computed MemStats and // MemStats accumulated by scanning the heap. func ReadMemStatsSlow() (base, slow MemStats) { diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go index 0b2c972d3f..1a1655eae6 100644 --- a/src/runtime/gc_test.go +++ b/src/runtime/gc_test.go @@ -573,6 +573,11 @@ func TestPageAccounting(t *testing.T) { } } +func init() { + // Enable ReadMemStats' double-check mode. + *runtime.DoubleCheckReadMemStats = true +} + func TestReadMemStats(t *testing.T) { base, slow := runtime.ReadMemStatsSlow() if base != slow { diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go index 3a5273f361..8424946e77 100644 --- a/src/runtime/mstats.go +++ b/src/runtime/mstats.go @@ -356,6 +356,11 @@ func ReadMemStats(m *MemStats) { startTheWorld() } +// doubleCheckReadMemStats controls a double-check mode for ReadMemStats that +// ensures consistency between the values that ReadMemStats is using and the +// runtime-internal stats. +var doubleCheckReadMemStats = false + // readmemstats_m populates stats for internal runtime values. // // The world must be stopped. @@ -430,56 +435,65 @@ func readmemstats_m(stats *MemStats) { heapGoal := gcController.heapGoal() - // The world is stopped, so the consistent stats (after aggregation) - // should be identical to some combination of memstats. In particular: - // - // * memstats.heapInUse == inHeap - // * memstats.heapReleased == released - // * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits - // * memstats.totalAlloc == totalAlloc - // * memstats.totalFree == totalFree - // - // Check if that's actually true. - // - // TODO(mknyszek): Maybe don't throw here. It would be bad if a - // bug in otherwise benign accounting caused the whole application - // to crash. - if gcController.heapInUse.load() != uint64(consStats.inHeap) { - print("runtime: heapInUse=", gcController.heapInUse.load(), "\n") - print("runtime: consistent value=", consStats.inHeap, "\n") - throw("heapInUse and consistent stats are not equal") - } - if gcController.heapReleased.load() != uint64(consStats.released) { - print("runtime: heapReleased=", gcController.heapReleased.load(), "\n") - print("runtime: consistent value=", consStats.released, "\n") - throw("heapReleased and consistent stats are not equal") - } - heapRetained := gcController.heapInUse.load() + gcController.heapFree.load() - consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits) - if heapRetained != consRetained { - print("runtime: global value=", heapRetained, "\n") - print("runtime: consistent value=", consRetained, "\n") - throw("measures of the retained heap are not equal") - } - if gcController.totalAlloc.Load() != totalAlloc { - print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n") - print("runtime: consistent value=", totalAlloc, "\n") - throw("totalAlloc and consistent stats are not equal") - } - if gcController.totalFree.Load() != totalFree { - print("runtime: totalFree=", gcController.totalFree.Load(), "\n") - print("runtime: consistent value=", totalFree, "\n") - throw("totalFree and consistent stats are not equal") - } - // Also check that mappedReady lines up with totalMapped - released. - // This isn't really the same type of "make sure consistent stats line up" situation, - // but this is an opportune time to check. - if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) { - print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n") - print("runtime: totalMapped=", totalMapped, "\n") - print("runtime: released=", uint64(consStats.released), "\n") - print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n") - throw("mappedReady and other memstats are not equal") + if doubleCheckReadMemStats { + // Only check this if we're debugging. It would be bad to crash an application + // just because the debugging stats are wrong. We mostly rely on tests to catch + // these issues, and we enable the double check mode for tests. + // + // The world is stopped, so the consistent stats (after aggregation) + // should be identical to some combination of memstats. In particular: + // + // * memstats.heapInUse == inHeap + // * memstats.heapReleased == released + // * memstats.heapInUse + memstats.heapFree == committed - inStacks - inWorkBufs - inPtrScalarBits + // * memstats.totalAlloc == totalAlloc + // * memstats.totalFree == totalFree + // + // Check if that's actually true. + // + // Prevent sysmon and the tracer from skewing the stats since they can + // act without synchronizing with a STW. See #64401. + lock(&sched.sysmonlock) + lock(&trace.lock) + if gcController.heapInUse.load() != uint64(consStats.inHeap) { + print("runtime: heapInUse=", gcController.heapInUse.load(), "\n") + print("runtime: consistent value=", consStats.inHeap, "\n") + throw("heapInUse and consistent stats are not equal") + } + if gcController.heapReleased.load() != uint64(consStats.released) { + print("runtime: heapReleased=", gcController.heapReleased.load(), "\n") + print("runtime: consistent value=", consStats.released, "\n") + throw("heapReleased and consistent stats are not equal") + } + heapRetained := gcController.heapInUse.load() + gcController.heapFree.load() + consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits) + if heapRetained != consRetained { + print("runtime: global value=", heapRetained, "\n") + print("runtime: consistent value=", consRetained, "\n") + throw("measures of the retained heap are not equal") + } + if gcController.totalAlloc.Load() != totalAlloc { + print("runtime: totalAlloc=", gcController.totalAlloc.Load(), "\n") + print("runtime: consistent value=", totalAlloc, "\n") + throw("totalAlloc and consistent stats are not equal") + } + if gcController.totalFree.Load() != totalFree { + print("runtime: totalFree=", gcController.totalFree.Load(), "\n") + print("runtime: consistent value=", totalFree, "\n") + throw("totalFree and consistent stats are not equal") + } + // Also check that mappedReady lines up with totalMapped - released. + // This isn't really the same type of "make sure consistent stats line up" situation, + // but this is an opportune time to check. + if gcController.mappedReady.Load() != totalMapped-uint64(consStats.released) { + print("runtime: mappedReady=", gcController.mappedReady.Load(), "\n") + print("runtime: totalMapped=", totalMapped, "\n") + print("runtime: released=", uint64(consStats.released), "\n") + print("runtime: totalMapped-released=", totalMapped-uint64(consStats.released), "\n") + throw("mappedReady and other memstats are not equal") + } + unlock(&trace.lock) + unlock(&sched.sysmonlock) } // We've calculated all the values we need. Now, populate stats. -- cgit v1.2.3-54-g00ecf