aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLucien Coffe <lucien.coffe@botify.com>2023-04-21 13:44:35 +0200
committerGopher Robot <gobot@golang.org>2023-06-22 15:28:32 +0000
commit3db4f8146c0a7e41a234042eb2057fbfddc17e5f (patch)
tree4be55448baedcfd7a3211ea008c2b7a6292e83ca
parent6b45fb7b73636dc56b54652d7090de9db3dfff30 (diff)
downloadgo-3db4f8146c0a7e41a234042eb2057fbfddc17e5f.tar.gz
go-3db4f8146c0a7e41a234042eb2057fbfddc17e5f.zip
[release-branch.go1.20] runtime: resolve checkdead panic by refining `startm` lock handling in caller context
This change addresses a `checkdead` panic caused by a race condition between `sysmon->startm` and `checkdead` callers, due to prematurely releasing the scheduler lock. The solution involves allowing a `startm` caller to acquire the scheduler lock and call `startm` in this context. A new `lockheld` bool argument is added to `startm`, which manages all lock and unlock calls within the function. The`startIdle` function variable in `injectglist` is updated to call `startm` with the lock held, ensuring proper lock handling in this specific case. This refined lock handling resolves the observed race condition issue. For #59600. Fixes #60760. Change-Id: I11663a15536c10c773fc2fde291d959099aa71be Reviewed-on: https://go-review.googlesource.com/c/go/+/487316 Reviewed-by: Dmitri Shuralyov <dmitshur@google.com> Reviewed-by: Michael Pratt <mpratt@google.com> TryBot-Result: Gopher Robot <gobot@golang.org> Run-TryBot: Michael Pratt <mpratt@google.com> (cherry picked from commit ff059add10d71fe13239cf893c0cca113de1fc21) Reviewed-on: https://go-review.googlesource.com/c/go/+/504395 Reviewed-by: Lucien Coffe <lucien.coffe@botify.com> Auto-Submit: Dmitri Shuralyov <dmitshur@google.com> Run-TryBot: Dmitri Shuralyov <dmitshur@golang.org>
-rw-r--r--src/runtime/proc.go45
1 files changed, 31 insertions, 14 deletions
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 7cd6898a93..bf2a33ea92 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2351,10 +2351,15 @@ func mspinning() {
// Callers passing a non-nil P must call from a non-preemptible context. See
// comment on acquirem below.
//
+// Argument lockheld indicates whether the caller already acquired the
+// scheduler lock. Callers holding the lock when making the call must pass
+// true. The lock might be temporarily dropped, but will be reacquired before
+// returning.
+//
// Must not have write barriers because this may be called without a P.
//
//go:nowritebarrierrec
-func startm(pp *p, spinning bool) {
+func startm(pp *p, spinning, lockheld bool) {
// Disable preemption.
//
// Every owned P must have an owner that will eventually stop it in the
@@ -2372,7 +2377,9 @@ func startm(pp *p, spinning bool) {
// startm. Callers passing a nil P may be preemptible, so we must
// disable preemption before acquiring a P from pidleget below.
mp := acquirem()
- lock(&sched.lock)
+ if !lockheld {
+ lock(&sched.lock)
+ }
if pp == nil {
if spinning {
// TODO(prattmic): All remaining calls to this function
@@ -2382,7 +2389,9 @@ func startm(pp *p, spinning bool) {
}
pp, _ = pidleget(0)
if pp == nil {
- unlock(&sched.lock)
+ if !lockheld {
+ unlock(&sched.lock)
+ }
releasem(mp)
return
}
@@ -2396,6 +2405,8 @@ func startm(pp *p, spinning bool) {
// could find no idle P while checkdead finds a runnable G but
// no running M's because this new M hasn't started yet, thus
// throwing in an apparent deadlock.
+ // This apparent deadlock is possible when startm is called
+ // from sysmon, which doesn't count as a running M.
//
// Avoid this situation by pre-allocating the ID for the new M,
// thus marking it as 'running' before we drop sched.lock. This
@@ -2410,12 +2421,18 @@ func startm(pp *p, spinning bool) {
fn = mspinning
}
newm(fn, pp, id)
+
+ if lockheld {
+ lock(&sched.lock)
+ }
// Ownership transfer of pp committed by start in newm.
// Preemption is now safe.
releasem(mp)
return
}
- unlock(&sched.lock)
+ if !lockheld {
+ unlock(&sched.lock)
+ }
if nmp.spinning {
throw("startm: m is spinning")
}
@@ -2444,24 +2461,24 @@ func handoffp(pp *p) {
// if it has local work, start it straight away
if !runqempty(pp) || sched.runqsize != 0 {
- startm(pp, false)
+ startm(pp, false, false)
return
}
// if there's trace work to do, start it straight away
if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil {
- startm(pp, false)
+ startm(pp, false, false)
return
}
// if it has GC work, start it straight away
if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) {
- startm(pp, false)
+ startm(pp, false, false)
return
}
// no local work, check that there are no spinning/idle M's,
// otherwise our help is not required
if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic
sched.needspinning.Store(0)
- startm(pp, true)
+ startm(pp, true, false)
return
}
lock(&sched.lock)
@@ -2483,14 +2500,14 @@ func handoffp(pp *p) {
}
if sched.runqsize != 0 {
unlock(&sched.lock)
- startm(pp, false)
+ startm(pp, false, false)
return
}
// If this is the last running P and nobody is polling network,
// need to wakeup another M to poll network.
if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 {
unlock(&sched.lock)
- startm(pp, false)
+ startm(pp, false, false)
return
}
@@ -2539,7 +2556,7 @@ func wakep() {
// see at least one running M (ours).
unlock(&sched.lock)
- startm(pp, true)
+ startm(pp, true, false)
releasem(mp)
}
@@ -3292,8 +3309,8 @@ func injectglist(glist *gList) {
break
}
+ startm(pp, false, true)
unlock(&sched.lock)
- startm(pp, false)
releasem(mp)
}
}
@@ -5391,7 +5408,7 @@ func sysmon() {
// See issue 42515 and
// https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094.
if next := timeSleepUntil(); next < now {
- startm(nil, false)
+ startm(nil, false, false)
}
}
if scavenger.sysmonWake.Load() != 0 {
@@ -5663,7 +5680,7 @@ func schedEnableUser(enable bool) {
globrunqputbatch(&sched.disable.runnable, n)
unlock(&sched.lock)
for ; n != 0 && sched.npidle.Load() != 0; n-- {
- startm(nil, false)
+ startm(nil, false, false)
}
} else {
unlock(&sched.lock)