From ce04f86bd3aa8aacc7012228fa406c80c3a85d04 Mon Sep 17 00:00:00 2001 From: "Andrew G. Morgan" Date: Fri, 26 Mar 2021 19:27:22 -0700 Subject: [release-branch.go1.16] syscall: syscall.AllThreadsSyscall signal handling fixes The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #45307 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt Trust: Michael Pratt Trust: Ian Lance Taylor Run-TryBot: Michael Pratt TryBot-Result: Go Bot (cherry picked from commit 7e97e4e8ccdba9677f31ab9380802cd7613f62c5) Reviewed-on: https://go-review.googlesource.com/c/go/+/316869 Run-TryBot: Ian Lance Taylor --- src/os/signal/signal_test.go | 42 +++++++++++++++++++++++++++++++++++ src/runtime/proc.go | 52 ++++++++++++++++++++++++++++++++++++++++---- src/runtime/runtime2.go | 7 ++++-- src/runtime/sigqueue.go | 2 +- 4 files changed, 96 insertions(+), 7 deletions(-) diff --git a/src/os/signal/signal_test.go b/src/os/signal/signal_test.go index bbc68af9fb..9e5ee8bd88 100644 --- a/src/os/signal/signal_test.go +++ b/src/os/signal/signal_test.go @@ -15,6 +15,7 @@ import ( "os" "os/exec" "runtime" + "runtime/trace" "strconv" "sync" "syscall" @@ -853,3 +854,44 @@ func TestNotifyContextStringer(t *testing.T) { t.Errorf("c.String() = %q, want %q", got, want) } } + +// #44193 test signal handling while stopping and starting the world. +func TestSignalTrace(t *testing.T) { + done := make(chan struct{}) + quit := make(chan struct{}) + c := make(chan os.Signal, 1) + Notify(c, syscall.SIGHUP) + + // Source and sink for signals busy loop unsynchronized with + // trace starts and stops. We are ultimately validating that + // signals and runtime.(stop|start)TheWorldGC are compatible. + go func() { + defer close(done) + defer Stop(c) + pid := syscall.Getpid() + for { + select { + case <-quit: + return + default: + syscall.Kill(pid, syscall.SIGHUP) + } + waitSig(t, c, syscall.SIGHUP) + } + }() + + for i := 0; i < 100; i++ { + buf := new(bytes.Buffer) + if err := trace.Start(buf); err != nil { + t.Fatalf("[%d] failed to start tracing: %v", i, err) + } + time.After(1 * time.Microsecond) + trace.Stop() + size := buf.Len() + if size == 0 { + t.Fatalf("[%d] trace is empty", i) + } + } + close(quit) + <-done +} diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 73a789c189..cf9770587a 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -1338,6 +1338,9 @@ func mPark() { g := getg() for { notesleep(&g.m.park) + // Note, because of signal handling by this parked m, + // a preemptive mDoFixup() may actually occur via + // mDoFixupAndOSYield(). (See golang.org/issue/44193) noteclear(&g.m.park) if !mDoFixup() { return @@ -1571,6 +1574,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) { for atomic.Load(&sched.sysmonStarting) != 0 { osyield() } + + // We don't want this thread to handle signals for the + // duration of this critical section. The underlying issue + // being that this locked coordinating m is the one monitoring + // for fn() execution by all the other m's of the runtime, + // while no regular go code execution is permitted (the world + // is stopped). If this present m were to get distracted to + // run signal handling code, and find itself waiting for a + // second thread to execute go code before being able to + // return from that signal handling, a deadlock will result. + // (See golang.org/issue/44193.) + lockOSThread() + var sigmask sigset + sigsave(&sigmask) + sigblock(false) + stopTheWorldGC("doAllThreadsSyscall") if atomic.Load(&newmHandoff.haveTemplateThread) != 0 { // Ensure that there are no in-flight thread @@ -1622,6 +1641,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) { // the possibility of racing with mp. lock(&mp.mFixup.lock) mp.mFixup.fn = fn + atomic.Store(&mp.mFixup.used, 1) if mp.doesPark { // For non-service threads this will // cause the wakeup to be short lived @@ -1638,9 +1658,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) { if mp.procid == tid { continue } - lock(&mp.mFixup.lock) - done = done && (mp.mFixup.fn == nil) - unlock(&mp.mFixup.lock) + done = atomic.Load(&mp.mFixup.used) == 0 } if done { break @@ -1667,6 +1685,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) { unlock(&mFixupRace.lock) } startTheWorldGC() + msigrestore(sigmask) + unlockOSThread() } // runSafePointFn runs the safe point function, if any, for this P. @@ -2157,9 +2177,21 @@ var mFixupRace struct { // mDoFixup runs any outstanding fixup function for the running m. // Returns true if a fixup was outstanding and actually executed. // +// Note: to avoid deadlocks, and the need for the fixup function +// itself to be async safe, signals are blocked for the working m +// while it holds the mFixup lock. (See golang.org/issue/44193) +// //go:nosplit func mDoFixup() bool { _g_ := getg() + if used := atomic.Load(&_g_.m.mFixup.used); used == 0 { + return false + } + + // slow path - if fixup fn is used, block signals and lock. + var sigmask sigset + sigsave(&sigmask) + sigblock(false) lock(&_g_.m.mFixup.lock) fn := _g_.m.mFixup.fn if fn != nil { @@ -2176,7 +2208,6 @@ func mDoFixup() bool { // is more obviously safe. throw("GC must be disabled to protect validity of fn value") } - *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0 if _g_.racectx != 0 || !raceenabled { fn(false) } else { @@ -2191,11 +2222,24 @@ func mDoFixup() bool { _g_.racectx = 0 unlock(&mFixupRace.lock) } + *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0 + atomic.Store(&_g_.m.mFixup.used, 0) } unlock(&_g_.m.mFixup.lock) + msigrestore(sigmask) return fn != nil } +// mDoFixupAndOSYield is called when an m is unable to send a signal +// because the allThreadsSyscall mechanism is in progress. That is, an +// mPark() has been interrupted with this signal handler so we need to +// ensure the fixup is executed from this context. +//go:nosplit +func mDoFixupAndOSYield() { + mDoFixup() + osyield() +} + // templateThread is a thread in a known-good state that exists solely // to start new threads in known-good states when the calling thread // may not be in a good state. diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 109f0da131..9a032d8658 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -537,10 +537,13 @@ type m struct { syscalltick uint32 freelink *m // on sched.freem - // mFixup is used to synchronize OS related m state (credentials etc) - // use mutex to access. + // mFixup is used to synchronize OS related m state + // (credentials etc) use mutex to access. To avoid deadlocks + // an atomic.Load() of used being zero in mDoFixupFn() + // guarantees fn is nil. mFixup struct { lock mutex + used uint32 fn func(bool) bool } diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go index 28b9e26d0f..6bed64e51a 100644 --- a/src/runtime/sigqueue.go +++ b/src/runtime/sigqueue.go @@ -119,7 +119,7 @@ Send: } case sigFixup: // nothing to do - we need to wait for sigIdle. - osyield() + mDoFixupAndOSYield() } } -- cgit v1.2.3-54-g00ecf