aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrew G. Morgan <agm@google.com>2021-03-26 19:27:22 -0700
committerIan Lance Taylor <iant@golang.org>2021-05-04 20:41:53 +0000
commitce04f86bd3aa8aacc7012228fa406c80c3a85d04 (patch)
tree78213692595fee4c19b4b661c8be86d216995406
parent7e709791c22409826cb267fced81f210f6aaaf09 (diff)
downloadgo-ce04f86bd3aa8aacc7012228fa406c80c3a85d04.tar.gz
go-ce04f86bd3aa8aacc7012228fa406c80c3a85d04.zip
[release-branch.go1.16] syscall: syscall.AllThreadsSyscall signal handling fixes
The runtime support for syscall.AllThreadsSyscall() functions had some corner case deadlock issues when signal handling was in use. This was observed in at least 3 build test failures on ppc64 and amd64 architecture CGO_ENABLED=0 builds over the last few months. The fixes involve more controlled handling of signals while the AllThreads mechanism is being executed. Further details are discussed in bug #44193. The all-threads syscall support is new in go1.16, so earlier releases are not affected by this bug. Fixes #45307 Change-Id: I01ba8508a6e1bb2d872751f50da86dd07911a41d Reviewed-on: https://go-review.googlesource.com/c/go/+/305149 Reviewed-by: Michael Pratt <mpratt@google.com> Trust: Michael Pratt <mpratt@google.com> Trust: Ian Lance Taylor <iant@golang.org> Run-TryBot: Michael Pratt <mpratt@google.com> TryBot-Result: Go Bot <gobot@golang.org> (cherry picked from commit 7e97e4e8ccdba9677f31ab9380802cd7613f62c5) Reviewed-on: https://go-review.googlesource.com/c/go/+/316869 Run-TryBot: Ian Lance Taylor <iant@golang.org>
-rw-r--r--src/os/signal/signal_test.go42
-rw-r--r--src/runtime/proc.go52
-rw-r--r--src/runtime/runtime2.go7
-rw-r--r--src/runtime/sigqueue.go2
4 files changed, 96 insertions, 7 deletions
diff --git a/src/os/signal/signal_test.go b/src/os/signal/signal_test.go
index bbc68af9fb..9e5ee8bd88 100644
--- a/src/os/signal/signal_test.go
+++ b/src/os/signal/signal_test.go
@@ -15,6 +15,7 @@ import (
"os"
"os/exec"
"runtime"
+ "runtime/trace"
"strconv"
"sync"
"syscall"
@@ -853,3 +854,44 @@ func TestNotifyContextStringer(t *testing.T) {
t.Errorf("c.String() = %q, want %q", got, want)
}
}
+
+// #44193 test signal handling while stopping and starting the world.
+func TestSignalTrace(t *testing.T) {
+ done := make(chan struct{})
+ quit := make(chan struct{})
+ c := make(chan os.Signal, 1)
+ Notify(c, syscall.SIGHUP)
+
+ // Source and sink for signals busy loop unsynchronized with
+ // trace starts and stops. We are ultimately validating that
+ // signals and runtime.(stop|start)TheWorldGC are compatible.
+ go func() {
+ defer close(done)
+ defer Stop(c)
+ pid := syscall.Getpid()
+ for {
+ select {
+ case <-quit:
+ return
+ default:
+ syscall.Kill(pid, syscall.SIGHUP)
+ }
+ waitSig(t, c, syscall.SIGHUP)
+ }
+ }()
+
+ for i := 0; i < 100; i++ {
+ buf := new(bytes.Buffer)
+ if err := trace.Start(buf); err != nil {
+ t.Fatalf("[%d] failed to start tracing: %v", i, err)
+ }
+ time.After(1 * time.Microsecond)
+ trace.Stop()
+ size := buf.Len()
+ if size == 0 {
+ t.Fatalf("[%d] trace is empty", i)
+ }
+ }
+ close(quit)
+ <-done
+}
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 73a789c189..cf9770587a 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1338,6 +1338,9 @@ func mPark() {
g := getg()
for {
notesleep(&g.m.park)
+ // Note, because of signal handling by this parked m,
+ // a preemptive mDoFixup() may actually occur via
+ // mDoFixupAndOSYield(). (See golang.org/issue/44193)
noteclear(&g.m.park)
if !mDoFixup() {
return
@@ -1571,6 +1574,22 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
for atomic.Load(&sched.sysmonStarting) != 0 {
osyield()
}
+
+ // We don't want this thread to handle signals for the
+ // duration of this critical section. The underlying issue
+ // being that this locked coordinating m is the one monitoring
+ // for fn() execution by all the other m's of the runtime,
+ // while no regular go code execution is permitted (the world
+ // is stopped). If this present m were to get distracted to
+ // run signal handling code, and find itself waiting for a
+ // second thread to execute go code before being able to
+ // return from that signal handling, a deadlock will result.
+ // (See golang.org/issue/44193.)
+ lockOSThread()
+ var sigmask sigset
+ sigsave(&sigmask)
+ sigblock(false)
+
stopTheWorldGC("doAllThreadsSyscall")
if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
// Ensure that there are no in-flight thread
@@ -1622,6 +1641,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
// the possibility of racing with mp.
lock(&mp.mFixup.lock)
mp.mFixup.fn = fn
+ atomic.Store(&mp.mFixup.used, 1)
if mp.doesPark {
// For non-service threads this will
// cause the wakeup to be short lived
@@ -1638,9 +1658,7 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
if mp.procid == tid {
continue
}
- lock(&mp.mFixup.lock)
- done = done && (mp.mFixup.fn == nil)
- unlock(&mp.mFixup.lock)
+ done = atomic.Load(&mp.mFixup.used) == 0
}
if done {
break
@@ -1667,6 +1685,8 @@ func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
unlock(&mFixupRace.lock)
}
startTheWorldGC()
+ msigrestore(sigmask)
+ unlockOSThread()
}
// runSafePointFn runs the safe point function, if any, for this P.
@@ -2157,9 +2177,21 @@ var mFixupRace struct {
// mDoFixup runs any outstanding fixup function for the running m.
// Returns true if a fixup was outstanding and actually executed.
//
+// Note: to avoid deadlocks, and the need for the fixup function
+// itself to be async safe, signals are blocked for the working m
+// while it holds the mFixup lock. (See golang.org/issue/44193)
+//
//go:nosplit
func mDoFixup() bool {
_g_ := getg()
+ if used := atomic.Load(&_g_.m.mFixup.used); used == 0 {
+ return false
+ }
+
+ // slow path - if fixup fn is used, block signals and lock.
+ var sigmask sigset
+ sigsave(&sigmask)
+ sigblock(false)
lock(&_g_.m.mFixup.lock)
fn := _g_.m.mFixup.fn
if fn != nil {
@@ -2176,7 +2208,6 @@ func mDoFixup() bool {
// is more obviously safe.
throw("GC must be disabled to protect validity of fn value")
}
- *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
if _g_.racectx != 0 || !raceenabled {
fn(false)
} else {
@@ -2191,11 +2222,24 @@ func mDoFixup() bool {
_g_.racectx = 0
unlock(&mFixupRace.lock)
}
+ *(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+ atomic.Store(&_g_.m.mFixup.used, 0)
}
unlock(&_g_.m.mFixup.lock)
+ msigrestore(sigmask)
return fn != nil
}
+// mDoFixupAndOSYield is called when an m is unable to send a signal
+// because the allThreadsSyscall mechanism is in progress. That is, an
+// mPark() has been interrupted with this signal handler so we need to
+// ensure the fixup is executed from this context.
+//go:nosplit
+func mDoFixupAndOSYield() {
+ mDoFixup()
+ osyield()
+}
+
// templateThread is a thread in a known-good state that exists solely
// to start new threads in known-good states when the calling thread
// may not be in a good state.
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 109f0da131..9a032d8658 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -537,10 +537,13 @@ type m struct {
syscalltick uint32
freelink *m // on sched.freem
- // mFixup is used to synchronize OS related m state (credentials etc)
- // use mutex to access.
+ // mFixup is used to synchronize OS related m state
+ // (credentials etc) use mutex to access. To avoid deadlocks
+ // an atomic.Load() of used being zero in mDoFixupFn()
+ // guarantees fn is nil.
mFixup struct {
lock mutex
+ used uint32
fn func(bool) bool
}
diff --git a/src/runtime/sigqueue.go b/src/runtime/sigqueue.go
index 28b9e26d0f..6bed64e51a 100644
--- a/src/runtime/sigqueue.go
+++ b/src/runtime/sigqueue.go
@@ -119,7 +119,7 @@ Send:
}
case sigFixup:
// nothing to do - we need to wait for sigIdle.
- osyield()
+ mDoFixupAndOSYield()
}
}