aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMichael Pratt <mpratt@google.com>2020-05-07 18:13:21 -0400
committerAndrew Bonventre <andybons@golang.org>2020-05-27 17:55:05 +0000
commit1a1178122f31bfe67534ec637883636755adbfbc (patch)
tree9193423f7a73442965649c90964ba5a95d5e9705
parent237b6067c17ac3ef1e02632b77deefb5e9837cbb (diff)
downloadgo-1a1178122f31bfe67534ec637883636755adbfbc.tar.gz
go-1a1178122f31bfe67534ec637883636755adbfbc.zip
[release-branch.go1.13] runtime: disable preemption in startTemplateThread
When a locked M wants to start a new M, it hands off to the template thread to actually call clone and start the thread. The template thread is lazily created the first time a thread is locked (or if cgo is in use). stoplockedm will release the P (_Pidle), then call handoffp to give the P to another M. In the case of a pending STW, one of two things can happen: 1. handoffp starts an M, which does acquirep followed by schedule, which will finally enter _Pgcstop. 2. handoffp immediately enters _Pgcstop. This only occurs if the P has no local work, GC work, and no spinning M is required. If handoffp starts an M, and must create a new M to do so, then newm will simply queue the M on newmHandoff for the template thread to do the clone. When a stop-the-world is required, stopTheWorldWithSema will start the stop and then wait for all Ps to enter _Pgcstop. If the template thread is not fully created because startTemplateThread gets stopped, then another stoplockedm may queue an M that will never get created, and the handoff P will never leave _Pidle. Thus stopTheWorldWithSema will wait forever. A sequence to trigger this hang when STW occurs can be visualized with two threads: T1 T2 ------------------------------- ----------------------------- LockOSThread LockOSThread haveTemplateThread == 0 startTemplateThread haveTemplateThread = 1 newm haveTemplateThread == 1 preempt -> schedule g.m.lockedExt++ gcstopm -> _Pgcstop g.m.lockedg = ... park g.lockedm = ... return ... (any code) preempt -> schedule stoplockedm releasep -> _Pidle handoffp startm (first 3 handoffp cases) newm g.m.lockedExt != 0 Add to newmHandoff, return park Note that the P in T2 is stuck sitting in _Pidle. Since the template thread isn't running, the new M will not be started complete the transition to _Pgcstop. To resolve this, we disable preemption around the assignment of haveTemplateThread and the creation of the template thread in order to guarantee that if handTemplateThread is set then the template thread will eventually exist, in the presence of stops. For #38931 Fixes #38932 Change-Id: I50535fbbe2f328f47b18e24d9030136719274191 Reviewed-on: https://go-review.googlesource.com/c/go/+/232978 Run-TryBot: Michael Pratt <mpratt@google.com> Reviewed-by: Austin Clements <austin@google.com> TryBot-Result: Gobot Gobot <gobot@golang.org> (cherry picked from commit 11b3730a02c93fd5745bfd977156541a9033759b) Reviewed-on: https://go-review.googlesource.com/c/go/+/234888 Reviewed-by: Cherry Zhang <cherryyz@google.com>
-rw-r--r--src/runtime/crash_test.go14
-rw-r--r--src/runtime/proc.go6
-rw-r--r--src/runtime/proc_test.go24
-rw-r--r--src/runtime/testdata/testprog/lockosthread.go49
4 files changed, 91 insertions, 2 deletions
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index c54bb57da2..ac0344e9a3 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -55,6 +55,16 @@ func runTestProg(t *testing.T, binary, name string, env ...string) string {
t.Fatal(err)
}
+ return runBuiltTestProg(t, exe, name, env...)
+}
+
+func runBuiltTestProg(t *testing.T, exe, name string, env ...string) string {
+ if *flagQuick {
+ t.Skip("-quick")
+ }
+
+ testenv.MustHaveGoBuild(t)
+
cmd := testenv.CleanCmdEnv(exec.Command(exe, name))
cmd.Env = append(cmd.Env, env...)
if testing.Short() {
@@ -64,7 +74,7 @@ func runTestProg(t *testing.T, binary, name string, env ...string) string {
cmd.Stdout = &b
cmd.Stderr = &b
if err := cmd.Start(); err != nil {
- t.Fatalf("starting %s %s: %v", binary, name, err)
+ t.Fatalf("starting %s %s: %v", exe, name, err)
}
// If the process doesn't complete within 1 minute,
@@ -92,7 +102,7 @@ func runTestProg(t *testing.T, binary, name string, env ...string) string {
}()
if err := cmd.Wait(); err != nil {
- t.Logf("%s %s exit status: %v", binary, name, err)
+ t.Logf("%s %s exit status: %v", exe, name, err)
}
close(done)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 93d329d15e..3723f84a05 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1862,10 +1862,16 @@ func startTemplateThread() {
if GOARCH == "wasm" { // no threads on wasm yet
return
}
+
+ // Disable preemption to guarantee that the template thread will be
+ // created before a park once haveTemplateThread is set.
+ mp := acquirem()
if !atomic.Cas(&newmHandoff.haveTemplateThread, 0, 1) {
+ releasem(mp)
return
}
newm(templateThread, nil)
+ releasem(mp)
}
// templateThread is a thread in a known-good state that exists solely
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index 6e6272e80a..108be6a881 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -6,6 +6,7 @@ package runtime_test
import (
"fmt"
+ "internal/testenv"
"math"
"net"
"runtime"
@@ -912,6 +913,29 @@ func TestLockOSThreadAvoidsStatePropagation(t *testing.T) {
}
}
+func TestLockOSThreadTemplateThreadRace(t *testing.T) {
+ testenv.MustHaveGoRun(t)
+
+ exe, err := buildTestProg(t, "testprog")
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ iterations := 100
+ if testing.Short() {
+ // Reduce run time to ~100ms, with much lower probability of
+ // catching issues.
+ iterations = 5
+ }
+ for i := 0; i < iterations; i++ {
+ want := "OK\n"
+ output := runBuiltTestProg(t, exe, "LockOSThreadTemplateThreadRace")
+ if output != want {
+ t.Fatalf("run %d: want %q, got %q", i, want, output)
+ }
+ }
+}
+
// fakeSyscall emulates a system call.
//go:nosplit
func fakeSyscall(duration time.Duration) {
diff --git a/src/runtime/testdata/testprog/lockosthread.go b/src/runtime/testdata/testprog/lockosthread.go
index fd3123e647..098cc4dd72 100644
--- a/src/runtime/testdata/testprog/lockosthread.go
+++ b/src/runtime/testdata/testprog/lockosthread.go
@@ -7,6 +7,7 @@ package main
import (
"os"
"runtime"
+ "sync"
"time"
)
@@ -30,6 +31,7 @@ func init() {
runtime.LockOSThread()
})
register("LockOSThreadAvoidsStatePropagation", LockOSThreadAvoidsStatePropagation)
+ register("LockOSThreadTemplateThreadRace", LockOSThreadTemplateThreadRace)
}
func LockOSThreadMain() {
@@ -195,3 +197,50 @@ func LockOSThreadAvoidsStatePropagation() {
runtime.UnlockOSThread()
println("OK")
}
+
+func LockOSThreadTemplateThreadRace() {
+ // This test attempts to reproduce the race described in
+ // golang.org/issue/38931. To do so, we must have a stop-the-world
+ // (achieved via ReadMemStats) racing with two LockOSThread calls.
+ //
+ // While this test attempts to line up the timing, it is only expected
+ // to fail (and thus hang) around 2% of the time if the race is
+ // present.
+
+ // Ensure enough Ps to actually run everything in parallel. Though on
+ // <4 core machines, we are still at the whim of the kernel scheduler.
+ runtime.GOMAXPROCS(4)
+
+ go func() {
+ // Stop the world; race with LockOSThread below.
+ var m runtime.MemStats
+ for {
+ runtime.ReadMemStats(&m)
+ }
+ }()
+
+ // Try to synchronize both LockOSThreads.
+ start := time.Now().Add(10*time.Millisecond)
+
+ var wg sync.WaitGroup
+ wg.Add(2)
+
+ for i := 0; i < 2; i++ {
+ go func() {
+ for time.Now().Before(start) {
+ }
+
+ // Add work to the local runq to trigger early startm
+ // in handoffp.
+ go func(){}()
+
+ runtime.LockOSThread()
+ runtime.Gosched() // add a preemption point.
+ wg.Done()
+ }()
+ }
+
+ wg.Wait()
+ // If both LockOSThreads completed then we did not hit the race.
+ println("OK")
+}