aboutsummaryrefslogtreecommitdiff
path: root/misc
diff options
context:
space:
mode:
authordoujiang24 <doujiang24@gmail.com>2023-03-31 15:45:39 -0400
committerCherry Mui <cherryyz@google.com>2023-04-03 18:34:11 +0000
commitccad8a9f9c7de7f51478954a5ee667cd7b61fbc1 (patch)
treeee9eecf9e15c608b4c04450af499d6e061498e5d /misc
parent33d8cdeedc4e69cd2d4eae10f024af826a73ce47 (diff)
downloadgo-ccad8a9f9c7de7f51478954a5ee667cd7b61fbc1.tar.gz
go-ccad8a9f9c7de7f51478954a5ee667cd7b61fbc1.zip
runtime/cgo: store M for C-created thread in pthread key
This reapplies CL 392854, with the followup fixes in CL 479255, CL 479915, and CL 481057 incorporated. CL 392854, by doujiang24 <doujiang24@gmail.com>, speed up C to Go calls by binding the M to the C thread. See below for its description. CL 479255 is a followup fix for a small bug in ARM assembly code. CL 479915 is another followup fix to address C to Go calls after the C code uses some stack, but that CL is also buggy. CL 481057, by Michael Knyszek, is a followup fix for a memory leak bug of CL 479915. [Original CL 392854 description] In a C thread, it's necessary to acquire an extra M by using needm while invoking a Go function from C. But, needm and dropm are heavy costs due to the signal-related syscalls. So, we change to not dropm while returning back to C, which means binding the extra M to the C thread until it exits, to avoid needm and dropm on each C to Go call. Instead, we only dropm while the C thread exits, so the extra M won't leak. When invoking a Go function from C: Allocate a pthread variable using pthread_key_create, only once per shared object, and register a thread-exit-time destructor. And store the g0 of the current m into the thread-specified value of the pthread key, only once per C thread, so that the destructor will put the extra M back onto the extra M list while the C thread exits. When returning back to C: Skip dropm in cgocallback, when the pthread variable has been created, so that the extra M will be reused the next time invoke a Go function from C. This is purely a performance optimization. The old version, in which needm & dropm happen on each cgo call, is still correct too, and we have to keep the old version on systems with cgo but without pthreads, like Windows. This optimization is significant, and the specific value depends on the OS system and CPU, but in general, it can be considered as 10x faster, for a simple Go function call from a C thread. For the newly added BenchmarkCGoInCThread, some benchmark results: 1. it's 28x faster, from 3395 ns/op to 121 ns/op, in darwin OS & Intel(R) Core(TM) i7-9750H CPU @ 2.60GHz 2. it's 6.5x faster, from 1495 ns/op to 230 ns/op, in Linux OS & Intel(R) Xeon(R) CPU E5-2630 0 @ 2.30GHz [CL 479915 description] Currently, when C calls into Go the first time, we grab an M using needm, which sets m.g0's stack bounds using the SP. We don't know how big the stack is, so we simply assume 32K. Previously, when the Go function returns to C, we drop the M, and the next time C calls into Go, we put a new stack bound on the g0 based on the current SP. After CL 392854, we don't drop the M, and the next time C calls into Go, we reuse the same g0, without recomputing the stack bounds. If the C code uses quite a bit of stack space before calling into Go, the SP may be well below the 32K stack bound we assumed, so the runtime thinks the g0 stack overflows. This CL makes needm get a more accurate stack bound from pthread. (In some platforms this may still be a guess as we don't know exactly where we are in the C stack), but it is probably better than simply assuming 32K. Fixes #51676. Fixes #59294. Change-Id: I9bf1400106d5c08ce621d2ed1df3a2d9e3f55494 Reviewed-on: https://go-review.googlesource.com/c/go/+/481061 Reviewed-by: Michael Knyszek <mknyszek@google.com> Run-TryBot: Cherry Mui <cherryyz@google.com> Reviewed-by: DeJiang Zhu (doujiang) <doujiang24@gmail.com> TryBot-Result: Gopher Robot <gobot@golang.org>
Diffstat (limited to 'misc')
-rw-r--r--misc/cgo/test/cgo_test.go7
-rw-r--r--misc/cgo/test/cthread_unix.c24
-rw-r--r--misc/cgo/test/cthread_windows.c22
-rw-r--r--misc/cgo/test/testx.go14
-rw-r--r--misc/cgo/testcarchive/carchive_test.go54
-rw-r--r--misc/cgo/testcarchive/testdata/libgo9/a.go14
-rw-r--r--misc/cgo/testcarchive/testdata/main9.c24
7 files changed, 156 insertions, 3 deletions
diff --git a/misc/cgo/test/cgo_test.go b/misc/cgo/test/cgo_test.go
index 5b298954f5..0c3980c12d 100644
--- a/misc/cgo/test/cgo_test.go
+++ b/misc/cgo/test/cgo_test.go
@@ -104,6 +104,7 @@ func TestThreadLock(t *testing.T) { testThreadLockFunc(t) }
func TestUnsignedInt(t *testing.T) { testUnsignedInt(t) }
func TestZeroArgCallback(t *testing.T) { testZeroArgCallback(t) }
-func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
-func BenchmarkGoString(b *testing.B) { benchGoString(b) }
-func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
+func BenchmarkCgoCall(b *testing.B) { benchCgoCall(b) }
+func BenchmarkGoString(b *testing.B) { benchGoString(b) }
+func BenchmarkCGoCallback(b *testing.B) { benchCallback(b) }
+func BenchmarkCGoInCThread(b *testing.B) { benchCGoInCthread(b) }
diff --git a/misc/cgo/test/cthread_unix.c b/misc/cgo/test/cthread_unix.c
index 247d636d06..13623254a9 100644
--- a/misc/cgo/test/cthread_unix.c
+++ b/misc/cgo/test/cthread_unix.c
@@ -32,3 +32,27 @@ doAdd(int max, int nthread)
for(i=0; i<nthread; i++)
pthread_join(thread_id[i], 0);
}
+
+static void*
+goDummyCallbackThread(void* p)
+{
+ int i, max;
+
+ max = *(int*)p;
+ for(i=0; i<max; i++)
+ goDummy();
+ return NULL;
+}
+
+int
+callGoInCThread(int max)
+{
+ pthread_t thread;
+
+ if (pthread_create(&thread, NULL, goDummyCallbackThread, (void*)(&max)) != 0)
+ return -1;
+ if (pthread_join(thread, NULL) != 0)
+ return -1;
+
+ return max;
+}
diff --git a/misc/cgo/test/cthread_windows.c b/misc/cgo/test/cthread_windows.c
index 3a62ddd373..4e52209dee 100644
--- a/misc/cgo/test/cthread_windows.c
+++ b/misc/cgo/test/cthread_windows.c
@@ -35,3 +35,25 @@ doAdd(int max, int nthread)
CloseHandle((HANDLE)thread_id[i]);
}
}
+
+__stdcall
+static unsigned int
+goDummyCallbackThread(void* p)
+{
+ int i, max;
+
+ max = *(int*)p;
+ for(i=0; i<max; i++)
+ goDummy();
+ return 0;
+}
+
+int
+callGoInCThread(int max)
+{
+ uintptr_t thread_id;
+ thread_id = _beginthreadex(0, 0, goDummyCallbackThread, &max, 0, 0);
+ WaitForSingleObject((HANDLE)thread_id, INFINITE);
+ CloseHandle((HANDLE)thread_id);
+ return max;
+}
diff --git a/misc/cgo/test/testx.go b/misc/cgo/test/testx.go
index 6a8e97ddf3..0e2a51a522 100644
--- a/misc/cgo/test/testx.go
+++ b/misc/cgo/test/testx.go
@@ -24,6 +24,7 @@ import (
/*
// threads
extern void doAdd(int, int);
+extern int callGoInCThread(int);
// issue 1328
void IntoC(void);
@@ -146,6 +147,10 @@ func Add(x int) {
*p = 2
}
+//export goDummy
+func goDummy() {
+}
+
func testCthread(t *testing.T) {
if (runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64" {
t.Skip("the iOS exec wrapper is unable to properly handle the panic from Add")
@@ -159,6 +164,15 @@ func testCthread(t *testing.T) {
}
}
+// Benchmark measuring overhead from C to Go in a C thread.
+// Create a new C thread and invoke Go function repeatedly in the new C thread.
+func benchCGoInCthread(b *testing.B) {
+ n := C.callGoInCThread(C.int(b.N))
+ if int(n) != b.N {
+ b.Fatal("unmatch loop times")
+ }
+}
+
// issue 1328
//export BackIntoGo
diff --git a/misc/cgo/testcarchive/carchive_test.go b/misc/cgo/testcarchive/carchive_test.go
index 8a39c24a6d..5996268018 100644
--- a/misc/cgo/testcarchive/carchive_test.go
+++ b/misc/cgo/testcarchive/carchive_test.go
@@ -1247,3 +1247,57 @@ func TestPreemption(t *testing.T) {
t.Error(err)
}
}
+
+// Issue 59294. Test calling Go function from C after using some
+// stack space.
+func TestDeepStack(t *testing.T) {
+ t.Parallel()
+
+ if !testWork {
+ defer func() {
+ os.Remove("testp9" + exeSuffix)
+ os.Remove("libgo9.a")
+ os.Remove("libgo9.h")
+ }()
+ }
+
+ cmd := exec.Command("go", "build", "-buildmode=c-archive", "-o", "libgo9.a", "./libgo9")
+ out, err := cmd.CombinedOutput()
+ t.Logf("%v\n%s", cmd.Args, out)
+ if err != nil {
+ t.Fatal(err)
+ }
+ checkLineComments(t, "libgo9.h")
+ checkArchive(t, "libgo9.a")
+
+ // build with -O0 so the C compiler won't optimize out the large stack frame
+ ccArgs := append(cc, "-O0", "-o", "testp9"+exeSuffix, "main9.c", "libgo9.a")
+ out, err = exec.Command(ccArgs[0], ccArgs[1:]...).CombinedOutput()
+ t.Logf("%v\n%s", ccArgs, out)
+ if err != nil {
+ t.Fatal(err)
+ }
+
+ argv := cmdToRun("./testp9")
+ cmd = exec.Command(argv[0], argv[1:]...)
+ sb := new(strings.Builder)
+ cmd.Stdout = sb
+ cmd.Stderr = sb
+ if err := cmd.Start(); err != nil {
+ t.Fatal(err)
+ }
+
+ timer := time.AfterFunc(time.Minute,
+ func() {
+ t.Error("test program timed out")
+ cmd.Process.Kill()
+ },
+ )
+ defer timer.Stop()
+
+ err = cmd.Wait()
+ t.Logf("%v\n%s", cmd.Args, sb)
+ if err != nil {
+ t.Error(err)
+ }
+}
diff --git a/misc/cgo/testcarchive/testdata/libgo9/a.go b/misc/cgo/testcarchive/testdata/libgo9/a.go
new file mode 100644
index 0000000000..acb08d90ec
--- /dev/null
+++ b/misc/cgo/testcarchive/testdata/libgo9/a.go
@@ -0,0 +1,14 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package main
+
+import "runtime"
+
+import "C"
+
+func main() {}
+
+//export GoF
+func GoF() { runtime.GC() }
diff --git a/misc/cgo/testcarchive/testdata/main9.c b/misc/cgo/testcarchive/testdata/main9.c
new file mode 100644
index 0000000000..95ad4dea49
--- /dev/null
+++ b/misc/cgo/testcarchive/testdata/main9.c
@@ -0,0 +1,24 @@
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "libgo9.h"
+
+void use(int *x) { (*x)++; }
+
+void callGoFWithDeepStack() {
+ int x[10000];
+
+ use(&x[0]);
+ use(&x[9999]);
+
+ GoF();
+
+ use(&x[0]);
+ use(&x[9999]);
+}
+
+int main() {
+ GoF(); // call GoF without using much stack
+ callGoFWithDeepStack(); // call GoF with a deep stack
+}