171 files changed, 7037 insertions, 2518 deletions
diff --git a/src/runtime/alg.go b/src/runtime/alg.go
index 0af48ab25c..2ec3fc3658 100644
--- a/src/runtime/alg.go
+++ b/src/runtime/alg.go
@@ -15,25 +15,6 @@ const (
 	c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503)
 )
 
-// type algorithms - known to compiler
-const (
-	alg_NOEQ = iota
-	alg_MEM0
-	alg_MEM8
-	alg_MEM16
-	alg_MEM32
-	alg_MEM64
-	alg_MEM128
-	alg_STRING
-	alg_INTER
-	alg_NILINTER
-	alg_FLOAT32
-	alg_FLOAT64
-	alg_CPLX64
-	alg_CPLX128
-	alg_max
-)
-
 func memhash0(p unsafe.Pointer, h uintptr) uintptr {
 	return h
 }
@@ -185,7 +166,7 @@ func typehash(t *_type, p unsafe.Pointer, h uintptr) uintptr {
 		return strhash(p, h)
 	case kindInterface:
 		i := (*interfacetype)(unsafe.Pointer(t))
-		if len(i.mhdr) == 0 {
+		if i.isEmpty() {
 			return nilinterhash(p, h)
 		}
 		return interhash(p, h)
diff --git a/src/runtime/asm_386.s b/src/runtime/asm_386.s
index 11863fba39..fa3b1be339 100644
--- a/src/runtime/asm_386.s
+++ b/src/runtime/asm_386.s
@@ -702,25 +702,9 @@ nosave:
 	MOVL	AX, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	LEAL	fn+0(FP), AX
-	MOVL	AX, 0(SP)
-	MOVL	frame+4(FP), AX
-	MOVL	AX, 4(SP)
-	MOVL	framesize+8(FP), AX
-	MOVL	AX, 8(SP)
-	MOVL	ctxt+12(FP), AX
-	MOVL	AX, 12(SP)
-	MOVL	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
+TEXT ·cgocallback(SB),NOSPLIT,$12-12  // Frame size must match commented places below
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -738,13 +722,12 @@ TEXT ·cgocallback_gofunc(SB),NOSPLIT,$12-16
 	CMPL	BP, $0
 	JEQ	needm
 	MOVL	g_m(BP), BP
-	MOVL	BP, DX // saved copy of oldm
+	MOVL	BP, savedm-4(SP) // saved copy of oldm
 	JMP	havem
 needm:
-	MOVL	$0, 0(SP)
 	MOVL	$runtime·needm(SB), AX
 	CALL	AX
-	MOVL	0(SP), DX
+	MOVL	$0, savedm-4(SP) // dropm on return
 	get_tls(CX)
 	MOVL	g(CX), BP
 	MOVL	g_m(BP), BP
@@ -780,34 +763,32 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, 4(SP) holds the saved oldm (DX) register.
-	// 8(SP) is unused.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVL	m_curg(BP), SI
 	MOVL	SI, g(CX)
 	MOVL	(g_sched+gobuf_sp)(SI), DI // prepare stack as DI
 	MOVL	(g_sched+gobuf_pc)(SI), BP
-	MOVL	BP, -4(DI)
-	MOVL	ctxt+12(FP), CX
-	LEAL	-(4+12)(DI), SP
-	MOVL	DX, 4(SP)
-	MOVL	CX, 0(SP)
+	MOVL	BP, -4(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVL	fn+0(FP), AX
+	MOVL	frame+4(FP), BX
+	MOVL	ctxt+8(FP), CX
+	LEAL	-(4+12)(DI), SP  // Must match declared frame size
+	MOVL	AX, 0(SP)
+	MOVL	BX, 4(SP)
+	MOVL	CX, 8(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVL	4(SP), DX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	get_tls(CX)
 	MOVL	g(CX), SI
-	MOVL	12(SP), BP
+	MOVL	12(SP), BP  // Must match declared frame size
 	MOVL	BP, (g_sched+gobuf_pc)(SI)
-	LEAL	(12+4)(SP), DI
+	LEAL	(12+4)(SP), DI  // Must match declared frame size
 	MOVL	DI, (g_sched+gobuf_sp)(SI)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -823,6 +804,7 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
+	MOVL	savedm-4(SP), DX
 	CMPL	DX, $0
 	JNE 3(PC)
 	MOVL	$runtime·dropm(SB), AX
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s
index fa25c55b96..19a3bb2d7d 100644
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -470,6 +470,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 
 TEXT ·reflectcall(SB), NOSPLIT, $0-32
 	MOVLQZX argsize+24(FP), CX
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -537,6 +538,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -689,25 +691,9 @@ nosave:
 	MOVL	AX, ret+16(FP)
 	RET
 
-// func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	LEAQ	fn+0(FP), AX
-	MOVQ	AX, 0(SP)
-	MOVQ	frame+8(FP), AX
-	MOVQ	AX, 8(SP)
-	MOVQ	framesize+16(FP), AX
-	MOVQ	AX, 16(SP)
-	MOVQ	ctxt+24(FP), AX
-	MOVQ	AX, 24(SP)
-	MOVQ	$runtime·cgocallback_gofunc(SB), AX
-	CALL	AX
-	RET
-
-// func cgocallback_gofunc(fn, frame, framesize, ctxt uintptr)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// If g is nil, Go did not create the current thread.
@@ -725,13 +711,12 @@ TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
 	CMPQ	BX, $0
 	JEQ	needm
 	MOVQ	g_m(BX), BX
-	MOVQ	BX, R8 // holds oldm until end of function
+	MOVQ	BX, savedm-8(SP)	// saved copy of oldm
 	JMP	havem
 needm:
-	MOVQ	$0, 0(SP)
-	MOVQ	$runtime·needm(SB), AX
+	MOVQ    $runtime·needm(SB), AX
 	CALL	AX
-	MOVQ	0(SP), R8
+	MOVQ	$0, savedm-8(SP) // dropm on return
 	get_tls(CX)
 	MOVQ	g(CX), BX
 	MOVQ	g_m(BX), BX
@@ -767,37 +752,36 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, 8(SP) holds the saved R8.
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVQ	m_curg(BX), SI
 	MOVQ	SI, g(CX)
 	MOVQ	(g_sched+gobuf_sp)(SI), DI  // prepare stack as DI
 	MOVQ	(g_sched+gobuf_pc)(SI), BX
-	MOVQ	BX, -8(DI)
+	MOVQ	BX, -8(DI)  // "push" return PC on the g stack
+	// Gather our arguments into registers.
+	MOVQ	fn+0(FP), BX
+	MOVQ	frame+8(FP), CX
+	MOVQ	ctxt+16(FP), DX
 	// Compute the size of the frame, including return PC and, if
 	// GOEXPERIMENT=framepointer, the saved base pointer
-	MOVQ	ctxt+24(FP), BX
-	LEAQ	fv+0(FP), AX
-	SUBQ	SP, AX
-	SUBQ	AX, DI
+	LEAQ	fn+0(FP), AX
+	SUBQ	SP, AX   // AX is our actual frame size
+	SUBQ	AX, DI   // Allocate the same frame size on the g stack
 	MOVQ	DI, SP
 
-	MOVQ	R8, 8(SP)
 	MOVQ	BX, 0(SP)
+	MOVQ	CX, 8(SP)
+	MOVQ	DX, 16(SP)
 	CALL	runtime·cgocallbackg(SB)
-	MOVQ	8(SP), R8
 
 	// Compute the size of the frame again. FP and SP have
 	// completely different values here than they did above,
 	// but only their difference matters.
-	LEAQ	fv+0(FP), AX
+	LEAQ	fn+0(FP), AX
 	SUBQ	SP, AX
 
 	// Restore g->sched (== m->curg->sched) from saved values.
@@ -822,7 +806,8 @@ havem:
 
 	// If the m on entry was nil, we called needm above to borrow an m
 	// for the duration of the call. Since the call is over, return it with dropm.
-	CMPQ	R8, $0
+	MOVQ	savedm-8(SP), BX
+	CMPQ	BX, $0
 	JNE 3(PC)
 	MOVQ	$runtime·dropm(SB), AX
 	CALL	AX
diff --git a/src/runtime/asm_arm.s b/src/runtime/asm_arm.s
index 51a50c604c..c54b4eb006 100644
--- a/src/runtime/asm_arm.s
+++ b/src/runtime/asm_arm.s
@@ -643,25 +643,9 @@ nosave:
 	MOVW	R0, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R0
-	MOVW	R0, 4(R13)
-	MOVW	frame+4(FP), R0
-	MOVW	R0, 8(R13)
-	MOVW	framesize+8(FP), R0
-	MOVW	R0, 12(R13)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, 16(R13)
-	MOVW	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT	·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT	·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -686,7 +670,7 @@ needm:
 	MOVW	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -706,10 +690,10 @@ havem:
 	// Save current m->g0->sched.sp on stack and then set it to SP.
 	// Save current sp in m->g0->sched.sp in preparation for
 	// switch back to m->curg stack.
-	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-8(SP).
+	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R13) aka savedsp-12(SP).
 	MOVW	m_g0(R8), R3
 	MOVW	(g_sched+gobuf_sp)(R3), R4
-	MOVW	R4, savedsp-8(SP)
+	MOVW	R4, savedsp-12(SP)	// must match frame size
 	MOVW	R13, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -718,30 +702,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVW	(g_sched+gobuf_pc)(g), R5
-	MOVW	R5, -12(R4)
-	MOVW	ctxt+12(FP), R0
-	MOVW	R0, -8(R4)
-	MOVW	$-12(R4), R13
+	MOVW	R5, -(12+4)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R1
+	MOVW	frame+4(FP), R2
+	MOVW	ctxt+8(FP), R3
+	MOVW	$-(12+4)(R4), R13	// switch stack; must match frame size
+	MOVW	R1, 4(R13)
+	MOVW	R2, 8(R13)
+	MOVW	R3, 12(R13)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R13), R5
 	MOVW	R5, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R13), R4
+	MOVW	$(12+4)(R13), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -751,7 +735,7 @@ havem:
 	MOVW	m_g0(R8), R0
 	BL	setg<>(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R13
-	MOVW	savedsp-8(SP), R4
+	MOVW	savedsp-12(SP), R4	// must match frame size
 	MOVW	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_arm64.s b/src/runtime/asm_arm64.s
index 6b3d1e779e..a45e342478 100644
--- a/src/runtime/asm_arm64.s
+++ b/src/runtime/asm_arm64.s
@@ -331,6 +331,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R16
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -414,34 +415,33 @@ TEXT callRet<>(SB), NOSPLIT, $40-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
-// These have 8 added to make the overall frame size a multiple of 16,
-// as required by the ABI. (There is another +8 for the saved LR.)
-CALLFN(·call32, 40 )
-CALLFN(·call64, 72 )
-CALLFN(·call128, 136 )
-CALLFN(·call256, 264 )
-CALLFN(·call512, 520 )
-CALLFN(·call1024, 1032 )
-CALLFN(·call2048, 2056 )
-CALLFN(·call4096, 4104 )
-CALLFN(·call8192, 8200 )
-CALLFN(·call16384, 16392 )
-CALLFN(·call32768, 32776 )
-CALLFN(·call65536, 65544 )
-CALLFN(·call131072, 131080 )
-CALLFN(·call262144, 262152 )
-CALLFN(·call524288, 524296 )
-CALLFN(·call1048576, 1048584 )
-CALLFN(·call2097152, 2097160 )
-CALLFN(·call4194304, 4194312 )
-CALLFN(·call8388608, 8388616 )
-CALLFN(·call16777216, 16777224 )
-CALLFN(·call33554432, 33554440 )
-CALLFN(·call67108864, 67108872 )
-CALLFN(·call134217728, 134217736 )
-CALLFN(·call268435456, 268435464 )
-CALLFN(·call536870912, 536870920 )
-CALLFN(·call1073741824, 1073741832 )
+CALLFN(·call16, 16)
+CALLFN(·call32, 32)
+CALLFN(·call64, 64)
+CALLFN(·call128, 128)
+CALLFN(·call256, 256)
+CALLFN(·call512, 512)
+CALLFN(·call1024, 1024)
+CALLFN(·call2048, 2048)
+CALLFN(·call4096, 4096)
+CALLFN(·call8192, 8192)
+CALLFN(·call16384, 16384)
+CALLFN(·call32768, 32768)
+CALLFN(·call65536, 65536)
+CALLFN(·call131072, 131072)
+CALLFN(·call262144, 262144)
+CALLFN(·call524288, 524288)
+CALLFN(·call1048576, 1048576)
+CALLFN(·call2097152, 2097152)
+CALLFN(·call4194304, 4194304)
+CALLFN(·call8388608, 8388608)
+CALLFN(·call16777216, 16777216)
+CALLFN(·call33554432, 33554432)
+CALLFN(·call67108864, 67108864)
+CALLFN(·call134217728, 134217728)
+CALLFN(·call268435456, 268435456)
+CALLFN(·call536870912, 536870912)
+CALLFN(·call1073741824, 1073741824)
 
 // func memhash32(p unsafe.Pointer, h uintptr) uintptr
 TEXT runtime·memhash32(SB),NOSPLIT|NOFRAME,$0-24
@@ -958,25 +958,9 @@ nosave:
 	MOVD	R0, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$40-32
-	MOVD	$fn+0(FP), R0
-	MOVD	R0, 8(RSP)
-	MOVD	frame+8(FP), R0
-	MOVD	R0, 16(RSP)
-	MOVD	framesize+16(FP), R0
-	MOVD	R0, 24(RSP)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, 32(RSP)
-	MOVD	$runtime·cgocallback_gofunc(SB), R0
-	BL	(R0)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load g from thread-local storage.
@@ -1001,7 +985,7 @@ needm:
 	MOVD	$runtime·needm(SB), R0
 	BL	(R0)
 
-	// Set m->sched.sp = SP, so that if a panic happens
+	// Set m->g0->sched.sp = SP, so that if a panic happens
 	// during the function we are about to execute, it will
 	// have a valid SP to run on the g0 stack.
 	// The next few lines (after the havem label)
@@ -1037,16 +1021,11 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
@@ -1054,10 +1033,15 @@ havem:
 	MOVD	R5, -48(R4)
 	MOVD	(g_sched+gobuf_bp)(g), R5
 	MOVD	R5, -56(R4)
-	MOVD	ctxt+24(FP), R0
-	MOVD	R0, -40(R4)
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
 	MOVD	$-48(R4), R0 // maintain 16-byte SP alignment
-	MOVD	R0, RSP
+	MOVD	R0, RSP	// switch stack
+	MOVD	R1, 8(RSP)
+	MOVD	R2, 16(RSP)
+	MOVD	R3, 24(RSP)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
diff --git a/src/runtime/asm_mips64x.s b/src/runtime/asm_mips64x.s
index 7330f40e85..19781f7885 100644
--- a/src/runtime/asm_mips64x.s
+++ b/src/runtime/asm_mips64x.s
@@ -294,6 +294,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), R1
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -470,25 +471,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVV	$fn+0(FP), R1
-	MOVV	R1, 8(R29)
-	MOVV	frame+8(FP), R1
-	MOVV	R1, 16(R29)
-	MOVV	framesize+16(FP), R1
-	MOVV	R1, 24(R29)
-	MOVV	ctxt+24(FP), R1
-	MOVV	R1, 32(R29)
-	MOVV	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -536,7 +521,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R29) aka savedsp-16(SP).
 	MOVV	m_g0(R3), R1
 	MOVV	(g_sched+gobuf_sp)(R1), R2
-	MOVV	R2, savedsp-16(SP)
+	MOVV	R2, savedsp-24(SP)	// must match frame size
 	MOVV	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -545,30 +530,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVV	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVV	(g_sched+gobuf_pc)(g), R4
-	MOVV	R4, -24(R2)
-	MOVV    ctxt+24(FP), R1
-	MOVV    R1, -16(R2)
-	MOVV	$-24(R2), R29
+	MOVV	R4, -(24+8)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVV	fn+0(FP), R5
+	MOVV	frame+8(FP), R6
+	MOVV	ctxt+16(FP), R7
+	MOVV	$-(24+8)(R2), R29	// switch stack; must match frame size
+	MOVV	R5, 8(R29)
+	MOVV	R6, 16(R29)
+	MOVV	R7, 24(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVV	0(R29), R4
 	MOVV	R4, (g_sched+gobuf_pc)(g)
-	MOVV	$24(R29), R2
+	MOVV	$(24+8)(R29), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -578,7 +563,7 @@ havem:
 	MOVV	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVV	(g_sched+gobuf_sp)(g), R29
-	MOVV	savedsp-16(SP), R2
+	MOVV	savedsp-24(SP), R2	// must match frame size
 	MOVV	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_mipsx.s b/src/runtime/asm_mipsx.s
index aca0510b69..ee87d81436 100644
--- a/src/runtime/asm_mipsx.s
+++ b/src/runtime/asm_mipsx.s
@@ -472,25 +472,9 @@ g0:
 	MOVW	R2, ret+8(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$16-16
-	MOVW	$fn+0(FP), R1
-	MOVW	R1, 4(R29)
-	MOVW	frame+4(FP), R1
-	MOVW	R1, 8(R29)
-	MOVW	framesize+8(FP), R1
-	MOVW	R1, 12(R29)
-	MOVW	ctxt+12(FP), R1
-	MOVW	R1, 16(R29)
-	MOVW	$runtime·cgocallback_gofunc(SB), R1
-	JAL	(R1)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$8-16
+TEXT ·cgocallback(SB),NOSPLIT,$12-12
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -538,7 +522,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 4(R29) aka savedsp-8(SP).
 	MOVW	m_g0(R3), R1
 	MOVW	(g_sched+gobuf_sp)(R1), R2
-	MOVW	R2, savedsp-8(SP)
+	MOVW	R2, savedsp-12(SP)	// must match frame size
 	MOVW	R29, (g_sched+gobuf_sp)(R1)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -547,30 +531,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -4(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVW	m_curg(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R2 // prepare stack as R2
 	MOVW	(g_sched+gobuf_pc)(g), R4
-	MOVW	R4, -12(R2)
-	MOVW    ctxt+12(FP), R1
-	MOVW    R1, -8(R2)
-	MOVW	$-12(R2), R29
+	MOVW	R4, -(12+4)(R2)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVW	fn+0(FP), R5
+	MOVW	frame+4(FP), R6
+	MOVW	ctxt+8(FP), R7
+	MOVW	$-(12+4)(R2), R29	// switch stack; must match frame size
+	MOVW	R5, 4(R29)
+	MOVW	R6, 8(R29)
+	MOVW	R7, 12(R29)
 	JAL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVW	0(R29), R4
 	MOVW	R4, (g_sched+gobuf_pc)(g)
-	MOVW	$12(R29), R2
+	MOVW	$(12+4)(R29), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -580,7 +564,7 @@ havem:
 	MOVW	m_g0(R3), g
 	JAL	runtime·save_g(SB)
 	MOVW	(g_sched+gobuf_sp)(g), R29
-	MOVW	savedsp-8(SP), R2
+	MOVW	savedsp-12(SP), R2	// must match frame size
 	MOVW	R2, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_ppc64x.s b/src/runtime/asm_ppc64x.s
index 23387a2165..dc34c0e4c8 100644
--- a/src/runtime/asm_ppc64x.s
+++ b/src/runtime/asm_ppc64x.s
@@ -372,6 +372,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -478,6 +479,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -649,26 +651,9 @@ g0:
 	MOVW	R3, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, FIXED_FRAME+0(R1)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, FIXED_FRAME+8(R1)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, FIXED_FRAME+16(R1)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, FIXED_FRAME+24(R1)
-	MOVD	$runtime·cgocallback_gofunc(SB), R12
-	MOVD	R12, CTR
-	BL	(CTR)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// func cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -719,7 +704,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)      // must match frame size
 	MOVD	R1, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -728,30 +713,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -(FIXED_FRAME+16)(R4)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, -16(R4)
-	MOVD	$-(FIXED_FRAME+16)(R4), R1
+	MOVD	R5, -(24+FIXED_FRAME)(R4)       // "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R5
+	MOVD	frame+8(FP), R6
+	MOVD	ctxt+16(FP), R7
+	MOVD	$-(24+FIXED_FRAME)(R4), R1      // switch stack; must match frame size
+	MOVD    R5, FIXED_FRAME+0(R1)
+	MOVD    R6, FIXED_FRAME+8(R1)
+	MOVD    R7, FIXED_FRAME+16(R1)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R1), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$(FIXED_FRAME+16)(R1), R4
+	MOVD	$(24+FIXED_FRAME)(R1), R4       // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -761,7 +746,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R1
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4      // must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_riscv64.s b/src/runtime/asm_riscv64.s
index 8f6c8773eb..fd01fd6f07 100644
--- a/src/runtime/asm_riscv64.s
+++ b/src/runtime/asm_riscv64.s
@@ -342,6 +342,7 @@ TEXT reflect·call(SB), NOSPLIT, $0-0
 // func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
 TEXT ·reflectcall(SB), NOSPLIT|NOFRAME, $0-32
 	MOVWU argsize+24(FP), T0
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -452,8 +453,9 @@ TEXT runtime·goexit(SB),NOSPLIT|NOFRAME|TOPFRAME,$0-0
 	// traceback from goexit1 must hit code range of goexit
 	MOV	ZERO, ZERO	// NOP
 
-// func cgocallback_gofunc(fv uintptr, frame uintptr, framesize, ctxt uintptr)
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$24-32
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
+// See cgocall.go for more details.
+TEXT ·cgocallback(SB),NOSPLIT,$0-24
 	// TODO(jsing): Add support for cgo - issue #36641.
 	WORD $0		// crash
 
@@ -518,12 +520,12 @@ flush:
 	MOV	T1, 16(X2)	// Also second argument to wbBufFlush
 
 	// TODO: Optimise
-	// R3 is g.
-	// R4 already saved (T0)
-	// R5 already saved (T1)
-	// R9 already saved (A0)
-	// R10 already saved (A1)
-	// R30 is tmp register.
+	// X5 already saved (T0)
+	// X6 already saved (T1)
+	// X10 already saved (A0)
+	// X11 already saved (A1)
+	// X27 is g.
+	// X31 is tmp register.
 	MOV	X0, 24(X2)
 	MOV	X1, 32(X2)
 	MOV	X2, 40(X2)
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s
index cb39451faa..7baef37324 100644
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -383,6 +383,7 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT|NOFRAME,$0-0
 
 TEXT ·reflectcall(SB), NOSPLIT, $-8-32
 	MOVWZ argsize+24(FP), R3
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -461,6 +462,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	BL	runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -573,25 +575,9 @@ g0:
 	MOVW	R2, ret+16(FP)
 	RET
 
-// cgocallback(void (*fn)(void*), void *frame, uintptr framesize, uintptr ctxt)
-// Turn the fn into a Go func (by taking its address) and call
-// cgocallback_gofunc.
-TEXT runtime·cgocallback(SB),NOSPLIT,$32-32
-	MOVD	$fn+0(FP), R3
-	MOVD	R3, 8(R15)
-	MOVD	frame+8(FP), R3
-	MOVD	R3, 16(R15)
-	MOVD	framesize+16(FP), R3
-	MOVD	R3, 24(R15)
-	MOVD	ctxt+24(FP), R3
-	MOVD	R3, 32(R15)
-	MOVD	$runtime·cgocallback_gofunc(SB), R3
-	BL	(R3)
-	RET
-
-// cgocallback_gofunc(FuncVal*, void *frame, uintptr framesize, uintptr ctxt)
+// cgocallback(fn, frame unsafe.Pointer, ctxt uintptr)
 // See cgocall.go for more details.
-TEXT ·cgocallback_gofunc(SB),NOSPLIT,$16-32
+TEXT ·cgocallback(SB),NOSPLIT,$24-24
 	NO_LOCAL_POINTERS
 
 	// Load m and g from thread-local storage.
@@ -639,7 +625,7 @@ havem:
 	// NOTE: unwindm knows that the saved g->sched.sp is at 8(R1) aka savedsp-16(SP).
 	MOVD	m_g0(R8), R3
 	MOVD	(g_sched+gobuf_sp)(R3), R4
-	MOVD	R4, savedsp-16(SP)
+	MOVD	R4, savedsp-24(SP)	// must match frame size
 	MOVD	R15, (g_sched+gobuf_sp)(R3)
 
 	// Switch to m->curg stack and call runtime.cgocallbackg.
@@ -648,30 +634,30 @@ havem:
 	// save that information (m->curg->sched) so we can restore it.
 	// We can restore m->curg->sched.sp easily, because calling
 	// runtime.cgocallbackg leaves SP unchanged upon return.
-	// To save m->curg->sched.pc, we push it onto the stack.
-	// This has the added benefit that it looks to the traceback
-	// routine like cgocallbackg is going to return to that
-	// PC (because the frame we allocate below has the same
-	// size as cgocallback_gofunc's frame declared above)
-	// so that the traceback will seamlessly trace back into
-	// the earlier calls.
-	//
-	// In the new goroutine, -8(SP) is unused (where SP refers to
-	// m->curg's SP while we're setting it up, before we've adjusted it).
+	// To save m->curg->sched.pc, we push it onto the curg stack and
+	// open a frame the same size as cgocallback's g0 frame.
+	// Once we switch to the curg stack, the pushed PC will appear
+	// to be the return PC of cgocallback, so that the traceback
+	// will seamlessly trace back into the earlier calls.
 	MOVD	m_curg(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R4 // prepare stack as R4
 	MOVD	(g_sched+gobuf_pc)(g), R5
-	MOVD	R5, -24(R4)
-	MOVD	ctxt+24(FP), R5
-	MOVD	R5, -16(R4)
-	MOVD	$-24(R4), R15
+	MOVD	R5, -(24+8)(R4)	// "saved LR"; must match frame size
+	// Gather our arguments into registers.
+	MOVD	fn+0(FP), R1
+	MOVD	frame+8(FP), R2
+	MOVD	ctxt+16(FP), R3
+	MOVD	$-(24+8)(R4), R15	// switch stack; must match frame size
+	MOVD	R1, 8(R15)
+	MOVD	R2, 16(R15)
+	MOVD	R3, 24(R15)
 	BL	runtime·cgocallbackg(SB)
 
 	// Restore g->sched (== m->curg->sched) from saved values.
 	MOVD	0(R15), R5
 	MOVD	R5, (g_sched+gobuf_pc)(g)
-	MOVD	$24(R15), R4
+	MOVD	$(24+8)(R15), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// Switch back to m->g0's stack and restore m->g0->sched.sp.
@@ -681,7 +667,7 @@ havem:
 	MOVD	m_g0(R8), g
 	BL	runtime·save_g(SB)
 	MOVD	(g_sched+gobuf_sp)(g), R15
-	MOVD	savedsp-16(SP), R4
+	MOVD	savedsp-24(SP), R4	// must match frame size
 	MOVD	R4, (g_sched+gobuf_sp)(g)
 
 	// If the m on entry was nil, we called needm above to borrow an m
diff --git a/src/runtime/asm_wasm.s b/src/runtime/asm_wasm.s
index 7d88beb537..67e81adf0b 100644
--- a/src/runtime/asm_wasm.s
+++ b/src/runtime/asm_wasm.s
@@ -288,9 +288,6 @@ TEXT runtime·morestack_noctxt(SB),NOSPLIT,$0
 TEXT ·asmcgocall(SB), NOSPLIT, $0-0
 	UNDEF
 
-TEXT ·cgocallback_gofunc(SB), NOSPLIT, $16-32
-	UNDEF
-
 #define DISPATCH(NAME, MAXSIZE) \
 	Get R0; \
 	I64Const $MAXSIZE; \
@@ -308,6 +305,7 @@ TEXT ·reflectcall(SB), NOSPLIT, $0-32
 
 	MOVW argsize+24(FP), R0
 
+	DISPATCH(runtime·call16, 16)
 	DISPATCH(runtime·call32, 32)
 	DISPATCH(runtime·call64, 64)
 	DISPATCH(runtime·call128, 128)
@@ -398,6 +396,7 @@ TEXT callRet<>(SB), NOSPLIT, $32-0
 	CALL runtime·reflectcallmove(SB)
 	RET
 
+CALLFN(·call16, 16)
 CALLFN(·call32, 32)
 CALLFN(·call64, 64)
 CALLFN(·call128, 128)
@@ -430,7 +429,7 @@ TEXT runtime·goexit(SB), NOSPLIT, $0-0
 	CALL runtime·goexit1(SB) // does not return
 	UNDEF
 
-TEXT runtime·cgocallback(SB), NOSPLIT, $32-32
+TEXT runtime·cgocallback(SB), NOSPLIT, $0-24
 	UNDEF
 
 // gcWriteBarrier performs a heap pointer write and informs the GC.
diff --git a/src/runtime/auxv_none.go b/src/runtime/auxv_none.go
index 3a560a1793..3ca617b21e 100644
--- a/src/runtime/auxv_none.go
+++ b/src/runtime/auxv_none.go
@@ -7,7 +7,6 @@
 // +build !dragonfly
 // +build !freebsd
 // +build !netbsd
-// +build !openbsd !arm64
 // +build !solaris
 
 package runtime
diff --git a/src/runtime/cgo/asm_386.s b/src/runtime/cgo/asm_386.s
index 7293c20bf8..2e7e9512e2 100644
--- a/src/runtime/cgo/asm_386.s
+++ b/src/runtime/cgo/asm_386.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT,$28-16
 	MOVL BP, 24(SP)
 	MOVL BX, 20(SP)
@@ -15,12 +16,11 @@ TEXT crosscall2(SB),NOSPLIT,$28-16
 
 	MOVL	ctxt+12(FP), AX
 	MOVL	AX, 8(SP)
-	MOVL	n+8(FP), AX
-	MOVL	AX, 4(SP)
 	MOVL	a+4(FP), AX
-	MOVL	AX, 0(SP)
+	MOVL	AX, 4(SP)
 	MOVL	fn+0(FP), AX
-	CALL	AX
+	MOVL	AX, 0(SP)
+	CALL	runtime·cgocallback(SB)
 
 	MOVL 12(SP), DI
 	MOVL 16(SP), SI
diff --git a/src/runtime/cgo/asm_amd64.s b/src/runtime/cgo/asm_amd64.s
index 06c538b9bc..5dc8e2d235 100644
--- a/src/runtime/cgo/asm_amd64.s
+++ b/src/runtime/cgo/asm_amd64.s
@@ -5,8 +5,10 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
+// This signature is known to SWIG, so we can't change it.
 #ifndef GOOS_windows
 TEXT crosscall2(SB),NOSPLIT,$0x50-0 /* keeps stack pointer 32-byte aligned */
 #else
@@ -33,11 +35,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	X14, 0xe0(SP)
 	MOVUPS	X15, 0xf0(SP)
 
-	MOVQ	DX, 0x0(SP)	/* arg */
-	MOVQ	R8, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	CX, 0x0(SP)	/* fn */
+	MOVQ	DX, 0x8(SP)	/* arg */
+	// Skip n in R8.
 	MOVQ	R9, 0x10(SP)	/* ctxt */
 
-	CALL	CX	/* fn */
+	CALL	runtime·cgocallback(SB)
 
 	MOVQ	0x48(SP), DI
 	MOVQ	0x50(SP), SI
@@ -52,11 +55,12 @@ TEXT crosscall2(SB),NOSPLIT,$0x110-0 /* also need to save xmm6 - xmm15 */
 	MOVUPS	0xe0(SP), X14
 	MOVUPS	0xf0(SP), X15
 #else
-	MOVQ	SI, 0x0(SP)	/* arg */
-	MOVQ	DX, 0x8(SP)	/* argsize (includes padding) */
+	MOVQ	DI, 0x0(SP)	/* fn */
+	MOVQ	SI, 0x8(SP)	/* arg */
+	// Skip n in DX.
 	MOVQ	CX, 0x10(SP)	/* ctxt */
 
-	CALL	DI	/* fn */
+	CALL	runtime·cgocallback(SB)
 #endif
 
 	MOVQ	0x18(SP), BX
diff --git a/src/runtime/cgo/asm_arm.s b/src/runtime/cgo/asm_arm.s
index 60132c14a8..ea55e173c1 100644
--- a/src/runtime/cgo/asm_arm.s
+++ b/src/runtime/cgo/asm_arm.s
@@ -5,51 +5,52 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
-	/*
-	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
-	 * Also note that at procedure entry in gc world, 4(R13) will be the
-	 *  first arg, so we must push another dummy reg (R0) for 0(R13).
-	 *  Additionally, runtime·load_g will clobber R0, so we need to save R0
-	 *  nevertheless.
-	 */
 	SUB	$(8*9), R13 // Reserve space for the floating point registers.
-	MOVM.WP	[R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14], (R13)
+	// The C arguments arrive in R0, R1, R2, and R3. We want to
+	// pass R0, R1, and R3 to Go, so we push those on the stack.
+	// Also, save C callee-save registers R4-R12.
+	MOVM.WP	[R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12], (R13)
+	// Finally, save the link register R14. This also puts the
+	// arguments we pushed for cgocallback where they need to be,
+	// starting at 4(R13).
+	MOVW.W	R14, -4(R13)
 
 	// Skip floating point registers on GOARM < 6.
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfpsave
-	MOVD	F8, (14*4+8*1)(R13)
-	MOVD	F9, (14*4+8*2)(R13)
-	MOVD	F10, (14*4+8*3)(R13)
-	MOVD	F11, (14*4+8*4)(R13)
-	MOVD	F12, (14*4+8*5)(R13)
-	MOVD	F13, (14*4+8*6)(R13)
-	MOVD	F14, (14*4+8*7)(R13)
-	MOVD	F15, (14*4+8*8)(R13)
+	MOVD	F8, (13*4+8*1)(R13)
+	MOVD	F9, (13*4+8*2)(R13)
+	MOVD	F10, (13*4+8*3)(R13)
+	MOVD	F11, (13*4+8*4)(R13)
+	MOVD	F12, (13*4+8*5)(R13)
+	MOVD	F13, (13*4+8*6)(R13)
+	MOVD	F14, (13*4+8*7)(R13)
+	MOVD	F15, (13*4+8*8)(R13)
 
 skipfpsave:
 	BL	runtime·load_g(SB)
-	MOVW	R15, R14 // R15 is PC.
-	MOVW	0(R13), R15
+	// We set up the arguments to cgocallback when saving registers above.
+	BL	runtime·cgocallback(SB)
 
 	MOVB    runtime·goarm(SB), R11
 	CMP $6, R11
 	BLT skipfprest
-	MOVD	(14*4+8*1)(R13), F8
-	MOVD	(14*4+8*2)(R13), F9
-	MOVD	(14*4+8*3)(R13), F10
-	MOVD	(14*4+8*4)(R13), F11
-	MOVD	(14*4+8*5)(R13), F12
-	MOVD	(14*4+8*6)(R13), F13
-	MOVD	(14*4+8*7)(R13), F14
-	MOVD	(14*4+8*8)(R13), F15
+	MOVD	(13*4+8*1)(R13), F8
+	MOVD	(13*4+8*2)(R13), F9
+	MOVD	(13*4+8*3)(R13), F10
+	MOVD	(13*4+8*4)(R13), F11
+	MOVD	(13*4+8*5)(R13), F12
+	MOVD	(13*4+8*6)(R13), F13
+	MOVD	(13*4+8*7)(R13), F14
+	MOVD	(13*4+8*8)(R13), F15
 
 skipfprest:
-	MOVM.IAW	(R13), [R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, g, R11, R12, R14]
+	MOVW.P	4(R13), R14
+	MOVM.IAW	(R13), [R0, R1, R3, R4, R5, R6, R7, R8, R9, g, R11, R12]
 	ADD	$(8*9), R13
 	MOVW	R14, R15
diff --git a/src/runtime/cgo/asm_arm64.s b/src/runtime/cgo/asm_arm64.s
index ce56f9b1c7..1cb25cf89e 100644
--- a/src/runtime/cgo/asm_arm64.s
+++ b/src/runtime/cgo/asm_arm64.s
@@ -5,19 +5,20 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R1, R2, R3).
+	 *  push 3 args for fn (R0, R1, R3), skipping R2.
 	 * Also note that at procedure entry in gc world, 8(RSP) will be the
 	 *  first arg.
 	 * TODO(minux): use LDP/STP here if it matters.
 	 */
 	SUB	$(8*24), RSP
-	MOVD	R1, (8*1)(RSP)
-	MOVD	R2, (8*2)(RSP)
+	MOVD	R0, (8*1)(RSP)
+	MOVD	R1, (8*2)(RSP)
 	MOVD	R3, (8*3)(RSP)
 	MOVD	R19, (8*4)(RSP)
 	MOVD	R20, (8*5)(RSP)
@@ -40,15 +41,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	FMOVD	F14, (8*22)(RSP)
 	FMOVD	F15, (8*23)(RSP)
 
-	MOVD	R0, R19
-
 	// Initialize Go ABI environment
 	BL	runtime·load_g(SB)
-	BL	(R19)
 
-	MOVD	(8*1)(RSP), R1
-	MOVD	(8*2)(RSP), R2
-	MOVD	(8*3)(RSP), R3
+	BL	runtime·cgocallback(SB)
+
 	MOVD	(8*4)(RSP), R19
 	MOVD	(8*5)(RSP), R20
 	MOVD	(8*6)(RSP), R21
diff --git a/src/runtime/cgo/asm_mips64x.s b/src/runtime/cgo/asm_mips64x.s
index 1235852dbe..e51cdf3d12 100644
--- a/src/runtime/cgo/asm_mips64x.s
+++ b/src/runtime/cgo/asm_mips64x.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 8(R29) will be the
 	 *  first arg.
 	 */
@@ -22,9 +22,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	ADDV	$(-8*15), R29
 #endif
-	MOVV	R5, (8*1)(R29) // void*
-	MOVW	R6, (8*2)(R29) // int32
-	MOVV	R7, (8*3)(R29) // uintptr
+	MOVV	R4, (8*1)(R29) // fn unsafe.Pointer
+	MOVV	R5, (8*2)(R29) // a unsafe.Pointer
+	MOVV	R7, (8*3)(R29) // ctxt uintptr
 	MOVV	R16, (8*4)(R29)
 	MOVV	R17, (8*5)(R29)
 	MOVV	R18, (8*6)(R29)
@@ -52,7 +52,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	SRLV	$32, R31, RSB
 	SLLV	$32, RSB
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVV	(8*4)(R29), R16
 	MOVV	(8*5)(R29), R17
diff --git a/src/runtime/cgo/asm_mipsx.s b/src/runtime/cgo/asm_mipsx.s
index e3090da223..1127c8beb4 100644
--- a/src/runtime/cgo/asm_mipsx.s
+++ b/src/runtime/cgo/asm_mipsx.s
@@ -6,14 +6,14 @@
 
 #include "textflag.h"
 
-/*
- * void crosscall2(void (*fn)(void*, int32, uintptr), void*, int32, uintptr)
- * Save registers and call fn with two arguments.
- */
+// Called by C code generated by cmd/cgo.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	/*
 	 * We still need to save all callee save register as before, and then
-	 *  push 3 args for fn (R5, R6, R7).
+	 *  push 3 args for fn (R4, R5, R7), skipping R6.
 	 * Also note that at procedure entry in gc world, 4(R29) will be the
 	 *  first arg.
 	 */
@@ -25,9 +25,9 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 #else
 	SUBU	$(4*14-16), R29	// For soft-float, no FPR.
 #endif
-	MOVW	R5, (4*1)(R29)
-	MOVW	R6, (4*2)(R29)
-	MOVW	R7, (4*3)(R29)
+	MOVW	R4, (4*1)(R29)	// fn unsafe.Pointer
+	MOVW	R5, (4*2)(R29)	// a unsafe.Pointer
+	MOVW	R7, (4*3)(R29)	// ctxt uintptr
 	MOVW	R16, (4*4)(R29)
 	MOVW	R17, (4*5)(R29)
 	MOVW	R18, (4*6)(R29)
@@ -47,7 +47,8 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	MOVD	F30, (4*14+8*5)(R29)
 #endif
 	JAL	runtime·load_g(SB)
-	JAL	(R4)
+
+	JAL	runtime·cgocallback(SB)
 
 	MOVW	(4*4)(R29), R16
 	MOVW	(4*5)(R29), R17
diff --git a/src/runtime/cgo/asm_ppc64x.s b/src/runtime/cgo/asm_ppc64x.s
index 3876f9389c..f4efc1e67d 100644
--- a/src/runtime/cgo/asm_ppc64x.s
+++ b/src/runtime/cgo/asm_ppc64x.s
@@ -8,8 +8,9 @@
 #include "asm_ppc64x.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage
 	MOVD	LR, R0
@@ -26,19 +27,18 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	BL	runtime·reginit(SB)
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, R12
 #ifdef GOARCH_ppc64
 	// ppc64 use elf ABI v1. we must get the real entry address from
 	// first slot of the function descriptor before call.
 	// Same for AIX.
-	MOVD	8(R12), R2
-	MOVD	(R12), R12
+	MOVD	8(R3), R2
+	MOVD	(R3), R3
 #endif
-	MOVD	R12, CTR
-	MOVD	R4, FIXED_FRAME+0(R1)
-	MOVW	R5, FIXED_FRAME+8(R1)
-	MOVD	R6, FIXED_FRAME+16(R1)
-	BL	(CTR)
+	MOVD	R3, FIXED_FRAME+0(R1)	// fn unsafe.Pointer
+	MOVD	R4, FIXED_FRAME+8(R1)	// a unsafe.Pointer
+	// Skip R5 = n uint32
+	MOVD	R6, FIXED_FRAME+16(R1)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	ADD	$(288+3*8+FIXED_FRAME), R1
 
diff --git a/src/runtime/cgo/asm_s390x.s b/src/runtime/cgo/asm_s390x.s
index 7eab8f652a..8bf16e75e2 100644
--- a/src/runtime/cgo/asm_s390x.s
+++ b/src/runtime/cgo/asm_s390x.s
@@ -5,8 +5,9 @@
 #include "textflag.h"
 
 // Called by C code generated by cmd/cgo.
-// func crosscall2(fn func(a unsafe.Pointer, n int32, ctxt uintptr), a unsafe.Pointer, n int32, ctxt uintptr)
-// Saves C callee-saved registers and calls fn with three arguments.
+// func crosscall2(fn, a unsafe.Pointer, n int32, ctxt uintptr)
+// Saves C callee-saved registers and calls cgocallback with three arguments.
+// fn is the PC of a func(a unsafe.Pointer) function.
 TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Start with standard C stack frame layout and linkage.
 
@@ -29,10 +30,11 @@ TEXT crosscall2(SB),NOSPLIT|NOFRAME,$0
 	// Initialize Go ABI environment.
 	BL	runtime·load_g(SB)
 
-	MOVD	R3, 8(R15)  // arg1
-	MOVW	R4, 16(R15) // arg2
-	MOVD	R5, 24(R15) // arg3
-	BL	(R2)        // fn(arg1, arg2, arg3)
+	MOVD	R2, 8(R15)	// fn unsafe.Pointer
+	MOVD	R3, 16(R15)	// a unsafe.Pointer
+	// Skip R4 = n uint32
+	MOVD	R5, 24(R15)	// ctxt uintptr
+	BL	runtime·cgocallback(SB)
 
 	FMOVD	32(R15), F8
 	FMOVD	40(R15), F9
diff --git a/src/runtime/cgo/callbacks.go b/src/runtime/cgo/callbacks.go
index 14a218ec92..cd8b795387 100644
--- a/src/runtime/cgo/callbacks.go
+++ b/src/runtime/cgo/callbacks.go
@@ -9,20 +9,18 @@ import "unsafe"
 // These utility functions are available to be called from code
 // compiled with gcc via crosscall2.
 
-// cgocallback is defined in runtime
-//go:linkname _runtime_cgocallback runtime.cgocallback
-func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
-
 // The declaration of crosscall2 is:
-//   void crosscall2(void (*fn)(void *, int), void *, int);
+//   void crosscall2(void (*fn)(void *), void *, int);
 //
 // We need to export the symbol crosscall2 in order to support
 // callbacks from shared libraries. This applies regardless of
 // linking mode.
 //
-// Compatibility note: crosscall2 actually takes four arguments, but
-// it works to call it with three arguments when calling _cgo_panic.
-// That is supported for backward compatibility.
+// Compatibility note: SWIG uses crosscall2 in exactly one situation:
+// to call _cgo_panic using the pattern shown below. We need to keep
+// that pattern working. In particular, crosscall2 actually takes four
+// arguments, but it works to call it with three arguments when
+// calling _cgo_panic.
 //go:cgo_export_static crosscall2
 //go:cgo_export_dynamic crosscall2
 
@@ -34,21 +32,18 @@ func _runtime_cgocallback(unsafe.Pointer, unsafe.Pointer, uintptr, uintptr)
 //   crosscall2(_cgo_panic, &a, sizeof a);
 //   /* The function call will not return.  */
 
+// TODO: We should export a regular C function to panic, change SWIG
+// to use that instead of the above pattern, and then we can drop
+// backwards-compatibility from crosscall2 and stop exporting it.
+
 //go:linkname _runtime_cgo_panic_internal runtime._cgo_panic_internal
 func _runtime_cgo_panic_internal(p *byte)
 
 //go:linkname _cgo_panic _cgo_panic
 //go:cgo_export_static _cgo_panic
 //go:cgo_export_dynamic _cgo_panic
-//go:nosplit
-//go:norace
-func _cgo_panic(a unsafe.Pointer, n int32) {
-	f := _runtime_cgo_panic_internal
-	type funcval struct {
-		pc unsafe.Pointer
-	}
-	fv := *(**funcval)(unsafe.Pointer(&f))
-	_runtime_cgocallback(fv.pc, a, uintptr(n), 0)
+func _cgo_panic(a *struct{ cstr *byte }) {
+	_runtime_cgo_panic_internal(a.cstr)
 }
 
 //go:cgo_import_static x_cgo_init
diff --git a/src/runtime/cgo/cgo.go b/src/runtime/cgo/cgo.go
index c02b837978..4d2caf6c4f 100644
--- a/src/runtime/cgo/cgo.go
+++ b/src/runtime/cgo/cgo.go
@@ -21,6 +21,7 @@ package cgo
 #cgo openbsd LDFLAGS: -lpthread
 #cgo aix LDFLAGS: -Wl,-berok
 #cgo solaris LDFLAGS: -lxnet
+#cgo illumos LDFLAGS: -lsocket
 
 // Issue 35247.
 #cgo darwin CFLAGS: -Wno-nullability-completeness
diff --git a/src/runtime/cgo/gcc_darwin_arm64.c b/src/runtime/cgo/gcc_darwin_arm64.c
index fd7d4084c9..dbe848b4ee 100644
--- a/src/runtime/cgo/gcc_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_darwin_arm64.c
@@ -10,12 +10,16 @@
 #include <unistd.h>
 #include <stdlib.h>
 
-#include <CoreFoundation/CFBundle.h>
-#include <CoreFoundation/CFString.h>
-
 #include "libcgo.h"
 #include "libcgo_unix.h"
 
+#include <TargetConditionals.h>
+
+#if TARGET_OS_IPHONE
+#include <CoreFoundation/CFBundle.h>
+#include <CoreFoundation/CFString.h>
+#endif
+
 #define magic (0xc476c475c47957UL)
 
 // inittls allocates a thread-local storage slot for g.
@@ -87,14 +91,18 @@ threadentry(void *v)
 	ts = *(ThreadStart*)v;
 	free(v);
 
+#if TARGET_OS_IPHONE
 	darwin_arm_init_thread_exception_port();
+#endif
 
 	crosscall1(ts.fn, setg_gcc, (void*)ts.g);
 	return nil;
 }
 
+#if TARGET_OS_IPHONE
+
 // init_working_dir sets the current working directory to the app root.
-// By default darwin/arm64 processes start in "/".
+// By default ios/arm64 processes start in "/".
 static void
 init_working_dir()
 {
@@ -131,7 +139,7 @@ init_working_dir()
 		fprintf(stderr, "runtime/cgo: chdir(%s) failed\n", dir);
 	}
 
-	// The test harness in go_darwin_arm_exec passes the relative working directory
+	// The test harness in go_ios_exec passes the relative working directory
 	// in the GoExecWrapperWorkingDirectory property of the app bundle.
 	CFStringRef wd_ref = CFBundleGetValueForInfoDictionaryKey(bundle, CFSTR("GoExecWrapperWorkingDirectory"));
 	if (wd_ref != NULL) {
@@ -145,6 +153,8 @@ init_working_dir()
 	}
 }
 
+#endif // TARGET_OS_IPHONE
+
 void
 x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 {
@@ -161,8 +171,9 @@ x_cgo_init(G *g, void (*setg)(void*), void **tlsg, void **tlsbase)
 	// yes, tlsbase from mrs might not be correctly aligned.
 	inittls(tlsg, (void**)((uintptr)tlsbase & ~7));
 
+#if TARGET_OS_IPHONE
 	darwin_arm_init_mach_exception_handler();
 	darwin_arm_init_thread_exception_port();
-
 	init_working_dir();
+#endif
 }
diff --git a/src/runtime/cgo/gcc_libinit_windows.c b/src/runtime/cgo/gcc_libinit_windows.c
index 9fd7d36bfb..2732248bdc 100644
--- a/src/runtime/cgo/gcc_libinit_windows.c
+++ b/src/runtime/cgo/gcc_libinit_windows.c
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build cgo
+
 #define WIN64_LEAN_AND_MEAN
 #include <windows.h>
 #include <process.h>
diff --git a/src/runtime/cgo/gcc_signal2_darwin_arm64.c b/src/runtime/cgo/gcc_signal2_ios_arm64.c
index 5b8a18ffd6..5b8a18ffd6 100644
--- a/src/runtime/cgo/gcc_signal2_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_signal2_ios_arm64.c
diff --git a/src/runtime/cgo/gcc_signal_darwin_arm64.c b/src/runtime/cgo/gcc_signal_ios_arm64.c
index 6519edd4cc..6519edd4cc 100644
--- a/src/runtime/cgo/gcc_signal_darwin_arm64.c
+++ b/src/runtime/cgo/gcc_signal_ios_arm64.c
diff --git a/src/runtime/cgo/gcc_signal_darwin_lldb.c b/src/runtime/cgo/gcc_signal_ios_nolldb.c
index 0ccdae324e..cfa4025414 100644
--- a/src/runtime/cgo/gcc_signal_darwin_lldb.c
+++ b/src/runtime/cgo/gcc_signal_ios_nolldb.c
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build !lldb
-// +build darwin
+// +build ios
 // +build arm64
 
 #include <stdint.h>
diff --git a/src/runtime/cgo/linux.go b/src/runtime/cgo/linux.go
new file mode 100644
index 0000000000..76c0192c20
--- /dev/null
+++ b/src/runtime/cgo/linux.go
@@ -0,0 +1,74 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Linux system call wrappers that provide POSIX semantics through the
+// corresponding cgo->libc (nptl) wrappers for various system calls.
+
+// +build linux
+
+package cgo
+
+import "unsafe"
+
+// Each of the following entries is needed to ensure that the
+// syscall.syscall_linux code can conditionally call these
+// function pointers:
+//
+//  1. find the C-defined function start
+//  2. force the local byte alias to be mapped to that location
+//  3. map the Go pointer to the function to the syscall package
+
+//go:cgo_import_static _cgo_libc_setegid
+//go:linkname _cgo_libc_setegid _cgo_libc_setegid
+//go:linkname cgo_libc_setegid syscall.cgo_libc_setegid
+var _cgo_libc_setegid byte
+var cgo_libc_setegid = unsafe.Pointer(&_cgo_libc_setegid)
+
+//go:cgo_import_static _cgo_libc_seteuid
+//go:linkname _cgo_libc_seteuid _cgo_libc_seteuid
+//go:linkname cgo_libc_seteuid syscall.cgo_libc_seteuid
+var _cgo_libc_seteuid byte
+var cgo_libc_seteuid = unsafe.Pointer(&_cgo_libc_seteuid)
+
+//go:cgo_import_static _cgo_libc_setregid
+//go:linkname _cgo_libc_setregid _cgo_libc_setregid
+//go:linkname cgo_libc_setregid syscall.cgo_libc_setregid
+var _cgo_libc_setregid byte
+var cgo_libc_setregid = unsafe.Pointer(&_cgo_libc_setregid)
+
+//go:cgo_import_static _cgo_libc_setresgid
+//go:linkname _cgo_libc_setresgid _cgo_libc_setresgid
+//go:linkname cgo_libc_setresgid syscall.cgo_libc_setresgid
+var _cgo_libc_setresgid byte
+var cgo_libc_setresgid = unsafe.Pointer(&_cgo_libc_setresgid)
+
+//go:cgo_import_static _cgo_libc_setresuid
+//go:linkname _cgo_libc_setresuid _cgo_libc_setresuid
+//go:linkname cgo_libc_setresuid syscall.cgo_libc_setresuid
+var _cgo_libc_setresuid byte
+var cgo_libc_setresuid = unsafe.Pointer(&_cgo_libc_setresuid)
+
+//go:cgo_import_static _cgo_libc_setreuid
+//go:linkname _cgo_libc_setreuid _cgo_libc_setreuid
+//go:linkname cgo_libc_setreuid syscall.cgo_libc_setreuid
+var _cgo_libc_setreuid byte
+var cgo_libc_setreuid = unsafe.Pointer(&_cgo_libc_setreuid)
+
+//go:cgo_import_static _cgo_libc_setgroups
+//go:linkname _cgo_libc_setgroups _cgo_libc_setgroups
+//go:linkname cgo_libc_setgroups syscall.cgo_libc_setgroups
+var _cgo_libc_setgroups byte
+var cgo_libc_setgroups = unsafe.Pointer(&_cgo_libc_setgroups)
+
+//go:cgo_import_static _cgo_libc_setgid
+//go:linkname _cgo_libc_setgid _cgo_libc_setgid
+//go:linkname cgo_libc_setgid syscall.cgo_libc_setgid
+var _cgo_libc_setgid byte
+var cgo_libc_setgid = unsafe.Pointer(&_cgo_libc_setgid)
+
+//go:cgo_import_static _cgo_libc_setuid
+//go:linkname _cgo_libc_setuid _cgo_libc_setuid
+//go:linkname cgo_libc_setuid syscall.cgo_libc_setuid
+var _cgo_libc_setuid byte
+var cgo_libc_setuid = unsafe.Pointer(&_cgo_libc_setuid)
diff --git a/src/runtime/cgo/linux_syscall.c b/src/runtime/cgo/linux_syscall.c
new file mode 100644
index 0000000000..c8e91918a1
--- /dev/null
+++ b/src/runtime/cgo/linux_syscall.c
@@ -0,0 +1,85 @@
+// Copyright 2019 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build linux
+
+#ifndef _GNU_SOURCE // setres[ug]id() API.
+#define _GNU_SOURCE
+#endif
+
+#include <grp.h>
+#include <sys/types.h>
+#include <sys/unistd.h>
+#include <errno.h>
+#include "libcgo.h"
+
+/*
+ * Assumed POSIX compliant libc system call wrappers. For linux, the
+ * glibc/nptl/setxid mechanism ensures that POSIX semantics are
+ * honored for all pthreads (by default), and this in turn with cgo
+ * ensures that all Go threads launched with cgo are kept in sync for
+ * these function calls.
+ */
+
+// argset_t matches runtime/cgocall.go:argset.
+typedef struct {
+	uintptr_t* args;
+	uintptr_t retval;
+} argset_t;
+
+// libc backed posix-compliant syscalls.
+
+#define SET_RETVAL(fn) \
+  uintptr_t ret = (uintptr_t) fn ; \
+  if (ret == -1) {                 \
+    x->retval = (uintptr_t) errno; \
+  } else                           \
+    x->retval = ret
+
+void
+_cgo_libc_setegid(argset_t* x) {
+	SET_RETVAL(setegid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_seteuid(argset_t* x) {
+	SET_RETVAL(seteuid((uid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgid(argset_t* x) {
+	SET_RETVAL(setgid((gid_t) x->args[0]));
+}
+
+void
+_cgo_libc_setgroups(argset_t* x) {
+	SET_RETVAL(setgroups((size_t) x->args[0], (const gid_t *) x->args[1]));
+}
+
+void
+_cgo_libc_setregid(argset_t* x) {
+	SET_RETVAL(setregid((gid_t) x->args[0], (gid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setresgid(argset_t* x) {
+	SET_RETVAL(setresgid((gid_t) x->args[0], (gid_t) x->args[1],
+			     (gid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setresuid(argset_t* x) {
+	SET_RETVAL(setresuid((uid_t) x->args[0], (uid_t) x->args[1],
+			     (uid_t) x->args[2]));
+}
+
+void
+_cgo_libc_setreuid(argset_t* x) {
+	SET_RETVAL(setreuid((uid_t) x->args[0], (uid_t) x->args[1]));
+}
+
+void
+_cgo_libc_setuid(argset_t* x) {
+	SET_RETVAL(setuid((uid_t) x->args[0]));
+}
diff --git a/src/runtime/cgo/signal_darwin_arm64.go b/src/runtime/cgo/signal_ios_arm64.go
index 3425c448c4..3425c448c4 100644
--- a/src/runtime/cgo/signal_darwin_arm64.go
+++ b/src/runtime/cgo/signal_ios_arm64.go
diff --git a/src/runtime/cgo/signal_darwin_arm64.s b/src/runtime/cgo/signal_ios_arm64.s
index 1ae00d13f3..1ae00d13f3 100644
--- a/src/runtime/cgo/signal_darwin_arm64.s
+++ b/src/runtime/cgo/signal_ios_arm64.s
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 427ed0ffb9..9bca279318 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -35,47 +35,52 @@
 // cgo writes a gcc-compiled function named GoF (not p.GoF, since gcc doesn't
 // know about packages).  The gcc-compiled C function f calls GoF.
 //
-// GoF calls crosscall2(_cgoexp_GoF, frame, framesize).  Crosscall2
-// (in cgo/gcc_$GOARCH.S, a gcc-compiled assembly file) is a two-argument
-// adapter from the gcc function call ABI to the 6c function call ABI.
-// It is called from gcc to call 6c functions. In this case it calls
-// _cgoexp_GoF(frame, framesize), still running on m->g0's stack
-// and outside the $GOMAXPROCS limit. Thus, this code cannot yet
-// call arbitrary Go code directly and must be careful not to allocate
-// memory or use up m->g0's stack.
+// GoF initializes "frame", a structure containing all of its
+// arguments and slots for p.GoF's results. It calls
+// crosscall2(_cgoexp_GoF, frame, framesize, ctxt) using the gcc ABI.
 //
-// _cgoexp_GoF calls runtime.cgocallback(p.GoF, frame, framesize, ctxt).
-// (The reason for having _cgoexp_GoF instead of writing a crosscall3
-// to make this call directly is that _cgoexp_GoF, because it is compiled
-// with 6c instead of gcc, can refer to dotted names like
-// runtime.cgocallback and p.GoF.)
+// crosscall2 (in cgo/asm_$GOARCH.s) is a four-argument adapter from
+// the gcc function call ABI to the gc function call ABI. At this
+// point we're in the Go runtime, but we're still running on m.g0's
+// stack and outside the $GOMAXPROCS limit. crosscall2 calls
+// runtime.cgocallback(_cgoexp_GoF, frame, ctxt) using the gc ABI.
+// (crosscall2's framesize argument is no longer used, but there's one
+// case where SWIG calls crosscall2 directly and expects to pass this
+// argument. See _cgo_panic.)
 //
-// runtime.cgocallback (in asm_$GOARCH.s) switches from m->g0's
-// stack to the original g (m->curg)'s stack, on which it calls
-// runtime.cgocallbackg(p.GoF, frame, framesize).
-// As part of the stack switch, runtime.cgocallback saves the current
-// SP as m->g0->sched.sp, so that any use of m->g0's stack during the
-// execution of the callback will be done below the existing stack frames.
-// Before overwriting m->g0->sched.sp, it pushes the old value on the
-// m->g0 stack, so that it can be restored later.
+// runtime.cgocallback (in asm_$GOARCH.s) switches from m.g0's stack
+// to the original g (m.curg)'s stack, on which it calls
+// runtime.cgocallbackg(_cgoexp_GoF, frame, ctxt). As part of the
+// stack switch, runtime.cgocallback saves the current SP as
+// m.g0.sched.sp, so that any use of m.g0's stack during the execution
+// of the callback will be done below the existing stack frames.
+// Before overwriting m.g0.sched.sp, it pushes the old value on the
+// m.g0 stack, so that it can be restored later.
 //
 // runtime.cgocallbackg (below) is now running on a real goroutine
-// stack (not an m->g0 stack).  First it calls runtime.exitsyscall, which will
+// stack (not an m.g0 stack).  First it calls runtime.exitsyscall, which will
 // block until the $GOMAXPROCS limit allows running this goroutine.
 // Once exitsyscall has returned, it is safe to do things like call the memory
-// allocator or invoke the Go callback function p.GoF.  runtime.cgocallbackg
-// first defers a function to unwind m->g0.sched.sp, so that if p.GoF
-// panics, m->g0.sched.sp will be restored to its old value: the m->g0 stack
-// and the m->curg stack will be unwound in lock step.
-// Then it calls p.GoF.  Finally it pops but does not execute the deferred
-// function, calls runtime.entersyscall, and returns to runtime.cgocallback.
+// allocator or invoke the Go callback function.  runtime.cgocallbackg
+// first defers a function to unwind m.g0.sched.sp, so that if p.GoF
+// panics, m.g0.sched.sp will be restored to its old value: the m.g0 stack
+// and the m.curg stack will be unwound in lock step.
+// Then it calls _cgoexp_GoF(frame).
+//
+// _cgoexp_GoF, which was generated by cmd/cgo, unpacks the arguments
+// from frame, calls p.GoF, writes the results back to frame, and
+// returns. Now we start unwinding this whole process.
+//
+// runtime.cgocallbackg pops but does not execute the deferred
+// function to unwind m.g0.sched.sp, calls runtime.entersyscall, and
+// returns to runtime.cgocallback.
 //
 // After it regains control, runtime.cgocallback switches back to
-// m->g0's stack (the pointer is still in m->g0.sched.sp), restores the old
-// m->g0.sched.sp value from the stack, and returns to _cgoexp_GoF.
+// m.g0's stack (the pointer is still in m.g0.sched.sp), restores the old
+// m.g0.sched.sp value from the stack, and returns to crosscall2.
 //
-// _cgoexp_GoF immediately returns to crosscall2, which restores the
-// callee-save registers for gcc and returns to GoF, which returns to f.
+// crosscall2 restores the callee-save registers for gcc and returns
+// to GoF, which unpacks any result values and returns to f.
 
 package runtime
 
@@ -89,6 +94,22 @@ import (
 // Length must match arg.Max in x_cgo_callers in runtime/cgo/gcc_traceback.c.
 type cgoCallers [32]uintptr
 
+// argset matches runtime/cgo/linux_syscall.c:argset_t
+type argset struct {
+	args   unsafe.Pointer
+	retval uintptr
+}
+
+// wrapper for syscall package to call cgocall for libc (cgo) calls.
+//go:linkname syscall_cgocaller syscall.cgocaller
+//go:nosplit
+//go:uintptrescapes
+func syscall_cgocaller(fn unsafe.Pointer, args ...uintptr) uintptr {
+	as := argset{args: unsafe.Pointer(&args[0])}
+	cgocall(fn, unsafe.Pointer(&as))
+	return as.retval
+}
+
 // Call from Go to C.
 //
 // This must be nosplit because it's used for syscalls on some
@@ -176,7 +197,7 @@ func cgocall(fn, arg unsafe.Pointer) int32 {
 
 // Call from C back to Go.
 //go:nosplit
-func cgocallbackg(ctxt uintptr) {
+func cgocallbackg(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp != gp.m.curg {
 		println("runtime: bad g in cgocallback")
@@ -204,7 +225,7 @@ func cgocallbackg(ctxt uintptr) {
 
 	osPreemptExtExit(gp.m)
 
-	cgocallbackg1(ctxt)
+	cgocallbackg1(fn, frame, ctxt)
 
 	// At this point unlockOSThread has been called.
 	// The following code must not change to a different m.
@@ -219,7 +240,7 @@ func cgocallbackg(ctxt uintptr) {
 	gp.m.syscall = syscall
 }
 
-func cgocallbackg1(ctxt uintptr) {
+func cgocallbackg1(fn, frame unsafe.Pointer, ctxt uintptr) {
 	gp := getg()
 	if gp.m.needextram || atomic.Load(&extraMWaiters) > 0 {
 		gp.m.needextram = false
@@ -263,79 +284,16 @@ func cgocallbackg1(ctxt uintptr) {
 		raceacquire(unsafe.Pointer(&racecgosync))
 	}
 
-	type args struct {
-		fn      *funcval
-		arg     unsafe.Pointer
-		argsize uintptr
-	}
-	var cb *args
-
-	// Location of callback arguments depends on stack frame layout
-	// and size of stack frame of cgocallback_gofunc.
-	sp := gp.m.g0.sched.sp
-	switch GOARCH {
-	default:
-		throw("cgocallbackg is unimplemented on arch")
-	case "arm":
-		// On arm, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "arm64":
-		// On arm64, stack frame is four words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		// Additional two words (16-byte alignment) are for saving FP.
-		cb = (*args)(unsafe.Pointer(sp + 7*sys.PtrSize))
-	case "amd64":
-		// On amd64, stack frame is two words, plus caller PC and BP.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "386":
-		// On 386, stack frame is three words, plus caller PC.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "ppc64", "ppc64le", "s390x":
-		// On ppc64 and s390x, the callback arguments are in the arguments area of
-		// cgocallback's stack frame. The stack looks like this:
-		// +--------------------+------------------------------+
-		// |                    | ...                          |
-		// | cgoexp_$fn         +------------------------------+
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+
-		// |                    | arguments area               |
-		// | cgocallback        +------------------------------+ <- sp + 2*minFrameSize + 2*ptrSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp + minFrameSize + 2*ptrSize
-		// |                    | local variables (2 pointers) |
-		// | cgocallback_gofunc +------------------------------+ <- sp + minFrameSize
-		// |                    | fixed frame area             |
-		// +--------------------+------------------------------+ <- sp
-		cb = (*args)(unsafe.Pointer(sp + 2*sys.MinFrameSize + 2*sys.PtrSize))
-	case "mips64", "mips64le":
-		// On mips64x, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	case "mips", "mipsle":
-		// On mipsx, stack frame is two words and there's a saved LR between
-		// SP and the stack frame and between the stack frame and the arguments.
-		cb = (*args)(unsafe.Pointer(sp + 4*sys.PtrSize))
-	}
-
-	// Invoke callback.
-	// NOTE(rsc): passing nil for argtype means that the copying of the
-	// results back into cb.arg happens without any corresponding write barriers.
-	// For cgo, cb.arg points into a C stack frame and therefore doesn't
-	// hold any pointers that the GC can find anyway - the write barrier
-	// would be a no-op.
-	reflectcall(nil, unsafe.Pointer(cb.fn), cb.arg, uint32(cb.argsize), 0)
+	// Invoke callback. This function is generated by cmd/cgo and
+	// will unpack the argument frame and call the Go function.
+	var cb func(frame unsafe.Pointer)
+	cbFV := funcval{uintptr(fn)}
+	*(*unsafe.Pointer)(unsafe.Pointer(&cb)) = noescape(unsafe.Pointer(&cbFV))
+	cb(frame)
 
 	if raceenabled {
 		racereleasemerge(unsafe.Pointer(&racecgosync))
 	}
-	if msanenabled {
-		// Tell msan that we wrote to the entire argument block.
-		// This tells msan that we set the results.
-		// Since we have already called the function it doesn't
-		// matter that we are writing to the non-result parameters.
-		msanwrite(cb.arg, cb.argsize)
-	}
 
 	// Do not unwind m->g0->sched.sp.
 	// Our caller, cgocallback, will do that.
diff --git a/src/runtime/crash_cgo_test.go b/src/runtime/crash_cgo_test.go
index 4872189f16..0680d07a32 100644
--- a/src/runtime/crash_cgo_test.go
+++ b/src/runtime/crash_cgo_test.go
@@ -154,7 +154,7 @@ func TestCgoExecSignalMask(t *testing.T) {
 	case "windows", "plan9":
 		t.Skipf("skipping signal mask test on %s", runtime.GOOS)
 	}
-	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask")
+	got := runTestProg(t, "testprogcgo", "CgoExecSignalMask", "GOTRACEBACK=system")
 	want := "OK\n"
 	if got != want {
 		t.Errorf("expected %q, got %v", want, got)
@@ -600,3 +600,16 @@ func TestEINTR(t *testing.T) {
 		t.Fatalf("want %s, got %s\n", want, output)
 	}
 }
+
+// Issue #42207.
+func TestNeedmDeadlock(t *testing.T) {
+	switch runtime.GOOS {
+	case "plan9", "windows":
+		t.Skipf("no signals on %s", runtime.GOOS)
+	}
+	output := runTestProg(t, "testprogcgo", "NeedmDeadlock")
+	want := "OK\n"
+	if output != want {
+		t.Fatalf("want %s, got %s\n", want, output)
+	}
+}
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go
index 34f30c9a37..5e22b7593e 100644
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -181,6 +181,9 @@ func TestCrashHandler(t *testing.T) {
 }
 
 func testDeadlock(t *testing.T, name string) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", name)
 	want := "fatal error: all goroutines are asleep - deadlock!\n"
 	if !strings.HasPrefix(output, want) {
@@ -205,6 +208,9 @@ func TestLockedDeadlock2(t *testing.T) {
 }
 
 func TestGoexitDeadlock(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitDeadlock")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -290,6 +296,9 @@ func TestRecursivePanic4(t *testing.T) {
 }
 
 func TestGoexitCrash(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "GoexitExit")
 	want := "no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.Contains(output, want) {
@@ -348,6 +357,9 @@ func TestBreakpoint(t *testing.T) {
 }
 
 func TestGoexitInPanic(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	// see issue 8774: this code used to trigger an infinite recursion
 	output := runTestProg(t, "testprog", "GoexitInPanic")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -412,6 +424,9 @@ func TestPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoveredPanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	output := runTestProg(t, "testprog", "RecoveredPanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
 	if !strings.HasPrefix(output, want) {
@@ -420,6 +435,9 @@ func TestRecoveredPanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -429,6 +447,9 @@ func TestRecoverBeforePanicAfterGoexit(t *testing.T) {
 }
 
 func TestRecoverBeforePanicAfterGoexit2(t *testing.T) {
+	// External linking brings in cgo, causing deadlock detection not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 	output := runTestProg(t, "testprog", "RecoverBeforePanicAfterGoexit2")
 	want := "fatal error: no goroutines (main called runtime.Goexit) - deadlock!"
@@ -667,7 +688,9 @@ func TestBadTraceback(t *testing.T) {
 }
 
 func TestTimePprof(t *testing.T) {
-	fn := runTestProg(t, "testprog", "TimeProf")
+	// Pass GOTRACEBACK for issue #41120 to try to get more
+	// information on timeout.
+	fn := runTestProg(t, "testprog", "TimeProf", "GOTRACEBACK=crash")
 	fn = strings.TrimSpace(fn)
 	defer os.Remove(fn)
 
diff --git a/src/runtime/crash_unix_test.go b/src/runtime/crash_unix_test.go
index 8ef52aba48..fc87f37408 100644
--- a/src/runtime/crash_unix_test.go
+++ b/src/runtime/crash_unix_test.go
@@ -241,7 +241,7 @@ func TestPanicSystemstack(t *testing.T) {
 	}
 
 	// Get traceback.
-	tb, err := ioutil.ReadAll(pr)
+	tb, err := io.ReadAll(pr)
 	if err != nil {
 		t.Fatal("reading traceback from pipe: ", err)
 	}
diff --git a/src/runtime/debug/panic_test.go b/src/runtime/debug/panic_test.go
index 93be216985..b67a3de4f9 100644
--- a/src/runtime/debug/panic_test.go
+++ b/src/runtime/debug/panic_test.go
@@ -20,8 +20,8 @@ func TestPanicOnFault(t *testing.T) {
 	if runtime.GOARCH == "s390x" {
 		t.Skip("s390x fault addresses are missing the low order bits")
 	}
-	if (runtime.GOOS == "darwin" || runtime.GOOS == "ios") && runtime.GOARCH == "arm64" {
-		t.Skip("darwin/arm64 doesn't provide fault addresses")
+	if runtime.GOOS == "ios" {
+		t.Skip("iOS doesn't provide fault addresses")
 	}
 	m, err := syscall.Mmap(-1, 0, 0x1000, syscall.PROT_READ /* Note: no PROT_WRITE */, syscall.MAP_SHARED|syscall.MAP_ANON)
 	if err != nil {
diff --git a/src/runtime/duff_riscv64.s b/src/runtime/duff_riscv64.s
new file mode 100644
index 0000000000..f7bd3f326e
--- /dev/null
+++ b/src/runtime/duff_riscv64.s
@@ -0,0 +1,907 @@
+// Code generated by mkduff.go; DO NOT EDIT.
+// Run go generate from src/runtime to update.
+// See mkduff.go for comments.
+
+#include "textflag.h"
+
+TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	MOV	ZERO, (X10)
+	ADD	$8, X10
+	RET
+
+TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	MOV	(X10), X31
+	ADD	$8, X10
+	MOV	X31, (X11)
+	ADD	$8, X11
+
+	RET
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index f2fa11dc98..4ca0420d2a 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -298,6 +298,32 @@ func (p *ProfBuf) Close() {
 	(*profBuf)(p).close()
 }
 
+func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
+	stopTheWorld("ReadMetricsSlow")
+
+	// Initialize the metrics beforehand because this could
+	// allocate and skew the stats.
+	semacquire(&metricsSema)
+	initMetrics()
+	semrelease(&metricsSema)
+
+	systemstack(func() {
+		// Read memstats first. It's going to flush
+		// the mcaches which readMetrics does not do, so
+		// going the other way around may result in
+		// inconsistent statistics.
+		readmemstats_m(memStats)
+	})
+
+	// Read metrics off the system stack.
+	//
+	// The only part of readMetrics that could allocate
+	// and skew the stats is initMetrics.
+	readMetrics(samplesp, len, cap)
+
+	startTheWorld()
+}
+
 // ReadMemStatsSlow returns both the runtime-computed MemStats and
 // MemStats accumulated by scanning the heap.
 func ReadMemStatsSlow() (base, slow MemStats) {
@@ -337,20 +363,22 @@ func ReadMemStatsSlow() (base, slow MemStats) {
 			}
 		}
 
-		// Add in frees. readmemstats_m flushed the cached stats, so
-		// these are up-to-date.
+		// Add in frees by just reading the stats for those directly.
+		var m heapStatsDelta
+		memstats.heapStats.unsafeRead(&m)
+
+		// Collect per-sizeclass free stats.
 		var smallFree uint64
-		slow.Frees = mheap_.nlargefree
-		for i := range mheap_.nsmallfree {
-			slow.Frees += mheap_.nsmallfree[i]
-			bySize[i].Frees = mheap_.nsmallfree[i]
-			bySize[i].Mallocs += mheap_.nsmallfree[i]
-			smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
-		}
-		slow.Frees += memstats.tinyallocs
+		for i := 0; i < _NumSizeClasses; i++ {
+			slow.Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Frees += uint64(m.smallFreeCount[i])
+			bySize[i].Mallocs += uint64(m.smallFreeCount[i])
+			smallFree += uint64(m.smallFreeCount[i]) * uint64(class_to_size[i])
+		}
+		slow.Frees += memstats.tinyallocs + uint64(m.largeFreeCount)
 		slow.Mallocs += slow.Frees
 
-		slow.TotalAlloc = slow.Alloc + mheap_.largefree + smallFree
+		slow.TotalAlloc = slow.Alloc + uint64(m.largeFree) + smallFree
 
 		for i := range slow.BySize {
 			slow.BySize[i].Mallocs = bySize[i].Mallocs
@@ -749,10 +777,7 @@ func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
 func (p *PageAlloc) InUse() []AddrRange {
 	ranges := make([]AddrRange, 0, len(p.inUse.ranges))
 	for _, r := range p.inUse.ranges {
-		ranges = append(ranges, AddrRange{
-			Base:  r.base.addr(),
-			Limit: r.limit.addr(),
-		})
+		ranges = append(ranges, AddrRange{r})
 	}
 	return ranges
 }
@@ -763,10 +788,111 @@ func (p *PageAlloc) PallocData(i ChunkIdx) *PallocData {
 	return (*PallocData)((*pageAlloc)(p).tryChunkOf(ci))
 }
 
-// AddrRange represents a range over addresses.
-// Specifically, it represents the range [Base, Limit).
+// AddrRange is a wrapper around addrRange for testing.
 type AddrRange struct {
-	Base, Limit uintptr
+	addrRange
+}
+
+// MakeAddrRange creates a new address range.
+func MakeAddrRange(base, limit uintptr) AddrRange {
+	return AddrRange{makeAddrRange(base, limit)}
+}
+
+// Base returns the virtual base address of the address range.
+func (a AddrRange) Base() uintptr {
+	return a.addrRange.base.addr()
+}
+
+// Base returns the virtual address of the limit of the address range.
+func (a AddrRange) Limit() uintptr {
+	return a.addrRange.limit.addr()
+}
+
+// Equals returns true if the two address ranges are exactly equal.
+func (a AddrRange) Equals(b AddrRange) bool {
+	return a == b
+}
+
+// Size returns the size in bytes of the address range.
+func (a AddrRange) Size() uintptr {
+	return a.addrRange.size()
+}
+
+// AddrRanges is a wrapper around addrRanges for testing.
+type AddrRanges struct {
+	addrRanges
+	mutable bool
+}
+
+// NewAddrRanges creates a new empty addrRanges.
+//
+// Note that this initializes addrRanges just like in the
+// runtime, so its memory is persistentalloc'd. Call this
+// function sparingly since the memory it allocates is
+// leaked.
+//
+// This AddrRanges is mutable, so we can test methods like
+// Add.
+func NewAddrRanges() AddrRanges {
+	r := addrRanges{}
+	r.init(new(sysMemStat))
+	return AddrRanges{r, true}
+}
+
+// MakeAddrRanges creates a new addrRanges populated with
+// the ranges in a.
+//
+// The returned AddrRanges is immutable, so methods like
+// Add will fail.
+func MakeAddrRanges(a ...AddrRange) AddrRanges {
+	// Methods that manipulate the backing store of addrRanges.ranges should
+	// not be used on the result from this function (e.g. add) since they may
+	// trigger reallocation. That would normally be fine, except the new
+	// backing store won't come from the heap, but from persistentalloc, so
+	// we'll leak some memory implicitly.
+	ranges := make([]addrRange, 0, len(a))
+	total := uintptr(0)
+	for _, r := range a {
+		ranges = append(ranges, r.addrRange)
+		total += r.Size()
+	}
+	return AddrRanges{addrRanges{
+		ranges:     ranges,
+		totalBytes: total,
+		sysStat:    new(sysMemStat),
+	}, false}
+}
+
+// Ranges returns a copy of the ranges described by the
+// addrRanges.
+func (a *AddrRanges) Ranges() []AddrRange {
+	result := make([]AddrRange, 0, len(a.addrRanges.ranges))
+	for _, r := range a.addrRanges.ranges {
+		result = append(result, AddrRange{r})
+	}
+	return result
+}
+
+// FindSucc returns the successor to base. See addrRanges.findSucc
+// for more details.
+func (a *AddrRanges) FindSucc(base uintptr) int {
+	return a.findSucc(base)
+}
+
+// Add adds a new AddrRange to the AddrRanges.
+//
+// The AddrRange must be mutable (i.e. created by NewAddrRanges),
+// otherwise this method will throw.
+func (a *AddrRanges) Add(r AddrRange) {
+	if !a.mutable {
+		throw("attempt to mutate immutable AddrRanges")
+	}
+	a.add(r.addrRange)
+}
+
+// TotalBytes returns the totalBytes field of the addrRanges.
+func (a *AddrRanges) TotalBytes() uintptr {
+	return a.addrRanges.totalBytes
 }
 
 // BitRange represents a range over a bitmap.
@@ -1015,3 +1141,27 @@ func MSpanCountAlloc(ms *MSpan, bits []byte) int {
 	s.gcmarkBits = nil
 	return result
 }
+
+const (
+	TimeHistSubBucketBits   = timeHistSubBucketBits
+	TimeHistNumSubBuckets   = timeHistNumSubBuckets
+	TimeHistNumSuperBuckets = timeHistNumSuperBuckets
+)
+
+type TimeHistogram timeHistogram
+
+// Counts returns the counts for the given bucket, subBucket indices.
+// Returns true if the bucket was valid, otherwise returns the counts
+// for the overflow bucket and false.
+func (th *TimeHistogram) Count(bucket, subBucket uint) (uint64, bool) {
+	t := (*timeHistogram)(th)
+	i := bucket*TimeHistNumSubBuckets + subBucket
+	if i >= uint(len(t.counts)) {
+		return t.overflow, false
+	}
+	return t.counts[i], true
+}
+
+func (th *TimeHistogram) Record(duration int64) {
+	(*timeHistogram)(th).record(duration)
+}
diff --git a/src/runtime/extern.go b/src/runtime/extern.go
index 7316503ed2..b75507b8f8 100644
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -78,6 +78,19 @@ It is a comma-separated list of name=val pairs setting these named variables:
 	If the line ends with "(forced)", this GC was forced by a
 	runtime.GC() call.
 
+	inittrace: setting inittrace=1 causes the runtime to emit a single line to standard
+	error for each package with init work, summarizing the execution time and memory
+	allocation. No information is printed for inits executed as part of plugin loading
+	and for packages without both user defined and compiler generated init work.
+	The format of this line is subject to change. Currently, it is:
+		init # @#ms, # ms clock, # bytes, # allocs
+	where the fields are as follows:
+		init #      the package name
+		@# ms       time in milliseconds when the init started since program start
+		# clock     wall-clock time for package initialization work
+		# bytes     memory allocated on the heap
+		# allocs    number of heap allocations
+
 	madvdontneed: setting madvdontneed=1 will use MADV_DONTNEED
 	instead of MADV_FREE on Linux when returning memory to the
 	kernel. This is less efficient, but causes RSS numbers to drop
diff --git a/src/runtime/funcdata.h b/src/runtime/funcdata.h
index 0fb50ddfba..cd76c06992 100644
--- a/src/runtime/funcdata.h
+++ b/src/runtime/funcdata.h
@@ -32,9 +32,9 @@
 // defines the pointer map for the function's arguments.
 // GO_ARGS should be the first instruction in a function that uses it.
 // It can be omitted if there are no arguments at all.
-// GO_ARGS is inserted implicitly by the linker for any function
-// that also has a Go prototype and therefore is usually not necessary
-// to write explicitly.
+// GO_ARGS is inserted implicitly by the linker for any function whose
+// name starts with a middle-dot and that also has a Go prototype; it
+// is therefore usually not necessary to write explicitly.
 #define GO_ARGS	FUNCDATA $FUNCDATA_ArgsPointerMaps, go_args_stackmap(SB)
 
 // GO_RESULTS_INITIALIZED indicates that the assembly function
diff --git a/src/runtime/gc_test.go b/src/runtime/gc_test.go
index 9edebdada6..7870f31ae9 100644
--- a/src/runtime/gc_test.go
+++ b/src/runtime/gc_test.go
@@ -518,7 +518,7 @@ func BenchmarkReadMemStats(b *testing.B) {
 	hugeSink = nil
 }
 
-func BenchmarkReadMemStatsLatency(b *testing.B) {
+func applyGCLoad(b *testing.B) func() {
 	// We’ll apply load to the runtime with maxProcs-1 goroutines
 	// and use one more to actually benchmark. It doesn't make sense
 	// to try to run this test with only 1 P (that's what
@@ -563,6 +563,14 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 			runtime.KeepAlive(hold)
 		}()
 	}
+	return func() {
+		close(done)
+		wg.Wait()
+	}
+}
+
+func BenchmarkReadMemStatsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
 
 	// Spend this much time measuring latencies.
 	latencies := make([]time.Duration, 0, 1024)
@@ -579,12 +587,11 @@ func BenchmarkReadMemStatsLatency(b *testing.B) {
 		runtime.ReadMemStats(&ms)
 		latencies = append(latencies, time.Now().Sub(start))
 	}
-	close(done)
-	// Make sure to stop the timer before we wait! The goroutines above
-	// are very heavy-weight and not easy to stop, so we could end up
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
 	// confusing the benchmarking framework for small b.N.
 	b.StopTimer()
-	wg.Wait()
+	stop()
 
 	// Disable the default */op metrics.
 	// ns/op doesn't mean anything because it's an average, but we
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 4c35309211..33e224d587 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -20,8 +20,19 @@ import (
 func runtime_debug_WriteHeapDump(fd uintptr) {
 	stopTheWorld("write heap dump")
 
+	// Keep m on this G's stack instead of the system stack.
+	// Both readmemstats_m and writeheapdump_m have pretty large
+	// peak stack depths and we risk blowing the system stack.
+	// This is safe because the world is stopped, so we don't
+	// need to worry about anyone shrinking and therefore moving
+	// our stack.
+	var m MemStats
 	systemstack(func() {
-		writeheapdump_m(fd)
+		// Call readmemstats_m here instead of deeper in
+		// writeheapdump_m because we might blow the system stack
+		// otherwise.
+		readmemstats_m(&m)
+		writeheapdump_m(fd, &m)
 	})
 
 	startTheWorld()
@@ -539,36 +550,40 @@ func dumpms() {
 	}
 }
 
-func dumpmemstats() {
+//go:systemstack
+func dumpmemstats(m *MemStats) {
+	// These ints should be identical to the exported
+	// MemStats structure and should be ordered the same
+	// way too.
 	dumpint(tagMemStats)
-	dumpint(memstats.alloc)
-	dumpint(memstats.total_alloc)
-	dumpint(memstats.sys)
-	dumpint(memstats.nlookup)
-	dumpint(memstats.nmalloc)
-	dumpint(memstats.nfree)
-	dumpint(memstats.heap_alloc)
-	dumpint(memstats.heap_sys)
-	dumpint(memstats.heap_idle)
-	dumpint(memstats.heap_inuse)
-	dumpint(memstats.heap_released)
-	dumpint(memstats.heap_objects)
-	dumpint(memstats.stacks_inuse)
-	dumpint(memstats.stacks_sys)
-	dumpint(memstats.mspan_inuse)
-	dumpint(memstats.mspan_sys)
-	dumpint(memstats.mcache_inuse)
-	dumpint(memstats.mcache_sys)
-	dumpint(memstats.buckhash_sys)
-	dumpint(memstats.gc_sys)
-	dumpint(memstats.other_sys)
-	dumpint(memstats.next_gc)
-	dumpint(memstats.last_gc_unix)
-	dumpint(memstats.pause_total_ns)
+	dumpint(m.Alloc)
+	dumpint(m.TotalAlloc)
+	dumpint(m.Sys)
+	dumpint(m.Lookups)
+	dumpint(m.Mallocs)
+	dumpint(m.Frees)
+	dumpint(m.HeapAlloc)
+	dumpint(m.HeapSys)
+	dumpint(m.HeapIdle)
+	dumpint(m.HeapInuse)
+	dumpint(m.HeapReleased)
+	dumpint(m.HeapObjects)
+	dumpint(m.StackInuse)
+	dumpint(m.StackSys)
+	dumpint(m.MSpanInuse)
+	dumpint(m.MSpanSys)
+	dumpint(m.MCacheInuse)
+	dumpint(m.MCacheSys)
+	dumpint(m.BuckHashSys)
+	dumpint(m.GCSys)
+	dumpint(m.OtherSys)
+	dumpint(m.NextGC)
+	dumpint(m.LastGC)
+	dumpint(m.PauseTotalNs)
 	for i := 0; i < 256; i++ {
-		dumpint(memstats.pause_ns[i])
+		dumpint(m.PauseNs[i])
 	}
-	dumpint(uint64(memstats.numgc))
+	dumpint(uint64(m.NumGC))
 }
 
 func dumpmemprof_callback(b *bucket, nstk uintptr, pstk *uintptr, size, allocs, frees uintptr) {
@@ -639,7 +654,7 @@ func dumpmemprof() {
 
 var dumphdr = []byte("go1.7 heap dump\n")
 
-func mdump() {
+func mdump(m *MemStats) {
 	// make sure we're done sweeping
 	for _, s := range mheap_.allspans {
 		if s.state.get() == mSpanInUse {
@@ -654,13 +669,13 @@ func mdump() {
 	dumpgs()
 	dumpms()
 	dumproots()
-	dumpmemstats()
+	dumpmemstats(m)
 	dumpmemprof()
 	dumpint(tagEOF)
 	flush()
 }
 
-func writeheapdump_m(fd uintptr) {
+func writeheapdump_m(fd uintptr, m *MemStats) {
 	_g_ := getg()
 	casgstatus(_g_.m.curg, _Grunning, _Gwaiting)
 	_g_.waitreason = waitReasonDumpingHeap
@@ -674,7 +689,7 @@ func writeheapdump_m(fd uintptr) {
 	dumpfd = fd
 
 	// Call dump routine.
-	mdump()
+	mdump(m)
 
 	// Reset dump file.
 	dumpfd = 0
diff --git a/src/runtime/histogram.go b/src/runtime/histogram.go
new file mode 100644
index 0000000000..4020969eb9
--- /dev/null
+++ b/src/runtime/histogram.go
@@ -0,0 +1,148 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+	"runtime/internal/atomic"
+	"runtime/internal/sys"
+)
+
+const (
+	// For the time histogram type, we use an HDR histogram.
+	// Values are placed in super-buckets based solely on the most
+	// significant set bit. Thus, super-buckets are power-of-2 sized.
+	// Values are then placed into sub-buckets based on the value of
+	// the next timeHistSubBucketBits most significant bits. Thus,
+	// sub-buckets are linear within a super-bucket.
+	//
+	// Therefore, the number of sub-buckets (timeHistNumSubBuckets)
+	// defines the error. This error may be computed as
+	// 1/timeHistNumSubBuckets*100%. For example, for 16 sub-buckets
+	// per super-bucket the error is approximately 6%.
+	//
+	// The number of super-buckets (timeHistNumSuperBuckets), on the
+	// other hand, defines the range. To reserve room for sub-buckets,
+	// bit timeHistSubBucketBits is the first bit considered for
+	// super-buckets, so super-bucket indicies are adjusted accordingly.
+	//
+	// As an example, consider 45 super-buckets with 16 sub-buckets.
+	//
+	//    00110
+	//    ^----
+	//    │  ^
+	//    │  └---- Lowest 4 bits -> sub-bucket 6
+	//    └------- Bit 4 unset -> super-bucket 0
+	//
+	//    10110
+	//    ^----
+	//    │  ^
+	//    │  └---- Next 4 bits -> sub-bucket 6
+	//    └------- Bit 4 set -> super-bucket 1
+	//    100010
+	//    ^----^
+	//    │  ^ └-- Lower bits ignored
+	//    │  └---- Next 4 bits -> sub-bucket 1
+	//    └------- Bit 5 set -> super-bucket 2
+	//
+	// Following this pattern, bucket 45 will have the bit 48 set. We don't
+	// have any buckets for higher values, so the highest sub-bucket will
+	// contain values of 2^48-1 nanoseconds or approx. 3 days. This range is
+	// more than enough to handle durations produced by the runtime.
+	timeHistSubBucketBits   = 4
+	timeHistNumSubBuckets   = 1 << timeHistSubBucketBits
+	timeHistNumSuperBuckets = 45
+	timeHistTotalBuckets    = timeHistNumSuperBuckets*timeHistNumSubBuckets + 1
+)
+
+// timeHistogram represents a distribution of durations in
+// nanoseconds.
+//
+// The accuracy and range of the histogram is defined by the
+// timeHistSubBucketBits and timeHistNumSuperBuckets constants.
+//
+// It is an HDR histogram with exponentially-distributed
+// buckets and linearly distributed sub-buckets.
+//
+// Counts in the histogram are updated atomically, so it is safe
+// for concurrent use. It is also safe to read all the values
+// atomically.
+type timeHistogram struct {
+	counts   [timeHistNumSuperBuckets * timeHistNumSubBuckets]uint64
+	overflow uint64
+}
+
+// record adds the given duration to the distribution.
+//
+// Although the duration is an int64 to facilitate ease-of-use
+// with e.g. nanotime, the duration must be non-negative.
+func (h *timeHistogram) record(duration int64) {
+	if duration < 0 {
+		throw("timeHistogram encountered negative duration")
+	}
+	// The index of the exponential bucket is just the index
+	// of the highest set bit adjusted for how many bits we
+	// use for the subbucket. Note that it's timeHistSubBucketsBits-1
+	// because we use the 0th bucket to hold values < timeHistNumSubBuckets.
+	var superBucket, subBucket uint
+	if duration >= timeHistNumSubBuckets {
+		// At this point, we know the duration value will always be
+		// at least timeHistSubBucketsBits long.
+		superBucket = uint(sys.Len64(uint64(duration))) - timeHistSubBucketBits
+		if superBucket*timeHistNumSubBuckets >= uint(len(h.counts)) {
+			// The bucket index we got is larger than what we support, so
+			// add into the special overflow bucket.
+			atomic.Xadd64(&h.overflow, 1)
+			return
+		}
+		// The linear subbucket index is just the timeHistSubBucketsBits
+		// bits after the top bit. To extract that value, shift down
+		// the duration such that we leave the top bit and the next bits
+		// intact, then extract the index.
+		subBucket = uint((duration >> (superBucket - 1)) % timeHistNumSubBuckets)
+	} else {
+		subBucket = uint(duration)
+	}
+	atomic.Xadd64(&h.counts[superBucket*timeHistNumSubBuckets+subBucket], 1)
+}
+
+// timeHistogramMetricsBuckets generates a slice of boundaries for
+// the timeHistogram. These boundaries are represented in seconds,
+// not nanoseconds like the timeHistogram represents durations.
+func timeHistogramMetricsBuckets() []float64 {
+	b := make([]float64, timeHistTotalBuckets-1)
+	for i := 0; i < timeHistNumSuperBuckets; i++ {
+		superBucketMin := uint64(0)
+		// The (inclusive) minimum for the first bucket is 0.
+		if i > 0 {
+			// The minimum for the second bucket will be
+			// 1 << timeHistSubBucketBits, indicating that all
+			// sub-buckets are represented by the next timeHistSubBucketBits
+			// bits.
+			// Thereafter, we shift up by 1 each time, so we can represent
+			// this pattern as (i-1)+timeHistSubBucketBits.
+			superBucketMin = uint64(1) << uint(i-1+timeHistSubBucketBits)
+		}
+		// subBucketShift is the amount that we need to shift the sub-bucket
+		// index to combine it with the bucketMin.
+		subBucketShift := uint(0)
+		if i > 1 {
+			// The first two buckets are exact with respect to integers,
+			// so we'll never have to shift the sub-bucket index. Thereafter,
+			// we shift up by 1 with each subsequent bucket.
+			subBucketShift = uint(i - 2)
+		}
+		for j := 0; j < timeHistNumSubBuckets; j++ {
+			// j is the sub-bucket index. By shifting the index into position to
+			// combine with the bucket minimum, we obtain the minimum value for that
+			// sub-bucket.
+			subBucketMin := superBucketMin + (uint64(j) << subBucketShift)
+
+			// Convert the subBucketMin which is in nanoseconds to a float64 seconds value.
+			// These values will all be exactly representable by a float64.
+			b[i*timeHistNumSubBuckets+j] = float64(subBucketMin) / 1e9
+		}
+	}
+	return b
+}
diff --git a/src/runtime/histogram_test.go b/src/runtime/histogram_test.go
new file mode 100644
index 0000000000..5f5b28f784
--- /dev/null
+++ b/src/runtime/histogram_test.go
@@ -0,0 +1,58 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+var dummyTimeHistogram TimeHistogram
+
+func TestTimeHistogram(t *testing.T) {
+	// We need to use a global dummy because this
+	// could get stack-allocated with a non-8-byte alignment.
+	// The result of this bad alignment is a segfault on
+	// 32-bit platforms when calling Record.
+	h := &dummyTimeHistogram
+
+	// Record exactly one sample in each bucket.
+	for i := 0; i < TimeHistNumSuperBuckets; i++ {
+		var base int64
+		if i > 0 {
+			base = int64(1) << (i + TimeHistSubBucketBits - 1)
+		}
+		for j := 0; j < TimeHistNumSubBuckets; j++ {
+			v := int64(j)
+			if i > 0 {
+				v <<= i - 1
+			}
+			h.Record(base + v)
+		}
+	}
+	// Hit the overflow bucket.
+	h.Record(int64(^uint64(0) >> 1))
+
+	// Check to make sure there's exactly one count in each
+	// bucket.
+	for i := uint(0); i < TimeHistNumSuperBuckets; i++ {
+		for j := uint(0); j < TimeHistNumSubBuckets; j++ {
+			c, ok := h.Count(i, j)
+			if !ok {
+				t.Errorf("hit overflow bucket unexpectedly: (%d, %d)", i, j)
+			} else if c != 1 {
+				t.Errorf("bucket (%d, %d) has count that is not 1: %d", i, j, c)
+			}
+		}
+	}
+	c, ok := h.Count(TimeHistNumSuperBuckets, 0)
+	if ok {
+		t.Errorf("expected to hit overflow bucket: (%d, %d)", TimeHistNumSuperBuckets, 0)
+	}
+	if c != 1 {
+		t.Errorf("overflow bucket has count that is not 1: %d", c)
+	}
+	dummyTimeHistogram = TimeHistogram{}
+}
diff --git a/src/runtime/iface.go b/src/runtime/iface.go
index 0504b89363..f8b7d429a3 100644
--- a/src/runtime/iface.go
+++ b/src/runtime/iface.go
@@ -31,16 +31,17 @@ func itabHashFunc(inter *interfacetype, typ *_type) uintptr {
 }
 
 func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
-	if len(inter.mhdr) == 0 {
+	if inter.isEmpty() {
 		throw("internal error - misuse of itab")
 	}
+	imethods := inter.methods()
 
 	// easy case
 	if typ.tflag&tflagUncommon == 0 {
 		if canfail {
 			return nil
 		}
-		name := inter.typ.nameOff(inter.mhdr[0].name)
+		name := inter.typ.nameOff(imethods[0].name)
 		panic(&TypeAssertionError{nil, typ, &inter.typ, name.name()})
 	}
 
@@ -63,7 +64,7 @@ func getitab(inter *interfacetype, typ *_type, canfail bool) *itab {
 	}
 
 	// Entry doesn't exist yet. Make a new entry & add it.
-	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(inter.mhdr)-1)*sys.PtrSize, 0, &memstats.other_sys))
+	m = (*itab)(persistentalloc(unsafe.Sizeof(itab{})+uintptr(len(imethods)-1)*sys.PtrSize, 0, &memstats.other_sys))
 	m.inter = inter
 	m._type = typ
 	// The hash is used in type switches. However, compiler statically generates itab's
@@ -197,7 +198,8 @@ func (m *itab) init() string {
 	// and interface names are unique,
 	// so can iterate over both in lock step;
 	// the loop is O(ni+nt) not O(ni*nt).
-	ni := len(inter.mhdr)
+	imethods := inter.methods()
+	ni := len(imethods)
 	nt := int(x.mcount)
 	xmhdr := (*[1 << 16]method)(add(unsafe.Pointer(x), uintptr(x.moff)))[:nt:nt]
 	j := 0
@@ -205,7 +207,7 @@ func (m *itab) init() string {
 	var fun0 unsafe.Pointer
 imethods:
 	for k := 0; k < ni; k++ {
-		i := &inter.mhdr[k]
+		i := &imethods[k]
 		itype := inter.typ.typeOff(i.ityp)
 		name := inter.typ.nameOff(i.name)
 		iname := name.name()
diff --git a/src/runtime/internal/atomic/asm_386.s b/src/runtime/internal/atomic/asm_386.s
index 9b9dc14a60..d82faef1f0 100644
--- a/src/runtime/internal/atomic/asm_386.s
+++ b/src/runtime/internal/atomic/asm_386.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool Cas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -11,7 +12,7 @@
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
+TEXT ·Cas(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -20,32 +21,31 @@ TEXT runtime∕internal∕atomic·Cas(SB), NOSPLIT, $0-13
 	SETEQ	ret+12(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB), NOSPLIT, $0-13
-	JMP	runtime∕internal∕atomic·Cas(SB)
+TEXT ·CasRel(SB), NOSPLIT, $0-13
+	JMP	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduint(SB), NOSPLIT, $0-8
+	JMP	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-8
-	JMP	runtime∕internal∕atomic·Store(SB)
-
-TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Xadd(SB)
+TEXT ·Storeuintptr(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB), NOSPLIT, $0-12
-	JMP runtime∕internal∕atomic·Load64(SB)
+TEXT ·Xadduintptr(SB), NOSPLIT, $0-12
+	JMP	·Xadd(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
-	JMP runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Loadint64(SB), NOSPLIT, $0-12
+	JMP	·Load64(SB)
 
+TEXT ·Xaddint64(SB), NOSPLIT, $0-20
+	JMP	·Xadd64(SB)
 
-// bool runtime∕internal∕atomic·Cas64(uint64 *val, uint64 old, uint64 new)
+// bool ·Cas64(uint64 *val, uint64 old, uint64 new)
 // Atomically:
 //	if(*val == *old){
 //		*val = new;
@@ -53,11 +53,12 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB), NOSPLIT, $0-20
 //	} else {
 //		return 0;
 //	}
-TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
+TEXT ·Cas64(SB), NOSPLIT, $0-21
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, BP // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVL	old_lo+4(FP), AX
 	MOVL	old_hi+8(FP), DX
 	MOVL	new_lo+12(FP), BX
@@ -74,7 +75,7 @@ TEXT runtime∕internal∕atomic·Cas64(SB), NOSPLIT, $0-21
 //		return 1;
 //	}else
 //		return 0;
-TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
+TEXT ·Casp1(SB), NOSPLIT, $0-13
 	MOVL	ptr+0(FP), BX
 	MOVL	old+4(FP), AX
 	MOVL	new+8(FP), CX
@@ -87,7 +88,7 @@ TEXT runtime∕internal∕atomic·Casp1(SB), NOSPLIT, $0-13
 // Atomically:
 //	*val += delta;
 //	return *val;
-TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
+TEXT ·Xadd(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	delta+4(FP), AX
 	MOVL	AX, CX
@@ -97,12 +98,13 @@ TEXT runtime∕internal∕atomic·Xadd(SB), NOSPLIT, $0-12
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xadd64(SB), NOSPLIT, $0-20
+TEXT ·Xadd64(SB), NOSPLIT, $0-20
+	NO_LOCAL_POINTERS
 	// no XADDQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// DI:SI = delta
 	MOVL	delta_lo+4(FP), SI
 	MOVL	delta_hi+8(FP), DI
@@ -133,22 +135,23 @@ addloop:
 	MOVL	CX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchg(SB), NOSPLIT, $0-12
+TEXT ·Xchg(SB), NOSPLIT, $0-12
 	MOVL	ptr+0(FP), BX
 	MOVL	new+4(FP), AX
 	XCHGL	AX, 0(BX)
 	MOVL	AX, ret+8(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·Xchguintptr(SB), NOSPLIT, $0-12
-	JMP	runtime∕internal∕atomic·Xchg(SB)
+TEXT ·Xchguintptr(SB), NOSPLIT, $0-12
+	JMP	·Xchg(SB)
 
-TEXT  runtime∕internal∕atomic·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$0-20
+	NO_LOCAL_POINTERS
 	// no XCHGQ so use CMPXCHG8B loop
 	MOVL	ptr+0(FP), BP
 	TESTL	$7, BP
 	JZ	2(PC)
-	MOVL	0, AX // crash when unaligned
+	CALL	·panicUnaligned(SB)
 	// CX:BX = new
 	MOVL	new_lo+4(FP), BX
 	MOVL	new_hi+8(FP), CX
@@ -171,38 +174,43 @@ swaploop:
 	MOVL	DX, ret_hi+16(FP)
 	RET
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-8
+TEXT ·StorepNoWB(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-8
+TEXT ·Store(SB), NOSPLIT, $0-8
 	MOVL	ptr+0(FP), BX
 	MOVL	val+4(FP), AX
 	XCHGL	AX, 0(BX)
 	RET
 
-TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-8
+TEXT ·StoreRel(SB), NOSPLIT, $0-8
+	JMP	·Store(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-8
 	JMP	runtime∕internal∕atomic·Store(SB)
 
 // uint64 atomicload64(uint64 volatile* addr);
-TEXT runtime∕internal∕atomic·Load64(SB), NOSPLIT, $0-12
+TEXT ·Load64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	MOVQ	(AX), M0
 	MOVQ	M0, ret+4(FP)
 	EMMS
 	RET
 
-// void runtime∕internal∕atomic·Store64(uint64 volatile* addr, uint64 v);
-TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
+// void ·Store64(uint64 volatile* addr, uint64 v);
+TEXT ·Store64(SB), NOSPLIT, $0-12
+	NO_LOCAL_POINTERS
 	MOVL	ptr+0(FP), AX
 	TESTL	$7, AX
 	JZ	2(PC)
-	MOVL	0, AX // crash with nil ptr deref
+	CALL	·panicUnaligned(SB)
 	// MOVQ and EMMS were introduced on the Pentium MMX.
 	MOVQ	val+4(FP), M0
 	MOVQ	M0, (AX)
@@ -214,24 +222,40 @@ TEXT runtime∕internal∕atomic·Store64(SB), NOSPLIT, $0-12
 	XADDL	AX, (SP)
 	RET
 
-// void	runtime∕internal∕atomic·Or8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-5
+// void	·Or8(byte volatile*, byte);
+TEXT ·Or8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ORB	BX, (AX)
 	RET
 
-// void	runtime∕internal∕atomic·And8(byte volatile*, byte);
-TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-5
+// void	·And8(byte volatile*, byte);
+TEXT ·And8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), AX
 	MOVB	val+4(FP), BX
 	LOCK
 	ANDB	BX, (AX)
 	RET
 
-TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-5
+TEXT ·Store8(SB), NOSPLIT, $0-5
 	MOVL	ptr+0(FP), BX
 	MOVB	val+4(FP), AX
 	XCHGB	AX, 0(BX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVL	ptr+0(FP), AX
+	MOVL	val+4(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_amd64.s b/src/runtime/internal/atomic/asm_amd64.s
index 90c56424c9..2cf7c55870 100644
--- a/src/runtime/internal/atomic/asm_amd64.s
+++ b/src/runtime/internal/atomic/asm_amd64.s
@@ -136,6 +136,12 @@ TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	JMP	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store8(SB), NOSPLIT, $0-9
 	MOVQ	ptr+0(FP), BX
 	MOVB	val+8(FP), AX
@@ -163,3 +169,19 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	LOCK
 	ANDB	BX, (AX)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ORL	BX, (AX)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVQ	ptr+0(FP), AX
+	MOVL	val+8(FP), BX
+	LOCK
+	ANDL	BX, (AX)
+	RET
diff --git a/src/runtime/internal/atomic/asm_arm.s b/src/runtime/internal/atomic/asm_arm.s
index d4ef11560e..274925ed60 100644
--- a/src/runtime/internal/atomic/asm_arm.s
+++ b/src/runtime/internal/atomic/asm_arm.s
@@ -3,6 +3,7 @@
 // license that can be found in the LICENSE file.
 
 #include "textflag.h"
+#include "funcdata.h"
 
 // bool armcas(int32 *val, int32 old, int32 new)
 // Atomically:
@@ -12,13 +13,13 @@
 //	}else
 //		return 0;
 //
-// To implement runtime∕internal∕atomic·cas in sys_$GOOS_arm.s
+// To implement ·cas in sys_$GOOS_arm.s
 // using the native instructions, use:
 //
-//	TEXT runtime∕internal∕atomic·cas(SB),NOSPLIT,$0
-//		B	runtime∕internal∕atomic·armcas(SB)
+//	TEXT ·cas(SB),NOSPLIT,$0
+//		B	·armcas(SB)
 //
-TEXT runtime∕internal∕atomic·armcas(SB),NOSPLIT,$0-13
+TEXT ·armcas(SB),NOSPLIT,$0-13
 	MOVW	ptr+0(FP), R1
 	MOVW	old+4(FP), R2
 	MOVW	new+8(FP), R3
@@ -50,44 +51,50 @@ casfail:
 
 // stubs
 
-TEXT runtime∕internal∕atomic·Loadp(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
-	B runtime∕internal∕atomic·Load(SB)
+TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Casuintptr(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-8
+	B 	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Casp1(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casuintptr(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·CasRel(SB),NOSPLIT,$0-13
-	B	runtime∕internal∕atomic·Cas(SB)
+TEXT ·Casp1(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·CasRel(SB),NOSPLIT,$0-13
+	B	·Cas(SB)
 
-TEXT runtime∕internal∕atomic·Loaduint(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Load(SB)
+TEXT ·Loaduintptr(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·Storeuintptr(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·Loaduint(SB),NOSPLIT,$0-8
+	B	·Load(SB)
 
-TEXT runtime∕internal∕atomic·StorepNoWB(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·Storeuintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·StoreRel(SB),NOSPLIT,$0-8
-	B	runtime∕internal∕atomic·Store(SB)
+TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xadduintptr(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Xadd(SB)
+TEXT ·StoreRel(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Loadint64(SB),NOSPLIT,$0-12
-	B	runtime∕internal∕atomic·Load64(SB)
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	B	·Store(SB)
 
-TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
-	B	runtime∕internal∕atomic·Xadd64(SB)
+TEXT ·Xadduintptr(SB),NOSPLIT,$0-12
+	B	·Xadd(SB)
+
+TEXT ·Loadint64(SB),NOSPLIT,$0-12
+	B	·Load64(SB)
+
+TEXT ·Xaddint64(SB),NOSPLIT,$0-20
+	B	·Xadd64(SB)
 
 // 64-bit atomics
 // The native ARM implementations use LDREXD/STREXD, which are
@@ -95,12 +102,8 @@ TEXT runtime∕internal∕atomic·Xaddint64(SB),NOSPLIT,$0-20
 // On older ARM, we use Go implementations which simulate 64-bit
 // atomics with locks.
 
-TEXT	armCas64<>(SB),NOSPLIT,$0-21
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armCas64<>(SB),NOSPLIT,$0-21
+	// addr is already in R1
 	MOVW	old_lo+4(FP), R2
 	MOVW	old_hi+8(FP), R3
 	MOVW	new_lo+12(FP), R4
@@ -128,12 +131,8 @@ cas64fail:
 	MOVBU	R0, swapped+20(FP)
 	RET
 
-TEXT	armXadd64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armXadd64<>(SB),NOSPLIT,$0-20
+	// addr is already in R1
 	MOVW	delta_lo+4(FP), R2
 	MOVW	delta_hi+8(FP), R3
 
@@ -154,12 +153,8 @@ add64loop:
 	MOVW	R5, new_hi+16(FP)
 	RET
 
-TEXT	armXchg64<>(SB),NOSPLIT,$0-20
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armXchg64<>(SB),NOSPLIT,$0-20
+	// addr is already in R1
 	MOVW	new_lo+4(FP), R2
 	MOVW	new_hi+8(FP), R3
 
@@ -178,12 +173,8 @@ swap64loop:
 	MOVW	R5, old_hi+16(FP)
 	RET
 
-TEXT	armLoad64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armLoad64<>(SB),NOSPLIT,$0-12
+	// addr is already in R1
 
 	LDREXD	(R1), R2	// loads R2 and R3
 	DMB	MB_ISH
@@ -192,12 +183,8 @@ TEXT	armLoad64<>(SB),NOSPLIT,$0-12
 	MOVW	R3, val_hi+8(FP)
 	RET
 
-TEXT	armStore64<>(SB),NOSPLIT,$0-12
-	MOVW	addr+0(FP), R1
-	// make unaligned atomic access panic
-	AND.S	$7, R1, R2
-	BEQ 	2(PC)
-	MOVW	R2, (R2)	// crash. AND.S above left only low 3 bits in R2.
+TEXT armStore64<>(SB),NOSPLIT,$0-12
+	// addr is already in R1
 	MOVW	val_lo+4(FP), R2
 	MOVW	val_hi+8(FP), R3
 
@@ -213,35 +200,83 @@ store64loop:
 	DMB	MB_ISH
 	RET
 
-TEXT	·Cas64(SB),NOSPLIT,$0-21
+// The following functions all panic if their address argument isn't
+// 8-byte aligned. Since we're calling back into Go code to do this,
+// we have to cooperate with stack unwinding. In the normal case, the
+// functions tail-call into the appropriate implementation, which
+// means they must not open a frame. Hence, when they go down the
+// panic path, at that point they push the LR to create a real frame
+// (they don't need to pop it because panic won't return).
+
+TEXT ·Cas64(SB),NOSPLIT,$-4-21
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armCas64<>(SB)
 	JMP	·goCas64(SB)
 
-TEXT	·Xadd64(SB),NOSPLIT,$0-20
+TEXT ·Xadd64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXadd64<>(SB)
 	JMP	·goXadd64(SB)
 
-TEXT	·Xchg64(SB),NOSPLIT,$0-20
+TEXT ·Xchg64(SB),NOSPLIT,$-4-20
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armXchg64<>(SB)
 	JMP	·goXchg64(SB)
 
-TEXT	·Load64(SB),NOSPLIT,$0-12
+TEXT ·Load64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
 	JMP	armLoad64<>(SB)
 	JMP	·goLoad64(SB)
 
-TEXT	·Store64(SB),NOSPLIT,$0-12
+TEXT ·Store64(SB),NOSPLIT,$-4-12
+	NO_LOCAL_POINTERS
+	MOVW	addr+0(FP), R1
+	// make unaligned atomic access panic
+	AND.S	$7, R1, R2
+	BEQ 	3(PC)
+	MOVW.W	R14, -4(R13) // prepare a real frame
+	BL	·panicUnaligned(SB)
+
 	MOVB	runtime·goarm(SB), R11
 	CMP	$7, R11
 	BLT	2(PC)
diff --git a/src/runtime/internal/atomic/asm_mips64x.s b/src/runtime/internal/atomic/asm_mips64x.s
index 3290fb726a..a515683ebb 100644
--- a/src/runtime/internal/atomic/asm_mips64x.s
+++ b/src/runtime/internal/atomic/asm_mips64x.s
@@ -158,6 +158,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 TEXT ·Store(SB), NOSPLIT, $0-12
 	MOVV	ptr+0(FP), R1
 	MOVW	val+8(FP), R2
@@ -237,3 +243,29 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	BEQ	R4, -4(PC)
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVV	ptr+0(FP), R1
+	MOVW	val+8(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_mipsx.s b/src/runtime/internal/atomic/asm_mipsx.s
index 62811a6599..2b2cfabe08 100644
--- a/src/runtime/internal/atomic/asm_mipsx.s
+++ b/src/runtime/internal/atomic/asm_mipsx.s
@@ -122,6 +122,9 @@ TEXT ·StorepNoWB(SB),NOSPLIT,$0-8
 TEXT ·StoreRel(SB),NOSPLIT,$0-8
 	JMP	·Store(SB)
 
+TEXT ·StoreReluintptr(SB),NOSPLIT,$0-8
+	JMP	·Store(SB)
+
 // void	Or8(byte volatile*, byte);
 TEXT ·Or8(SB),NOSPLIT,$0-5
 	MOVW	ptr+0(FP), R1
@@ -169,3 +172,29 @@ try_and8:
 	BEQ	R4, try_and8
 	SYNC
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	OR	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-8
+	MOVW	ptr+0(FP), R1
+	MOVW	val+4(FP), R2
+
+	SYNC
+	LL	(R1), R3
+	AND	R2, R3
+	SC	R3, (R1)
+	BEQ	R3, -4(PC)
+	SYNC
+	RET
diff --git a/src/runtime/internal/atomic/asm_ppc64x.s b/src/runtime/internal/atomic/asm_ppc64x.s
index 06dc931bf4..bb009ab34d 100644
--- a/src/runtime/internal/atomic/asm_ppc64x.s
+++ b/src/runtime/internal/atomic/asm_ppc64x.s
@@ -83,12 +83,18 @@ TEXT runtime∕internal∕atomic·Casuintptr(SB), NOSPLIT, $0-25
 TEXT runtime∕internal∕atomic·Loaduintptr(SB),  NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
+TEXT runtime∕internal∕atomic·LoadAcquintptr(SB),  NOSPLIT|NOFRAME, $0-16
+	BR	runtime∕internal∕atomic·LoadAcq64(SB)
+
 TEXT runtime∕internal∕atomic·Loaduint(SB), NOSPLIT|NOFRAME, $0-16
 	BR	runtime∕internal∕atomic·Load64(SB)
 
 TEXT runtime∕internal∕atomic·Storeuintptr(SB), NOSPLIT, $0-16
 	BR	runtime∕internal∕atomic·Store64(SB)
 
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	BR	runtime∕internal∕atomic·StoreRel64(SB)
+
 TEXT runtime∕internal∕atomic·Xadduintptr(SB), NOSPLIT, $0-24
 	BR	runtime∕internal∕atomic·Xadd64(SB)
 
@@ -191,6 +197,13 @@ TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	MOVW	R4, 0(R3)
 	RET
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	MOVD	ptr+0(FP), R3
+	MOVD	val+8(FP), R4
+	LWSYNC
+	MOVD	R4, 0(R3)
+	RET
+
 // void runtime∕internal∕atomic·Or8(byte volatile*, byte);
 TEXT runtime∕internal∕atomic·Or8(SB), NOSPLIT, $0-9
 	MOVD	ptr+0(FP), R3
@@ -209,8 +222,32 @@ TEXT runtime∕internal∕atomic·And8(SB), NOSPLIT, $0-9
 	MOVBZ	val+8(FP), R4
 	LWSYNC
 again:
-	LBAR	(R3),R6
-	AND	R4,R6
-	STBCCC	R6,(R3)
+	LBAR	(R3), R6
+	AND	R4, R6
+	STBCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3), R6
+	OR	R4, R6
+	STWCCC	R6, (R3)
+	BNE	again
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT runtime∕internal∕atomic·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LWSYNC
+again:
+	LWAR	(R3),R6
+	AND	R4, R6
+	STWCCC	R6, (R3)
 	BNE	again
 	RET
diff --git a/src/runtime/internal/atomic/asm_s390x.s b/src/runtime/internal/atomic/asm_s390x.s
index 9a19bc0ece..daf1f3cc9f 100644
--- a/src/runtime/internal/atomic/asm_s390x.s
+++ b/src/runtime/internal/atomic/asm_s390x.s
@@ -174,8 +174,8 @@ TEXT ·Xchguintptr(SB), NOSPLIT, $0-24
 
 // func Or8(addr *uint8, v uint8)
 TEXT ·Or8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to OR with the entire word atomically.
@@ -188,8 +188,8 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 
 // func And8(addr *uint8, v uint8)
 TEXT ·And8(SB), NOSPLIT, $0-9
-	MOVD    ptr+0(FP), R3
-	MOVBZ   val+8(FP), R4
+	MOVD	ptr+0(FP), R3
+	MOVBZ	val+8(FP), R4
 	// We don't have atomic operations that work on individual bytes so we
 	// need to align addr down to a word boundary and create a mask
 	// containing v to AND with the entire word atomically.
@@ -200,3 +200,17 @@ TEXT ·And8(SB), NOSPLIT, $0-9
 	RLL	R5, R4, R4           // R4 = rotl(R4, R5)
 	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
 	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAO	R4, R6, 0(R3)        // R6 = *R3; *R3 |= R4; (atomic)
+	RET
+
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R3
+	MOVW	val+8(FP), R4
+	LAN	R4, R6, 0(R3)        // R6 = *R3; *R3 &= R4; (atomic)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_386.go b/src/runtime/internal/atomic/atomic_386.go
index 8d002ebfe3..1bfcb1143d 100644
--- a/src/runtime/internal/atomic/atomic_386.go
+++ b/src/runtime/internal/atomic/atomic_386.go
@@ -30,6 +30,12 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd64(ptr *uint64, delta int64) uint64
 
@@ -63,6 +69,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
@@ -83,5 +95,8 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_amd64.go b/src/runtime/internal/atomic/atomic_amd64.go
index 14b8101720..e36eb83a11 100644
--- a/src/runtime/internal/atomic/atomic_amd64.go
+++ b/src/runtime/internal/atomic/atomic_amd64.go
@@ -35,6 +35,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
@@ -65,6 +77,12 @@ func And8(ptr *uint8, val uint8)
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
+//go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
@@ -85,6 +103,12 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // StorepNoWB performs *ptr = val atomically and without a write
 // barrier.
 //
diff --git a/src/runtime/internal/atomic/atomic_arm.go b/src/runtime/internal/atomic/atomic_arm.go
index 95713afcc1..546b3d6120 100644
--- a/src/runtime/internal/atomic/atomic_arm.go
+++ b/src/runtime/internal/atomic/atomic_arm.go
@@ -81,6 +81,9 @@ func Store(addr *uint32, v uint32)
 //go:noescape
 func StoreRel(addr *uint32, v uint32)
 
+//go:noescape
+func StoreReluintptr(addr *uintptr, v uintptr)
+
 //go:nosplit
 func goCas64(addr *uint64, old, new uint64) bool {
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
@@ -180,6 +183,26 @@ func And8(addr *uint8, v uint8) {
 }
 
 //go:nosplit
+func Or(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old|v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
+func And(addr *uint32, v uint32) {
+	for {
+		old := *addr
+		if Cas(addr, old, old&v) {
+			return
+		}
+	}
+}
+
+//go:nosplit
 func armcas(ptr *uint32, old, new uint32) bool
 
 //go:noescape
@@ -195,6 +218,9 @@ func Load8(addr *uint8) uint8
 func LoadAcq(addr *uint32) uint32
 
 //go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Cas64(addr *uint64, old, new uint64) bool
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_arm64.go b/src/runtime/internal/atomic/atomic_arm64.go
index 26ca94d54c..d49bee8936 100644
--- a/src/runtime/internal/atomic/atomic_arm64.go
+++ b/src/runtime/internal/atomic/atomic_arm64.go
@@ -42,12 +42,24 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(addr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -67,3 +79,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_arm64.s b/src/runtime/internal/atomic/atomic_arm64.s
index a2eb7568d2..0cf3c40223 100644
--- a/src/runtime/internal/atomic/atomic_arm64.s
+++ b/src/runtime/internal/atomic/atomic_arm64.s
@@ -36,12 +36,26 @@ TEXT ·Loadp(SB),NOSPLIT,$0-16
 TEXT ·LoadAcq(SB),NOSPLIT,$0-12
 	B	·Load(SB)
 
+// uint64 runtime∕internal∕atomic·LoadAcquintptr(uint64 volatile* addr)
+TEXT ·LoadAcq64(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcq64(uintptr volatile* addr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT,$0-16
+	B	·Load64(SB)
+
 TEXT runtime∕internal∕atomic·StorepNoWB(SB), NOSPLIT, $0-16
 	B	runtime∕internal∕atomic·Store64(SB)
 
 TEXT runtime∕internal∕atomic·StoreRel(SB), NOSPLIT, $0-12
 	B	runtime∕internal∕atomic·Store(SB)
 
+TEXT runtime∕internal∕atomic·StoreRel64(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
+TEXT runtime∕internal∕atomic·StoreReluintptr(SB), NOSPLIT, $0-16
+	B	runtime∕internal∕atomic·Store64(SB)
+
 TEXT runtime∕internal∕atomic·Store(SB), NOSPLIT, $0-12
 	MOVD	ptr+0(FP), R0
 	MOVW	val+8(FP), R1
@@ -150,3 +164,22 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	CBNZ	R3, -3(PC)
 	RET
 
+// func And(addr *uint32, v uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	AND	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
+
+// func Or(addr *uint32, v uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOVD	ptr+0(FP), R0
+	MOVW	val+8(FP), R1
+	LDAXRW	(R0), R2
+	ORR	R1, R2
+	STLXRW	R2, (R0), R3
+	CBNZ	R3, -3(PC)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_mips64x.go b/src/runtime/internal/atomic/atomic_mips64x.go
index 1d9977850b..b0109d72b0 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.go
+++ b/src/runtime/internal/atomic/atomic_mips64x.go
@@ -42,6 +42,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
@@ -50,6 +56,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -69,3 +81,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_mips64x.s b/src/runtime/internal/atomic/atomic_mips64x.s
index 1ed90937c9..125c0c221c 100644
--- a/src/runtime/internal/atomic/atomic_mips64x.s
+++ b/src/runtime/internal/atomic/atomic_mips64x.s
@@ -47,3 +47,11 @@ TEXT ·Loadp(SB),NOSPLIT|NOFRAME,$0-16
 // uint32 runtime∕internal∕atomic·LoadAcq(uint32 volatile* ptr)
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	atomic·Load(SB)
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
+
+// uintptr runtime∕internal∕atomic·LoadAcquintptr(uintptr volatile* ptr)
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	atomic·Load64(SB)
diff --git a/src/runtime/internal/atomic/atomic_mipsx.go b/src/runtime/internal/atomic/atomic_mipsx.go
index 0e2d77ade1..1336b50121 100644
--- a/src/runtime/internal/atomic/atomic_mipsx.go
+++ b/src/runtime/internal/atomic/atomic_mipsx.go
@@ -34,7 +34,7 @@ func spinUnlock(state *uint32)
 func lockAndCheck(addr *uint64) {
 	// ensure 8-byte alignment
 	if uintptr(unsafe.Pointer(addr))&7 != 0 {
-		addr = nil
+		panicUnaligned()
 	}
 	// force dereference before taking lock
 	_ = *addr
@@ -133,12 +133,21 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Store(ptr *uint32, val uint32)
 
 //go:noescape
@@ -151,4 +160,7 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 func StoreRel(ptr *uint32, val uint32)
 
 //go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
+//go:noescape
 func CasRel(addr *uint32, old, new uint32) bool
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.go b/src/runtime/internal/atomic/atomic_ppc64x.go
index a48ecf5ee8..e4b109f0ec 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.go
+++ b/src/runtime/internal/atomic/atomic_ppc64x.go
@@ -42,6 +42,12 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
@@ -50,6 +56,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -67,5 +79,11 @@ func Store64(ptr *uint64, val uint64)
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
 
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
+
 // NO go:noescape annotation; see atomic_pointer.go.
 func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
diff --git a/src/runtime/internal/atomic/atomic_ppc64x.s b/src/runtime/internal/atomic/atomic_ppc64x.s
index c2f696fb34..b79cdbca34 100644
--- a/src/runtime/internal/atomic/atomic_ppc64x.s
+++ b/src/runtime/internal/atomic/atomic_ppc64x.s
@@ -6,6 +6,15 @@
 
 #include "textflag.h"
 
+
+// For more details about how various memory models are
+// enforced on POWER, the following paper provides more
+// details about how they enforce C/C++ like models. This
+// gives context about why the strange looking code
+// sequences below work.
+//
+// http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
+
 // uint32 runtime∕internal∕atomic·Load(uint32 volatile* ptr)
 TEXT ·Load(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVD	ptr+0(FP), R3
@@ -56,5 +65,16 @@ TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$-8-12
 	MOVWZ  0(R3), R3
 	CMPW   R3, R3, CR7
 	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
 	MOVW   R3, ret+8(FP)
 	RET
+
+// uint64 runtime∕internal∕atomic·LoadAcq64(uint64 volatile* ptr)
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$-8-16
+	MOVD   ptr+0(FP), R3
+	MOVD   0(R3), R3
+	CMP    R3, R3, CR7
+	BC     4, 30, 1(PC) // bne- cr7, 0x4
+	ISYNC
+	MOVD   R3, ret+8(FP)
+	RET
diff --git a/src/runtime/internal/atomic/atomic_riscv64.go b/src/runtime/internal/atomic/atomic_riscv64.go
index d52512369e..8f24d61625 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.go
+++ b/src/runtime/internal/atomic/atomic_riscv64.go
@@ -40,12 +40,24 @@ func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 func LoadAcq(ptr *uint32) uint32
 
 //go:noescape
+func LoadAcq64(ptr *uint64) uint64
+
+//go:noescape
+func LoadAcquintptr(ptr *uintptr) uintptr
+
+//go:noescape
 func Or8(ptr *uint8, val uint8)
 
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Cas64(ptr *uint64, old, new uint64) bool
 
 //go:noescape
@@ -65,3 +77,9 @@ func StorepNoWB(ptr unsafe.Pointer, val unsafe.Pointer)
 
 //go:noescape
 func StoreRel(ptr *uint32, val uint32)
+
+//go:noescape
+func StoreRel64(ptr *uint64, val uint64)
+
+//go:noescape
+func StoreReluintptr(ptr *uintptr, val uintptr)
diff --git a/src/runtime/internal/atomic/atomic_riscv64.s b/src/runtime/internal/atomic/atomic_riscv64.s
index d005325ca3..74c896cea6 100644
--- a/src/runtime/internal/atomic/atomic_riscv64.s
+++ b/src/runtime/internal/atomic/atomic_riscv64.s
@@ -150,6 +150,12 @@ TEXT ·Xaddint64(SB),NOSPLIT,$0-24
 TEXT ·LoadAcq(SB),NOSPLIT|NOFRAME,$0-12
 	JMP	·Load(SB)
 
+TEXT ·LoadAcq64(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
+TEXT ·LoadAcquintptr(SB),NOSPLIT|NOFRAME,$0-16
+	JMP	·Load64(SB)
+
 // func Loadp(ptr unsafe.Pointer) unsafe.Pointer
 TEXT ·Loadp(SB),NOSPLIT,$0-16
 	JMP	·Load64(SB)
@@ -161,6 +167,12 @@ TEXT ·StorepNoWB(SB), NOSPLIT, $0-16
 TEXT ·StoreRel(SB), NOSPLIT, $0-12
 	JMP	·Store(SB)
 
+TEXT ·StoreRel64(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
+TEXT ·StoreReluintptr(SB), NOSPLIT, $0-16
+	JMP	·Store64(SB)
+
 // func Xchg(ptr *uint32, new uint32) uint32
 TEXT ·Xchg(SB), NOSPLIT, $0-20
 	MOV	ptr+0(FP), A0
@@ -230,3 +242,17 @@ TEXT ·Or8(SB), NOSPLIT, $0-9
 	SLL	A2, A1
 	AMOORW	A1, (A0), ZERO
 	RET
+
+// func And(ptr *uint32, val uint32)
+TEXT ·And(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOANDW	A1, (A0), ZERO
+	RET
+
+// func Or(ptr *uint32, val uint32)
+TEXT ·Or(SB), NOSPLIT, $0-12
+	MOV	ptr+0(FP), A0
+	MOVW	val+8(FP), A1
+	AMOORW	A1, (A0), ZERO
+	RET
diff --git a/src/runtime/internal/atomic/atomic_s390x.go b/src/runtime/internal/atomic/atomic_s390x.go
index 4d73b39baf..a058d60102 100644
--- a/src/runtime/internal/atomic/atomic_s390x.go
+++ b/src/runtime/internal/atomic/atomic_s390x.go
@@ -41,6 +41,18 @@ func LoadAcq(ptr *uint32) uint32 {
 	return *ptr
 }
 
+//go:nosplit
+//go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
 //go:noescape
 func Store(ptr *uint32, val uint32)
 
@@ -59,6 +71,18 @@ func StoreRel(ptr *uint32, val uint32) {
 	*ptr = val
 }
 
+//go:nosplit
+//go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
 //go:noescape
 func And8(ptr *uint8, val uint8)
 
@@ -68,6 +92,12 @@ func Or8(ptr *uint8, val uint8)
 // NOTE: Do not add atomicxor8 (XOR is not idempotent).
 
 //go:noescape
+func And(ptr *uint32, val uint32)
+
+//go:noescape
+func Or(ptr *uint32, val uint32)
+
+//go:noescape
 func Xadd(ptr *uint32, delta int32) uint32
 
 //go:noescape
diff --git a/src/runtime/internal/atomic/atomic_test.go b/src/runtime/internal/atomic/atomic_test.go
index b0a8fa0610..c9c2eba248 100644
--- a/src/runtime/internal/atomic/atomic_test.go
+++ b/src/runtime/internal/atomic/atomic_test.go
@@ -73,8 +73,15 @@ func TestXadduintptrOnUint64(t *testing.T) {
 
 func shouldPanic(t *testing.T, name string, f func()) {
 	defer func() {
-		if recover() == nil {
+		// Check that all GC maps are sane.
+		runtime.GC()
+
+		err := recover()
+		want := "unaligned 64-bit atomic operation"
+		if err == nil {
 			t.Errorf("%s did not panic", name)
+		} else if s, _ := err.(string); s != want {
+			t.Errorf("%s: wanted panic %q, got %q", name, want, err)
 		}
 	}()
 	f()
@@ -143,6 +150,45 @@ func TestAnd8(t *testing.T) {
 	}
 }
 
+func TestAnd(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0xffffffff)
+	for i := uint32(0); i < 32; i++ {
+		atomic.And(&x, ^(1 << i))
+		if r := uint32(0xffffffff) << (i + 1); x != r {
+			t.Fatalf("clearing bit %#x: want %#x, got %#x", uint32(1<<i), r, x)
+		}
+	}
+
+	// Set every bit in array to 1.
+	a := make([]uint32, 1<<12)
+	for i := range a {
+		a[i] = 0xffffffff
+	}
+
+	// Clear array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := ^uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.And(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestOr8(t *testing.T) {
 	// Basic sanity check.
 	x := uint8(0)
@@ -179,7 +225,43 @@ func TestOr8(t *testing.T) {
 	}
 }
 
-func TestBitwiseContended(t *testing.T) {
+func TestOr(t *testing.T) {
+	// Basic sanity check.
+	x := uint32(0)
+	for i := uint32(0); i < 32; i++ {
+		atomic.Or(&x, 1<<i)
+		if r := (uint32(1) << (i + 1)) - 1; x != r {
+			t.Fatalf("setting bit %#x: want %#x, got %#x", uint32(1)<<i, r, x)
+		}
+	}
+
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 1<<12)
+
+	// Set every bit in array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for i := range a {
+				atomic.Or(&a[i], m)
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally set.
+	for i, v := range a {
+		if v != 0xffffffff {
+			t.Fatalf("a[%v] not fully set: want %#x, got %#x", i, uint32(0xffffffff), v)
+		}
+	}
+}
+
+func TestBitwiseContended8(t *testing.T) {
 	// Start with every bit in array set to 0.
 	a := make([]uint8, 16)
 
@@ -221,6 +303,48 @@ func TestBitwiseContended(t *testing.T) {
 	}
 }
 
+func TestBitwiseContended(t *testing.T) {
+	// Start with every bit in array set to 0.
+	a := make([]uint32, 16)
+
+	// Iterations to try.
+	N := 1 << 16
+	if testing.Short() {
+		N = 1 << 10
+	}
+
+	// Set and then clear every bit in the array bit-by-bit in different goroutines.
+	done := make(chan bool)
+	for i := 0; i < 32; i++ {
+		m := uint32(1 << i)
+		go func() {
+			for n := 0; n < N; n++ {
+				for i := range a {
+					atomic.Or(&a[i], m)
+					if atomic.Load(&a[i])&m != m {
+						t.Errorf("a[%v] bit %#x not set", i, m)
+					}
+					atomic.And(&a[i], ^m)
+					if atomic.Load(&a[i])&m != 0 {
+						t.Errorf("a[%v] bit %#x not clear", i, m)
+					}
+				}
+			}
+			done <- true
+		}()
+	}
+	for i := 0; i < 32; i++ {
+		<-done
+	}
+
+	// Check that the array has been totally cleared.
+	for i, v := range a {
+		if v != 0 {
+			t.Fatalf("a[%v] not cleared: want %#x, got %#x", i, uint32(0), v)
+		}
+	}
+}
+
 func TestStorepNoWB(t *testing.T) {
 	var p [2]*int
 	for i := range p {
diff --git a/src/runtime/internal/atomic/atomic_wasm.go b/src/runtime/internal/atomic/atomic_wasm.go
index 2c0c3a8174..b05d98ed51 100644
--- a/src/runtime/internal/atomic/atomic_wasm.go
+++ b/src/runtime/internal/atomic/atomic_wasm.go
@@ -47,6 +47,18 @@ func LoadAcq(ptr *uint32) uint32 {
 
 //go:nosplit
 //go:noinline
+func LoadAcq64(ptr *uint64) uint64 {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
+func LoadAcquintptr(ptr *uintptr) uintptr {
+	return *ptr
+}
+
+//go:nosplit
+//go:noinline
 func Load8(ptr *uint8) uint8 {
 	return *ptr
 }
@@ -121,6 +133,18 @@ func Or8(ptr *uint8, val uint8) {
 
 //go:nosplit
 //go:noinline
+func And(ptr *uint32, val uint32) {
+	*ptr = *ptr & val
+}
+
+//go:nosplit
+//go:noinline
+func Or(ptr *uint32, val uint32) {
+	*ptr = *ptr | val
+}
+
+//go:nosplit
+//go:noinline
 func Cas64(ptr *uint64, old, new uint64) bool {
 	if *ptr == old {
 		*ptr = new
@@ -143,6 +167,18 @@ func StoreRel(ptr *uint32, val uint32) {
 
 //go:nosplit
 //go:noinline
+func StoreRel64(ptr *uint64, val uint64) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
+func StoreReluintptr(ptr *uintptr, val uintptr) {
+	*ptr = val
+}
+
+//go:nosplit
+//go:noinline
 func Store8(ptr *uint8, val uint8) {
 	*ptr = val
 }
diff --git a/src/runtime/internal/atomic/bench_test.go b/src/runtime/internal/atomic/bench_test.go
index de71b0f2c7..434aa6d434 100644
--- a/src/runtime/internal/atomic/bench_test.go
+++ b/src/runtime/internal/atomic/bench_test.go
@@ -51,6 +51,14 @@ func BenchmarkAnd8(b *testing.B) {
 	}
 }
 
+func BenchmarkAnd(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.And(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkAnd8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -63,6 +71,18 @@ func BenchmarkAnd8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkAndParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.And(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkOr8(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -71,6 +91,14 @@ func BenchmarkOr8(b *testing.B) {
 	}
 }
 
+func BenchmarkOr(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	for i := 0; i < b.N; i++ {
+		atomic.Or(&x[63], uint32(i))
+	}
+}
+
 func BenchmarkOr8Parallel(b *testing.B) {
 	var x [512]uint8 // give byte its own cache line
 	sink = &x
@@ -83,6 +111,18 @@ func BenchmarkOr8Parallel(b *testing.B) {
 	})
 }
 
+func BenchmarkOrParallel(b *testing.B) {
+	var x [128]uint32 // give x its own cache line
+	sink = &x
+	b.RunParallel(func(pb *testing.PB) {
+		i := uint32(0)
+		for pb.Next() {
+			atomic.Or(&x[63], i)
+			i++
+		}
+	})
+}
+
 func BenchmarkXadd(b *testing.B) {
 	var x uint32
 	ptr := &x
diff --git a/src/runtime/internal/atomic/unaligned.go b/src/runtime/internal/atomic/unaligned.go
new file mode 100644
index 0000000000..a859de4144
--- /dev/null
+++ b/src/runtime/internal/atomic/unaligned.go
@@ -0,0 +1,9 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package atomic
+
+func panicUnaligned() {
+	panic("unaligned 64-bit atomic operation")
+}
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 042f10b1d3..3f9b087856 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -41,12 +41,12 @@ const (
 	lockRankCpuprof
 	lockRankSweep
 
+	lockRankPollDesc
 	lockRankSched
 	lockRankDeadlock
 	lockRankPanic
 	lockRankAllg
 	lockRankAllp
-	lockRankPollDesc
 
 	lockRankTimers // Multiple timers locked simultaneously in destroy()
 	lockRankItab
@@ -120,12 +120,12 @@ var lockNames = []string{
 	lockRankCpuprof:      "cpuprof",
 	lockRankSweep:        "sweep",
 
+	lockRankPollDesc: "pollDesc",
 	lockRankSched:    "sched",
 	lockRankDeadlock: "deadlock",
 	lockRankPanic:    "panic",
 	lockRankAllg:     "allg",
 	lockRankAllp:     "allp",
-	lockRankPollDesc: "pollDesc",
 
 	lockRankTimers:      "timers",
 	lockRankItab:        "itab",
@@ -182,14 +182,14 @@ func (rank lockRank) String() string {
 	return lockNames[rank]
 }
 
-// lockPartialOrder is a partial order among the various lock types, listing the immediate
-// ordering that has actually been observed in the runtime. Each entry (which
-// corresponds to a particular lock rank) specifies the list of locks that can be
-// already be held immediately "above" it.
+// lockPartialOrder is a partial order among the various lock types, listing the
+// immediate ordering that has actually been observed in the runtime. Each entry
+// (which corresponds to a particular lock rank) specifies the list of locks
+// that can already be held immediately "above" it.
 //
-// So, for example, the lockRankSched entry shows that all the locks preceding it in
-// rank can actually be held. The fin lock shows that only the sched, timers, or
-// hchan lock can be held immediately above it when it is acquired.
+// So, for example, the lockRankSched entry shows that all the locks preceding
+// it in rank can actually be held. The allp lock shows that only the sysmon or
+// sched lock can be held immediately above it when it is acquired.
 var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDummy:         {},
 	lockRankSysmon:        {},
@@ -199,12 +199,12 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankAssistQueue:   {},
 	lockRankCpuprof:       {},
 	lockRankSweep:         {},
-	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep},
+	lockRankPollDesc:      {},
+	lockRankSched:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc},
 	lockRankDeadlock:      {lockRankDeadlock},
 	lockRankPanic:         {lockRankDeadlock},
 	lockRankAllg:          {lockRankSysmon, lockRankSched, lockRankPanic},
 	lockRankAllp:          {lockRankSysmon, lockRankSched},
-	lockRankPollDesc:      {},
 	lockRankTimers:        {lockRankSysmon, lockRankScavenge, lockRankSched, lockRankAllp, lockRankPollDesc, lockRankTimers},
 	lockRankItab:          {},
 	lockRankReflectOffs:   {lockRankItab},
@@ -231,7 +231,7 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankDefer:        {},
 	lockRankSudog:        {lockRankNotifyList, lockRankHchan},
 	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankSched, lockRankAllg, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
+	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankFin, lockRankPollDesc, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans, lockRankSpanSetSpine},
 	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankNotifyList, lockRankTraceBuf, lockRankTraceStrings, lockRankHchan},
 	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
 
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 4fa14996c2..d0b8c668c3 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -198,7 +198,7 @@ const (
 	// mips32 only has access to the low 2GB of virtual memory, so
 	// we further limit it to 31 bits.
 	//
-	// On darwin/arm64, although 64-bit pointers are presumably
+	// On ios/arm64, although 64-bit pointers are presumably
 	// available, pointers are truncated to 33 bits. Furthermore,
 	// only the top 4 GiB of the address space are actually available
 	// to the application, but we allow the whole 33 bits anyway for
@@ -207,7 +207,7 @@ const (
 	// arenaBaseOffset to offset into the top 4 GiB.
 	//
 	// WebAssembly currently has a limit of 4GB linear memory.
-	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*(sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64
+	heapAddrBits = (_64bit*(1-sys.GoarchWasm)*(1-sys.GoosIos*sys.GoarchArm64))*48 + (1-_64bit+sys.GoarchWasm)*(32-(sys.GoarchMips+sys.GoarchMipsle)) + 33*sys.GoosIos*sys.GoarchArm64
 
 	// maxAlloc is the maximum size of an allocation. On 64-bit,
 	// it's theoretically possible to allocate 1<<heapAddrBits bytes. On
@@ -514,14 +514,14 @@ func mallocinit() {
 		// However, on arm64, we ignore all this advice above and slam the
 		// allocation at 0x40 << 32 because when using 4k pages with 3-level
 		// translation buffers, the user address space is limited to 39 bits
-		// On darwin/arm64, the address space is even smaller.
+		// On ios/arm64, the address space is even smaller.
 		//
 		// On AIX, mmaps starts at 0x0A00000000000000 for 64-bit.
 		// processes.
 		for i := 0x7f; i >= 0; i-- {
 			var p uintptr
 			switch {
-			case GOARCH == "arm64" && (GOOS == "darwin" || GOOS == "ios"):
+			case GOARCH == "arm64" && GOOS == "ios":
 				p = uintptr(i)<<40 | uintptrMask&(0x0013<<28)
 			case GOARCH == "arm64":
 				p = uintptr(i)<<40 | uintptrMask&(0x0040<<32)
@@ -743,9 +743,9 @@ mapped:
 			throw("arena already initialized")
 		}
 		var r *heapArena
-		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+		r = (*heapArena)(h.heapArenaAlloc.alloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 		if r == nil {
-			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gc_sys))
+			r = (*heapArena)(persistentalloc(unsafe.Sizeof(*r), sys.PtrSize, &memstats.gcMiscSys))
 			if r == nil {
 				throw("out of memory allocating heap arena metadata")
 			}
@@ -757,7 +757,7 @@ mapped:
 			if size == 0 {
 				size = physPageSize
 			}
-			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gc_sys))
+			newArray := (*notInHeap)(persistentalloc(size, sys.PtrSize, &memstats.gcMiscSys))
 			if newArray == nil {
 				throw("out of memory allocating allArenas")
 			}
@@ -909,27 +909,34 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		return unsafe.Pointer(&zerobase)
 	}
 
-	if debug.sbrk != 0 {
-		align := uintptr(16)
-		if typ != nil {
-			// TODO(austin): This should be just
-			//   align = uintptr(typ.align)
-			// but that's only 4 on 32-bit platforms,
-			// even if there's a uint64 field in typ (see #599).
-			// This causes 64-bit atomic accesses to panic.
-			// Hence, we use stricter alignment that matches
-			// the normal allocator better.
-			if size&7 == 0 {
-				align = 8
-			} else if size&3 == 0 {
-				align = 4
-			} else if size&1 == 0 {
-				align = 2
-			} else {
-				align = 1
+	if debug.malloc {
+		if debug.sbrk != 0 {
+			align := uintptr(16)
+			if typ != nil {
+				// TODO(austin): This should be just
+				//   align = uintptr(typ.align)
+				// but that's only 4 on 32-bit platforms,
+				// even if there's a uint64 field in typ (see #599).
+				// This causes 64-bit atomic accesses to panic.
+				// Hence, we use stricter alignment that matches
+				// the normal allocator better.
+				if size&7 == 0 {
+					align = 8
+				} else if size&3 == 0 {
+					align = 4
+				} else if size&1 == 0 {
+					align = 2
+				} else {
+					align = 1
+				}
 			}
+			return persistentalloc(size, align, &memstats.other_sys)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.allocs += 1
 		}
-		return persistentalloc(size, align, &memstats.other_sys)
 	}
 
 	// assistG is the G to charge for this allocation, or nil if
@@ -965,19 +972,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	shouldhelpgc := false
 	dataSize := size
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		// We will be called without a P while bootstrapping,
-		// in which case we use mcache0, which is set in mallocinit.
-		// mcache0 is cleared when bootstrapping is complete,
-		// by procresize.
-		c = mcache0
-		if c == nil {
-			throw("malloc called with no P")
-		}
-	}
+	c := getMCache()
 	var span *mspan
 	var x unsafe.Pointer
 	noscan := typ == nil || typ.ptrdata == 0
@@ -1016,6 +1011,14 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 			// Align tiny pointer for required (conservative) alignment.
 			if size&7 == 0 {
 				off = alignUp(off, 8)
+			} else if sys.PtrSize == 4 && size == 12 {
+				// Conservatively align 12-byte objects to 8 bytes on 32-bit
+				// systems so that objects whose first field is a 64-bit
+				// value is aligned to 8 bytes and does not cause a fault on
+				// atomic access. See issue 37262.
+				// TODO(mknyszek): Remove this workaround if/when issue 36606
+				// is resolved.
+				off = alignUp(off, 8)
 			} else if size&3 == 0 {
 				off = alignUp(off, 4)
 			} else if size&1 == 0 {
@@ -1025,7 +1028,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 				// The object fits into existing tiny block.
 				x = unsafe.Pointer(c.tiny + off)
 				c.tinyoffset = off + size
-				c.local_tinyallocs++
+				c.tinyAllocs++
 				mp.mallocing = 0
 				releasem(mp)
 				return x
@@ -1067,9 +1070,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		}
 	} else {
 		shouldhelpgc = true
-		systemstack(func() {
-			span = largeAlloc(size, needzero, noscan)
-		})
+		span = c.allocLarge(size, needzero, noscan)
 		span.freeindex = 1
 		span.allocCount = 1
 		x = unsafe.Pointer(span.base())
@@ -1098,7 +1099,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 		} else {
 			scanSize = typ.ptrdata
 		}
-		c.local_scan += scanSize
+		c.scanAlloc += scanSize
 	}
 
 	// Ensure that the stores above that initialize x to
@@ -1128,13 +1129,20 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	mp.mallocing = 0
 	releasem(mp)
 
-	if debug.allocfreetrace != 0 {
-		tracealloc(x, size, typ)
+	if debug.malloc {
+		if debug.allocfreetrace != 0 {
+			tracealloc(x, size, typ)
+		}
+
+		if inittrace.active && inittrace.id == getg().goid {
+			// Init functions are executed sequentially in a single Go routine.
+			inittrace.bytes += uint64(size)
+		}
 	}
 
 	if rate := MemProfileRate; rate > 0 {
-		if rate != 1 && size < c.next_sample {
-			c.next_sample -= size
+		if rate != 1 && size < c.nextSample {
+			c.nextSample -= size
 		} else {
 			mp := acquirem()
 			profilealloc(mp, x, size)
@@ -1157,35 +1165,6 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 	return x
 }
 
-func largeAlloc(size uintptr, needzero bool, noscan bool) *mspan {
-	// print("largeAlloc size=", size, "\n")
-
-	if size+_PageSize < size {
-		throw("out of memory")
-	}
-	npages := size >> _PageShift
-	if size&_PageMask != 0 {
-		npages++
-	}
-
-	// Deduct credit for this span allocation and sweep if
-	// necessary. mHeap_Alloc will also sweep npages, so this only
-	// pays the debt down to npage pages.
-	deductSweepCredit(npages*_PageSize, npages)
-
-	spc := makeSpanClass(0, noscan)
-	s := mheap_.alloc(npages, spc, needzero)
-	if s == nil {
-		throw("out of memory")
-	}
-	// Put the large span in the mcentral swept list so that it's
-	// visible to the background sweeper.
-	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
-	s.limit = s.base() + size
-	heapBitsForAddr(s.base()).initSpan(s)
-	return s
-}
-
 // implementation of new builtin
 // compiler (both frontend and SSA backend) knows the signature
 // of this function
@@ -1221,16 +1200,7 @@ func reflect_unsafe_NewArray(typ *_type, n int) unsafe.Pointer {
 }
 
 func profilealloc(mp *m, x unsafe.Pointer, size uintptr) {
-	var c *mcache
-	if mp.p != 0 {
-		c = mp.p.ptr().mcache
-	} else {
-		c = mcache0
-		if c == nil {
-			throw("profilealloc called with no P")
-		}
-	}
-	c.next_sample = nextSample()
+	getMCache().nextSample = nextSample()
 	mProf_Malloc(x, size)
 }
 
@@ -1322,7 +1292,7 @@ var persistentChunks *notInHeap
 // The returned memory will be zeroed.
 //
 // Consider marking persistentalloc'd types go:notinheap.
-func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func persistentalloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	var p *notInHeap
 	systemstack(func() {
 		p = persistentalloc1(size, align, sysStat)
@@ -1333,7 +1303,7 @@ func persistentalloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
 // Must run on system stack because stack growth can (re)invoke it.
 // See issue 9174.
 //go:systemstack
-func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
+func persistentalloc1(size, align uintptr, sysStat *sysMemStat) *notInHeap {
 	const (
 		maxBlock = 64 << 10 // VM reservation granularity is 64K on windows
 	)
@@ -1392,8 +1362,8 @@ func persistentalloc1(size, align uintptr, sysStat *uint64) *notInHeap {
 	}
 
 	if sysStat != &memstats.other_sys {
-		mSysStatInc(sysStat, size)
-		mSysStatDec(&memstats.other_sys, size)
+		sysStat.add(int64(size))
+		memstats.other_sys.add(-int64(size))
 	}
 	return p
 }
@@ -1434,7 +1404,7 @@ func (l *linearAlloc) init(base, size uintptr) {
 	l.end = base + size
 }
 
-func (l *linearAlloc) alloc(size, align uintptr, sysStat *uint64) unsafe.Pointer {
+func (l *linearAlloc) alloc(size, align uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := alignUp(l.next, align)
 	if p+size > l.end {
 		return nil
diff --git a/src/runtime/malloc_test.go b/src/runtime/malloc_test.go
index 5c97f548fd..4ba94d0494 100644
--- a/src/runtime/malloc_test.go
+++ b/src/runtime/malloc_test.go
@@ -12,8 +12,10 @@ import (
 	"os"
 	"os/exec"
 	"reflect"
+	"runtime"
 	. "runtime"
 	"strings"
+	"sync/atomic"
 	"testing"
 	"time"
 	"unsafe"
@@ -168,6 +170,61 @@ func TestTinyAlloc(t *testing.T) {
 	}
 }
 
+var (
+	tinyByteSink   *byte
+	tinyUint32Sink *uint32
+	tinyObj12Sink  *obj12
+)
+
+type obj12 struct {
+	a uint64
+	b uint32
+}
+
+func TestTinyAllocIssue37262(t *testing.T) {
+	// Try to cause an alignment access fault
+	// by atomically accessing the first 64-bit
+	// value of a tiny-allocated object.
+	// See issue 37262 for details.
+
+	// GC twice, once to reach a stable heap state
+	// and again to make sure we finish the sweep phase.
+	runtime.GC()
+	runtime.GC()
+
+	// Make 1-byte allocations until we get a fresh tiny slot.
+	aligned := false
+	for i := 0; i < 16; i++ {
+		tinyByteSink = new(byte)
+		if uintptr(unsafe.Pointer(tinyByteSink))&0xf == 0xf {
+			aligned = true
+			break
+		}
+	}
+	if !aligned {
+		t.Fatal("unable to get a fresh tiny slot")
+	}
+
+	// Create a 4-byte object so that the current
+	// tiny slot is partially filled.
+	tinyUint32Sink = new(uint32)
+
+	// Create a 12-byte object, which fits into the
+	// tiny slot. If it actually gets place there,
+	// then the field "a" will be improperly aligned
+	// for atomic access on 32-bit architectures.
+	// This won't be true if issue 36606 gets resolved.
+	tinyObj12Sink = new(obj12)
+
+	// Try to atomically access "x.a".
+	atomic.StoreUint64(&tinyObj12Sink.a, 10)
+
+	// Clear the sinks.
+	tinyByteSink = nil
+	tinyUint32Sink = nil
+	tinyObj12Sink = nil
+}
+
 func TestPageCacheLeak(t *testing.T) {
 	defer GOMAXPROCS(GOMAXPROCS(1))
 	leaked := PageCachePagesLeaked()
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 51c3625c3d..fbfaae0f93 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -1868,12 +1868,12 @@ func materializeGCProg(ptrdata uintptr, prog *byte) *mspan {
 	bitmapBytes := divRoundUp(ptrdata, 8*sys.PtrSize)
 	// Compute the number of pages needed for bitmapBytes.
 	pages := divRoundUp(bitmapBytes, pageSize)
-	s := mheap_.allocManual(pages, &memstats.gc_sys)
+	s := mheap_.allocManual(pages, spanAllocPtrScalarBits)
 	runGCProg(addb(prog, 4), nil, (*byte)(unsafe.Pointer(s.startAddr)), 1)
 	return s
 }
 func dematerializeGCProg(s *mspan) {
-	mheap_.freeManual(s, &memstats.gc_sys)
+	mheap_.freeManual(s, spanAllocPtrScalarBits)
 }
 
 func dumpGCProg(p *byte) {
diff --git a/src/runtime/mcache.go b/src/runtime/mcache.go
index 7a7d33ccae..c9342a41c9 100644
--- a/src/runtime/mcache.go
+++ b/src/runtime/mcache.go
@@ -10,6 +10,7 @@ import (
 )
 
 // Per-thread (in Go, per-P) cache for small objects.
+// This includes a small object cache and local allocation stats.
 // No locking needed because it is per-thread (per-P).
 //
 // mcaches are allocated from non-GC'd memory, so any heap pointers
@@ -19,8 +20,8 @@ import (
 type mcache struct {
 	// The following members are accessed on every malloc,
 	// so they are grouped here for better caching.
-	next_sample uintptr // trigger heap sample after allocating this many bytes
-	local_scan  uintptr // bytes of scannable heap allocated
+	nextSample uintptr // trigger heap sample after allocating this many bytes
+	scanAlloc  uintptr // bytes of scannable heap allocated
 
 	// Allocator cache for tiny objects w/o pointers.
 	// See "Tiny allocator" comment in malloc.go.
@@ -31,9 +32,12 @@ type mcache struct {
 	// tiny is a heap pointer. Since mcache is in non-GC'd memory,
 	// we handle it by clearing it in releaseAll during mark
 	// termination.
-	tiny             uintptr
-	tinyoffset       uintptr
-	local_tinyallocs uintptr // number of tiny allocs not counted in other stats
+	//
+	// tinyAllocs is the number of tiny allocations performed
+	// by the P that owns this mcache.
+	tiny       uintptr
+	tinyoffset uintptr
+	tinyAllocs uintptr
 
 	// The rest is not accessed on every malloc.
 
@@ -41,16 +45,15 @@ type mcache struct {
 
 	stackcache [_NumStackOrders]stackfreelist
 
-	// Local allocator stats, flushed during GC.
-	local_largefree  uintptr                  // bytes freed for large objects (>maxsmallsize)
-	local_nlargefree uintptr                  // number of frees for large objects (>maxsmallsize)
-	local_nsmallfree [_NumSizeClasses]uintptr // number of frees for small objects (<=maxsmallsize)
-
 	// flushGen indicates the sweepgen during which this mcache
 	// was last flushed. If flushGen != mheap_.sweepgen, the spans
 	// in this mcache are stale and need to the flushed so they
 	// can be swept. This is done in acquirep.
 	flushGen uint32
+
+	// statsSeq is a counter indicating whether this P is currently
+	// writing any stats. Its value is even when not, odd when it is.
+	statsSeq uint32
 }
 
 // A gclink is a node in a linked list of blocks, like mlink,
@@ -93,10 +96,16 @@ func allocmcache() *mcache {
 	for i := range c.alloc {
 		c.alloc[i] = &emptymspan
 	}
-	c.next_sample = nextSample()
+	c.nextSample = nextSample()
 	return c
 }
 
+// freemcache releases resources associated with this
+// mcache and puts the object onto a free list.
+//
+// In some cases there is no way to simply release
+// resources, such as statistics, so donate them to
+// a different mcache (the recipient).
 func freemcache(c *mcache) {
 	systemstack(func() {
 		c.releaseAll()
@@ -108,12 +117,34 @@ func freemcache(c *mcache) {
 		// gcworkbuffree(c.gcworkbuf)
 
 		lock(&mheap_.lock)
-		purgecachedstats(c)
 		mheap_.cachealloc.free(unsafe.Pointer(c))
 		unlock(&mheap_.lock)
 	})
 }
 
+// getMCache is a convenience function which tries to obtain an mcache.
+//
+// Must be running with a P when called (so the caller must be in a
+// non-preemptible state) or must be called during bootstrapping.
+func getMCache() *mcache {
+	// Grab the mcache, since that's where stats live.
+	pp := getg().m.p.ptr()
+	var c *mcache
+	if pp == nil {
+		// We will be called without a P while bootstrapping,
+		// in which case we use mcache0, which is set in mallocinit.
+		// mcache0 is cleared when bootstrapping is complete,
+		// by procresize.
+		c = mcache0
+		if c == nil {
+			throw("getMCache called with no P or outside bootstrapping")
+		}
+	} else {
+		c = pp.mcache
+	}
+	return c
+}
+
 // refill acquires a new span of span class spc for c. This span will
 // have at least one free object. The current span in c must be full.
 //
@@ -148,13 +179,107 @@ func (c *mcache) refill(spc spanClass) {
 	// sweeping in the next sweep phase.
 	s.sweepgen = mheap_.sweepgen + 3
 
+	// Assume all objects from this span will be allocated in the
+	// mcache. If it gets uncached, we'll adjust this.
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.smallAllocCount[spc.sizeclass()], uintptr(s.nelems)-uintptr(s.allocCount))
+	memstats.heapStats.release(c)
+
+	// Update heap_live with the same assumption.
+	usedBytes := uintptr(s.allocCount) * s.elemsize
+	atomic.Xadd64(&memstats.heap_live, int64(s.npages*pageSize)-int64(usedBytes))
+
+	// Flush tinyAllocs.
+	if spc == tinySpanClass {
+		atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+		c.tinyAllocs = 0
+	}
+
+	// While we're here, flush scanAlloc, since we have to call
+	// revise anyway.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	if trace.enabled {
+		// heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		// heap_live and heap_scan changed.
+		gcController.revise()
+	}
+
 	c.alloc[spc] = s
 }
 
+// allocLarge allocates a span for a large object.
+func (c *mcache) allocLarge(size uintptr, needzero bool, noscan bool) *mspan {
+	if size+_PageSize < size {
+		throw("out of memory")
+	}
+	npages := size >> _PageShift
+	if size&_PageMask != 0 {
+		npages++
+	}
+
+	// Deduct credit for this span allocation and sweep if
+	// necessary. mHeap_Alloc will also sweep npages, so this only
+	// pays the debt down to npage pages.
+	deductSweepCredit(npages*_PageSize, npages)
+
+	spc := makeSpanClass(0, noscan)
+	s := mheap_.alloc(npages, spc, needzero)
+	if s == nil {
+		throw("out of memory")
+	}
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xadduintptr(&stats.largeAlloc, npages*pageSize)
+	atomic.Xadduintptr(&stats.largeAllocCount, 1)
+	memstats.heapStats.release(c)
+
+	// Update heap_live and revise pacing if needed.
+	atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
+	if trace.enabled {
+		// Trace that a heap alloc occurred because heap_live changed.
+		traceHeapAlloc()
+	}
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
+
+	// Put the large span in the mcentral swept list so that it's
+	// visible to the background sweeper.
+	mheap_.central[spc].mcentral.fullSwept(mheap_.sweepgen).push(s)
+	s.limit = s.base() + size
+	heapBitsForAddr(s.base()).initSpan(s)
+	return s
+}
+
 func (c *mcache) releaseAll() {
+	// Take this opportunity to flush scanAlloc.
+	atomic.Xadd64(&memstats.heap_scan, int64(c.scanAlloc))
+	c.scanAlloc = 0
+
+	sg := mheap_.sweepgen
 	for i := range c.alloc {
 		s := c.alloc[i]
 		if s != &emptymspan {
+			// Adjust nsmallalloc in case the span wasn't fully allocated.
+			n := uintptr(s.nelems) - uintptr(s.allocCount)
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallAllocCount[spanClass(i).sizeclass()], -n)
+			memstats.heapStats.release(c)
+			if s.sweepgen != sg+1 {
+				// refill conservatively counted unallocated slots in heap_live.
+				// Undo this.
+				//
+				// If this span was cached before sweep, then
+				// heap_live was totally recomputed since
+				// caching this span, so we don't do this for
+				// stale spans.
+				atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
+			}
+			// Release the span to the mcentral.
 			mheap_.central[i].mcentral.uncacheSpan(s)
 			c.alloc[i] = &emptymspan
 		}
@@ -162,6 +287,13 @@ func (c *mcache) releaseAll() {
 	// Clear tinyalloc pool.
 	c.tiny = 0
 	c.tinyoffset = 0
+	atomic.Xadd64(&memstats.tinyallocs, int64(c.tinyAllocs))
+	c.tinyAllocs = 0
+
+	// Updated heap_scan and possible heap_live.
+	if gcBlackenEnabled != 0 {
+		gcController.revise()
+	}
 }
 
 // prepareForSweep flushes c if the system has entered a new sweep phase
diff --git a/src/runtime/mcentral.go b/src/runtime/mcentral.go
index ed49e01677..97fe92c2ab 100644
--- a/src/runtime/mcentral.go
+++ b/src/runtime/mcentral.go
@@ -44,11 +44,6 @@ type mcentral struct {
 	// encounter swept spans, and these should be ignored.
 	partial [2]spanSet // list of spans with a free object
 	full    [2]spanSet // list of spans with no free objects
-
-	// nmalloc is the cumulative count of objects allocated from
-	// this mcentral, assuming all spans in mcaches are
-	// fully-allocated. Written atomically, read under STW.
-	nmalloc uint64
 }
 
 // Initialize a single central free list.
@@ -178,19 +173,6 @@ havespan:
 	if n == 0 || s.freeindex == s.nelems || uintptr(s.allocCount) == s.nelems {
 		throw("span has no free objects")
 	}
-	// Assume all objects from this span will be allocated in the
-	// mcache. If it gets uncached, we'll adjust this.
-	atomic.Xadd64(&c.nmalloc, int64(n))
-	usedBytes := uintptr(s.allocCount) * s.elemsize
-	atomic.Xadd64(&memstats.heap_live, int64(spanBytes)-int64(usedBytes))
-	if trace.enabled {
-		// heap_live changed.
-		traceHeapAlloc()
-	}
-	if gcBlackenEnabled != 0 {
-		// heap_live changed.
-		gcController.revise()
-	}
 	freeByteBase := s.freeindex &^ (64 - 1)
 	whichByte := freeByteBase / 8
 	// Init alloc bits cache.
@@ -228,27 +210,6 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// Indicate that s is no longer cached.
 		atomic.Store(&s.sweepgen, sg)
 	}
-	n := int(s.nelems) - int(s.allocCount)
-
-	// Fix up statistics.
-	if n > 0 {
-		// cacheSpan updated alloc assuming all objects on s
-		// were going to be allocated. Adjust for any that
-		// weren't. We must do this before potentially
-		// sweeping the span.
-		atomic.Xadd64(&c.nmalloc, -int64(n))
-
-		if !stale {
-			// (*mcentral).cacheSpan conservatively counted
-			// unallocated slots in heap_live. Undo this.
-			//
-			// If this span was cached before sweep, then
-			// heap_live was totally recomputed since
-			// caching this span, so we don't do this for
-			// stale spans.
-			atomic.Xadd64(&memstats.heap_live, -int64(n)*int64(s.elemsize))
-		}
-	}
 
 	// Put the span in the appropriate place.
 	if stale {
@@ -256,7 +217,7 @@ func (c *mcentral) uncacheSpan(s *mspan) {
 		// the right list.
 		s.sweep(false)
 	} else {
-		if n > 0 {
+		if int(s.nelems)-int(s.allocCount) > 0 {
 			// Put it back on the partial swept list.
 			c.partialSwept(sg).push(s)
 		} else {
diff --git a/src/runtime/mcheckmark.go b/src/runtime/mcheckmark.go
index 1fd8e4e78f..c0b028d715 100644
--- a/src/runtime/mcheckmark.go
+++ b/src/runtime/mcheckmark.go
@@ -41,7 +41,7 @@ func startCheckmarks() {
 
 		if bitmap == nil {
 			// Allocate bitmap on first use.
-			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gc_sys))
+			bitmap = (*checkmarksMap)(persistentalloc(unsafe.Sizeof(*bitmap), 0, &memstats.gcMiscSys))
 			if bitmap == nil {
 				throw("out of memory allocating checkmarks bitmap")
 			}
diff --git a/src/runtime/mem_aix.go b/src/runtime/mem_aix.go
index 7e145b072a..957aa4dcc2 100644
--- a/src/runtime/mem_aix.go
+++ b/src/runtime/mem_aix.go
@@ -11,7 +11,7 @@ import (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -24,7 +24,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -41,8 +41,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 
 }
@@ -59,8 +59,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	// AIX does not allow mapping a range that is already mapped.
 	// So, call mprotect to change permissions.
diff --git a/src/runtime/mem_bsd.go b/src/runtime/mem_bsd.go
index 4d860e7bd3..bc672019fb 100644
--- a/src/runtime/mem_bsd.go
+++ b/src/runtime/mem_bsd.go
@@ -13,12 +13,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -35,8 +35,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -65,8 +65,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 const _sunosEAGAIN = 11
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM || ((GOOS == "solaris" || GOOS == "illumos") && err == _sunosEAGAIN) {
diff --git a/src/runtime/mem_darwin.go b/src/runtime/mem_darwin.go
index 3b5d565b0f..7fccd2bb8e 100644
--- a/src/runtime/mem_darwin.go
+++ b/src/runtime/mem_darwin.go
@@ -11,12 +11,12 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	v, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return v
 }
 
@@ -39,8 +39,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -58,8 +58,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 
 const _ENOMEM = 12
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_js.go b/src/runtime/mem_js.go
index 092b3d4fa2..957ed36ffa 100644
--- a/src/runtime/mem_js.go
+++ b/src/runtime/mem_js.go
@@ -13,7 +13,7 @@ import (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p := sysReserve(nil, n)
 	sysMap(p, n, sysStat)
 	return p
@@ -31,8 +31,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
@@ -80,6 +80,6 @@ func growMemory(pages int32) int32
 // This allows the front-end to replace the old DataView object with a new one.
 func resetMemoryDataView()
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 59b0bca970..3436851091 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -17,7 +17,7 @@ const (
 // Don't split the stack as this method may be invoked without a valid G, which
 // prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	p, err := mmap(nil, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_PRIVATE, -1, 0)
 	if err != 0 {
 		if err == _EACCES {
@@ -30,7 +30,7 @@ func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
 		}
 		return nil
 	}
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 	return p
 }
 
@@ -144,8 +144,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	munmap(v, n)
 }
 
@@ -161,8 +161,8 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return p
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 
 	p, err := mmap(v, n, _PROT_READ|_PROT_WRITE, _MAP_ANON|_MAP_FIXED|_MAP_PRIVATE, -1, 0)
 	if err == _ENOMEM {
diff --git a/src/runtime/mem_plan9.go b/src/runtime/mem_plan9.go
index 4fea851cdd..53d8e6dffa 100644
--- a/src/runtime/mem_plan9.go
+++ b/src/runtime/mem_plan9.go
@@ -140,19 +140,19 @@ func sbrk(n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(bl)
 }
 
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
 	lock(&memlock)
 	p := memAlloc(n)
 	memCheck()
 	unlock(&memlock)
 	if p != nil {
-		mSysStatInc(sysStat, n)
+		sysStat.add(int64(n))
 	}
 	return p
 }
 
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	lock(&memlock)
 	if uintptr(v)+n == bloc {
 		// Address range being freed is at the end of memory,
@@ -176,10 +176,10 @@ func sysUsed(v unsafe.Pointer, n uintptr) {
 func sysHugePage(v unsafe.Pointer, n uintptr) {
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
 	// sysReserve has already allocated all heap memory,
 	// but has not adjusted stats.
-	mSysStatInc(sysStat, n)
+	sysStat.add(int64(n))
 }
 
 func sysFault(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mem_windows.go b/src/runtime/mem_windows.go
index 165062ec27..3a805b9767 100644
--- a/src/runtime/mem_windows.go
+++ b/src/runtime/mem_windows.go
@@ -24,8 +24,8 @@ const (
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysAlloc(n uintptr, sysStat *uint64) unsafe.Pointer {
-	mSysStatInc(sysStat, n)
+func sysAlloc(n uintptr, sysStat *sysMemStat) unsafe.Pointer {
+	sysStat.add(int64(n))
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_COMMIT|_MEM_RESERVE, _PAGE_READWRITE))
 }
 
@@ -97,8 +97,8 @@ func sysHugePage(v unsafe.Pointer, n uintptr) {
 // Don't split the stack as this function may be invoked without a valid G,
 // which prevents us from allocating more stack.
 //go:nosplit
-func sysFree(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatDec(sysStat, n)
+func sysFree(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(-int64(n))
 	r := stdcall3(_VirtualFree, uintptr(v), 0, _MEM_RELEASE)
 	if r == 0 {
 		print("runtime: VirtualFree of ", n, " bytes failed with errno=", getlasterror(), "\n")
@@ -124,6 +124,6 @@ func sysReserve(v unsafe.Pointer, n uintptr) unsafe.Pointer {
 	return unsafe.Pointer(stdcall4(_VirtualAlloc, 0, n, _MEM_RESERVE, _PAGE_READWRITE))
 }
 
-func sysMap(v unsafe.Pointer, n uintptr, sysStat *uint64) {
-	mSysStatInc(sysStat, n)
+func sysMap(v unsafe.Pointer, n uintptr, sysStat *sysMemStat) {
+	sysStat.add(int64(n))
 }
diff --git a/src/runtime/memmove_test.go b/src/runtime/memmove_test.go
index 396c1304c5..b549433f71 100644
--- a/src/runtime/memmove_test.go
+++ b/src/runtime/memmove_test.go
@@ -538,21 +538,30 @@ func BenchmarkCopyFat1024(b *testing.B) {
 	}
 }
 
+// BenchmarkIssue18740 ensures that memmove uses 4 and 8 byte load/store to move 4 and 8 bytes.
+// It used to do 2 2-byte load/stores, which leads to a pipeline stall
+// when we try to read the result with one 4-byte load.
 func BenchmarkIssue18740(b *testing.B) {
-	// This tests that memmove uses one 4-byte load/store to move 4 bytes.
-	// It used to do 2 2-byte load/stores, which leads to a pipeline stall
-	// when we try to read the result with one 4-byte load.
-	var buf [4]byte
-	for j := 0; j < b.N; j++ {
-		s := uint32(0)
-		for i := 0; i < 4096; i += 4 {
-			copy(buf[:], g[i:])
-			s += binary.LittleEndian.Uint32(buf[:])
-		}
-		sink = uint64(s)
+	benchmarks := []struct {
+		name  string
+		nbyte int
+		f     func([]byte) uint64
+	}{
+		{"2byte", 2, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint16(buf)) }},
+		{"4byte", 4, func(buf []byte) uint64 { return uint64(binary.LittleEndian.Uint32(buf)) }},
+		{"8byte", 8, func(buf []byte) uint64 { return binary.LittleEndian.Uint64(buf) }},
+	}
+
+	var g [4096]byte
+	for _, bm := range benchmarks {
+		buf := make([]byte, bm.nbyte)
+		b.Run(bm.name, func(b *testing.B) {
+			for j := 0; j < b.N; j++ {
+				for i := 0; i < 4096; i += bm.nbyte {
+					copy(buf[:], g[i:])
+					sink += bm.f(buf[:])
+				}
+			}
+		})
 	}
 }
-
-// TODO: 2 byte and 8 byte benchmarks also.
-
-var g [4096]byte
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go
new file mode 100644
index 0000000000..d3c0341aee
--- /dev/null
+++ b/src/runtime/metrics.go
@@ -0,0 +1,485 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+// Metrics implementation exported to runtime/metrics.
+
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
+
+var (
+	// metrics is a map of runtime/metrics keys to
+	// data used by the runtime to sample each metric's
+	// value.
+	metricsSema uint32 = 1
+	metricsInit bool
+	metrics     map[string]metricData
+
+	sizeClassBuckets []float64
+	timeHistBuckets  []float64
+)
+
+type metricData struct {
+	// deps is the set of runtime statistics that this metric
+	// depends on. Before compute is called, the statAggregate
+	// which will be passed must ensure() these dependencies.
+	deps statDepSet
+
+	// compute is a function that populates a metricValue
+	// given a populated statAggregate structure.
+	compute func(in *statAggregate, out *metricValue)
+}
+
+// initMetrics initializes the metrics map if it hasn't been yet.
+//
+// metricsSema must be held.
+func initMetrics() {
+	if metricsInit {
+		return
+	}
+	sizeClassBuckets = make([]float64, _NumSizeClasses)
+	for i := range sizeClassBuckets {
+		sizeClassBuckets[i] = float64(class_to_size[i])
+	}
+	timeHistBuckets = timeHistogramMetricsBuckets()
+	metrics = map[string]metricData{
+		"/gc/cycles/automatic:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone - in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/forced:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesForced
+			},
+		},
+		"/gc/cycles/total:gc-cycles": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.gcCyclesDone
+			},
+		},
+		"/gc/heap/allocs-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeAllocCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallAllocCount[i])
+				}
+			},
+		},
+		"/gc/heap/frees-by-size:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(sizeClassBuckets)
+				hist.counts[len(hist.counts)-1] = uint64(in.heapStats.largeFreeCount)
+				for i := range hist.buckets {
+					hist.counts[i] = uint64(in.heapStats.smallFreeCount[i])
+				}
+			},
+		},
+		"/gc/heap/goal:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.heapGoal
+			},
+		},
+		"/gc/heap/objects:objects": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.numObjects
+			},
+		},
+		"/gc/pauses:seconds": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				hist := out.float64HistOrInit(timeHistBuckets)
+				hist.counts[len(hist.counts)-1] = atomic.Load64(&memstats.gcPauseDist.overflow)
+				for i := range hist.buckets {
+					hist.counts[i] = atomic.Load64(&memstats.gcPauseDist.counts[i])
+				}
+			},
+		},
+		"/memory/classes/heap/free:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed - in.heapStats.inHeap -
+					in.heapStats.inStacks - in.heapStats.inWorkBufs -
+					in.heapStats.inPtrScalarBits)
+			},
+		},
+		"/memory/classes/heap/objects:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/heap/released:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.released)
+			},
+		},
+		"/memory/classes/heap/stacks:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inStacks)
+			},
+		},
+		"/memory/classes/heap/unused:bytes": {
+			deps: makeStatDepSet(heapStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inHeap) - in.heapStats.inObjects
+			},
+		},
+		"/memory/classes/metadata/mcache/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheSys - in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mcache/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mCacheInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/free:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanSys - in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/mspan/inuse:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.mSpanInUse
+			},
+		},
+		"/memory/classes/metadata/other:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.inWorkBufs+in.heapStats.inPtrScalarBits) + in.sysStats.gcMiscSys
+			},
+		},
+		"/memory/classes/os-stacks:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.stacksSys
+			},
+		},
+		"/memory/classes/other:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.otherSys
+			},
+		},
+		"/memory/classes/profiling/buckets:bytes": {
+			deps: makeStatDepSet(sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = in.sysStats.buckHashSys
+			},
+		},
+		"/memory/classes/total:bytes": {
+			deps: makeStatDepSet(heapStatsDep, sysStatsDep),
+			compute: func(in *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(in.heapStats.committed+in.heapStats.released) +
+					in.sysStats.stacksSys + in.sysStats.mSpanSys +
+					in.sysStats.mCacheSys + in.sysStats.buckHashSys +
+					in.sysStats.gcMiscSys + in.sysStats.otherSys
+			},
+		},
+		"/sched/goroutines:goroutines": {
+			compute: func(_ *statAggregate, out *metricValue) {
+				out.kind = metricKindUint64
+				out.scalar = uint64(gcount())
+			},
+		},
+	}
+	metricsInit = true
+}
+
+// statDep is a dependency on a group of statistics
+// that a metric might have.
+type statDep uint
+
+const (
+	heapStatsDep statDep = iota // corresponds to heapStatsAggregate
+	sysStatsDep                 // corresponds to sysStatsAggregate
+	numStatsDeps
+)
+
+// statDepSet represents a set of statDeps.
+//
+// Under the hood, it's a bitmap.
+type statDepSet [1]uint64
+
+// makeStatDepSet creates a new statDepSet from a list of statDeps.
+func makeStatDepSet(deps ...statDep) statDepSet {
+	var s statDepSet
+	for _, d := range deps {
+		s[d/64] |= 1 << (d % 64)
+	}
+	return s
+}
+
+// differennce returns set difference of s from b as a new set.
+func (s statDepSet) difference(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] &^ b[i]
+	}
+	return c
+}
+
+// union returns the union of the two sets as a new set.
+func (s statDepSet) union(b statDepSet) statDepSet {
+	var c statDepSet
+	for i := range s {
+		c[i] = s[i] | b[i]
+	}
+	return c
+}
+
+// empty returns true if there are no dependencies in the set.
+func (s *statDepSet) empty() bool {
+	for _, c := range s {
+		if c != 0 {
+			return false
+		}
+	}
+	return true
+}
+
+// has returns true if the set contains a given statDep.
+func (s *statDepSet) has(d statDep) bool {
+	return s[d/64]&(1<<(d%64)) != 0
+}
+
+// heapStatsAggregate represents memory stats obtained from the
+// runtime. This set of stats is grouped together because they
+// depend on each other in some way to make sense of the runtime's
+// current heap memory use. They're also sharded across Ps, so it
+// makes sense to grab them all at once.
+type heapStatsAggregate struct {
+	heapStatsDelta
+
+	// Derived from values in heapStatsDelta.
+
+	// inObjects is the bytes of memory occupied by objects,
+	inObjects uint64
+
+	// numObjects is the number of live objects in the heap.
+	numObjects uint64
+}
+
+// compute populates the heapStatsAggregate with values from the runtime.
+func (a *heapStatsAggregate) compute() {
+	memstats.heapStats.read(&a.heapStatsDelta)
+
+	// Calculate derived stats.
+	a.inObjects = uint64(a.largeAlloc - a.largeFree)
+	a.numObjects = uint64(a.largeAllocCount - a.largeFreeCount)
+	for i := range a.smallAllocCount {
+		n := uint64(a.smallAllocCount[i] - a.smallFreeCount[i])
+		a.inObjects += n * uint64(class_to_size[i])
+		a.numObjects += n
+	}
+}
+
+// sysStatsAggregate represents system memory stats obtained
+// from the runtime. This set of stats is grouped together because
+// they're all relatively cheap to acquire and generally independent
+// of one another and other runtime memory stats. The fact that they
+// may be acquired at different times, especially with respect to
+// heapStatsAggregate, means there could be some skew, but because of
+// these stats are independent, there's no real consistency issue here.
+type sysStatsAggregate struct {
+	stacksSys      uint64
+	mSpanSys       uint64
+	mSpanInUse     uint64
+	mCacheSys      uint64
+	mCacheInUse    uint64
+	buckHashSys    uint64
+	gcMiscSys      uint64
+	otherSys       uint64
+	heapGoal       uint64
+	gcCyclesDone   uint64
+	gcCyclesForced uint64
+}
+
+// compute populates the sysStatsAggregate with values from the runtime.
+func (a *sysStatsAggregate) compute() {
+	a.stacksSys = memstats.stacks_sys.load()
+	a.buckHashSys = memstats.buckhash_sys.load()
+	a.gcMiscSys = memstats.gcMiscSys.load()
+	a.otherSys = memstats.other_sys.load()
+	a.heapGoal = atomic.Load64(&memstats.next_gc)
+	a.gcCyclesDone = uint64(memstats.numgc)
+	a.gcCyclesForced = uint64(memstats.numforcedgc)
+
+	systemstack(func() {
+		lock(&mheap_.lock)
+		a.mSpanSys = memstats.mspan_sys.load()
+		a.mSpanInUse = uint64(mheap_.spanalloc.inuse)
+		a.mCacheSys = memstats.mcache_sys.load()
+		a.mCacheInUse = uint64(mheap_.cachealloc.inuse)
+		unlock(&mheap_.lock)
+	})
+}
+
+// statAggregate is the main driver of the metrics implementation.
+//
+// It contains multiple aggregates of runtime statistics, as well
+// as a set of these aggregates that it has populated. The aggergates
+// are populated lazily by its ensure method.
+type statAggregate struct {
+	ensured   statDepSet
+	heapStats heapStatsAggregate
+	sysStats  sysStatsAggregate
+}
+
+// ensure populates statistics aggregates determined by deps if they
+// haven't yet been populated.
+func (a *statAggregate) ensure(deps *statDepSet) {
+	missing := deps.difference(a.ensured)
+	if missing.empty() {
+		return
+	}
+	for i := statDep(0); i < numStatsDeps; i++ {
+		if !missing.has(i) {
+			continue
+		}
+		switch i {
+		case heapStatsDep:
+			a.heapStats.compute()
+		case sysStatsDep:
+			a.sysStats.compute()
+		}
+	}
+	a.ensured = a.ensured.union(missing)
+}
+
+// metricValidKind is a runtime copy of runtime/metrics.ValueKind and
+// must be kept structurally identical to that type.
+type metricKind int
+
+const (
+	// These values must be kept identical to their corresponding Kind* values
+	// in the runtime/metrics package.
+	metricKindBad metricKind = iota
+	metricKindUint64
+	metricKindFloat64
+	metricKindFloat64Histogram
+)
+
+// metricSample is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricSample struct {
+	name  string
+	value metricValue
+}
+
+// metricValue is a runtime copy of runtime/metrics.Sample and
+// must be kept structurally identical to that type.
+type metricValue struct {
+	kind    metricKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// float64HistOrInit tries to pull out an existing float64Histogram
+// from the value, but if none exists, then it allocates one with
+// the given buckets.
+func (v *metricValue) float64HistOrInit(buckets []float64) *metricFloat64Histogram {
+	var hist *metricFloat64Histogram
+	if v.kind == metricKindFloat64Histogram && v.pointer != nil {
+		hist = (*metricFloat64Histogram)(v.pointer)
+	} else {
+		v.kind = metricKindFloat64Histogram
+		hist = new(metricFloat64Histogram)
+		v.pointer = unsafe.Pointer(hist)
+	}
+	hist.buckets = buckets
+	if len(hist.counts) != len(hist.buckets)+1 {
+		hist.counts = make([]uint64, len(buckets)+1)
+	}
+	return hist
+}
+
+// metricFloat64Histogram is a runtime copy of runtime/metrics.Float64Histogram
+// and must be kept structurally identical to that type.
+type metricFloat64Histogram struct {
+	counts  []uint64
+	buckets []float64
+}
+
+// agg is used by readMetrics, and is protected by metricsSema.
+//
+// Managed as a global variable because its pointer will be
+// an argument to a dynamically-defined function, and we'd
+// like to avoid it escaping to the heap.
+var agg statAggregate
+
+// readMetrics is the implementation of runtime/metrics.Read.
+//
+//go:linkname readMetrics runtime/metrics.runtime_readMetrics
+func readMetrics(samplesp unsafe.Pointer, len int, cap int) {
+	// Construct a slice from the args.
+	sl := slice{samplesp, len, cap}
+	samples := *(*[]metricSample)(unsafe.Pointer(&sl))
+
+	// Acquire the metricsSema but with handoff. This operation
+	// is expensive enough that queueing up goroutines and handing
+	// off between them will be noticably better-behaved.
+	semacquire1(&metricsSema, true, 0, 0)
+
+	// Ensure the map is initialized.
+	initMetrics()
+
+	// Clear agg defensively.
+	agg = statAggregate{}
+
+	// Sample.
+	for i := range samples {
+		sample := &samples[i]
+		data, ok := metrics[sample.name]
+		if !ok {
+			sample.value.kind = metricKindBad
+			continue
+		}
+		// Ensure we have all the stats we need.
+		// agg is populated lazily.
+		agg.ensure(&data.deps)
+
+		// Compute the value based on the stats we have.
+		data.compute(&agg, &sample.value)
+	}
+
+	semrelease(&metricsSema)
+}
diff --git a/src/runtime/metrics/description.go b/src/runtime/metrics/description.go
new file mode 100644
index 0000000000..bc2e0882db
--- /dev/null
+++ b/src/runtime/metrics/description.go
@@ -0,0 +1,176 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Description describes a runtime metric.
+type Description struct {
+	// Name is the full name of the metric which includes the unit.
+	//
+	// The format of the metric may be described by the following regular expression.
+	//
+	// 	^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$
+	//
+	// The format splits the name into two components, separated by a colon: a path which always
+	// starts with a /, and a machine-parseable unit. The name may contain any valid Unicode
+	// codepoint in between / characters, but by convention will try to stick to lowercase
+	// characters and hyphens. An example of such a path might be "/memory/heap/free".
+	//
+	// The unit is by convention a series of lowercase English unit names (singular or plural)
+	// without prefixes delimited by '*' or '/'. The unit names may contain any valid Unicode
+	// codepoint that is not a delimiter.
+	// Examples of units might be "seconds", "bytes", "bytes/second", "cpu-seconds",
+	// "byte*cpu-seconds", and "bytes/second/second".
+	//
+	// A complete name might look like "/memory/heap/free:bytes".
+	Name string
+
+	// Description is an English language sentence describing the metric.
+	Description string
+
+	// Kind is the kind of value for this metric.
+	//
+	// The purpose of this field is to allow users to filter out metrics whose values are
+	// types which their application may not understand.
+	Kind ValueKind
+
+	// Cumulative is whether or not the metric is cumulative. If a cumulative metric is just
+	// a single number, then it increases monotonically. If the metric is a distribution,
+	// then each bucket count increases monotonically.
+	//
+	// This flag thus indicates whether or not it's useful to compute a rate from this value.
+	Cumulative bool
+
+	// StopTheWorld is whether or not the metric requires a stop-the-world
+	// event in order to collect it.
+	StopTheWorld bool
+}
+
+// The English language descriptions below must be kept in sync with the
+// descriptions of each metric in doc.go.
+var allDesc = []Description{
+	{
+		Name:        "/gc/cycles/automatic:gc-cycles",
+		Description: "Count of completed GC cycles generated by the Go runtime.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/forced:gc-cycles",
+		Description: "Count of completed forced GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/cycles/total:gc-cycles",
+		Description: "Count of all completed GC cycles.",
+		Kind:        KindUint64,
+		Cumulative:  true,
+	},
+	{
+		Name:        "/gc/heap/allocs-by-size:objects",
+		Description: "Distribution of all objects allocated by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/frees-by-size:objects",
+		Description: "Distribution of all objects freed by approximate size.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/gc/heap/goal:bytes",
+		Description: "Heap size target for the end of the GC cycle.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/heap/objects:objects",
+		Description: "Number of objects, live or unswept, occupying heap memory.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/gc/pauses:seconds",
+		Description: "Distribution individual GC-related stop-the-world pause latencies.",
+		Kind:        KindFloat64Histogram,
+	},
+	{
+		Name:        "/memory/classes/heap/free:bytes",
+		Description: "Memory that is available for allocation, and may be returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/objects:bytes",
+		Description: "Memory occupied by live objects and dead objects that have not yet been collected.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/released:bytes",
+		Description: "Memory that has been returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/stacks:bytes",
+		Description: "Memory allocated from the heap that is occupied by stacks.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/heap/unused:bytes",
+		Description: "Memory that is unavailable for allocation, but cannot be returned to the underlying system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/free:bytes",
+		Description: "Memory that is reserved for runtime mcache structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mcache/inuse:bytes",
+		Description: "Memory that is occupied by runtime mcache structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/free:bytes",
+		Description: "Memory that is reserved for runtime mspan structures, but not in-use.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/mspan/inuse:bytes",
+		Description: "Memory that is occupied by runtime mspan structures that are currently being used.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/metadata/other:bytes",
+		Description: "Memory that is reserved for or used to hold runtime metadata.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/os-stacks:bytes",
+		Description: "Stack memory allocated by the underlying operating system.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/other:bytes",
+		Description: "Memory used by execution trace buffers, structures for debugging the runtime, finalizer and profiler specials, and more.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/profiling/buckets:bytes",
+		Description: "Memory that is used by the stack trace hash map used for profiling.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/memory/classes/total:bytes",
+		Description: "All memory mapped by the Go runtime into the current process as read-write. Note that this does not include memory mapped by code called via cgo or via the syscall package. Sum of all metrics in /memory/classes.",
+		Kind:        KindUint64,
+	},
+	{
+		Name:        "/sched/goroutines:goroutines",
+		Description: "Count of live goroutines.",
+		Kind:        KindUint64,
+	},
+}
+
+// All returns a slice of containing metric descriptions for all supported metrics.
+func All() []Description {
+	return allDesc
+}
diff --git a/src/runtime/metrics/description_test.go b/src/runtime/metrics/description_test.go
new file mode 100644
index 0000000000..e966a281a1
--- /dev/null
+++ b/src/runtime/metrics/description_test.go
@@ -0,0 +1,125 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics_test
+
+import (
+	"bufio"
+	"os"
+	"path/filepath"
+	"regexp"
+	"runtime"
+	"runtime/metrics"
+	"strings"
+	"testing"
+)
+
+func TestDescriptionNameFormat(t *testing.T) {
+	r := regexp.MustCompile("^(?P<name>/[^:]+):(?P<unit>[^:*/]+(?:[*/][^:*/]+)*)$")
+	descriptions := metrics.All()
+	for _, desc := range descriptions {
+		if !r.MatchString(desc.Name) {
+			t.Errorf("metrics %q does not match regexp %s", desc.Name, r)
+		}
+	}
+}
+
+func extractMetricDocs(t *testing.T) map[string]string {
+	if runtime.GOOS == "android" {
+		t.Skip("no access to Go source on android")
+	}
+
+	// Get doc.go.
+	_, filename, _, _ := runtime.Caller(0)
+	filename = filepath.Join(filepath.Dir(filename), "doc.go")
+
+	f, err := os.Open(filename)
+	if err != nil {
+		t.Fatal(err)
+	}
+	const (
+		stateSearch          = iota // look for list of metrics
+		stateNextMetric             // look for next metric
+		stateNextDescription        // build description
+	)
+	state := stateSearch
+	s := bufio.NewScanner(f)
+	result := make(map[string]string)
+	var metric string
+	var prevMetric string
+	var desc strings.Builder
+	for s.Scan() {
+		line := strings.TrimSpace(s.Text())
+		switch state {
+		case stateSearch:
+			if line == "Supported metrics" {
+				state = stateNextMetric
+			}
+		case stateNextMetric:
+			// Ignore empty lines until we find a non-empty
+			// one. This will be our metric name.
+			if len(line) != 0 {
+				prevMetric = metric
+				metric = line
+				if prevMetric > metric {
+					t.Errorf("metrics %s and %s are out of lexicographical order", prevMetric, metric)
+				}
+				state = stateNextDescription
+			}
+		case stateNextDescription:
+			if len(line) == 0 || line == `*/` {
+				// An empty line means we're done.
+				// Write down the description and look
+				// for a new metric.
+				result[metric] = desc.String()
+				desc.Reset()
+				state = stateNextMetric
+			} else {
+				// As long as we're seeing data, assume that's
+				// part of the description and append it.
+				if desc.Len() != 0 {
+					// Turn previous newlines into spaces.
+					desc.WriteString(" ")
+				}
+				desc.WriteString(line)
+			}
+		}
+		if line == `*/` {
+			break
+		}
+	}
+	if state == stateSearch {
+		t.Fatalf("failed to find supported metrics docs in %s", filename)
+	}
+	return result
+}
+
+func TestDescriptionDocs(t *testing.T) {
+	docs := extractMetricDocs(t)
+	descriptions := metrics.All()
+	for _, d := range descriptions {
+		want := d.Description
+		got, ok := docs[d.Name]
+		if !ok {
+			t.Errorf("no docs found for metric %s", d.Name)
+			continue
+		}
+		if got != want {
+			t.Errorf("mismatched description and docs for metric %s", d.Name)
+			t.Errorf("want: %q, got %q", want, got)
+			continue
+		}
+	}
+	if len(docs) > len(descriptions) {
+	docsLoop:
+		for name, _ := range docs {
+			for _, d := range descriptions {
+				if name == d.Name {
+					continue docsLoop
+				}
+			}
+			t.Errorf("stale documentation for non-existent metric: %s", name)
+		}
+	}
+}
diff --git a/src/runtime/metrics/doc.go b/src/runtime/metrics/doc.go
new file mode 100644
index 0000000000..e340f3d0dd
--- /dev/null
+++ b/src/runtime/metrics/doc.go
@@ -0,0 +1,130 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+/*
+Package metrics provides a stable interface to access implementation-defined
+metrics exported by the Go runtime. This package is similar to existing functions
+like runtime.ReadMemStats and debug.ReadGCStats, but significantly more general.
+
+The set of metrics defined by this package may evolve as the runtime itself
+evolves, and also enables variation across Go implementations, whose relevant
+metric sets may not intersect.
+
+Interface
+
+Metrics are designated by a string key, rather than, for example, a field name in
+a struct. The full list of supported metrics is always available in the slice of
+Descriptions returned by All. Each Description also includes useful information
+about the metric, such as how to display it (e.g. gauge vs. counter) and how difficult
+or disruptive it is to obtain it (e.g. do you need to stop the world?).
+
+Thus, users of this API are encouraged to sample supported metrics defined by the
+slice returned by All to remain compatible across Go versions. Of course, situations
+arise where reading specific metrics is critical. For these cases, users are
+encouranged to use build tags, and although metrics may be deprecated and removed,
+users should consider this to be an exceptional and rare event, coinciding with a
+very large change in a particular Go implementation.
+
+Each metric key also has a "kind" that describes the format of the metric's value.
+In the interest of not breaking users of this package, the "kind" for a given metric
+is guaranteed not to change. If it must change, then a new metric will be introduced
+with a new key and a new "kind."
+
+Metric key format
+
+As mentioned earlier, metric keys are strings. Their format is simple and well-defined,
+designed to be both human and machine readable. It is split into two components,
+separated by a colon: a rooted path and a unit. The choice to include the unit in
+the key is motivated by compatibility: if a metric's unit changes, its semantics likely
+did also, and a new key should be introduced.
+
+For more details on the precise definition of the metric key's path and unit formats, see
+the documentation of the Name field of the Description struct.
+
+Supported metrics
+
+	/gc/cycles/automatic:gc-cycles
+		Count of completed GC cycles generated by the Go runtime.
+
+	/gc/cycles/forced:gc-cycles
+		Count of completed forced GC cycles.
+
+	/gc/cycles/total:gc-cycles
+		Count of all completed GC cycles.
+
+	/gc/heap/allocs-by-size:objects
+		Distribution of all objects allocated by approximate size.
+
+	/gc/heap/frees-by-size:objects
+		Distribution of all objects freed by approximate size.
+
+	/gc/heap/goal:bytes
+		Heap size target for the end of the GC cycle.
+
+	/gc/heap/objects:objects
+		Number of objects, live or unswept, occupying heap memory.
+
+	/gc/pauses:seconds
+		Distribution individual GC-related stop-the-world pause latencies.
+
+	/memory/classes/heap/free:bytes
+		Memory that is available for allocation, and may be returned
+		to the underlying system.
+
+	/memory/classes/heap/objects:bytes
+		Memory occupied by live objects and dead objects that have
+		not yet been collected.
+
+	/memory/classes/heap/released:bytes
+		Memory that has been returned to the underlying system.
+
+	/memory/classes/heap/stacks:bytes
+		Memory allocated from the heap that is occupied by stacks.
+
+	/memory/classes/heap/unused:bytes
+		Memory that is unavailable for allocation, but cannot be
+		returned to the underlying system.
+
+	/memory/classes/metadata/mcache/free:bytes
+		Memory that is reserved for runtime mcache structures, but
+		not in-use.
+
+	/memory/classes/metadata/mcache/inuse:bytes
+		Memory that is occupied by runtime mcache structures that
+		are currently being used.
+
+	/memory/classes/metadata/mspan/free:bytes
+		Memory that is reserved for runtime mspan structures, but
+		not in-use.
+
+	/memory/classes/metadata/mspan/inuse:bytes
+		Memory that is occupied by runtime mspan structures that are
+		currently being used.
+
+	/memory/classes/metadata/other:bytes
+		Memory that is reserved for or used to hold runtime
+		metadata.
+
+	/memory/classes/os-stacks:bytes
+		Stack memory allocated by the underlying operating system.
+
+	/memory/classes/other:bytes
+		Memory used by execution trace buffers, structures for
+		debugging the runtime, finalizer and profiler specials, and
+		more.
+
+	/memory/classes/profiling/buckets:bytes
+		Memory that is used by the stack trace hash map used for
+		profiling.
+
+	/memory/classes/total:bytes
+		All memory mapped by the Go runtime into the current process
+		as read-write. Note that this does not include memory mapped
+		by code called via cgo or via the syscall package.
+		Sum of all metrics in /memory/classes.
+
+	/sched/goroutines:goroutines
+		Count of live goroutines.
+*/
+package metrics
diff --git a/src/runtime/metrics/histogram.go b/src/runtime/metrics/histogram.go
new file mode 100644
index 0000000000..e1364e1e26
--- /dev/null
+++ b/src/runtime/metrics/histogram.go
@@ -0,0 +1,30 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+// Float64Histogram represents a distribution of float64 values.
+type Float64Histogram struct {
+	// Counts contains the weights for each histogram bucket. The length of
+	// Counts is equal to the length of Buckets (in the metric description)
+	// plus one to account for the implicit minimum bucket.
+	//
+	// Given N buckets, the following is the mathematical relationship between
+	// Counts and Buckets.
+	// count[0] is the weight of the range (-inf, bucket[0])
+	// count[n] is the weight of the range [bucket[n], bucket[n+1]), for 0 < n < N-1
+	// count[N-1] is the weight of the range [bucket[N-1], inf)
+	Counts []uint64
+
+	// Buckets contains the boundaries between histogram buckets, in increasing order.
+	//
+	// Because this slice contains boundaries, there are len(Buckets)+1 counts:
+	// a count for all values less than the first boundary, a count covering each
+	// [slice[i], slice[i+1]) interval, and a count for all values greater than or
+	// equal to the last boundary.
+	//
+	// For a given metric name, the value of Buckets is guaranteed not to change
+	// between calls until program exit.
+	Buckets []float64
+}
diff --git a/src/runtime/metrics/sample.go b/src/runtime/metrics/sample.go
new file mode 100644
index 0000000000..b4b0979aa6
--- /dev/null
+++ b/src/runtime/metrics/sample.go
@@ -0,0 +1,37 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	_ "runtime" // depends on the runtime via a linkname'd function
+	"unsafe"
+)
+
+// Sample captures a single metric sample.
+type Sample struct {
+	// Name is the name of the metric sampled.
+	//
+	// It must correspond to a name in one of the metric descriptions
+	// returned by Descriptions.
+	Name string
+
+	// Value is the value of the metric sample.
+	Value Value
+}
+
+// Implemented in the runtime.
+func runtime_readMetrics(unsafe.Pointer, int, int)
+
+// Read populates each Value field in the given slice of metric samples.
+//
+// Desired metrics should be present in the slice with the appropriate name.
+// The user of this API is encouraged to re-use the same slice between calls.
+//
+// Metric values with names not appearing in the value returned by Descriptions
+// will have the value populated as KindBad to indicate that the name is
+// unknown.
+func Read(m []Sample) {
+	runtime_readMetrics(unsafe.Pointer(&m[0]), len(m), cap(m))
+}
diff --git a/src/runtime/metrics/value.go b/src/runtime/metrics/value.go
new file mode 100644
index 0000000000..0b056b4ea8
--- /dev/null
+++ b/src/runtime/metrics/value.go
@@ -0,0 +1,69 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package metrics
+
+import (
+	"math"
+	"unsafe"
+)
+
+// ValueKind is a tag for a metric Value which indicates its type.
+type ValueKind int
+
+const (
+	// KindBad indicates that the Value has no type and should not be used.
+	KindBad ValueKind = iota
+
+	// KindUint64 indicates that the type of the Value is a uint64.
+	KindUint64
+
+	// KindFloat64 indicates that the type of the Value is a float64.
+	KindFloat64
+
+	// KindFloat64Histogram indicates that the type of the Value is a *Float64Histogram.
+	KindFloat64Histogram
+)
+
+// Value represents a metric value returned by the runtime.
+type Value struct {
+	kind    ValueKind
+	scalar  uint64         // contains scalar values for scalar Kinds.
+	pointer unsafe.Pointer // contains non-scalar values.
+}
+
+// Kind returns the a tag representing the kind of value this is.
+func (v Value) Kind() ValueKind {
+	return v.kind
+}
+
+// Uint64 returns the internal uint64 value for the metric.
+//
+// If v.Kind() != KindUint64, this method panics.
+func (v Value) Uint64() uint64 {
+	if v.kind != KindUint64 {
+		panic("called Uint64 on non-uint64 metric value")
+	}
+	return v.scalar
+}
+
+// Float64 returns the internal float64 value for the metric.
+//
+// If v.Kind() != KindFloat64, this method panics.
+func (v Value) Float64() float64 {
+	if v.kind != KindFloat64 {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return math.Float64frombits(v.scalar)
+}
+
+// Float64Histogram returns the internal *Float64Histogram value for the metric.
+//
+// If v.Kind() != KindFloat64Histogram, this method panics.
+func (v Value) Float64Histogram() *Float64Histogram {
+	if v.kind != KindFloat64Histogram {
+		panic("called Float64 on non-float64 metric value")
+	}
+	return (*Float64Histogram)(v.pointer)
+}
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go
new file mode 100644
index 0000000000..167edd57fd
--- /dev/null
+++ b/src/runtime/metrics_test.go
@@ -0,0 +1,224 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	"runtime"
+	"runtime/metrics"
+	"sort"
+	"strings"
+	"testing"
+	"time"
+	"unsafe"
+)
+
+func prepareAllMetricsSamples() (map[string]metrics.Description, []metrics.Sample) {
+	all := metrics.All()
+	samples := make([]metrics.Sample, len(all))
+	descs := make(map[string]metrics.Description)
+	for i := range all {
+		samples[i].Name = all[i].Name
+		descs[all[i].Name] = all[i]
+	}
+	return descs, samples
+}
+
+func TestReadMetrics(t *testing.T) {
+	// Tests whether readMetrics produces values aligning
+	// with ReadMemStats while the world is stopped.
+	var mstats runtime.MemStats
+	_, samples := prepareAllMetricsSamples()
+	runtime.ReadMetricsSlow(&mstats, unsafe.Pointer(&samples[0]), len(samples), cap(samples))
+
+	checkUint64 := func(t *testing.T, m string, got, want uint64) {
+		t.Helper()
+		if got != want {
+			t.Errorf("metric %q: got %d, want %d", m, got, want)
+		}
+	}
+
+	// Check to make sure the values we read line up with other values we read.
+	for i := range samples {
+		switch name := samples[i].Name; name {
+		case "/memory/classes/heap/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapIdle-mstats.HeapReleased)
+		case "/memory/classes/heap/released:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapReleased)
+		case "/memory/classes/heap/objects:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapAlloc)
+		case "/memory/classes/heap/unused:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapInuse-mstats.HeapAlloc)
+		case "/memory/classes/heap/stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackInuse)
+		case "/memory/classes/metadata/mcache/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheSys-mstats.MCacheInuse)
+		case "/memory/classes/metadata/mcache/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MCacheInuse)
+		case "/memory/classes/metadata/mspan/free:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanSys-mstats.MSpanInuse)
+		case "/memory/classes/metadata/mspan/inuse:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.MSpanInuse)
+		case "/memory/classes/metadata/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.GCSys)
+		case "/memory/classes/os-stacks:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.StackSys-mstats.StackInuse)
+		case "/memory/classes/other:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.OtherSys)
+		case "/memory/classes/profiling/buckets:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.BuckHashSys)
+		case "/memory/classes/total:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.Sys)
+		case "/gc/heap/objects:objects":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.HeapObjects)
+		case "/gc/heap/goal:bytes":
+			checkUint64(t, name, samples[i].Value.Uint64(), mstats.NextGC)
+		case "/gc/cycles/automatic:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC-mstats.NumForcedGC))
+		case "/gc/cycles/forced:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumForcedGC))
+		case "/gc/cycles/total:gc-cycles":
+			checkUint64(t, name, samples[i].Value.Uint64(), uint64(mstats.NumGC))
+		}
+	}
+}
+
+func TestReadMetricsConsistency(t *testing.T) {
+	// Tests whether readMetrics produces consistent, sensible values.
+	// The values are read concurrently with the runtime doing other
+	// things (e.g. allocating) so what we read can't reasonably compared
+	// to runtime values.
+
+	// Run a few GC cycles to get some of the stats to be non-zero.
+	runtime.GC()
+	runtime.GC()
+	runtime.GC()
+
+	// Read all the supported metrics through the metrics package.
+	descs, samples := prepareAllMetricsSamples()
+	metrics.Read(samples)
+
+	// Check to make sure the values we read make sense.
+	var totalVirtual struct {
+		got, want uint64
+	}
+	var objects struct {
+		alloc, free *metrics.Float64Histogram
+		total       uint64
+	}
+	var gc struct {
+		numGC  uint64
+		pauses uint64
+	}
+	for i := range samples {
+		kind := samples[i].Value.Kind()
+		if want := descs[samples[i].Name].Kind; kind != want {
+			t.Errorf("supported metric %q has unexpected kind: got %d, want %d", samples[i].Name, kind, want)
+			continue
+		}
+		if samples[i].Name != "/memory/classes/total:bytes" && strings.HasPrefix(samples[i].Name, "/memory/classes") {
+			v := samples[i].Value.Uint64()
+			totalVirtual.want += v
+
+			// None of these stats should ever get this big.
+			// If they do, there's probably overflow involved,
+			// usually due to bad accounting.
+			if int64(v) < 0 {
+				t.Errorf("%q has high/negative value: %d", samples[i].Name, v)
+			}
+		}
+		switch samples[i].Name {
+		case "/memory/classes/total:bytes":
+			totalVirtual.got = samples[i].Value.Uint64()
+		case "/gc/heap/objects:objects":
+			objects.total = samples[i].Value.Uint64()
+		case "/gc/heap/allocs-by-size:objects":
+			objects.alloc = samples[i].Value.Float64Histogram()
+		case "/gc/heap/frees-by-size:objects":
+			objects.free = samples[i].Value.Float64Histogram()
+		case "/gc/cycles:gc-cycles":
+			gc.numGC = samples[i].Value.Uint64()
+		case "/gc/pauses:seconds":
+			h := samples[i].Value.Float64Histogram()
+			gc.pauses = 0
+			for i := range h.Counts {
+				gc.pauses += h.Counts[i]
+			}
+		case "/sched/goroutines:goroutines":
+			if samples[i].Value.Uint64() < 1 {
+				t.Error("number of goroutines is less than one")
+			}
+		}
+	}
+	if totalVirtual.got != totalVirtual.want {
+		t.Errorf(`"/memory/classes/total:bytes" does not match sum of /memory/classes/**: got %d, want %d`, totalVirtual.got, totalVirtual.want)
+	}
+	if len(objects.alloc.Buckets) != len(objects.free.Buckets) {
+		t.Error("allocs-by-size and frees-by-size buckets don't match in length")
+	} else if len(objects.alloc.Counts) != len(objects.free.Counts) {
+		t.Error("allocs-by-size and frees-by-size counts don't match in length")
+	} else {
+		for i := range objects.alloc.Buckets {
+			ba := objects.alloc.Buckets[i]
+			bf := objects.free.Buckets[i]
+			if ba != bf {
+				t.Errorf("bucket %d is different for alloc and free hists: %f != %f", i, ba, bf)
+			}
+		}
+		if !t.Failed() {
+			got, want := uint64(0), objects.total
+			for i := range objects.alloc.Counts {
+				if objects.alloc.Counts[i] < objects.free.Counts[i] {
+					t.Errorf("found more allocs than frees in object dist bucket %d", i)
+					continue
+				}
+				got += objects.alloc.Counts[i] - objects.free.Counts[i]
+			}
+			if got != want {
+				t.Errorf("object distribution counts don't match count of live objects: got %d, want %d", got, want)
+			}
+		}
+	}
+	// The current GC has at least 2 pauses per GC.
+	// Check to see if that value makes sense.
+	if gc.pauses < gc.numGC*2 {
+		t.Errorf("fewer pauses than expected: got %d, want at least %d", gc.pauses, gc.numGC*2)
+	}
+}
+
+func BenchmarkReadMetricsLatency(b *testing.B) {
+	stop := applyGCLoad(b)
+
+	// Spend this much time measuring latencies.
+	latencies := make([]time.Duration, 0, 1024)
+	_, samples := prepareAllMetricsSamples()
+
+	// Hit metrics.Read continuously and measure.
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		start := time.Now()
+		metrics.Read(samples)
+		latencies = append(latencies, time.Now().Sub(start))
+	}
+	// Make sure to stop the timer before we wait! The load created above
+	// is very heavy-weight and not easy to stop, so we could end up
+	// confusing the benchmarking framework for small b.N.
+	b.StopTimer()
+	stop()
+
+	// Disable the default */op metrics.
+	// ns/op doesn't mean anything because it's an average, but we
+	// have a sleep in our b.N loop above which skews this significantly.
+	b.ReportMetric(0, "ns/op")
+	b.ReportMetric(0, "B/op")
+	b.ReportMetric(0, "allocs/op")
+
+	// Sort latencies then report percentiles.
+	sort.Slice(latencies, func(i, j int) bool {
+		return latencies[i] < latencies[j]
+	})
+	b.ReportMetric(float64(latencies[len(latencies)*50/100]), "p50-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*90/100]), "p90-ns")
+	b.ReportMetric(float64(latencies[len(latencies)*99/100]), "p99-ns")
+}
diff --git a/src/runtime/mfinal.go b/src/runtime/mfinal.go
index cd6196dcab..6ec5133be0 100644
--- a/src/runtime/mfinal.go
+++ b/src/runtime/mfinal.go
@@ -88,7 +88,7 @@ func queuefinalizer(p unsafe.Pointer, fn *funcval, nret uintptr, fint *_type, ot
 	lock(&finlock)
 	if finq == nil || finq.cnt == uint32(len(finq.fin)) {
 		if finc == nil {
-			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gc_sys))
+			finc = (*finblock)(persistentalloc(_FinBlockSize, 0, &memstats.gcMiscSys))
 			finc.alllink = allfin
 			allfin = finc
 			if finptrmask[0] == 0 {
@@ -210,7 +210,7 @@ func runfinq() {
 					// set up with empty interface
 					(*eface)(frame)._type = &f.ot.typ
 					(*eface)(frame).data = f.arg
-					if len(ityp.mhdr) != 0 {
+					if !ityp.isEmpty() {
 						// convert to interface with methods
 						// this conversion is guaranteed to succeed - we checked in SetFinalizer
 						*(*iface)(frame) = assertE2I(ityp, *(*eface)(frame))
@@ -394,7 +394,7 @@ func SetFinalizer(obj interface{}, finalizer interface{}) {
 		}
 	case fint.kind&kindMask == kindInterface:
 		ityp := (*interfacetype)(unsafe.Pointer(fint))
-		if len(ityp.mhdr) == 0 {
+		if ityp.isEmpty() {
 			// ok - satisfies empty interface
 			goto okarg
 		}
diff --git a/src/runtime/mfixalloc.go b/src/runtime/mfixalloc.go
index f9dd6ca474..293c16b38b 100644
--- a/src/runtime/mfixalloc.go
+++ b/src/runtime/mfixalloc.go
@@ -32,7 +32,7 @@ type fixalloc struct {
 	chunk  uintptr // use uintptr instead of unsafe.Pointer to avoid write barriers
 	nchunk uint32
 	inuse  uintptr // in-use bytes now
-	stat   *uint64
+	stat   *sysMemStat
 	zero   bool // zero allocations
 }
 
@@ -49,7 +49,7 @@ type mlink struct {
 
 // Initialize f to allocate objects of the given size,
 // using the allocator to obtain chunks of memory.
-func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *uint64) {
+func (f *fixalloc) init(size uintptr, first func(arg, p unsafe.Pointer), arg unsafe.Pointer, stat *sysMemStat) {
 	f.size = size
 	f.first = first
 	f.arg = arg
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index bd87144355..b0ab0ae6bb 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -388,10 +388,24 @@ type gcControllerState struct {
 	// bytes that should be performed by mutator assists. This is
 	// computed at the beginning of each cycle and updated every
 	// time heap_scan is updated.
-	assistWorkPerByte float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	assistWorkPerByte uint64
 
 	// assistBytesPerWork is 1/assistWorkPerByte.
-	assistBytesPerWork float64
+	//
+	// Stored as a uint64, but it's actually a float64. Use
+	// float64frombits to get the value.
+	//
+	// Read and written atomically.
+	//
+	// Note that because this is read and written independently
+	// from assistWorkPerByte users may notice a skew between
+	// the two values, and such a state should be safe.
+	assistBytesPerWork uint64
 
 	// fractionalUtilizationGoal is the fraction of wall clock
 	// time that should be spent in the fractional mark worker on
@@ -409,7 +423,8 @@ type gcControllerState struct {
 }
 
 // startCycle resets the GC controller's state and computes estimates
-// for a new GC cycle. The caller must hold worldsema.
+// for a new GC cycle. The caller must hold worldsema and the world
+// must be stopped.
 func (c *gcControllerState) startCycle() {
 	c.scanWork = 0
 	c.bgScanCredit = 0
@@ -469,7 +484,8 @@ func (c *gcControllerState) startCycle() {
 	c.revise()
 
 	if debug.gcpacertrace > 0 {
-		print("pacer: assist ratio=", c.assistWorkPerByte,
+		assistRatio := float64frombits(atomic.Load64(&c.assistWorkPerByte))
+		print("pacer: assist ratio=", assistRatio,
 			" (scan ", memstats.heap_scan>>20, " MB in ",
 			work.initialHeapLive>>20, "->",
 			memstats.next_gc>>20, " MB)",
@@ -479,9 +495,22 @@ func (c *gcControllerState) startCycle() {
 }
 
 // revise updates the assist ratio during the GC cycle to account for
-// improved estimates. This should be called either under STW or
-// whenever memstats.heap_scan, memstats.heap_live, or
-// memstats.next_gc is updated (with mheap_.lock held).
+// improved estimates. This should be called whenever memstats.heap_scan,
+// memstats.heap_live, or memstats.next_gc is updated. It is safe to
+// call concurrently, but it may race with other calls to revise.
+//
+// The result of this race is that the two assist ratio values may not line
+// up or may be stale. In practice this is OK because the assist ratio
+// moves slowly throughout a GC cycle, and the assist ratio is a best-effort
+// heuristic anyway. Furthermore, no part of the heuristic depends on
+// the two assist ratio values being exact reciprocals of one another, since
+// the two values are used to convert values from different sources.
+//
+// The worst case result of this raciness is that we may miss a larger shift
+// in the ratio (say, if we decide to pace more aggressively against the
+// hard heap goal) but even this "hard goal" is best-effort (see #40460).
+// The dedicated GC should ensure we don't exceed the hard goal by too much
+// in the rare case we do exceed it.
 //
 // It should only be called when gcBlackenEnabled != 0 (because this
 // is when assists are enabled and the necessary statistics are
@@ -494,10 +523,12 @@ func (c *gcControllerState) revise() {
 		gcpercent = 100000
 	}
 	live := atomic.Load64(&memstats.heap_live)
+	scan := atomic.Load64(&memstats.heap_scan)
+	work := atomic.Loadint64(&c.scanWork)
 
 	// Assume we're under the soft goal. Pace GC to complete at
 	// next_gc assuming the heap is in steady-state.
-	heapGoal := int64(memstats.next_gc)
+	heapGoal := int64(atomic.Load64(&memstats.next_gc))
 
 	// Compute the expected scan work remaining.
 	//
@@ -508,17 +539,17 @@ func (c *gcControllerState) revise() {
 	//
 	// (This is a float calculation to avoid overflowing on
 	// 100*heap_scan.)
-	scanWorkExpected := int64(float64(memstats.heap_scan) * 100 / float64(100+gcpercent))
+	scanWorkExpected := int64(float64(scan) * 100 / float64(100+gcpercent))
 
-	if live > memstats.next_gc || c.scanWork > scanWorkExpected {
+	if int64(live) > heapGoal || work > scanWorkExpected {
 		// We're past the soft goal, or we've already done more scan
 		// work than we expected. Pace GC so that in the worst case it
 		// will complete by the hard goal.
 		const maxOvershoot = 1.1
-		heapGoal = int64(float64(memstats.next_gc) * maxOvershoot)
+		heapGoal = int64(float64(heapGoal) * maxOvershoot)
 
 		// Compute the upper bound on the scan work remaining.
-		scanWorkExpected = int64(memstats.heap_scan)
+		scanWorkExpected = int64(scan)
 	}
 
 	// Compute the remaining scan work estimate.
@@ -528,7 +559,7 @@ func (c *gcControllerState) revise() {
 	// (scanWork), so allocation will change this difference
 	// slowly in the soft regime and not at all in the hard
 	// regime.
-	scanWorkRemaining := scanWorkExpected - c.scanWork
+	scanWorkRemaining := scanWorkExpected - work
 	if scanWorkRemaining < 1000 {
 		// We set a somewhat arbitrary lower bound on
 		// remaining scan work since if we aim a little high,
@@ -552,8 +583,15 @@ func (c *gcControllerState) revise() {
 	// Compute the mutator assist ratio so by the time the mutator
 	// allocates the remaining heap bytes up to next_gc, it will
 	// have done (or stolen) the remaining amount of scan work.
-	c.assistWorkPerByte = float64(scanWorkRemaining) / float64(heapRemaining)
-	c.assistBytesPerWork = float64(heapRemaining) / float64(scanWorkRemaining)
+	// Note that the assist ratio values are updated atomically
+	// but not together. This means there may be some degree of
+	// skew between the two values. This is generally OK as the
+	// values shift relatively slowly over the course of a GC
+	// cycle.
+	assistWorkPerByte := float64(scanWorkRemaining) / float64(heapRemaining)
+	assistBytesPerWork := float64(heapRemaining) / float64(scanWorkRemaining)
+	atomic.Store64(&c.assistWorkPerByte, float64bits(assistWorkPerByte))
+	atomic.Store64(&c.assistBytesPerWork, float64bits(assistBytesPerWork))
 }
 
 // endCycle computes the trigger ratio for the next cycle.
@@ -844,7 +882,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 
 	// Commit to the trigger and goal.
 	memstats.gc_trigger = trigger
-	memstats.next_gc = goal
+	atomic.Store64(&memstats.next_gc, goal)
 	if trace.enabled {
 		traceNextGC()
 	}
@@ -901,7 +939,7 @@ func gcSetTriggerRatio(triggerRatio float64) {
 //
 // mheap_.lock must be held or the world must be stopped.
 func gcEffectiveGrowthRatio() float64 {
-	egogc := float64(memstats.next_gc-memstats.heap_marked) / float64(memstats.heap_marked)
+	egogc := float64(atomic.Load64(&memstats.next_gc)-memstats.heap_marked) / float64(memstats.heap_marked)
 	if egogc < 0 {
 		// Shouldn't happen, but just in case.
 		egogc = 0
@@ -983,7 +1021,6 @@ var work struct {
 	nproc  uint32
 	tstart int64
 	nwait  uint32
-	ndone  uint32
 
 	// Number of roots of various root types. Set by gcMarkRootPrepare.
 	nFlushCacheRoots                               int
@@ -1381,6 +1418,7 @@ func gcStart(trigger gcTrigger) {
 		now = startTheWorldWithSema(trace.enabled)
 		work.pauseNS += now - work.pauseStart
 		work.tMark = now
+		memstats.gcPauseDist.record(now - work.pauseStart)
 	})
 
 	// Release the world sema before Gosched() in STW mode
@@ -1407,19 +1445,6 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32
 
-// debugCachedWork enables extra checks for debugging premature mark
-// termination.
-//
-// For debugging issue #27993.
-const debugCachedWork = false
-
-// gcWorkPauseGen is for debugging the mark completion algorithm.
-// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
-// Only used if debugCachedWork is true.
-//
-// For debugging issue #27993.
-var gcWorkPauseGen uint32 = 1
-
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1475,15 +1500,7 @@ top:
 			// Flush the write barrier buffer, since this may add
 			// work to the gcWork.
 			wbBufFlush1(_p_)
-			// For debugging, shrink the write barrier
-			// buffer so it flushes immediately.
-			// wbBuf.reset will keep it at this size as
-			// long as throwOnGCWork is set.
-			if debugCachedWork {
-				b := &_p_.wbBuf
-				b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
-				b.debugGen = gcWorkPauseGen
-			}
+
 			// Flush the gcWork, since this may create global work
 			// and set the flushedWork flag.
 			//
@@ -1494,29 +1511,12 @@ top:
 			if _p_.gcw.flushedWork {
 				atomic.Xadd(&gcMarkDoneFlushed, 1)
 				_p_.gcw.flushedWork = false
-			} else if debugCachedWork {
-				// For debugging, freeze the gcWork
-				// until we know whether we've reached
-				// completion or not. If we think
-				// we've reached completion, but
-				// there's a paused gcWork, then
-				// that's a bug.
-				_p_.gcw.pauseGen = gcWorkPauseGen
-				// Capture the G's stack.
-				for i := range _p_.gcw.pauseStack {
-					_p_.gcw.pauseStack[i] = 0
-				}
-				callers(1, _p_.gcw.pauseStack[:])
 			}
 		})
 		casgstatus(gp, _Gwaiting, _Grunning)
 	})
 
 	if gcMarkDoneFlushed != 0 {
-		if debugCachedWork {
-			// Release paused gcWorks.
-			atomic.Xadd(&gcWorkPauseGen, 1)
-		}
 		// More grey objects were discovered since the
 		// previous termination check, so there may be more
 		// work to do. Keep going. It's possible the
@@ -1526,13 +1526,6 @@ top:
 		goto top
 	}
 
-	if debugCachedWork {
-		throwOnGCWork = true
-		// Release paused gcWorks. If there are any, they
-		// should now observe throwOnGCWork and panic.
-		atomic.Xadd(&gcWorkPauseGen, 1)
-	}
-
 	// There was no global work, no local work, and no Ps
 	// communicated work since we took markDoneSema. Therefore
 	// there are no grey objects and no more objects can be
@@ -1549,59 +1542,34 @@ top:
 	// below. The important thing is that the wb remains active until
 	// all marking is complete. This includes writes made by the GC.
 
-	if debugCachedWork {
-		// For debugging, double check that no work was added after we
-		// went around above and disable write barrier buffering.
+	// There is sometimes work left over when we enter mark termination due
+	// to write barriers performed after the completion barrier above.
+	// Detect this and resume concurrent mark. This is obviously
+	// unfortunate.
+	//
+	// See issue #27993 for details.
+	//
+	// Switch to the system stack to call wbBufFlush1, though in this case
+	// it doesn't matter because we're non-preemptible anyway.
+	restart := false
+	systemstack(func() {
 		for _, p := range allp {
-			gcw := &p.gcw
-			if !gcw.empty() {
-				printlock()
-				print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
-				if gcw.wbuf1 == nil {
-					print(" wbuf1=<nil>")
-				} else {
-					print(" wbuf1.n=", gcw.wbuf1.nobj)
-				}
-				if gcw.wbuf2 == nil {
-					print(" wbuf2=<nil>")
-				} else {
-					print(" wbuf2.n=", gcw.wbuf2.nobj)
-				}
-				print("\n")
-				if gcw.pauseGen == gcw.putGen {
-					println("runtime: checkPut already failed at this generation")
-				}
-				throw("throwOnGCWork")
+			wbBufFlush1(p)
+			if !p.gcw.empty() {
+				restart = true
+				break
 			}
 		}
-	} else {
-		// For unknown reasons (see issue #27993), there is
-		// sometimes work left over when we enter mark
-		// termination. Detect this and resume concurrent
-		// mark. This is obviously unfortunate.
-		//
-		// Switch to the system stack to call wbBufFlush1,
-		// though in this case it doesn't matter because we're
-		// non-preemptible anyway.
-		restart := false
+	})
+	if restart {
+		getg().m.preemptoff = ""
 		systemstack(func() {
-			for _, p := range allp {
-				wbBufFlush1(p)
-				if !p.gcw.empty() {
-					restart = true
-					break
-				}
-			}
+			now := startTheWorldWithSema(true)
+			work.pauseNS += now - work.pauseStart
+			memstats.gcPauseDist.record(now - work.pauseStart)
 		})
-		if restart {
-			getg().m.preemptoff = ""
-			systemstack(func() {
-				now := startTheWorldWithSema(true)
-				work.pauseNS += now - work.pauseStart
-			})
-			semrelease(&worldsema)
-			goto top
-		}
+		semrelease(&worldsema)
+		goto top
 	}
 
 	// Disable assists and background workers. We must do
@@ -1630,10 +1598,10 @@ top:
 	gcMarkTermination(nextTriggerRatio)
 }
 
+// World must be stopped and mark assists and background workers must be
+// disabled.
 func gcMarkTermination(nextTriggerRatio float64) {
-	// World is stopped.
-	// Start marktermination which includes enabling the write barrier.
-	atomic.Store(&gcBlackenEnabled, 0)
+	// Start marktermination (write barrier remains enabled for now).
 	setGCPhase(_GCmarktermination)
 
 	work.heap1 = memstats.heap_live
@@ -1711,6 +1679,7 @@ func gcMarkTermination(nextTriggerRatio float64) {
 	unixNow := sec*1e9 + int64(nsec)
 	work.pauseNS += now - work.pauseStart
 	work.tEnd = now
+	memstats.gcPauseDist.record(now - work.pauseStart)
 	atomic.Store64(&memstats.last_gc_unix, uint64(unixNow)) // must be Unix time to make sense to user
 	atomic.Store64(&memstats.last_gc_nanotime, uint64(now)) // monotonic time for us
 	memstats.pause_ns[memstats.numgc%uint32(len(memstats.pause_ns))] = uint64(work.pauseNS)
@@ -2085,7 +2054,7 @@ func gcMark(start_time int64) {
 		// ensured all reachable objects were marked, all of
 		// these must be pointers to black objects. Hence we
 		// can just discard the write barrier buffer.
-		if debug.gccheckmark > 0 || throwOnGCWork {
+		if debug.gccheckmark > 0 {
 			// For debugging, flush the buffer and make
 			// sure it really was all marked.
 			wbBufFlush1(p)
@@ -2117,13 +2086,21 @@ func gcMark(start_time int64) {
 		gcw.dispose()
 	}
 
-	throwOnGCWork = false
-
-	cachestats()
-
 	// Update the marked heap stat.
 	memstats.heap_marked = work.bytesMarked
 
+	// Flush scanAlloc from each mcache since we're about to modify
+	// heap_scan directly. If we were to flush this later, then scanAlloc
+	// might have incorrect information.
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		memstats.heap_scan += uint64(c.scanAlloc)
+		c.scanAlloc = 0
+	}
+
 	// Update other GC heap size stats. This must happen after
 	// cachestats (which flushes local statistics to these) and
 	// flushallmcaches (which modifies heap_live).
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 79df59d6d6..c71c0e58d3 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -400,11 +400,13 @@ retry:
 	// balance positive. When the required amount of work is low,
 	// we over-assist to build up credit for future allocations
 	// and amortize the cost of assisting.
+	assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
 	debtBytes := -gp.gcAssistBytes
-	scanWork := int64(gcController.assistWorkPerByte * float64(debtBytes))
+	scanWork := int64(assistWorkPerByte * float64(debtBytes))
 	if scanWork < gcOverAssistWork {
 		scanWork = gcOverAssistWork
-		debtBytes = int64(gcController.assistBytesPerWork * float64(scanWork))
+		debtBytes = int64(assistBytesPerWork * float64(scanWork))
 	}
 
 	// Steal as much credit as we can from the background GC's
@@ -418,7 +420,7 @@ retry:
 	if bgScanCredit > 0 {
 		if bgScanCredit < scanWork {
 			stolen = bgScanCredit
-			gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(stolen))
+			gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(stolen))
 		} else {
 			stolen = scanWork
 			gp.gcAssistBytes += debtBytes
@@ -543,7 +545,8 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
 	// this scan work counts for. The "1+" is a poor man's
 	// round-up, to ensure this adds credit even if
 	// assistBytesPerWork is very low.
-	gp.gcAssistBytes += 1 + int64(gcController.assistBytesPerWork*float64(workDone))
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	gp.gcAssistBytes += 1 + int64(assistBytesPerWork*float64(workDone))
 
 	// If this is the last worker and we ran out of work,
 	// signal a completion point.
@@ -637,7 +640,8 @@ func gcFlushBgCredit(scanWork int64) {
 		return
 	}
 
-	scanBytes := int64(float64(scanWork) * gcController.assistBytesPerWork)
+	assistBytesPerWork := float64frombits(atomic.Load64(&gcController.assistBytesPerWork))
+	scanBytes := int64(float64(scanWork) * assistBytesPerWork)
 
 	lock(&work.assistQueue.lock)
 	for !work.assistQueue.q.empty() && scanBytes > 0 {
@@ -670,7 +674,8 @@ func gcFlushBgCredit(scanWork int64) {
 
 	if scanBytes > 0 {
 		// Convert from scan bytes back to work.
-		scanWork = int64(float64(scanBytes) * gcController.assistWorkPerByte)
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanWork = int64(float64(scanBytes) * assistWorkPerByte)
 		atomic.Xaddint64(&gcController.bgScanCredit, scanWork)
 	}
 	unlock(&work.assistQueue.lock)
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 9d6f551768..5843ada981 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -100,7 +100,7 @@ const (
 
 // heapRetained returns an estimate of the current heap RSS.
 func heapRetained() uint64 {
-	return atomic.Load64(&memstats.heap_sys) - atomic.Load64(&memstats.heap_released)
+	return memstats.heap_sys.load() - atomic.Load64(&memstats.heap_released)
 }
 
 // gcPaceScavenger updates the scavenger's pacing, particularly
@@ -123,7 +123,7 @@ func gcPaceScavenger() {
 		return
 	}
 	// Compute our scavenging goal.
-	goalRatio := float64(memstats.next_gc) / float64(memstats.last_next_gc)
+	goalRatio := float64(atomic.Load64(&memstats.next_gc)) / float64(memstats.last_next_gc)
 	retainedGoal := uint64(float64(memstats.last_heap_inuse) * goalRatio)
 	// Add retainExtraPercent overhead to retainedGoal. This calculation
 	// looks strange but the purpose is to arrive at an integer division
@@ -390,13 +390,13 @@ func bgscavenge(c chan int) {
 //
 // Returns the amount of memory scavenged in bytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
+func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	var (
 		addrs addrRange
 		gen   uint32
@@ -404,17 +404,17 @@ func (s *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 	released := uintptr(0)
 	for released < nbytes {
 		if addrs.size() == 0 {
-			if addrs, gen = s.scavengeReserve(); addrs.size() == 0 {
+			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
 				break
 			}
 		}
-		r, a := s.scavengeOne(addrs, nbytes-released, mayUnlock)
+		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
 		released += r
 		addrs = a
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
-	s.scavengeUnreserve(addrs, gen)
+	p.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -440,46 +440,46 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 // scavengeStartGen starts a new scavenge generation, resetting
 // the scavenger's search space to the full in-use address space.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeStartGen() {
+func (p *pageAlloc) scavengeStartGen() {
 	if debug.scavtrace > 0 {
-		printScavTrace(s.scav.gen, s.scav.released, false)
+		printScavTrace(p.scav.gen, p.scav.released, false)
 	}
-	s.inUse.cloneInto(&s.scav.inUse)
+	p.inUse.cloneInto(&p.scav.inUse)
 
 	// Pick the new starting address for the scavenger cycle.
 	var startAddr offAddr
-	if s.scav.scavLWM.lessThan(s.scav.freeHWM) {
+	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
 		// The "free" high watermark exceeds the "scavenged" low watermark,
 		// so there are free scavengable pages in parts of the address space
 		// that the scavenger already searched, the high watermark being the
 		// highest one. Pick that as our new starting point to ensure we
 		// see those pages.
-		startAddr = s.scav.freeHWM
+		startAddr = p.scav.freeHWM
 	} else {
 		// The "free" high watermark does not exceed the "scavenged" low
 		// watermark. This means the allocator didn't free any memory in
 		// the range we scavenged last cycle, so we might as well continue
 		// scavenging from where we were.
-		startAddr = s.scav.scavLWM
+		startAddr = p.scav.scavLWM
 	}
-	s.scav.inUse.removeGreaterEqual(startAddr.addr())
+	p.scav.inUse.removeGreaterEqual(startAddr.addr())
 
-	// reservationBytes may be zero if s.inUse.totalBytes is small, or if
+	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
 	// scavengeReservationShards is large. This case is fine as the scavenger
 	// will simply be turned off, but it does mean that scavengeReservationShards,
 	// in concert with pallocChunkBytes, dictates the minimum heap size at which
 	// the scavenger triggers. In practice this minimum is generally less than an
 	// arena in size, so virtually every heap has the scavenger on.
-	s.scav.reservationBytes = alignUp(s.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
-	s.scav.gen++
-	s.scav.released = 0
-	s.scav.freeHWM = minOffAddr
-	s.scav.scavLWM = maxOffAddr
+	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
+	p.scav.gen++
+	p.scav.released = 0
+	p.scav.freeHWM = minOffAddr
+	p.scav.scavLWM = maxOffAddr
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -489,19 +489,19 @@ func (s *pageAlloc) scavengeStartGen() {
 //
 // Returns the reserved range and the scavenge generation number for it.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
+func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	// Start by reserving the minimum.
-	r := s.scav.inUse.removeLast(s.scav.reservationBytes)
+	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
 
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, s.scav.gen
+		return r, p.scav.gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -511,27 +511,27 @@ func (s *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	newBase := alignDown(r.base.addr(), pallocChunkBytes)
 
 	// Remove from inUse however much extra we just pulled out.
-	s.scav.inUse.removeGreaterEqual(newBase)
+	p.scav.inUse.removeGreaterEqual(newBase)
 	r.base = offAddr{newBase}
-	return r, s.scav.gen
+	return r, p.scav.gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
 //
-// s.mheapLock must be held.
+// p.mheapLock must be held.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	if r.size() == 0 || gen != s.scav.gen {
+func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
+	if r.size() == 0 || gen != p.scav.gen {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	s.scav.inUse.add(r)
+	p.scav.inUse.add(r)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -545,13 +545,13 @@ func (s *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// s.mheapLock must be held, but may be temporarily released if
+// p.mheapLock must be held, but may be temporarily released if
 // mayUnlock == true.
 //
-// Must run on the system stack because s.mheapLock must be held.
+// Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
 	// Defensively check if we've recieved an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -586,12 +586,12 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// Helpers for locking and unlocking only if mayUnlock == true.
 	lockHeap := func() {
 		if mayUnlock {
-			lock(s.mheapLock)
+			lock(p.mheapLock)
 		}
 	}
 	unlockHeap := func() {
 		if mayUnlock {
-			unlock(s.mheapLock)
+			unlock(p.mheapLock)
 		}
 	}
 
@@ -602,14 +602,14 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// by subtracting 1.
 	maxAddr := work.limit.addr() - 1
 	maxChunk := chunkIndex(maxAddr)
-	if s.summary[len(s.summary)-1][maxChunk].max() >= uint(minPages) {
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := s.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(maxChunk, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
 			return uintptr(npages) * pageSize, work
 		}
 	}
@@ -631,7 +631,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// that's fine. We're being optimistic anyway.
 
 			// Check quickly if there are enough free pages at all.
-			if s.summary[len(s.summary)-1][i].max() < uint(minPages) {
+			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
 				continue
 			}
 
@@ -641,7 +641,7 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 			// avoid races with heap growth. It may or may not be possible to also
 			// see a nil pointer in this case if we do race with heap growth, but
 			// just defensively ignore the nils. This operation is optimistic anyway.
-			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&s.chunks[i.l1()])))
+			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
 			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
 				return i, true
 			}
@@ -670,10 +670,10 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		}
 
 		// Find, verify, and scavenge if we can.
-		chunk := s.chunkOf(candidateChunkIdx)
+		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
-			work.limit = offAddr{s.scavengeRangeLocked(candidateChunkIdx, base, npages)}
+			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
 			return uintptr(npages) * pageSize, work
 		}
 
@@ -690,28 +690,37 @@ func (s *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
-	s.chunkOf(ci).scavenged.setRange(base, npages)
+// p.mheapLock must be held.
+func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
 	// Update the scavenge low watermark.
-	if oAddr := (offAddr{addr}); oAddr.lessThan(s.scav.scavLWM) {
-		s.scav.scavLWM = oAddr
+	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
+		p.scav.scavLWM = oAddr
 	}
 
 	// Only perform the actual scavenging if we're not in a test.
 	// It's dangerous to do so otherwise.
-	if s.test {
+	if p.test {
 		return addr
 	}
 	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
 	// Update global accounting only when not in test, otherwise
 	// the runtime's accounting will be wrong.
-	mSysStatInc(&memstats.heap_released, uintptr(npages)*pageSize)
+	nbytes := int64(npages) * pageSize
+	atomic.Xadd64(&memstats.heap_released, nbytes)
+
+	// Update consistent accounting too.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, -nbytes)
+	atomic.Xaddint64(&stats.released, nbytes)
+	memstats.heapStats.release(c)
+
 	return addr
 }
 
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 7f619b1e7d..250343077f 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -235,26 +235,43 @@ func TestPallocDataFindScavengeCandidate(t *testing.T) {
 	if PhysHugePageSize > uintptr(PageSize) {
 		// Check hugepage preserving behavior.
 		bits := uint(PhysHugePageSize / uintptr(PageSize))
-		tests["PreserveHugePageBottom"] = test{
-			alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
-			min:   1,
-			max:   3, // Make it so that max would have us try to break the huge page.
-			want:  BitRange{0, bits + 2},
-		}
-		if 3*bits < PallocChunkPages {
-			// We need at least 3 huge pages in a chunk for this test to make sense.
-			tests["PreserveHugePageMiddle"] = test{
-				alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+		if bits < PallocChunkPages {
+			tests["PreserveHugePageBottom"] = test{
+				alloc: []BitRange{{bits + 2, PallocChunkPages - (bits + 2)}},
 				min:   1,
-				max:   12, // Make it so that max would have us try to break the huge page.
-				want:  BitRange{bits, bits + 10},
+				max:   3, // Make it so that max would have us try to break the huge page.
+				want:  BitRange{0, bits + 2},
+			}
+			if 3*bits < PallocChunkPages {
+				// We need at least 3 huge pages in a chunk for this test to make sense.
+				tests["PreserveHugePageMiddle"] = test{
+					alloc: []BitRange{{0, bits - 10}, {2*bits + 10, PallocChunkPages - (2*bits + 10)}},
+					min:   1,
+					max:   12, // Make it so that max would have us try to break the huge page.
+					want:  BitRange{bits, bits + 10},
+				}
+			}
+			tests["PreserveHugePageTop"] = test{
+				alloc: []BitRange{{0, PallocChunkPages - bits}},
+				min:   1,
+				max:   1, // Even one page would break a huge page in this case.
+				want:  BitRange{PallocChunkPages - bits, bits},
+			}
+		} else if bits == PallocChunkPages {
+			tests["PreserveHugePageAll"] = test{
+				min:  1,
+				max:  1, // Even one page would break a huge page in this case.
+				want: BitRange{0, PallocChunkPages},
+			}
+		} else {
+			// The huge page size is greater than pallocChunkPages, so it should
+			// be effectively disabled. There's no way we can possible scavenge
+			// a huge page out of this bitmap chunk.
+			tests["PreserveHugePageNone"] = test{
+				min:  1,
+				max:  1,
+				want: BitRange{PallocChunkPages - 1, 1},
 			}
-		}
-		tests["PreserveHugePageTop"] = test{
-			alloc: []BitRange{{0, PallocChunkPages - bits}},
-			min:   1,
-			max:   1, // Even one page would break a huge page in this case.
-			want:  BitRange{PallocChunkPages - bits, bits},
 		}
 	}
 	for name, v := range tests {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 6b8c56ce35..9b77ce635c 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -503,7 +503,9 @@ func (s *mspan) sweep(preserve bool) bool {
 			// wasn't totally filled, but then swept, still has all of its
 			// free slots zeroed.
 			s.needzero = 1
-			c.local_nsmallfree[spc.sizeclass()] += uintptr(nfreed)
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.smallFreeCount[spc.sizeclass()], uintptr(nfreed))
+			memstats.heapStats.release(c)
 		}
 		if !preserve {
 			// The caller may not have removed this span from whatever
@@ -548,8 +550,10 @@ func (s *mspan) sweep(preserve bool) bool {
 			} else {
 				mheap_.freeSpan(s)
 			}
-			c.local_nlargefree++
-			c.local_largefree += size
+			stats := memstats.heapStats.acquire(c)
+			atomic.Xadduintptr(&stats.largeFreeCount, 1)
+			atomic.Xadduintptr(&stats.largeFree, size)
+			memstats.heapStats.release(c)
 			return true
 		}
 
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 46101657d5..b3a068661e 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -22,13 +22,6 @@ const (
 	workbufAlloc = 32 << 10
 )
 
-// throwOnGCWork causes any operations that add pointers to a gcWork
-// buffer to throw.
-//
-// TODO(austin): This is a temporary debugging measure for issue
-// #27993. To be removed before release.
-var throwOnGCWork bool
-
 func init() {
 	if workbufAlloc%pageSize != 0 || workbufAlloc%_WorkbufSize != 0 {
 		throw("bad workbufAlloc")
@@ -93,17 +86,6 @@ type gcWork struct {
 	// termination check. Specifically, this indicates that this
 	// gcWork may have communicated work to another gcWork.
 	flushedWork bool
-
-	// pauseGen causes put operations to spin while pauseGen ==
-	// gcWorkPauseGen if debugCachedWork is true.
-	pauseGen uint32
-
-	// putGen is the pauseGen of the last putGen.
-	putGen uint32
-
-	// pauseStack is the stack at which this P was paused if
-	// debugCachedWork is true.
-	pauseStack [16]uintptr
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -122,60 +104,10 @@ func (w *gcWork) init() {
 	w.wbuf2 = wbuf2
 }
 
-func (w *gcWork) checkPut(ptr uintptr, ptrs []uintptr) {
-	if debugCachedWork {
-		alreadyFailed := w.putGen == w.pauseGen
-		w.putGen = w.pauseGen
-		if !canPreemptM(getg().m) {
-			// If we were to spin, the runtime may
-			// deadlock. Since we can't be preempted, the
-			// spin could prevent gcMarkDone from
-			// finishing the ragged barrier, which is what
-			// releases us from the spin.
-			return
-		}
-		for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
-		}
-		if throwOnGCWork {
-			printlock()
-			if alreadyFailed {
-				println("runtime: checkPut already failed at this generation")
-			}
-			println("runtime: late gcWork put")
-			if ptr != 0 {
-				gcDumpObject("ptr", ptr, ^uintptr(0))
-			}
-			for _, ptr := range ptrs {
-				gcDumpObject("ptrs", ptr, ^uintptr(0))
-			}
-			println("runtime: paused at")
-			for _, pc := range w.pauseStack {
-				if pc == 0 {
-					break
-				}
-				f := findfunc(pc)
-				if f.valid() {
-					// Obviously this doesn't
-					// relate to ancestor
-					// tracebacks, but this
-					// function prints what we
-					// want.
-					printAncestorTracebackFuncInfo(f, pc)
-				} else {
-					println("\tunknown PC ", hex(pc), "\n")
-				}
-			}
-			throw("throwOnGCWork")
-		}
-	}
-}
-
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-	w.checkPut(obj, nil)
-
 	flushed := false
 	wbuf := w.wbuf1
 	// Record that this may acquire the wbufSpans or heap lock to
@@ -214,8 +146,6 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-	w.checkPut(obj, nil)
-
 	wbuf := w.wbuf1
 	if wbuf == nil {
 		return false
@@ -237,8 +167,6 @@ func (w *gcWork) putBatch(obj []uintptr) {
 		return
 	}
 
-	w.checkPut(0, obj)
-
 	flushed := false
 	wbuf := w.wbuf1
 	if wbuf == nil {
@@ -360,12 +288,10 @@ func (w *gcWork) balance() {
 		return
 	}
 	if wbuf := w.wbuf2; wbuf.nobj != 0 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		putfull(wbuf)
 		w.flushedWork = true
 		w.wbuf2 = getempty()
 	} else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-		w.checkPut(0, wbuf.obj[:wbuf.nobj])
 		w.wbuf1 = handoff(wbuf)
 		w.flushedWork = true // handoff did putfull
 	} else {
@@ -445,7 +371,7 @@ func getempty() *workbuf {
 		}
 		if s == nil {
 			systemstack(func() {
-				s = mheap_.allocManual(workbufAlloc/pageSize, &memstats.gc_sys)
+				s = mheap_.allocManual(workbufAlloc/pageSize, spanAllocWorkBuf)
 			})
 			if s == nil {
 				throw("out of memory")
@@ -547,7 +473,7 @@ func freeSomeWbufs(preemptible bool) bool {
 				break
 			}
 			work.wbufSpans.free.remove(span)
-			mheap_.freeManual(span, &memstats.gc_sys)
+			mheap_.freeManual(span, spanAllocWorkBuf)
 		}
 	})
 	more := !work.wbufSpans.free.isEmpty()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 1a57bcd66e..14a73c0491 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -128,13 +128,6 @@ type mheap struct {
 	// This is accessed atomically.
 	reclaimCredit uintptr
 
-	// Malloc stats.
-	largealloc  uint64                  // bytes allocated for large objects
-	nlargealloc uint64                  // number of large object allocations
-	largefree   uint64                  // bytes freed for large objects (>maxsmallsize)
-	nlargefree  uint64                  // number of frees for large objects (>maxsmallsize)
-	nsmallfree  [_NumSizeClasses]uint64 // number of frees for small objects (<=maxsmallsize)
-
 	// arenas is the heap arena map. It points to the metadata for
 	// the heap for every arena frame of the entire usable virtual
 	// address space.
@@ -720,7 +713,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gc_sys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -868,6 +861,22 @@ func (h *mheap) reclaimChunk(arenas []arenaIdx, pageIdx, n uintptr) uintptr {
 	return nFreed
 }
 
+// spanAllocType represents the type of allocation to make, or
+// the type of allocation to be freed.
+type spanAllocType uint8
+
+const (
+	spanAllocHeap          spanAllocType = iota // heap span
+	spanAllocStack                              // stack span
+	spanAllocPtrScalarBits                      // unrolled GC prog bitmap span
+	spanAllocWorkBuf                            // work buf span
+)
+
+// manual returns true if the span allocation is manually managed.
+func (s spanAllocType) manual() bool {
+	return s != spanAllocHeap
+}
+
 // alloc allocates a new span of npage pages from the GC'd heap.
 //
 // spanclass indicates the span's size class and scannability.
@@ -884,7 +893,7 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 		if h.sweepdone == 0 {
 			h.reclaim(npages)
 		}
-		s = h.allocSpan(npages, false, spanclass, &memstats.heap_inuse)
+		s = h.allocSpan(npages, spanAllocHeap, spanclass)
 	})
 
 	if s != nil {
@@ -909,9 +918,15 @@ func (h *mheap) alloc(npages uintptr, spanclass spanClass, needzero bool) *mspan
 // allocManual must be called on the system stack because it may
 // acquire the heap lock via allocSpan. See mheap for details.
 //
+// If new code is written to call allocManual, do NOT use an
+// existing spanAllocType value and instead declare a new one.
+//
 //go:systemstack
-func (h *mheap) allocManual(npages uintptr, stat *uint64) *mspan {
-	return h.allocSpan(npages, true, 0, stat)
+func (h *mheap) allocManual(npages uintptr, typ spanAllocType) *mspan {
+	if !typ.manual() {
+		throw("manual span allocation called with non-manually-managed type")
+	}
+	return h.allocSpan(npages, typ, 0)
 }
 
 // setSpans modifies the span map so [spanOf(base), spanOf(base+npage*pageSize))
@@ -1073,7 +1088,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 
 // allocSpan allocates an mspan which owns npages worth of memory.
 //
-// If manual == false, allocSpan allocates a heap span of class spanclass
+// If typ.manual() == false, allocSpan allocates a heap span of class spanclass
 // and updates heap accounting. If manual == true, allocSpan allocates a
 // manually-managed span (spanclass is ignored), and the caller is
 // responsible for any accounting related to its use of the span. Either
@@ -1088,7 +1103,7 @@ func (h *mheap) freeMSpanLocked(s *mspan) {
 // the heap lock and because it must block GC transitions.
 //
 //go:systemstack
-func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysStat *uint64) (s *mspan) {
+func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass) (s *mspan) {
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
@@ -1109,23 +1124,11 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		base, scav = c.alloc(npages)
 		if base != 0 {
 			s = h.tryAllocMSpan()
-
-			if s != nil && gcBlackenEnabled == 0 && (manual || spanclass.sizeclass() != 0) {
+			if s != nil {
 				goto HaveSpan
 			}
-			// We're either running duing GC, failed to acquire a mspan,
-			// or the allocation is for a large object. This means we
-			// have to lock the heap and do a bunch of extra work,
-			// so go down the HaveBaseLocked path.
-			//
-			// We must do this during GC to avoid skew with heap_scan
-			// since we flush mcache stats whenever we lock.
-			//
-			// TODO(mknyszek): It would be nice to not have to
-			// lock the heap if it's a large allocation, but
-			// it's fine for now. The critical section here is
-			// short and large object allocations are relatively
-			// infrequent.
+			// We have a base but no mspan, so we need
+			// to lock the heap.
 		}
 	}
 
@@ -1152,39 +1155,6 @@ func (h *mheap) allocSpan(npages uintptr, manual bool, spanclass spanClass, sysS
 		// one now that we have the heap lock.
 		s = h.allocMSpanLocked()
 	}
-	if !manual {
-		// This is a heap span, so we should do some additional accounting
-		// which may only be done with the heap locked.
-
-		// Transfer stats from mcache to global.
-		var c *mcache
-		if gp.m.p != 0 {
-			c = gp.m.p.ptr().mcache
-		} else {
-			// This case occurs while bootstrapping.
-			// See the similar code in mallocgc.
-			c = mcache0
-			if c == nil {
-				throw("mheap.allocSpan called with no P")
-			}
-		}
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
-
-		// Do some additional accounting if it's a large allocation.
-		if spanclass.sizeclass() == 0 {
-			mheap_.largealloc += uint64(npages * pageSize)
-			mheap_.nlargealloc++
-			atomic.Xadd64(&memstats.heap_live, int64(npages*pageSize))
-		}
-
-		// Either heap_live or heap_scan could have been updated.
-		if gcBlackenEnabled != 0 {
-			gcController.revise()
-		}
-	}
 	unlock(&h.lock)
 
 HaveSpan:
@@ -1195,12 +1165,10 @@ HaveSpan:
 		s.needzero = 1
 	}
 	nbytes := npages * pageSize
-	if manual {
+	if typ.manual() {
 		s.manualFreeList = 0
 		s.nelems = 0
 		s.limit = s.base() + s.npages*pageSize
-		// Manually managed memory doesn't count toward heap_sys.
-		mSysStatDec(&memstats.heap_sys, s.npages*pageSize)
 		s.state.set(mSpanManual)
 	} else {
 		// We must set span properties before the span is published anywhere
@@ -1254,11 +1222,32 @@ HaveSpan:
 		// sysUsed all the pages that are actually available
 		// in the span since some of them might be scavenged.
 		sysUsed(unsafe.Pointer(base), nbytes)
-		mSysStatDec(&memstats.heap_released, scav)
+		atomic.Xadd64(&memstats.heap_released, -int64(scav))
 	}
 	// Update stats.
-	mSysStatInc(sysStat, nbytes)
-	mSysStatDec(&memstats.heap_idle, nbytes)
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys.
+		memstats.heap_sys.add(-int64(nbytes))
+	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	atomic.Xaddint64(&stats.committed, int64(scav))
+	atomic.Xaddint64(&stats.released, -int64(scav))
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Publish the span in various locations.
 
@@ -1269,7 +1258,7 @@ HaveSpan:
 	// before that happens) or pageInUse is updated.
 	h.setSpans(s.base(), npages, s)
 
-	if !manual {
+	if !typ.manual() {
 		// Mark in-use span in arena page bitmap.
 		//
 		// This publishes the span to the page sweeper, so
@@ -1280,11 +1269,6 @@ HaveSpan:
 
 		// Update related page sweeper stats.
 		atomic.Xadd64(&h.pagesInUse, int64(npages))
-
-		if trace.enabled {
-			// Trace that a heap alloc occurred.
-			traceHeapAlloc()
-		}
 	}
 
 	// Make sure the newly allocated span will be observed
@@ -1340,8 +1324,11 @@ func (h *mheap) grow(npage uintptr) bool {
 		// The allocation is always aligned to the heap arena
 		// size which is always > physPageSize, so its safe to
 		// just add directly to heap_released.
-		mSysStatInc(&memstats.heap_released, asize)
-		mSysStatInc(&memstats.heap_idle, asize)
+		atomic.Xadd64(&memstats.heap_released, int64(asize))
+		c := getMCache()
+		stats := memstats.heapStats.acquire(c)
+		atomic.Xaddint64(&stats.released, int64(asize))
+		memstats.heapStats.release(c)
 
 		// Recalculate nBase.
 		// We know this won't overflow, because sysAlloc returned
@@ -1373,29 +1360,20 @@ func (h *mheap) grow(npage uintptr) bool {
 // Free the span back into the heap.
 func (h *mheap) freeSpan(s *mspan) {
 	systemstack(func() {
-		c := getg().m.p.ptr().mcache
 		lock(&h.lock)
-		memstats.heap_scan += uint64(c.local_scan)
-		c.local_scan = 0
-		memstats.tinyallocs += uint64(c.local_tinyallocs)
-		c.local_tinyallocs = 0
 		if msanenabled {
 			// Tell msan that this entire span is no longer in use.
 			base := unsafe.Pointer(s.base())
 			bytes := s.npages << _PageShift
 			msanfree(base, bytes)
 		}
-		if gcBlackenEnabled != 0 {
-			// heap_scan changed.
-			gcController.revise()
-		}
-		h.freeSpanLocked(s, true, true)
+		h.freeSpanLocked(s, spanAllocHeap)
 		unlock(&h.lock)
 	})
 }
 
 // freeManual frees a manually-managed span returned by allocManual.
-// stat must be the same as the stat passed to the allocManual that
+// typ must be the same as the spanAllocType passed to the allocManual that
 // allocated s.
 //
 // This must only be called when gcphase == _GCoff. See mSpanState for
@@ -1405,16 +1383,14 @@ func (h *mheap) freeSpan(s *mspan) {
 // the heap lock. See mheap for details.
 //
 //go:systemstack
-func (h *mheap) freeManual(s *mspan, stat *uint64) {
+func (h *mheap) freeManual(s *mspan, typ spanAllocType) {
 	s.needzero = 1
 	lock(&h.lock)
-	mSysStatDec(stat, s.npages*pageSize)
-	mSysStatInc(&memstats.heap_sys, s.npages*pageSize)
-	h.freeSpanLocked(s, false, true)
+	h.freeSpanLocked(s, typ)
 	unlock(&h.lock)
 }
 
-func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
+func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	switch s.state.get() {
 	case mSpanManual:
 		if s.allocCount != 0 {
@@ -1434,12 +1410,31 @@ func (h *mheap) freeSpanLocked(s *mspan, acctinuse, acctidle bool) {
 		throw("mheap.freeSpanLocked - invalid span state")
 	}
 
-	if acctinuse {
-		mSysStatDec(&memstats.heap_inuse, s.npages*pageSize)
-	}
-	if acctidle {
-		mSysStatInc(&memstats.heap_idle, s.npages*pageSize)
-	}
+	// Update stats.
+	//
+	// Mirrors the code in allocSpan.
+	nbytes := s.npages * pageSize
+	if typ == spanAllocHeap {
+		atomic.Xadd64(&memstats.heap_inuse, -int64(nbytes))
+	}
+	if typ.manual() {
+		// Manually managed memory doesn't count toward heap_sys, so add it back.
+		memstats.heap_sys.add(int64(nbytes))
+	}
+	// Update consistent stats.
+	c := getMCache()
+	stats := memstats.heapStats.acquire(c)
+	switch typ {
+	case spanAllocHeap:
+		atomic.Xaddint64(&stats.inHeap, -int64(nbytes))
+	case spanAllocStack:
+		atomic.Xaddint64(&stats.inStacks, -int64(nbytes))
+	case spanAllocPtrScalarBits:
+		atomic.Xaddint64(&stats.inPtrScalarBits, -int64(nbytes))
+	case spanAllocWorkBuf:
+		atomic.Xaddint64(&stats.inWorkBufs, -int64(nbytes))
+	}
+	memstats.heapStats.release(c)
 
 	// Mark the space as free.
 	h.pages.free(s.base(), s.npages)
@@ -1982,7 +1977,7 @@ func newArenaMayUnlock() *gcBitsArena {
 	var result *gcBitsArena
 	if gcBitsArenas.free == nil {
 		unlock(&gcBitsArenas.lock)
-		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gc_sys))
+		result = (*gcBitsArena)(sysAlloc(gcBitsChunkBytes, &memstats.gcMiscSys))
 		if result == nil {
 			throw("runtime: cannot allocate memory")
 		}
diff --git a/src/runtime/mkduff.go b/src/runtime/mkduff.go
index 8859ed68cc..6ddf0256e9 100644
--- a/src/runtime/mkduff.go
+++ b/src/runtime/mkduff.go
@@ -38,6 +38,7 @@ func main() {
 	gen("arm64", notags, zeroARM64, copyARM64)
 	gen("ppc64x", tagsPPC64x, zeroPPC64x, copyPPC64x)
 	gen("mips64x", tagsMIPS64x, zeroMIPS64x, copyMIPS64x)
+	gen("riscv64", notags, zeroRISCV64, copyRISCV64)
 }
 
 func gen(arch string, tags, zero, copy func(io.Writer)) {
@@ -227,3 +228,30 @@ func copyMIPS64x(w io.Writer) {
 	}
 	fmt.Fprintln(w, "\tRET")
 }
+
+func zeroRISCV64(w io.Writer) {
+	// ZERO: always zero
+	// X10: ptr to memory to be zeroed
+	// X10 is updated as a side effect.
+	fmt.Fprintln(w, "TEXT runtime·duffzero(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\tZERO, (X10)")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+	}
+	fmt.Fprintln(w, "\tRET")
+}
+
+func copyRISCV64(w io.Writer) {
+	// X10: ptr to source memory
+	// X11: ptr to destination memory
+	// X10 and X11 are updated as a side effect
+	fmt.Fprintln(w, "TEXT runtime·duffcopy(SB), NOSPLIT|NOFRAME, $0-0")
+	for i := 0; i < 128; i++ {
+		fmt.Fprintln(w, "\tMOV\t(X10), X31")
+		fmt.Fprintln(w, "\tADD\t$8, X10")
+		fmt.Fprintln(w, "\tMOV\tX31, (X11)")
+		fmt.Fprintln(w, "\tADD\t$8, X11")
+		fmt.Fprintln(w)
+	}
+	fmt.Fprintln(w, "\tRET")
+}
diff --git a/src/runtime/mkpreempt.go b/src/runtime/mkpreempt.go
index c2e14cdcd6..76237bc31b 100644
--- a/src/runtime/mkpreempt.go
+++ b/src/runtime/mkpreempt.go
@@ -189,7 +189,6 @@ func (l *layout) restore() {
 
 func gen386() {
 	p("PUSHFL")
-
 	// Save general purpose registers.
 	var l = layout{sp: "SP"}
 	for _, reg := range regNames386 {
@@ -199,12 +198,6 @@ func gen386() {
 		l.add("MOVL", reg, 4)
 	}
 
-	// Save the 387 state.
-	l.addSpecial(
-		"FSAVE %d(SP)\nFLDCW runtime·controlWord64(SB)",
-		"FRSTOR %d(SP)",
-		108)
-
 	// Save SSE state only if supported.
 	lSSE := layout{stack: l.stack, sp: "SP"}
 	for i := 0; i < 8; i++ {
@@ -355,12 +348,9 @@ func genARM64() {
 	p("MOVD R29, -8(RSP)") // save frame pointer (only used on Linux)
 	p("SUB $8, RSP, R29")  // set up new frame pointer
 	p("#endif")
-	// On darwin, save the LR again after decrementing SP. We run the
-	// signal handler on the G stack (as it doesn't support SA_ONSTACK),
+	// On iOS, save the LR again after decrementing SP. We run the
+	// signal handler on the G stack (as it doesn't support sigaltstack),
 	// so any writes below SP may be clobbered.
-	p("#ifdef GOOS_darwin")
-	p("MOVD R30, (RSP)")
-	p("#endif")
 	p("#ifdef GOOS_ios")
 	p("MOVD R30, (RSP)")
 	p("#endif")
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index c90a6378bd..2af1c97e0b 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -293,13 +293,13 @@ type pageAlloc struct {
 
 	// sysStat is the runtime memstat to update when new system
 	// memory is committed by the pageAlloc for allocation metadata.
-	sysStat *uint64
+	sysStat *sysMemStat
 
 	// Whether or not this struct is being used in tests.
 	test bool
 }
 
-func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
@@ -308,29 +308,29 @@ func (s *pageAlloc) init(mheapLock *mutex, sysStat *uint64) {
 		print("runtime: summary max pages = ", maxPackedValue, "\n")
 		throw("root level max pages doesn't fit in summary")
 	}
-	s.sysStat = sysStat
+	p.sysStat = sysStat
 
-	// Initialize s.inUse.
-	s.inUse.init(sysStat)
+	// Initialize p.inUse.
+	p.inUse.init(sysStat)
 
 	// System-dependent initialization.
-	s.sysInit()
+	p.sysInit()
 
 	// Start with the searchAddr in a state indicating there's no free memory.
-	s.searchAddr = maxSearchAddr
+	p.searchAddr = maxSearchAddr
 
 	// Set the mheapLock.
-	s.mheapLock = mheapLock
+	p.mheapLock = mheapLock
 
 	// Initialize scavenge tracking state.
-	s.scav.scavLWM = maxSearchAddr
+	p.scav.scavLWM = maxSearchAddr
 }
 
 // tryChunkOf returns the bitmap data for the given chunk.
 //
 // Returns nil if the chunk data has not been mapped.
-func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
-	l2 := s.chunks[ci.l1()]
+func (p *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
+	l2 := p.chunks[ci.l1()]
 	if l2 == nil {
 		return nil
 	}
@@ -340,15 +340,15 @@ func (s *pageAlloc) tryChunkOf(ci chunkIdx) *pallocData {
 // chunkOf returns the chunk at the given chunk index.
 //
 // The chunk index must be valid or this method may throw.
-func (s *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
-	return &s.chunks[ci.l1()][ci.l2()]
+func (p *pageAlloc) chunkOf(ci chunkIdx) *pallocData {
+	return &p.chunks[ci.l1()][ci.l2()]
 }
 
 // grow sets up the metadata for the address range [base, base+size).
-// It may allocate metadata, in which case *s.sysStat will be updated.
+// It may allocate metadata, in which case *p.sysStat will be updated.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) grow(base, size uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) grow(base, size uintptr) {
 	// Round up to chunks, since we can't deal with increments smaller
 	// than chunks. Also, sysGrow expects aligned values.
 	limit := alignUp(base+size, pallocChunkBytes)
@@ -356,29 +356,29 @@ func (s *pageAlloc) grow(base, size uintptr) {
 
 	// Grow the summary levels in a system-dependent manner.
 	// We just update a bunch of additional metadata here.
-	s.sysGrow(base, limit)
+	p.sysGrow(base, limit)
 
-	// Update s.start and s.end.
+	// Update p.start and p.end.
 	// If no growth happened yet, start == 0. This is generally
 	// safe since the zero page is unmapped.
-	firstGrowth := s.start == 0
+	firstGrowth := p.start == 0
 	start, end := chunkIndex(base), chunkIndex(limit)
-	if firstGrowth || start < s.start {
-		s.start = start
+	if firstGrowth || start < p.start {
+		p.start = start
 	}
-	if end > s.end {
-		s.end = end
+	if end > p.end {
+		p.end = end
 	}
 	// Note that [base, limit) will never overlap with any existing
 	// range inUse because grow only ever adds never-used memory
 	// regions to the page allocator.
-	s.inUse.add(makeAddrRange(base, limit))
+	p.inUse.add(makeAddrRange(base, limit))
 
 	// A grow operation is a lot like a free operation, so if our
-	// chunk ends up below s.searchAddr, update s.searchAddr to the
+	// chunk ends up below p.searchAddr, update p.searchAddr to the
 	// new address, just like in free.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 
 	// Add entries into chunks, which is sparse, if needed. Then,
@@ -387,21 +387,21 @@ func (s *pageAlloc) grow(base, size uintptr) {
 	// Newly-grown memory is always considered scavenged.
 	// Set all the bits in the scavenged bitmaps high.
 	for c := chunkIndex(base); c < chunkIndex(limit); c++ {
-		if s.chunks[c.l1()] == nil {
+		if p.chunks[c.l1()] == nil {
 			// Create the necessary l2 entry.
 			//
 			// Store it atomically to avoid races with readers which
 			// don't acquire the heap lock.
-			r := sysAlloc(unsafe.Sizeof(*s.chunks[0]), s.sysStat)
-			atomic.StorepNoWB(unsafe.Pointer(&s.chunks[c.l1()]), r)
+			r := sysAlloc(unsafe.Sizeof(*p.chunks[0]), p.sysStat)
+			atomic.StorepNoWB(unsafe.Pointer(&p.chunks[c.l1()]), r)
 		}
-		s.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
+		p.chunkOf(c).scavenged.setRange(0, pallocChunkPages)
 	}
 
 	// Update summaries accordingly. The grow acts like a free, so
 	// we need to ensure this newly-free memory is visible in the
 	// summaries.
-	s.update(base, size/pageSize, true, false)
+	p.update(base, size/pageSize, true, false)
 }
 
 // update updates heap metadata. It must be called each time the bitmap
@@ -411,8 +411,8 @@ func (s *pageAlloc) grow(base, size uintptr) {
 // a contiguous allocation or free between addr and addr+npages. alloc indicates
 // whether the operation performed was an allocation or a free.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
+// p.mheapLock must be held.
+func (p *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	// base, limit, start, and end are inclusive.
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -421,23 +421,23 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 	if sc == ec {
 		// Fast path: the allocation doesn't span more than one chunk,
 		// so update this one and if the summary didn't change, return.
-		x := s.summary[len(s.summary)-1][sc]
-		y := s.chunkOf(sc).summarize()
+		x := p.summary[len(p.summary)-1][sc]
+		y := p.chunkOf(sc).summarize()
 		if x == y {
 			return
 		}
-		s.summary[len(s.summary)-1][sc] = y
+		p.summary[len(p.summary)-1][sc] = y
 	} else if contig {
 		// Slow contiguous path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 
 		// Update the summary for chunk sc.
-		summary[sc] = s.chunkOf(sc).summarize()
+		summary[sc] = p.chunkOf(sc).summarize()
 
 		// Update the summaries for chunks in between, which are
 		// either totally allocated or freed.
-		whole := s.summary[len(s.summary)-1][sc+1 : ec]
+		whole := p.summary[len(p.summary)-1][sc+1 : ec]
 		if alloc {
 			// Should optimize into a memclr.
 			for i := range whole {
@@ -450,22 +450,22 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 		}
 
 		// Update the summary for chunk ec.
-		summary[ec] = s.chunkOf(ec).summarize()
+		summary[ec] = p.chunkOf(ec).summarize()
 	} else {
 		// Slow general path: the allocation spans more than one chunk
 		// and at least one summary is guaranteed to change.
 		//
 		// We can't assume a contiguous allocation happened, so walk over
 		// every chunk in the range and manually recompute the summary.
-		summary := s.summary[len(s.summary)-1]
+		summary := p.summary[len(p.summary)-1]
 		for c := sc; c <= ec; c++ {
-			summary[c] = s.chunkOf(c).summarize()
+			summary[c] = p.chunkOf(c).summarize()
 		}
 	}
 
 	// Walk up the radix tree and update the summaries appropriately.
 	changed := true
-	for l := len(s.summary) - 2; l >= 0 && changed; l-- {
+	for l := len(p.summary) - 2; l >= 0 && changed; l-- {
 		// Update summaries at level l from summaries at level l+1.
 		changed = false
 
@@ -479,12 +479,12 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 
 		// Iterate over each block, updating the corresponding summary in the less-granular level.
 		for i := lo; i < hi; i++ {
-			children := s.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
+			children := p.summary[l+1][i<<logEntriesPerBlock : (i+1)<<logEntriesPerBlock]
 			sum := mergeSummaries(children, logMaxPages)
-			old := s.summary[l][i]
+			old := p.summary[l][i]
 			if old != sum {
 				changed = true
-				s.summary[l][i] = sum
+				p.summary[l][i] = sum
 			}
 		}
 	}
@@ -497,8 +497,8 @@ func (s *pageAlloc) update(base, npages uintptr, contig, alloc bool) {
 // Returns the amount of scavenged memory in bytes present in the
 // allocated range.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	limit := base + npages*pageSize - 1
 	sc, ec := chunkIndex(base), chunkIndex(limit)
 	si, ei := chunkPageIndex(base), chunkPageIndex(limit)
@@ -506,24 +506,24 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 	scav := uint(0)
 	if sc == ec {
 		// The range doesn't cross any chunk boundaries.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, ei+1-si)
 		chunk.allocRange(si, ei+1-si)
 	} else {
 		// The range crosses at least one chunk boundary.
-		chunk := s.chunkOf(sc)
+		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
 		chunk.allocRange(si, pallocChunkPages-si)
 		for c := sc + 1; c < ec; c++ {
-			chunk := s.chunkOf(c)
+			chunk := p.chunkOf(c)
 			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
 			chunk.allocAll()
 		}
-		chunk = s.chunkOf(ec)
+		chunk = p.chunkOf(ec)
 		scav += chunk.scavenged.popcntRange(0, ei+1)
 		chunk.allocRange(0, ei+1)
 	}
-	s.update(base, npages, true, true)
+	p.update(base, npages, true, true)
 	return uintptr(scav) * pageSize
 }
 
@@ -532,13 +532,13 @@ func (s *pageAlloc) allocRange(base, npages uintptr) uintptr {
 // returned. If addr is higher than any mapped region, then
 // it returns maxOffAddr.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
+// p.mheapLock must be held.
+func (p *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 	// If we're not in a test, validate first by checking mheap_.arenas.
 	// This is a fast path which is only safe to use outside of testing.
 	ai := arenaIndex(addr.addr())
-	if s.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
-		vAddr, ok := s.inUse.findAddrGreaterEqual(addr.addr())
+	if p.test || mheap_.arenas[ai.l1()] == nil || mheap_.arenas[ai.l1()][ai.l2()] == nil {
+		vAddr, ok := p.inUse.findAddrGreaterEqual(addr.addr())
 		if ok {
 			return offAddr{vAddr}
 		} else {
@@ -554,20 +554,20 @@ func (s *pageAlloc) findMappedAddr(addr offAddr) offAddr {
 // find searches for the first (address-ordered) contiguous free region of
 // npages in size and returns a base address for that region.
 //
-// It uses s.searchAddr to prune its search and assumes that no palloc chunks
-// below chunkIndex(s.searchAddr) contain any free memory at all.
+// It uses p.searchAddr to prune its search and assumes that no palloc chunks
+// below chunkIndex(p.searchAddr) contain any free memory at all.
 //
-// find also computes and returns a candidate s.searchAddr, which may or
-// may not prune more of the address space than s.searchAddr already does.
-// This candidate is always a valid s.searchAddr.
+// find also computes and returns a candidate p.searchAddr, which may or
+// may not prune more of the address space than p.searchAddr already does.
+// This candidate is always a valid p.searchAddr.
 //
 // find represents the slow path and the full radix tree search.
 //
 // Returns a base address of 0 on failure, in which case the candidate
 // searchAddr returned is invalid and must be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	// Search algorithm.
 	//
 	// This algorithm walks each level l of the radix tree from the root level
@@ -647,7 +647,7 @@ func (s *pageAlloc) find(npages uintptr) (uintptr, offAddr) {
 	lastSumIdx := -1
 
 nextLevel:
-	for l := 0; l < len(s.summary); l++ {
+	for l := 0; l < len(p.summary); l++ {
 		// For the root level, entriesPerBlock is the whole level.
 		entriesPerBlock := 1 << levelBits[l]
 		logMaxPages := levelLogPages[l]
@@ -657,14 +657,14 @@ nextLevel:
 		i <<= levelBits[l]
 
 		// Slice out the block of entries we care about.
-		entries := s.summary[l][i : i+entriesPerBlock]
+		entries := p.summary[l][i : i+entriesPerBlock]
 
 		// Determine j0, the first index we should start iterating from.
 		// The searchAddr may help us eliminate iterations if we followed the
 		// searchAddr on the previous level or we're on the root leve, in which
 		// case the searchAddr should be the same as i after levelShift.
 		j0 := 0
-		if searchIdx := offAddrToLevelIndex(l, s.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
+		if searchIdx := offAddrToLevelIndex(l, p.searchAddr); searchIdx&^(entriesPerBlock-1) == i {
 			j0 = searchIdx & (entriesPerBlock - 1)
 		}
 
@@ -729,7 +729,7 @@ nextLevel:
 			// We found a sufficiently large run of free pages straddling
 			// some boundary, so compute the address and return it.
 			addr := levelIndexToOffAddr(l, i).add(uintptr(base) * pageSize).addr()
-			return addr, s.findMappedAddr(firstFree.base)
+			return addr, p.findMappedAddr(firstFree.base)
 		}
 		if l == 0 {
 			// We're at level zero, so that means we've exhausted our search.
@@ -741,7 +741,7 @@ nextLevel:
 		// lied to us. In either case, dump some useful state and throw.
 		print("runtime: summary[", l-1, "][", lastSumIdx, "] = ", lastSum.start(), ", ", lastSum.max(), ", ", lastSum.end(), "\n")
 		print("runtime: level = ", l, ", npages = ", npages, ", j0 = ", j0, "\n")
-		print("runtime: s.searchAddr = ", hex(s.searchAddr.addr()), ", i = ", i, "\n")
+		print("runtime: p.searchAddr = ", hex(p.searchAddr.addr()), ", i = ", i, "\n")
 		print("runtime: levelShift[level] = ", levelShift[l], ", levelBits[level] = ", levelBits[l], "\n")
 		for j := 0; j < len(entries); j++ {
 			sum := entries[j]
@@ -758,12 +758,12 @@ nextLevel:
 	// After iterating over all levels, i must contain a chunk index which
 	// is what the final level represents.
 	ci := chunkIdx(i)
-	j, searchIdx := s.chunkOf(ci).find(npages, 0)
+	j, searchIdx := p.chunkOf(ci).find(npages, 0)
 	if j == ^uint(0) {
 		// We couldn't find any space in this chunk despite the summaries telling
 		// us it should be there. There's likely a bug, so dump some state and throw.
-		sum := s.summary[len(s.summary)-1][i]
-		print("runtime: summary[", len(s.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
+		sum := p.summary[len(p.summary)-1][i]
+		print("runtime: summary[", len(p.summary)-1, "][", i, "] = (", sum.start(), ", ", sum.max(), ", ", sum.end(), ")\n")
 		print("runtime: npages = ", npages, "\n")
 		throw("bad summary data")
 	}
@@ -775,7 +775,7 @@ nextLevel:
 	// found an even narrower free window.
 	searchAddr := chunkBase(ci) + uintptr(searchIdx)*pageSize
 	foundFree(offAddr{searchAddr}, chunkBase(ci+1)-searchAddr)
-	return addr, s.findMappedAddr(firstFree.base)
+	return addr, p.findMappedAddr(firstFree.base)
 }
 
 // alloc allocates npages worth of memory from the page heap, returning the base
@@ -785,25 +785,25 @@ nextLevel:
 // Returns a 0 base address on failure, in which case other returned values
 // should be ignored.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
+// p.mheapLock must be held.
+func (p *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return 0, 0
 	}
 
 	// If npages has a chance of fitting in the chunk where the searchAddr is,
 	// search it directly.
 	searchAddr := minOffAddr
-	if pallocChunkPages-chunkPageIndex(s.searchAddr.addr()) >= uint(npages) {
+	if pallocChunkPages-chunkPageIndex(p.searchAddr.addr()) >= uint(npages) {
 		// npages is guaranteed to be no greater than pallocChunkPages here.
-		i := chunkIndex(s.searchAddr.addr())
-		if max := s.summary[len(s.summary)-1][i].max(); max >= uint(npages) {
-			j, searchIdx := s.chunkOf(i).find(npages, chunkPageIndex(s.searchAddr.addr()))
+		i := chunkIndex(p.searchAddr.addr())
+		if max := p.summary[len(p.summary)-1][i].max(); max >= uint(npages) {
+			j, searchIdx := p.chunkOf(i).find(npages, chunkPageIndex(p.searchAddr.addr()))
 			if j == ^uint(0) {
 				print("runtime: max = ", max, ", npages = ", npages, "\n")
-				print("runtime: searchIdx = ", chunkPageIndex(s.searchAddr.addr()), ", s.searchAddr = ", hex(s.searchAddr.addr()), "\n")
+				print("runtime: searchIdx = ", chunkPageIndex(p.searchAddr.addr()), ", p.searchAddr = ", hex(p.searchAddr.addr()), "\n")
 				throw("bad summary data")
 			}
 			addr = chunkBase(i) + uintptr(j)*pageSize
@@ -813,7 +813,7 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 	}
 	// We failed to use a searchAddr for one reason or another, so try
 	// the slow path.
-	addr, searchAddr = s.find(npages)
+	addr, searchAddr = p.find(npages)
 	if addr == 0 {
 		if npages == 1 {
 			// We failed to find a single free page, the smallest unit
@@ -821,41 +821,41 @@ func (s *pageAlloc) alloc(npages uintptr) (addr uintptr, scav uintptr) {
 			// exhausted. Otherwise, the heap still might have free
 			// space in it, just not enough contiguous space to
 			// accommodate npages.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 		}
 		return 0, 0
 	}
 Found:
 	// Go ahead and actually mark the bits now that we have an address.
-	scav = s.allocRange(addr, npages)
+	scav = p.allocRange(addr, npages)
 
 	// If we found a higher searchAddr, we know that all the
 	// heap memory before that searchAddr in an offset address space is
-	// allocated, so bump s.searchAddr up to the new one.
-	if s.searchAddr.lessThan(searchAddr) {
-		s.searchAddr = searchAddr
+	// allocated, so bump p.searchAddr up to the new one.
+	if p.searchAddr.lessThan(searchAddr) {
+		p.searchAddr = searchAddr
 	}
 	return addr, scav
 }
 
 // free returns npages worth of memory starting at base back to the page heap.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) free(base, npages uintptr) {
-	// If we're freeing pages below the s.searchAddr, update searchAddr.
-	if b := (offAddr{base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+// p.mheapLock must be held.
+func (p *pageAlloc) free(base, npages uintptr) {
+	// If we're freeing pages below the p.searchAddr, update searchAddr.
+	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
 	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); s.scav.freeHWM.lessThan(offLimit) {
-		s.scav.freeHWM = offLimit
+	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+		p.scav.freeHWM = offLimit
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
 		// where it is, so mark it directly.
 		i := chunkIndex(base)
-		s.chunkOf(i).free1(chunkPageIndex(base))
+		p.chunkOf(i).free1(chunkPageIndex(base))
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
 		sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -863,17 +863,17 @@ func (s *pageAlloc) free(base, npages uintptr) {
 
 		if sc == ec {
 			// The range doesn't cross any chunk boundaries.
-			s.chunkOf(sc).free(si, ei+1-si)
+			p.chunkOf(sc).free(si, ei+1-si)
 		} else {
 			// The range crosses at least one chunk boundary.
-			s.chunkOf(sc).free(si, pallocChunkPages-si)
+			p.chunkOf(sc).free(si, pallocChunkPages-si)
 			for c := sc + 1; c < ec; c++ {
-				s.chunkOf(c).freeAll()
+				p.chunkOf(c).freeAll()
 			}
-			s.chunkOf(ec).free(0, ei+1)
+			p.chunkOf(ec).free(0, ei+1)
 		}
 	}
-	s.update(base, npages, true, false)
+	p.update(base, npages, true, false)
 }
 
 const (
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 6658a900ac..331dadade9 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build 386 arm mips mipsle wasm darwin,arm64
+// +build 386 arm mips mipsle wasm ios,arm64
 
 // wasm is a treated as a 32-bit architecture for the purposes of the page
 // allocator, even though it has 64-bit pointers. This is because any wasm
 // pointer always has its top 32 bits as zero, so the effective heap address
 // space is only 2^32 bytes in size (see heapAddrBits).
 
-// darwin/arm64 is treated as a 32-bit architecture for the purposes of the
+// ios/arm64 is treated as a 32-bit architecture for the purposes of the
 // page allocator, even though it has 64-bit pointers and a 33-bit address
 // space (see heapAddrBits). The 33 bit address space cannot be rounded up
 // to 64 bits because there are too many summary levels to fit in just 33
@@ -60,7 +60,7 @@ var levelLogPages = [summaryLevels]uint{
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Calculate how much memory all our entries will take up.
 	//
 	// This should be around 12 KiB or less.
@@ -76,7 +76,7 @@ func (s *pageAlloc) sysInit() {
 		throw("failed to reserve page summary memory")
 	}
 	// There isn't much. Just map it and mark it as used immediately.
-	sysMap(reservation, totalSize, s.sysStat)
+	sysMap(reservation, totalSize, p.sysStat)
 	sysUsed(reservation, totalSize)
 
 	// Iterate over the reservation and cut it up into slices.
@@ -88,29 +88,29 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(reservation), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
 }
 
 // See mpagealloc_64bit.go for details.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
 	}
 
 	// Walk up the tree and update the summary slices.
-	for l := len(s.summary) - 1; l >= 0; l-- {
+	for l := len(p.summary) - 1; l >= 0; l-- {
 		// Figure out what part of the summary array this new address space needs.
 		// Note that we need to align the ranges to the block width (1<<levelBits[l])
 		// at this level because the full block is needed to compute the summary for
 		// the next level.
 		lo, hi := addrsToSummaryRange(l, base, limit)
 		_, hi = blockAlignSummaryRange(l, lo, hi)
-		if hi > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:hi]
+		if hi > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:hi]
 		}
 	}
 }
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 831626e4b2..ffacb46c18 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -2,9 +2,9 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-// +build amd64 !darwin,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
+// +build amd64 !ios,arm64 mips64 mips64le ppc64 ppc64le riscv64 s390x
 
-// See mpagealloc_32bit.go for why darwin/arm64 is excluded here.
+// See mpagealloc_32bit.go for why ios/arm64 is excluded here.
 
 package runtime
 
@@ -67,7 +67,7 @@ var levelLogPages = [summaryLevels]uint{
 // sysInit performs architecture-dependent initialization of fields
 // in pageAlloc. pageAlloc should be uninitialized except for sysStat
 // if any runtime statistic should be updated.
-func (s *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit() {
 	// Reserve memory for each level. This will get mapped in
 	// as R/W by setArenas.
 	for l, shift := range levelShift {
@@ -82,21 +82,21 @@ func (s *pageAlloc) sysInit() {
 
 		// Put this reservation into a slice.
 		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
-		s.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
+		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 	}
 }
 
 // sysGrow performs architecture-dependent operations on heap
 // growth for the page allocator, such as mapping in new memory
 // for summaries. It also updates the length of the slices in
-// s.summary.
+// [.summary.
 //
 // base is the base of the newly-added heap memory and limit is
 // the first address past the end of the newly-added heap memory.
 // Both must be aligned to pallocChunkBytes.
 //
-// The caller must update s.start and s.end after calling sysGrow.
-func (s *pageAlloc) sysGrow(base, limit uintptr) {
+// The caller must update p.start and p.end after calling sysGrow.
+func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
@@ -111,12 +111,12 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	}
 
 	// summaryRangeToSumAddrRange converts a range of indices in any
-	// level of s.summary into page-aligned addresses which cover that
+	// level of p.summary into page-aligned addresses which cover that
 	// range of indices.
 	summaryRangeToSumAddrRange := func(level, sumIdxBase, sumIdxLimit int) addrRange {
 		baseOffset := alignDown(uintptr(sumIdxBase)*pallocSumBytes, physPageSize)
 		limitOffset := alignUp(uintptr(sumIdxLimit)*pallocSumBytes, physPageSize)
-		base := unsafe.Pointer(&s.summary[level][0])
+		base := unsafe.Pointer(&p.summary[level][0])
 		return addrRange{
 			offAddr{uintptr(add(base, baseOffset))},
 			offAddr{uintptr(add(base, limitOffset))},
@@ -140,10 +140,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 	//
 	// This will be used to look at what memory in the summary array is already
 	// mapped before and after this new range.
-	inUseIndex := s.inUse.findSucc(base)
+	inUseIndex := p.inUse.findSucc(base)
 
 	// Walk up the radix tree and map summaries in as needed.
-	for l := range s.summary {
+	for l := range p.summary {
 		// Figure out what part of the summary array this new address space needs.
 		needIdxBase, needIdxLimit := addrRangeToSummaryRange(l, makeAddrRange(base, limit))
 
@@ -151,8 +151,8 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// we get tight bounds checks on at least the top bound.
 		//
 		// We must do this regardless of whether we map new memory.
-		if needIdxLimit > len(s.summary[l]) {
-			s.summary[l] = s.summary[l][:needIdxLimit]
+		if needIdxLimit > len(p.summary[l]) {
+			p.summary[l] = p.summary[l][:needIdxLimit]
 		}
 
 		// Compute the needed address range in the summary array for level l.
@@ -163,10 +163,10 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		// for mapping. prune's invariants are guaranteed by the fact that this
 		// function will never be asked to remap the same memory twice.
 		if inUseIndex > 0 {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex-1]))
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
 		}
-		if inUseIndex < len(s.inUse.ranges) {
-			need = need.subtract(addrRangeToSumAddrRange(l, s.inUse.ranges[inUseIndex]))
+		if inUseIndex < len(p.inUse.ranges) {
+			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex]))
 		}
 		// It's possible that after our pruning above, there's nothing new to map.
 		if need.size() == 0 {
@@ -174,7 +174,7 @@ func (s *pageAlloc) sysGrow(base, limit uintptr) {
 		}
 
 		// Map and commit need.
-		sysMap(unsafe.Pointer(need.base.addr()), need.size(), s.sysStat)
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), p.sysStat)
 		sysUsed(unsafe.Pointer(need.base.addr()), need.size())
 	}
 }
diff --git a/src/runtime/mpagealloc_test.go b/src/runtime/mpagealloc_test.go
index 65ba71d459..5d979fa95b 100644
--- a/src/runtime/mpagealloc_test.go
+++ b/src/runtime/mpagealloc_test.go
@@ -54,7 +54,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
 			},
 		},
 		"Contiguous2": {
@@ -63,7 +63,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 1,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
 			},
 		},
 		"Contiguous5": {
@@ -75,7 +75,7 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Discontiguous": {
@@ -85,9 +85,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"Mixed": {
@@ -98,8 +98,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 4,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
 			},
 		},
 		"WildlyDiscontiguous": {
@@ -110,9 +110,9 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x21,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)},
-				{PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)},
-				{PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+2, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x10, 0), PageBase(BaseChunkIdx+0x11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x21, 0), PageBase(BaseChunkIdx+0x22, 0)),
 			},
 		},
 		"ManyDiscontiguous": {
@@ -129,39 +129,39 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 64,
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)},
-				{PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)},
-				{PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)},
-				{PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)},
-				{PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)},
-				{PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)},
-				{PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)},
-				{PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)},
-				{PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)},
-				{PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)},
-				{PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)},
-				{PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)},
-				{PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)},
-				{PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)},
-				{PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)},
-				{PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)},
-				{PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)},
-				{PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)},
-				{PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)},
-				{PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)},
-				{PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)},
-				{PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)},
-				{PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)},
-				{PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)},
-				{PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)},
-				{PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)},
-				{PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)},
-				{PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)},
-				{PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)},
-				{PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)},
-				{PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)},
-				{PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+2, 0), PageBase(BaseChunkIdx+3, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+5, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+6, 0), PageBase(BaseChunkIdx+7, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+8, 0), PageBase(BaseChunkIdx+9, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+10, 0), PageBase(BaseChunkIdx+11, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+12, 0), PageBase(BaseChunkIdx+13, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+14, 0), PageBase(BaseChunkIdx+15, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+16, 0), PageBase(BaseChunkIdx+17, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+18, 0), PageBase(BaseChunkIdx+19, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+20, 0), PageBase(BaseChunkIdx+21, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+22, 0), PageBase(BaseChunkIdx+23, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+24, 0), PageBase(BaseChunkIdx+25, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+26, 0), PageBase(BaseChunkIdx+27, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+28, 0), PageBase(BaseChunkIdx+29, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+30, 0), PageBase(BaseChunkIdx+31, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+32, 0), PageBase(BaseChunkIdx+33, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+34, 0), PageBase(BaseChunkIdx+35, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+36, 0), PageBase(BaseChunkIdx+37, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+38, 0), PageBase(BaseChunkIdx+39, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+40, 0), PageBase(BaseChunkIdx+41, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+42, 0), PageBase(BaseChunkIdx+43, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+44, 0), PageBase(BaseChunkIdx+45, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+46, 0), PageBase(BaseChunkIdx+47, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+48, 0), PageBase(BaseChunkIdx+49, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+50, 0), PageBase(BaseChunkIdx+51, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+52, 0), PageBase(BaseChunkIdx+53, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+54, 0), PageBase(BaseChunkIdx+55, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+56, 0), PageBase(BaseChunkIdx+57, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+58, 0), PageBase(BaseChunkIdx+59, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+60, 0), PageBase(BaseChunkIdx+61, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+62, 0), PageBase(BaseChunkIdx+63, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+64, 0), PageBase(BaseChunkIdx+65, 0)),
 			},
 		},
 	}
@@ -172,8 +172,8 @@ func TestPageAllocGrow(t *testing.T) {
 				BaseChunkIdx + 0x100000, // constant translates to O(TiB)
 			},
 			inUse: []AddrRange{
-				{PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)},
-				{PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)},
+				MakeAddrRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0)),
+				MakeAddrRange(PageBase(BaseChunkIdx+0x100000, 0), PageBase(BaseChunkIdx+0x100001, 0)),
 			},
 		}
 	}
@@ -197,7 +197,7 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Fail()
 			} else {
 				for i := range want {
-					if want[i] != got[i] {
+					if !want[i].Equals(got[i]) {
 						t.Fail()
 						break
 					}
@@ -207,11 +207,11 @@ func TestPageAllocGrow(t *testing.T) {
 				t.Logf("found inUse mismatch")
 				t.Logf("got:")
 				for i, r := range got {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 				t.Logf("want:")
 				for i, r := range want {
-					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base, r.Limit)
+					t.Logf("\t#%d [0x%x, 0x%x)", i, r.Base(), r.Limit())
 				}
 			}
 		})
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 683a997136..5f76501a1c 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -71,8 +71,8 @@ func (c *pageCache) allocN(npages uintptr) (uintptr, uintptr) {
 // into s. Then, it clears the cache, such that empty returns
 // true.
 //
-// s.mheapLock must be held or the world must be stopped.
-func (c *pageCache) flush(s *pageAlloc) {
+// p.mheapLock must be held or the world must be stopped.
+func (c *pageCache) flush(p *pageAlloc) {
 	if c.empty() {
 		return
 	}
@@ -83,18 +83,18 @@ func (c *pageCache) flush(s *pageAlloc) {
 	// slower, safer thing by iterating over each bit individually.
 	for i := uint(0); i < 64; i++ {
 		if c.cache&(1<<i) != 0 {
-			s.chunkOf(ci).free1(pi + i)
+			p.chunkOf(ci).free1(pi + i)
 		}
 		if c.scav&(1<<i) != 0 {
-			s.chunkOf(ci).scavenged.setRange(pi+i, 1)
+			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
 		}
 	}
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
-	if b := (offAddr{c.base}); b.lessThan(s.searchAddr) {
-		s.searchAddr = b
+	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
+		p.searchAddr = b
 	}
-	s.update(c.base, pageCachePages, false, false)
+	p.update(c.base, pageCachePages, false, false)
 	*c = pageCache{}
 }
 
@@ -102,19 +102,19 @@ func (c *pageCache) flush(s *pageAlloc) {
 // may not be contiguous, and returns a pageCache structure which owns the
 // chunk.
 //
-// s.mheapLock must be held.
-func (s *pageAlloc) allocToCache() pageCache {
+// p.mheapLock must be held.
+func (p *pageAlloc) allocToCache() pageCache {
 	// If the searchAddr refers to a region which has a higher address than
 	// any known chunk, then we know we're out of memory.
-	if chunkIndex(s.searchAddr.addr()) >= s.end {
+	if chunkIndex(p.searchAddr.addr()) >= p.end {
 		return pageCache{}
 	}
 	c := pageCache{}
-	ci := chunkIndex(s.searchAddr.addr()) // chunk index
-	if s.summary[len(s.summary)-1][ci] != 0 {
+	ci := chunkIndex(p.searchAddr.addr()) // chunk index
+	if p.summary[len(p.summary)-1][ci] != 0 {
 		// Fast path: there's free pages at or near the searchAddr address.
-		chunk := s.chunkOf(ci)
-		j, _ := chunk.find(1, chunkPageIndex(s.searchAddr.addr()))
+		chunk := p.chunkOf(ci)
+		j, _ := chunk.find(1, chunkPageIndex(p.searchAddr.addr()))
 		if j == ^uint(0) {
 			throw("bad summary data")
 		}
@@ -126,15 +126,15 @@ func (s *pageAlloc) allocToCache() pageCache {
 	} else {
 		// Slow path: the searchAddr address had nothing there, so go find
 		// the first free page the slow way.
-		addr, _ := s.find(1)
+		addr, _ := p.find(1)
 		if addr == 0 {
 			// We failed to find adequate free space, so mark the searchAddr as OoM
 			// and return an empty pageCache.
-			s.searchAddr = maxSearchAddr
+			p.searchAddr = maxSearchAddr
 			return pageCache{}
 		}
 		ci := chunkIndex(addr)
-		chunk := s.chunkOf(ci)
+		chunk := p.chunkOf(ci)
 		c = pageCache{
 			base:  alignDown(addr, 64*pageSize),
 			cache: ^chunk.pages64(chunkPageIndex(addr)),
@@ -143,19 +143,19 @@ func (s *pageAlloc) allocToCache() pageCache {
 	}
 
 	// Set the bits as allocated and clear the scavenged bits.
-	s.allocRange(c.base, pageCachePages)
+	p.allocRange(c.base, pageCachePages)
 
 	// Update as an allocation, but note that it's not contiguous.
-	s.update(c.base, pageCachePages, false, true)
+	p.update(c.base, pageCachePages, false, true)
 
 	// Set the search address to the last page represented by the cache.
 	// Since all of the pages in this block are going to the cache, and we
 	// searched for the first free page, we can confidently start at the
 	// next page.
 	//
-	// However, s.searchAddr is not allowed to point into unmapped heap memory
+	// However, p.searchAddr is not allowed to point into unmapped heap memory
 	// unless it is maxSearchAddr, so make it the last page as opposed to
 	// the page after.
-	s.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
+	p.searchAddr = offAddr{c.base + pageSize*(pageCachePages-1)}
 	return c
 }
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index 2c0eb2c2dd..84a2c06dbb 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -160,10 +160,10 @@ type addrRanges struct {
 	totalBytes uintptr
 
 	// sysStat is the stat to track allocations by this type
-	sysStat *uint64
+	sysStat *sysMemStat
 }
 
-func (a *addrRanges) init(sysStat *uint64) {
+func (a *addrRanges) init(sysStat *sysMemStat) {
 	ranges := (*notInHeapSlice)(unsafe.Pointer(&a.ranges))
 	ranges.len = 0
 	ranges.cap = 16
@@ -172,20 +172,46 @@ func (a *addrRanges) init(sysStat *uint64) {
 	a.totalBytes = 0
 }
 
-// findSucc returns the first index in a such that base is
+// findSucc returns the first index in a such that addr is
 // less than the base of the addrRange at that index.
 func (a *addrRanges) findSucc(addr uintptr) int {
-	// TODO(mknyszek): Consider a binary search for large arrays.
-	// While iterating over these ranges is potentially expensive,
-	// the expected number of ranges is small, ideally just 1,
-	// since Go heaps are usually mostly contiguous.
 	base := offAddr{addr}
-	for i := range a.ranges {
+
+	// Narrow down the search space via a binary search
+	// for large addrRanges until we have at most iterMax
+	// candidates left.
+	const iterMax = 8
+	bot, top := 0, len(a.ranges)
+	for top-bot > iterMax {
+		i := ((top - bot) / 2) + bot
+		if a.ranges[i].contains(base.addr()) {
+			// a.ranges[i] contains base, so
+			// its successor is the next index.
+			return i + 1
+		}
+		if base.lessThan(a.ranges[i].base) {
+			// In this case i might actually be
+			// the successor, but we can't be sure
+			// until we check the ones before it.
+			top = i
+		} else {
+			// In this case we know base is
+			// greater than or equal to a.ranges[i].limit-1,
+			// so i is definitely not the successor.
+			// We already checked i, so pick the next
+			// one.
+			bot = i + 1
+		}
+	}
+	// There are top-bot candidates left, so
+	// iterate over them and find the first that
+	// base is strictly less than.
+	for i := bot; i < top; i++ {
 		if base.lessThan(a.ranges[i].base) {
 			return i
 		}
 	}
-	return len(a.ranges)
+	return top
 }
 
 // findAddrGreaterEqual returns the smallest address represented by a
@@ -218,7 +244,7 @@ func (a *addrRanges) contains(addr uintptr) bool {
 
 // add inserts a new address range to a.
 //
-// r must not overlap with any address range in a.
+// r must not overlap with any address range in a and r.size() must be > 0.
 func (a *addrRanges) add(r addrRange) {
 	// The copies in this function are potentially expensive, but this data
 	// structure is meant to represent the Go heap. At worst, copying this
@@ -229,6 +255,12 @@ func (a *addrRanges) add(r addrRange) {
 	// of 16) and Go heaps are usually mostly contiguous, so the chance that
 	// an addrRanges even grows to that size is extremely low.
 
+	// An empty range has no effect on the set of addresses represented
+	// by a, but passing a zero-sized range is almost always a bug.
+	if r.size() == 0 {
+		print("runtime: range = {", hex(r.base.addr()), ", ", hex(r.limit.addr()), "}\n")
+		throw("attempted to add zero-sized address range")
+	}
 	// Because we assume r is not currently represented in a,
 	// findSucc gives us our insertion index.
 	i := a.findSucc(r.base.addr())
diff --git a/src/runtime/mranges_test.go b/src/runtime/mranges_test.go
new file mode 100644
index 0000000000..ed439c56c2
--- /dev/null
+++ b/src/runtime/mranges_test.go
@@ -0,0 +1,275 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime_test
+
+import (
+	. "runtime"
+	"testing"
+)
+
+func validateAddrRanges(t *testing.T, a *AddrRanges, want ...AddrRange) {
+	ranges := a.Ranges()
+	if len(ranges) != len(want) {
+		t.Errorf("want %v, got %v", want, ranges)
+		t.Fatal("different lengths")
+	}
+	gotTotalBytes := uintptr(0)
+	wantTotalBytes := uintptr(0)
+	for i := range ranges {
+		gotTotalBytes += ranges[i].Size()
+		wantTotalBytes += want[i].Size()
+		if ranges[i].Base() >= ranges[i].Limit() {
+			t.Error("empty range found")
+		}
+		// Ensure this is equivalent to what we want.
+		if !ranges[i].Equals(want[i]) {
+			t.Errorf("range %d: got [0x%x, 0x%x), want [0x%x, 0x%x)", i,
+				ranges[i].Base(), ranges[i].Limit(),
+				want[i].Base(), want[i].Limit(),
+			)
+		}
+		if i != 0 {
+			// Ensure the ranges are sorted.
+			if ranges[i-1].Base() >= ranges[i].Base() {
+				t.Errorf("ranges %d and %d are out of sorted order", i-1, i)
+			}
+			// Check for a failure to coalesce.
+			if ranges[i-1].Limit() == ranges[i].Base() {
+				t.Errorf("ranges %d and %d should have coalesced", i-1, i)
+			}
+			// Check if any ranges overlap. Because the ranges are sorted
+			// by base, it's sufficient to just check neighbors.
+			if ranges[i-1].Limit() > ranges[i].Base() {
+				t.Errorf("ranges %d and %d overlap", i-1, i)
+			}
+		}
+	}
+	if wantTotalBytes != gotTotalBytes {
+		t.Errorf("expected %d total bytes, got %d", wantTotalBytes, gotTotalBytes)
+	}
+	if b := a.TotalBytes(); b != gotTotalBytes {
+		t.Errorf("inconsistent total bytes: want %d, got %d", gotTotalBytes, b)
+	}
+	if t.Failed() {
+		t.Errorf("addrRanges: %v", ranges)
+		t.Fatal("detected bad addrRanges")
+	}
+}
+
+func TestAddrRangesAdd(t *testing.T) {
+	a := NewAddrRanges()
+
+	// First range.
+	a.Add(MakeAddrRange(512, 1024))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 1024),
+	)
+
+	// Coalesce up.
+	a.Add(MakeAddrRange(1024, 2048))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+	)
+
+	// Add new independent range.
+	a.Add(MakeAddrRange(4096, 8192))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(4096, 8192),
+	)
+
+	// Coalesce down.
+	a.Add(MakeAddrRange(3776, 4096))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 2048),
+		MakeAddrRange(3776, 8192),
+	)
+
+	// Coalesce up and down.
+	a.Add(MakeAddrRange(2048, 3776))
+	validateAddrRanges(t, &a,
+		MakeAddrRange(512, 8192),
+	)
+
+	// Push a bunch of independent ranges to the end to try and force growth.
+	expectedRanges := []AddrRange{MakeAddrRange(512, 8192)}
+	for i := uintptr(0); i < 64; i++ {
+		dRange := MakeAddrRange(8192+(i+1)*2048, 8192+(i+1)*2048+10)
+		a.Add(dRange)
+		expectedRanges = append(expectedRanges, dRange)
+		validateAddrRanges(t, &a, expectedRanges...)
+	}
+
+	// Push a bunch of independent ranges to the beginning to try and force growth.
+	var bottomRanges []AddrRange
+	for i := uintptr(0); i < 63; i++ {
+		dRange := MakeAddrRange(8+i*8, 8+i*8+4)
+		a.Add(dRange)
+		bottomRanges = append(bottomRanges, dRange)
+		validateAddrRanges(t, &a, append(bottomRanges, expectedRanges...)...)
+	}
+}
+
+func TestAddrRangesFindSucc(t *testing.T) {
+	var large []AddrRange
+	for i := 0; i < 100; i++ {
+		large = append(large, MakeAddrRange(5+uintptr(i)*5, 5+uintptr(i)*5+3))
+	}
+
+	type testt struct {
+		name   string
+		base   uintptr
+		expect int
+		ranges []AddrRange
+	}
+	tests := []testt{
+		{
+			name:   "Empty",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{},
+		},
+		{
+			name:   "OneBefore",
+			base:   12,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneWithin",
+			base:   14,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfterLimit",
+			base:   16,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "OneAfter",
+			base:   17,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(14, 16),
+			},
+		},
+		{
+			name:   "ThreeBefore",
+			base:   3,
+			expect: 0,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeAfter",
+			base:   24,
+			expect: 3,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeBetween",
+			base:   11,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "ThreeWithin",
+			base:   9,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(6, 10),
+				MakeAddrRange(12, 16),
+				MakeAddrRange(19, 22),
+			},
+		},
+		{
+			name:   "Zero",
+			base:   0,
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(0, 10),
+			},
+		},
+		{
+			name:   "Max",
+			base:   ^uintptr(0),
+			expect: 1,
+			ranges: []AddrRange{
+				MakeAddrRange(^uintptr(0)-5, ^uintptr(0)),
+			},
+		},
+		{
+			name:   "LargeBefore",
+			base:   2,
+			expect: 0,
+			ranges: large,
+		},
+		{
+			name:   "LargeAfter",
+			base:   5 + uintptr(len(large))*5 + 30,
+			expect: len(large),
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenLow",
+			base:   14,
+			expect: 2,
+			ranges: large,
+		},
+		{
+			name:   "LargeBetweenHigh",
+			base:   249,
+			expect: 49,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinLow",
+			base:   25,
+			expect: 5,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinHigh",
+			base:   396,
+			expect: 79,
+			ranges: large,
+		},
+		{
+			name:   "LargeWithinMiddle",
+			base:   250,
+			expect: 50,
+			ranges: large,
+		},
+	}
+
+	for _, test := range tests {
+		t.Run(test.name, func(t *testing.T) {
+			a := MakeAddrRanges(test.ranges...)
+			i := a.FindSucc(test.base)
+			if i != test.expect {
+				t.Fatalf("expected %d, got %d", test.expect, i)
+			}
+		})
+	}
+}
diff --git a/src/runtime/mspanset.go b/src/runtime/mspanset.go
index 490eed4549..10d2596c38 100644
--- a/src/runtime/mspanset.go
+++ b/src/runtime/mspanset.go
@@ -102,7 +102,7 @@ retry:
 			if newCap == 0 {
 				newCap = spanSetInitSpineCap
 			}
-			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gc_sys)
+			newSpine := persistentalloc(newCap*sys.PtrSize, cpu.CacheLineSize, &memstats.gcMiscSys)
 			if b.spineCap != 0 {
 				// Blocks are allocated off-heap, so
 				// no write barriers.
@@ -283,7 +283,7 @@ func (p *spanSetBlockAlloc) alloc() *spanSetBlock {
 	if s := (*spanSetBlock)(p.stack.pop()); s != nil {
 		return s
 	}
-	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gc_sys))
+	return (*spanSetBlock)(persistentalloc(unsafe.Sizeof(spanSetBlock{}), cpu.CacheLineSize, &memstats.gcMiscSys))
 }
 
 // free returns a spanSetBlock back to the pool.
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go
index b95b332134..e0a417d213 100644
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -8,13 +8,10 @@ package runtime
 
 import (
 	"runtime/internal/atomic"
-	"runtime/internal/sys"
 	"unsafe"
 )
 
 // Statistics.
-// If you edit this structure, also edit type MemStats below.
-// Their layouts must match exactly.
 //
 // For detailed descriptions see the documentation for MemStats.
 // Fields that differ from MemStats are further documented here.
@@ -35,31 +32,43 @@ type mstats struct {
 	//
 	// Like MemStats, heap_sys and heap_inuse do not count memory
 	// in manually-managed spans.
-	heap_alloc    uint64 // bytes allocated and not yet freed (same as alloc above)
-	heap_sys      uint64 // virtual address space obtained from system for GC'd heap
-	heap_idle     uint64 // bytes in idle spans
-	heap_inuse    uint64 // bytes in mSpanInUse spans
-	heap_released uint64 // bytes released to the os
+	heap_sys      sysMemStat // virtual address space obtained from system for GC'd heap
+	heap_inuse    uint64     // bytes in mSpanInUse spans
+	heap_released uint64     // bytes released to the os
 
 	// heap_objects is not used by the runtime directly and instead
 	// computed on the fly by updatememstats.
 	heap_objects uint64 // total number of allocated objects
 
+	// Statistics about stacks.
+	stacks_inuse uint64     // bytes in manually-managed stack spans; computed by updatememstats
+	stacks_sys   sysMemStat // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
+
 	// Statistics about allocation of low-level fixed-size structures.
 	// Protected by FixAlloc locks.
-	stacks_inuse uint64 // bytes in manually-managed stack spans; updated atomically or during STW
-	stacks_sys   uint64 // only counts newosproc0 stack in mstats; differs from MemStats.StackSys
 	mspan_inuse  uint64 // mspan structures
-	mspan_sys    uint64
+	mspan_sys    sysMemStat
 	mcache_inuse uint64 // mcache structures
-	mcache_sys   uint64
-	buckhash_sys uint64 // profiling bucket hash table
-	gc_sys       uint64 // updated atomically or during STW
-	other_sys    uint64 // updated atomically or during STW
+	mcache_sys   sysMemStat
+	buckhash_sys sysMemStat // profiling bucket hash table
+
+	// Statistics about GC overhead.
+	gcWorkBufInUse           uint64     // computed by updatememstats
+	gcProgPtrScalarBitsInUse uint64     // computed by updatememstats
+	gcMiscSys                sysMemStat // updated atomically or during STW
+
+	// Miscellaneous statistics.
+	other_sys sysMemStat // updated atomically or during STW
+
+	// Statistics about the garbage collector.
+
+	// next_gc is the goal heap_live for when next GC ends.
+	// Set to ^uint64(0) if disabled.
+	//
+	// Read and written atomically, unless the world is stopped.
+	next_gc uint64
 
-	// Statistics about garbage collector.
 	// Protected by mheap or stopping the world during GC.
-	next_gc         uint64 // goal heap_live for when next GC ends; ^0 if disabled
 	last_gc_unix    uint64 // last gc (in unix time)
 	pause_total_ns  uint64
 	pause_ns        [256]uint64 // circular buffer of recent gc pause lengths
@@ -82,8 +91,6 @@ type mstats struct {
 	// to 64 bits for atomic operations on 32 bit platforms.
 	_ [1 - _NumSizeClasses%2]uint32
 
-	// Statistics below here are not exported to MemStats directly.
-
 	last_gc_nanotime uint64 // last gc (monotonic time)
 	tinyallocs       uint64 // number of tiny allocations that didn't cause actual allocation; not exported to go directly
 	last_next_gc     uint64 // next_gc for the previous GC
@@ -110,11 +117,10 @@ type mstats struct {
 
 	// heap_live is the number of bytes considered live by the GC.
 	// That is: retained by the most recent GC plus allocated
-	// since then. heap_live <= heap_alloc, since heap_alloc
-	// includes unmarked objects that have not yet been swept (and
-	// hence goes up as we allocate and down as we sweep) while
-	// heap_live excludes these objects (and hence only goes up
-	// between GCs).
+	// since then. heap_live <= alloc, since alloc includes unmarked
+	// objects that have not yet been swept (and hence goes up as we
+	// allocate and down as we sweep) while heap_live excludes these
+	// objects (and hence only goes up between GCs).
 	//
 	// This is updated atomically without locking. To reduce
 	// contention, this is updated only when obtaining a span from
@@ -139,6 +145,8 @@ type mstats struct {
 	// no-scan objects and no-scan tails of objects.
 	//
 	// Whenever this is updated, call gcController.revise().
+	//
+	// Read and written atomically or with the world stopped.
 	heap_scan uint64
 
 	// heap_marked is the number of bytes marked by the previous
@@ -146,6 +154,17 @@ type mstats struct {
 	// unlike heap_live, heap_marked does not change until the
 	// next mark termination.
 	heap_marked uint64
+
+	// heapStats is a set of statistics
+	heapStats consistentHeapStats
+
+	_ uint32 // ensure gcPauseDist is aligned
+
+	// gcPauseDist represents the distribution of all GC-related
+	// application pauses in the runtime.
+	//
+	// Each individual pause is counted separately, unlike pause_ns.
+	gcPauseDist timeHistogram
 }
 
 var memstats mstats
@@ -423,24 +442,25 @@ type MemStats struct {
 	}
 }
 
-// Size of the trailing by_size array differs between mstats and MemStats,
-// and all data after by_size is local to runtime, not exported.
-// NumSizeClasses was changed, but we cannot change MemStats because of backward compatibility.
-// sizeof_C_MStats is the size of the prefix of mstats that
-// corresponds to MemStats. It should match Sizeof(MemStats{}).
-var sizeof_C_MStats = unsafe.Offsetof(memstats.by_size) + 61*unsafe.Sizeof(memstats.by_size[0])
-
 func init() {
-	var memStats MemStats
-	if sizeof_C_MStats != unsafe.Sizeof(memStats) {
-		println(sizeof_C_MStats, unsafe.Sizeof(memStats))
-		throw("MStats vs MemStatsType size mismatch")
-	}
-
-	if unsafe.Offsetof(memstats.heap_live)%8 != 0 {
-		println(unsafe.Offsetof(memstats.heap_live))
+	if offset := unsafe.Offsetof(memstats.heap_live); offset%8 != 0 {
+		println(offset)
 		throw("memstats.heap_live not aligned to 8 bytes")
 	}
+	if offset := unsafe.Offsetof(memstats.heapStats); offset%8 != 0 {
+		println(offset)
+		throw("memstats.heapStats not aligned to 8 bytes")
+	}
+	if offset := unsafe.Offsetof(memstats.gcPauseDist); offset%8 != 0 {
+		println(offset)
+		throw("memstats.gcPauseDist not aligned to 8 bytes")
+	}
+	// Ensure the size of heapStatsDelta causes adjacent fields/slots (e.g.
+	// [3]heapStatsDelta) to be 8-byte aligned.
+	if size := unsafe.Sizeof(heapStatsDelta{}); size%8 != 0 {
+		println(size)
+		throw("heapStatsDelta not a multiple of 8 bytes in size")
+	}
 }
 
 // ReadMemStats populates m with memory allocator statistics.
@@ -462,14 +482,74 @@ func ReadMemStats(m *MemStats) {
 func readmemstats_m(stats *MemStats) {
 	updatememstats()
 
-	// The size of the trailing by_size array differs between
-	// mstats and MemStats. NumSizeClasses was changed, but we
-	// cannot change MemStats because of backward compatibility.
-	memmove(unsafe.Pointer(stats), unsafe.Pointer(&memstats), sizeof_C_MStats)
-
+	stats.Alloc = memstats.alloc
+	stats.TotalAlloc = memstats.total_alloc
+	stats.Sys = memstats.sys
+	stats.Mallocs = memstats.nmalloc
+	stats.Frees = memstats.nfree
+	stats.HeapAlloc = memstats.alloc
+	stats.HeapSys = memstats.heap_sys.load()
+	// By definition, HeapIdle is memory that was mapped
+	// for the heap but is not currently used to hold heap
+	// objects. It also specifically is memory that can be
+	// used for other purposes, like stacks, but this memory
+	// is subtracted out of HeapSys before it makes that
+	// transition. Put another way:
+	//
+	// heap_sys = bytes allocated from the OS for the heap - bytes ultimately used for non-heap purposes
+	// heap_idle = bytes allocated from the OS for the heap - bytes ultimately used for any purpose
+	//
+	// or
+	//
+	// heap_sys = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse
+	// heap_idle = sys - stacks_inuse - gcWorkBufInUse - gcProgPtrScalarBitsInUse - heap_inuse
+	//
+	// => heap_idle = heap_sys - heap_inuse
+	stats.HeapIdle = memstats.heap_sys.load() - memstats.heap_inuse
+	stats.HeapInuse = memstats.heap_inuse
+	stats.HeapReleased = memstats.heap_released
+	stats.HeapObjects = memstats.heap_objects
+	stats.StackInuse = memstats.stacks_inuse
 	// memstats.stacks_sys is only memory mapped directly for OS stacks.
 	// Add in heap-allocated stack memory for user consumption.
-	stats.StackSys += stats.StackInuse
+	stats.StackSys = memstats.stacks_inuse + memstats.stacks_sys.load()
+	stats.MSpanInuse = memstats.mspan_inuse
+	stats.MSpanSys = memstats.mspan_sys.load()
+	stats.MCacheInuse = memstats.mcache_inuse
+	stats.MCacheSys = memstats.mcache_sys.load()
+	stats.BuckHashSys = memstats.buckhash_sys.load()
+	// MemStats defines GCSys as an aggregate of all memory related
+	// to the memory management system, but we track this memory
+	// at a more granular level in the runtime.
+	stats.GCSys = memstats.gcMiscSys.load() + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+	stats.OtherSys = memstats.other_sys.load()
+	stats.NextGC = memstats.next_gc
+	stats.LastGC = memstats.last_gc_unix
+	stats.PauseTotalNs = memstats.pause_total_ns
+	stats.PauseNs = memstats.pause_ns
+	stats.PauseEnd = memstats.pause_end
+	stats.NumGC = memstats.numgc
+	stats.NumForcedGC = memstats.numforcedgc
+	stats.GCCPUFraction = memstats.gc_cpu_fraction
+	stats.EnableGC = true
+
+	// Handle BySize. Copy N values, where N is
+	// the minimum of the lengths of the two arrays.
+	// Unfortunately copy() won't work here because
+	// the arrays have different structs.
+	//
+	// TODO(mknyszek): Consider renaming the fields
+	// of by_size's elements to align so we can use
+	// the copy built-in.
+	bySizeLen := len(stats.BySize)
+	if l := len(memstats.by_size); l < bySizeLen {
+		bySizeLen = l
+	}
+	for i := 0; i < bySizeLen; i++ {
+		stats.BySize[i].Size = memstats.by_size[i].size
+		stats.BySize[i].Mallocs = memstats.by_size[i].nmalloc
+		stats.BySize[i].Frees = memstats.by_size[i].nfree
+	}
 }
 
 //go:linkname readGCStats runtime/debug.readGCStats
@@ -515,6 +595,10 @@ func readGCStats_m(pauses *[]uint64) {
 	*pauses = p[:n+n+3]
 }
 
+// Updates the memstats structure.
+//
+// The world must be stopped.
+//
 //go:nowritebarrier
 func updatememstats() {
 	// Flush mcaches to mcentral before doing anything else.
@@ -525,11 +609,9 @@ func updatememstats() {
 
 	memstats.mcache_inuse = uint64(mheap_.cachealloc.inuse)
 	memstats.mspan_inuse = uint64(mheap_.spanalloc.inuse)
-	memstats.sys = memstats.heap_sys + memstats.stacks_sys + memstats.mspan_sys +
-		memstats.mcache_sys + memstats.buckhash_sys + memstats.gc_sys + memstats.other_sys
-
-	// We also count stacks_inuse as sys memory.
-	memstats.sys += memstats.stacks_inuse
+	memstats.sys = memstats.heap_sys.load() + memstats.stacks_sys.load() + memstats.mspan_sys.load() +
+		memstats.mcache_sys.load() + memstats.buckhash_sys.load() + memstats.gcMiscSys.load() +
+		memstats.other_sys.load()
 
 	// Calculate memory allocator stats.
 	// During program execution we only count number of frees and amount of freed memory.
@@ -546,62 +628,75 @@ func updatememstats() {
 		memstats.by_size[i].nmalloc = 0
 		memstats.by_size[i].nfree = 0
 	}
+	// Collect consistent stats, which are the source-of-truth in the some cases.
+	var consStats heapStatsDelta
+	memstats.heapStats.unsafeRead(&consStats)
+
+	// Collect large allocation stats.
+	totalAlloc := uint64(consStats.largeAlloc)
+	memstats.nmalloc += uint64(consStats.largeAllocCount)
+	totalFree := uint64(consStats.largeFree)
+	memstats.nfree += uint64(consStats.largeFreeCount)
 
-	// Aggregate local stats.
-	cachestats()
-
-	// Collect allocation stats. This is safe and consistent
-	// because the world is stopped.
-	var smallFree, totalAlloc, totalFree uint64
-	// Collect per-spanclass stats.
-	for spc := range mheap_.central {
-		// The mcaches are now empty, so mcentral stats are
-		// up-to-date.
-		c := &mheap_.central[spc].mcentral
-		memstats.nmalloc += c.nmalloc
-		i := spanClass(spc).sizeclass()
-		memstats.by_size[i].nmalloc += c.nmalloc
-		totalAlloc += c.nmalloc * uint64(class_to_size[i])
-	}
 	// Collect per-sizeclass stats.
 	for i := 0; i < _NumSizeClasses; i++ {
-		if i == 0 {
-			memstats.nmalloc += mheap_.nlargealloc
-			totalAlloc += mheap_.largealloc
-			totalFree += mheap_.largefree
-			memstats.nfree += mheap_.nlargefree
-			continue
-		}
-
-		// The mcache stats have been flushed to mheap_.
-		memstats.nfree += mheap_.nsmallfree[i]
-		memstats.by_size[i].nfree = mheap_.nsmallfree[i]
-		smallFree += mheap_.nsmallfree[i] * uint64(class_to_size[i])
+		// Malloc stats.
+		a := uint64(consStats.smallAllocCount[i])
+		totalAlloc += a * uint64(class_to_size[i])
+		memstats.nmalloc += a
+		memstats.by_size[i].nmalloc = a
+
+		// Free stats.
+		f := uint64(consStats.smallFreeCount[i])
+		totalFree += f * uint64(class_to_size[i])
+		memstats.nfree += f
+		memstats.by_size[i].nfree = f
 	}
-	totalFree += smallFree
 
+	// Account for tiny allocations.
 	memstats.nfree += memstats.tinyallocs
 	memstats.nmalloc += memstats.tinyallocs
 
 	// Calculate derived stats.
 	memstats.total_alloc = totalAlloc
 	memstats.alloc = totalAlloc - totalFree
-	memstats.heap_alloc = memstats.alloc
 	memstats.heap_objects = memstats.nmalloc - memstats.nfree
-}
 
-// cachestats flushes all mcache stats.
-//
-// The world must be stopped.
-//
-//go:nowritebarrier
-func cachestats() {
-	for _, p := range allp {
-		c := p.mcache
-		if c == nil {
-			continue
-		}
-		purgecachedstats(c)
+	memstats.stacks_inuse = uint64(consStats.inStacks)
+	memstats.gcWorkBufInUse = uint64(consStats.inWorkBufs)
+	memstats.gcProgPtrScalarBitsInUse = uint64(consStats.inPtrScalarBits)
+
+	// We also count stacks_inuse, gcWorkBufInUse, and gcProgPtrScalarBitsInUse as sys memory.
+	memstats.sys += memstats.stacks_inuse + memstats.gcWorkBufInUse + memstats.gcProgPtrScalarBitsInUse
+
+	// The world is stopped, so the consistent stats (after aggregation)
+	// should be identical to some combination of memstats. In particular:
+	//
+	// * heap_inuse == inHeap
+	// * heap_released == released
+	// * heap_sys - heap_released == committed - inStacks - inWorkBufs - inPtrScalarBits
+	//
+	// Check if that's actually true.
+	//
+	// TODO(mknyszek): Maybe don't throw here. It would be bad if a
+	// bug in otherwise benign accounting caused the whole application
+	// to crash.
+	if memstats.heap_inuse != uint64(consStats.inHeap) {
+		print("runtime: heap_inuse=", memstats.heap_inuse, "\n")
+		print("runtime: consistent value=", consStats.inHeap, "\n")
+		throw("heap_inuse and consistent stats are not equal")
+	}
+	if memstats.heap_released != uint64(consStats.released) {
+		print("runtime: heap_released=", memstats.heap_released, "\n")
+		print("runtime: consistent value=", consStats.released, "\n")
+		throw("heap_released and consistent stats are not equal")
+	}
+	globalRetained := memstats.heap_sys.load() - memstats.heap_released
+	consRetained := uint64(consStats.committed - consStats.inStacks - consStats.inWorkBufs - consStats.inPtrScalarBits)
+	if globalRetained != consRetained {
+		print("runtime: global value=", globalRetained, "\n")
+		print("runtime: consistent value=", consRetained, "\n")
+		throw("measures of the retained heap are not equal")
 	}
 }
 
@@ -631,64 +726,222 @@ func flushallmcaches() {
 	}
 }
 
+// sysMemStat represents a global system statistic that is managed atomically.
+//
+// This type must structurally be a uint64 so that mstats aligns with MemStats.
+type sysMemStat uint64
+
+// load atomically reads the value of the stat.
+//
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
 //go:nosplit
-func purgecachedstats(c *mcache) {
-	// Protected by either heap or GC lock.
-	h := &mheap_
-	memstats.heap_scan += uint64(c.local_scan)
-	c.local_scan = 0
-	memstats.tinyallocs += uint64(c.local_tinyallocs)
-	c.local_tinyallocs = 0
-	h.largefree += uint64(c.local_largefree)
-	c.local_largefree = 0
-	h.nlargefree += uint64(c.local_nlargefree)
-	c.local_nlargefree = 0
-	for i := 0; i < len(c.local_nsmallfree); i++ {
-		h.nsmallfree[i] += uint64(c.local_nsmallfree[i])
-		c.local_nsmallfree[i] = 0
-	}
+func (s *sysMemStat) load() uint64 {
+	return atomic.Load64((*uint64)(s))
 }
 
-// Atomically increases a given *system* memory stat. We are counting on this
-// stat never overflowing a uintptr, so this function must only be used for
-// system memory stats.
+// add atomically adds the sysMemStat by n.
 //
-// The current implementation for little endian architectures is based on
-// xadduintptr(), which is less than ideal: xadd64() should really be used.
-// Using xadduintptr() is a stop-gap solution until arm supports xadd64() that
-// doesn't use locks.  (Locks are a problem as they require a valid G, which
-// restricts their useability.)
-//
-// A side-effect of using xadduintptr() is that we need to check for
-// overflow errors.
+// Must be nosplit as it is called in runtime initialization, e.g. newosproc0.
 //go:nosplit
-func mSysStatInc(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
+func (s *sysMemStat) add(n int64) {
+	if s == nil {
 		return
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, int64(n))
-		return
+	val := atomic.Xadd64((*uint64)(s), n)
+	if (n > 0 && int64(val) < n) || (n < 0 && int64(val)+n < n) {
+		print("runtime: val=", val, " n=", n, "\n")
+		throw("sysMemStat overflow")
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), n); val < n {
-		print("runtime: stat overflow: val ", val, ", n ", n, "\n")
-		exit(2)
+}
+
+// heapStatsDelta contains deltas of various runtime memory statistics
+// that need to be updated together in order for them to be kept
+// consistent with one another.
+type heapStatsDelta struct {
+	// Memory stats.
+	committed       int64 // byte delta of memory committed
+	released        int64 // byte delta of released memory generated
+	inHeap          int64 // byte delta of memory placed in the heap
+	inStacks        int64 // byte delta of memory reserved for stacks
+	inWorkBufs      int64 // byte delta of memory reserved for work bufs
+	inPtrScalarBits int64 // byte delta of memory reserved for unrolled GC prog bits
+
+	// Allocator stats.
+	largeAlloc      uintptr                  // bytes allocated for large objects
+	largeAllocCount uintptr                  // number of large object allocations
+	smallAllocCount [_NumSizeClasses]uintptr // number of allocs for small objects
+	largeFree       uintptr                  // bytes freed for large objects (>maxSmallSize)
+	largeFreeCount  uintptr                  // number of frees for large objects (>maxSmallSize)
+	smallFreeCount  [_NumSizeClasses]uintptr // number of frees for small objects (<=maxSmallSize)
+
+	// Add a uint32 to ensure this struct is a multiple of 8 bytes in size.
+	// Only necessary on 32-bit platforms.
+	// _ [(sys.PtrSize / 4) % 2]uint32
+}
+
+// merge adds in the deltas from b into a.
+func (a *heapStatsDelta) merge(b *heapStatsDelta) {
+	a.committed += b.committed
+	a.released += b.released
+	a.inHeap += b.inHeap
+	a.inStacks += b.inStacks
+	a.inWorkBufs += b.inWorkBufs
+	a.inPtrScalarBits += b.inPtrScalarBits
+
+	a.largeAlloc += b.largeAlloc
+	a.largeAllocCount += b.largeAllocCount
+	for i := range b.smallAllocCount {
+		a.smallAllocCount[i] += b.smallAllocCount[i]
+	}
+	a.largeFree += b.largeFree
+	a.largeFreeCount += b.largeFreeCount
+	for i := range b.smallFreeCount {
+		a.smallFreeCount[i] += b.smallFreeCount[i]
 	}
 }
 
-// Atomically decreases a given *system* memory stat. Same comments as
-// mSysStatInc apply.
-//go:nosplit
-func mSysStatDec(sysStat *uint64, n uintptr) {
-	if sysStat == nil {
-		return
+// consistentHeapStats represents a set of various memory statistics
+// whose updates must be viewed completely to get a consistent
+// state of the world.
+//
+// To write updates to memory stats use the acquire and release
+// methods. To obtain a consistent global snapshot of these statistics,
+// use read.
+type consistentHeapStats struct {
+	// stats is a ring buffer of heapStatsDelta values.
+	// Writers always atomically update the delta at index gen.
+	//
+	// Readers operate by rotating gen (0 -> 1 -> 2 -> 0 -> ...)
+	// and synchronizing with writers by observing each mcache's
+	// statsSeq field. If the reader observes a P (to which the
+	// mcache is bound) not writing, it can be sure that it will
+	// pick up the new gen value the next time it writes.
+	// The reader then takes responsibility by clearing space
+	// in the ring buffer for the next reader to rotate gen to
+	// that space (i.e. it merges in values from index (gen-2) mod 3
+	// to index (gen-1) mod 3, then clears the former).
+	//
+	// Note that this means only one reader can be reading at a time.
+	// There is no way for readers to synchronize.
+	//
+	// This process is why we need ring buffer of size 3 instead
+	// of 2: one is for the writers, one contains the most recent
+	// data, and the last one is clear so writers can begin writing
+	// to it the moment gen is updated.
+	stats [3]heapStatsDelta
+
+	// gen represents the current index into which writers
+	// are writing, and can take on the value of 0, 1, or 2.
+	// This value is updated atomically.
+	gen uint32
+}
+
+// acquire returns a heapStatsDelta to be updated. In effect,
+// it acquires the shard for writing. release must be called
+// as soon as the relevant deltas are updated. c must be
+// a valid mcache not being used by any other thread.
+//
+// The returned heapStatsDelta must be updated atomically.
+//
+// Note however, that this is unsafe to call concurrently
+// with other writers and there must be only one writer
+// at a time.
+func (m *consistentHeapStats) acquire(c *mcache) *heapStatsDelta {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 == 0 {
+		// Should have been incremented to odd.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
 	}
-	if sys.BigEndian {
-		atomic.Xadd64(sysStat, -int64(n))
-		return
+	gen := atomic.Load(&m.gen) % 3
+	return &m.stats[gen]
+}
+
+// release indicates that the writer is done modifying
+// the delta. The value returned by the corresponding
+// acquire must no longer be accessed or modified after
+// release is called.
+//
+// The mcache passed here must be the same as the one
+// passed to acquire.
+func (m *consistentHeapStats) release(c *mcache) {
+	seq := atomic.Xadd(&c.statsSeq, 1)
+	if seq%2 != 0 {
+		// Should have been incremented to even.
+		print("runtime: seq=", seq, "\n")
+		throw("bad sequence number")
+	}
+}
+
+// unsafeRead aggregates the delta for this shard into out.
+//
+// Unsafe because it does so without any synchronization. The
+// only safe time to call this is if the world is stopped or
+// we're freezing the world or going down anyway (and we just
+// want _some_ estimate).
+func (m *consistentHeapStats) unsafeRead(out *heapStatsDelta) {
+	for i := range m.stats {
+		out.merge(&m.stats[i])
+	}
+}
+
+// unsafeClear clears the shard.
+//
+// Unsafe because the world must be stopped and values should
+// be donated elsewhere before clearing.
+func (m *consistentHeapStats) unsafeClear() {
+	for i := range m.stats {
+		m.stats[i] = heapStatsDelta{}
+	}
+}
+
+// read takes a globally consistent snapshot of m
+// and puts the aggregated value in out. Even though out is a
+// heapStatsDelta, the resulting values should be complete and
+// valid statistic values.
+//
+// Not safe to call concurrently. The world must be stopped
+// or metricsSema must be held.
+func (m *consistentHeapStats) read(out *heapStatsDelta) {
+	// Getting preempted after this point is not safe because
+	// we read allp. We need to make sure a STW can't happen
+	// so it doesn't change out from under us.
+	mp := acquirem()
+
+	// Rotate gen, effectively taking a snapshot of the state of
+	// these statistics at the point of the exchange by moving
+	// writers to the next set of deltas.
+	//
+	// This exchange is safe to do because we won't race
+	// with anyone else trying to update this value.
+	currGen := atomic.Load(&m.gen)
+	atomic.Xchg(&m.gen, (currGen+1)%3)
+	prevGen := currGen - 1
+	if currGen == 0 {
+		prevGen = 2
 	}
-	if val := atomic.Xadduintptr((*uintptr)(unsafe.Pointer(sysStat)), uintptr(-int64(n))); val+n < n {
-		print("runtime: stat underflow: val ", val, ", n ", n, "\n")
-		exit(2)
+	for _, p := range allp {
+		c := p.mcache
+		if c == nil {
+			continue
+		}
+		// Spin until there are no more writers.
+		for atomic.Load(&c.statsSeq)%2 != 0 {
+		}
 	}
+
+	// At this point we've observed that each sequence
+	// number is even, so any future writers will observe
+	// the new gen value. That means it's safe to read from
+	// the other deltas in the stats buffer.
+
+	// Perform our responsibilities and free up
+	// stats[prevGen] for the next time we want to take
+	// a snapshot.
+	m.stats[currGen].merge(&m.stats[prevGen])
+	m.stats[prevGen] = heapStatsDelta{}
+
+	// Finally, copy out the complete delta.
+	*out = m.stats[currGen]
+	releasem(mp)
 }
diff --git a/src/runtime/mwbbuf.go b/src/runtime/mwbbuf.go
index 632769c114..6efc00007d 100644
--- a/src/runtime/mwbbuf.go
+++ b/src/runtime/mwbbuf.go
@@ -57,12 +57,6 @@ type wbBuf struct {
 	// on. This must be a multiple of wbBufEntryPointers because
 	// the write barrier only checks for overflow once per entry.
 	buf [wbBufEntryPointers * wbBufEntries]uintptr
-
-	// debugGen causes the write barrier buffer to flush after
-	// every write barrier if equal to gcWorkPauseGen. This is for
-	// debugging #27993. This is only set if debugCachedWork is
-	// set.
-	debugGen uint32
 }
 
 const (
@@ -86,7 +80,7 @@ const (
 func (b *wbBuf) reset() {
 	start := uintptr(unsafe.Pointer(&b.buf[0]))
 	b.next = start
-	if writeBarrier.cgo || (debugCachedWork && (throwOnGCWork || b.debugGen == atomic.Load(&gcWorkPauseGen))) {
+	if writeBarrier.cgo {
 		// Effectively disable the buffer by forcing a flush
 		// on every barrier.
 		b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
@@ -204,32 +198,10 @@ func wbBufFlush(dst *uintptr, src uintptr) {
 	// Switch to the system stack so we don't have to worry about
 	// the untyped stack slots or safe points.
 	systemstack(func() {
-		if debugCachedWork {
-			// For debugging, include the old value of the
-			// slot and some other data in the traceback.
-			wbBuf := &getg().m.p.ptr().wbBuf
-			var old uintptr
-			if dst != nil {
-				// dst may be nil in direct calls to wbBufFlush.
-				old = *dst
-			}
-			wbBufFlush1Debug(old, wbBuf.buf[0], wbBuf.buf[1], &wbBuf.buf[0], wbBuf.next)
-		} else {
-			wbBufFlush1(getg().m.p.ptr())
-		}
+		wbBufFlush1(getg().m.p.ptr())
 	})
 }
 
-// wbBufFlush1Debug is a temporary function for debugging issue
-// #27993. It exists solely to add some context to the traceback.
-//
-//go:nowritebarrierrec
-//go:systemstack
-//go:noinline
-func wbBufFlush1Debug(old, buf1, buf2 uintptr, start *uintptr, next uintptr) {
-	wbBufFlush1(getg().m.p.ptr())
-}
-
 // wbBufFlush1 flushes p's write barrier buffer to the GC work queue.
 //
 // This must not have write barriers because it is part of the write
diff --git a/src/runtime/netpoll.go b/src/runtime/netpoll.go
index 34ea82a7fa..77eb3aa4c6 100644
--- a/src/runtime/netpoll.go
+++ b/src/runtime/netpoll.go
@@ -79,16 +79,17 @@ type pollDesc struct {
 	lock    mutex // protects the following fields
 	fd      uintptr
 	closing bool
-	everr   bool    // marks event scanning error happened
-	user    uint32  // user settable cookie
-	rseq    uintptr // protects from stale read timers
-	rg      uintptr // pdReady, pdWait, G waiting for read or nil
-	rt      timer   // read deadline timer (set if rt.f != nil)
-	rd      int64   // read deadline
-	wseq    uintptr // protects from stale write timers
-	wg      uintptr // pdReady, pdWait, G waiting for write or nil
-	wt      timer   // write deadline timer
-	wd      int64   // write deadline
+	everr   bool      // marks event scanning error happened
+	user    uint32    // user settable cookie
+	rseq    uintptr   // protects from stale read timers
+	rg      uintptr   // pdReady, pdWait, G waiting for read or nil
+	rt      timer     // read deadline timer (set if rt.f != nil)
+	rd      int64     // read deadline
+	wseq    uintptr   // protects from stale write timers
+	wg      uintptr   // pdReady, pdWait, G waiting for write or nil
+	wt      timer     // write deadline timer
+	wd      int64     // write deadline
+	self    *pollDesc // storage for indirect interface. See (*pollDesc).makeArg.
 }
 
 type pollCache struct {
@@ -157,6 +158,7 @@ func poll_runtime_pollOpen(fd uintptr) (*pollDesc, int) {
 	pd.wseq++
 	pd.wg = 0
 	pd.wd = 0
+	pd.self = pd
 	unlock(&pd.lock)
 
 	var errno int32
@@ -271,14 +273,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 			// Copy current seq into the timer arg.
 			// Timer func will check the seq against current descriptor seq,
 			// if they differ the descriptor was reused or timers were reset.
-			pd.rt.arg = pd
+			pd.rt.arg = pd.makeArg()
 			pd.rt.seq = pd.rseq
 			resettimer(&pd.rt, pd.rd)
 		}
 	} else if pd.rd != rd0 || combo != combo0 {
 		pd.rseq++ // invalidate current timers
 		if pd.rd > 0 {
-			modtimer(&pd.rt, pd.rd, 0, rtf, pd, pd.rseq)
+			modtimer(&pd.rt, pd.rd, 0, rtf, pd.makeArg(), pd.rseq)
 		} else {
 			deltimer(&pd.rt)
 			pd.rt.f = nil
@@ -287,14 +289,14 @@ func poll_runtime_pollSetDeadline(pd *pollDesc, d int64, mode int) {
 	if pd.wt.f == nil {
 		if pd.wd > 0 && !combo {
 			pd.wt.f = netpollWriteDeadline
-			pd.wt.arg = pd
+			pd.wt.arg = pd.makeArg()
 			pd.wt.seq = pd.wseq
 			resettimer(&pd.wt, pd.wd)
 		}
 	} else if pd.wd != wd0 || combo != combo0 {
 		pd.wseq++ // invalidate current timers
 		if pd.wd > 0 && !combo {
-			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd, pd.wseq)
+			modtimer(&pd.wt, pd.wd, 0, netpollWriteDeadline, pd.makeArg(), pd.wseq)
 		} else {
 			deltimer(&pd.wt)
 			pd.wt.f = nil
@@ -547,3 +549,20 @@ func (c *pollCache) alloc() *pollDesc {
 	unlock(&c.lock)
 	return pd
 }
+
+// makeArg converts pd to an interface{}.
+// makeArg does not do any allocation. Normally, such
+// a conversion requires an allocation because pointers to
+// go:notinheap types (which pollDesc is) must be stored
+// in interfaces indirectly. See issue 42076.
+func (pd *pollDesc) makeArg() (i interface{}) {
+	x := (*eface)(unsafe.Pointer(&i))
+	x._type = pdType
+	x.data = unsafe.Pointer(&pd.self)
+	return
+}
+
+var (
+	pdEface interface{} = (*pollDesc)(nil)
+	pdType  *_type      = efaceOf(&pdEface)._type
+)
diff --git a/src/runtime/os_darwin.go b/src/runtime/os_darwin.go
index 01c40b4813..3f5bb7cf96 100644
--- a/src/runtime/os_darwin.go
+++ b/src/runtime/os_darwin.go
@@ -198,7 +198,6 @@ func newosproc(mp *m) {
 		exit(1)
 	}
 	mp.g0.stack.hi = stacksize // for mstart
-	//mSysStatInc(&memstats.stacks_sys, stacksize) //TODO: do this?
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -247,7 +246,7 @@ func newosproc0(stacksize uintptr, fn uintptr) {
 		exit(1)
 	}
 	g0.stack.hi = stacksize // for mstart
-	mSysStatInc(&memstats.stacks_sys, stacksize)
+	memstats.stacks_sys.add(int64(stacksize))
 
 	// Tell the pthread library we won't join with this thread.
 	if pthread_attr_setdetachstate(&attr, _PTHREAD_CREATE_DETACHED) != 0 {
@@ -289,9 +288,9 @@ func mpreinit(mp *m) {
 // Called to initialize a new m (including the bootstrap m).
 // Called on the new thread, cannot allocate memory.
 func minit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// The signal handler handles it directly.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		minitSignalStack()
 	}
 	minitSignalMask()
@@ -301,9 +300,9 @@ func minit() {
 // Called from dropm to undo the effect of an minit.
 //go:nosplit
 func unminit() {
-	// The alternate signal stack is buggy on arm64.
+	// iOS does not support alternate signal stack.
 	// See minit.
-	if GOARCH != "arm64" {
+	if !(GOOS == "ios" && GOARCH == "arm64") {
 		unminitSignals()
 	}
 }
diff --git a/src/runtime/os_freebsd_arm64.go b/src/runtime/os_freebsd_arm64.go
index 51ebf9d478..b5b25f0dc5 100644
--- a/src/runtime/os_freebsd_arm64.go
+++ b/src/runtime/os_freebsd_arm64.go
@@ -4,149 +4,6 @@
 
 package runtime
 
-import "internal/cpu"
-
-const (
-	hwcap_FP       = 1 << 0
-	hwcap_ASIMD    = 1 << 1
-	hwcap_EVTSTRM  = 1 << 2
-	hwcap_AES      = 1 << 3
-	hwcap_PMULL    = 1 << 4
-	hwcap_SHA1     = 1 << 5
-	hwcap_SHA2     = 1 << 6
-	hwcap_CRC32    = 1 << 7
-	hwcap_ATOMICS  = 1 << 8
-	hwcap_FPHP     = 1 << 9
-	hwcap_ASIMDHP  = 1 << 10
-	hwcap_CPUID    = 1 << 11
-	hwcap_ASIMDRDM = 1 << 12
-	hwcap_JSCVT    = 1 << 13
-	hwcap_FCMA     = 1 << 14
-	hwcap_LRCPC    = 1 << 15
-	hwcap_DCPOP    = 1 << 16
-	hwcap_SHA3     = 1 << 17
-	hwcap_SM3      = 1 << 18
-	hwcap_SM4      = 1 << 19
-	hwcap_ASIMDDP  = 1 << 20
-	hwcap_SHA512   = 1 << 21
-	hwcap_SVE      = 1 << 22
-	hwcap_ASIMDFHM = 1 << 23
-)
-
-func getisar0() uint64
-func getisar1() uint64
-func getpfr0() uint64
-
-// no hwcap support on FreeBSD aarch64, we need to retrieve the info from
-// ID_AA64ISAR0_EL1, ID_AA64ISAR1_EL1 and ID_AA64PFR0_EL1
-func archauxv(tag, val uintptr) {
-	var isar0, isar1, pfr0 uint64
-
-	isar0 = getisar0()
-	isar1 = getisar1()
-	pfr0 = getpfr0()
-
-	// ID_AA64ISAR0_EL1
-	switch extractBits(isar0, 4, 7) {
-	case 1:
-		cpu.HWCap |= hwcap_AES
-	case 2:
-		cpu.HWCap |= hwcap_PMULL | hwcap_AES
-	}
-
-	switch extractBits(isar0, 8, 11) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA1
-	}
-
-	switch extractBits(isar0, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA2
-	case 2:
-		cpu.HWCap |= hwcap_SHA2 | hwcap_SHA512
-	}
-
-	switch extractBits(isar0, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_CRC32
-	}
-
-	switch extractBits(isar0, 20, 23) {
-	case 2:
-		cpu.HWCap |= hwcap_ATOMICS
-	}
-
-	switch extractBits(isar0, 28, 31) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDRDM
-	}
-
-	switch extractBits(isar0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SHA3
-	}
-
-	switch extractBits(isar0, 36, 39) {
-	case 1:
-		cpu.HWCap |= hwcap_SM3
-	}
-
-	switch extractBits(isar0, 40, 43) {
-	case 1:
-		cpu.HWCap |= hwcap_SM4
-	}
-
-	switch extractBits(isar0, 44, 47) {
-	case 1:
-		cpu.HWCap |= hwcap_ASIMDDP
-	}
-
-	// ID_AA64ISAR1_EL1
-	switch extractBits(isar1, 0, 3) {
-	case 1:
-		cpu.HWCap |= hwcap_DCPOP
-	}
-
-	switch extractBits(isar1, 12, 15) {
-	case 1:
-		cpu.HWCap |= hwcap_JSCVT
-	}
-
-	switch extractBits(isar1, 16, 19) {
-	case 1:
-		cpu.HWCap |= hwcap_FCMA
-	}
-
-	switch extractBits(isar1, 20, 23) {
-	case 1:
-		cpu.HWCap |= hwcap_LRCPC
-	}
-
-	// ID_AA64PFR0_EL1
-	switch extractBits(pfr0, 16, 19) {
-	case 0:
-		cpu.HWCap |= hwcap_FP
-	case 1:
-		cpu.HWCap |= hwcap_FP | hwcap_FPHP
-	}
-
-	switch extractBits(pfr0, 20, 23) {
-	case 0:
-		cpu.HWCap |= hwcap_ASIMD
-	case 1:
-		cpu.HWCap |= hwcap_ASIMD | hwcap_ASIMDHP
-	}
-
-	switch extractBits(pfr0, 32, 35) {
-	case 1:
-		cpu.HWCap |= hwcap_SVE
-	}
-}
-
-func extractBits(data uint64, start, end uint) uint {
-	return (uint)(data>>start) & ((1 << (end - start + 1)) - 1)
-}
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed fastrand().
diff --git a/src/runtime/os_freebsd_noauxv.go b/src/runtime/os_freebsd_noauxv.go
index c6a49927c8..01efb9b7c9 100644
--- a/src/runtime/os_freebsd_noauxv.go
+++ b/src/runtime/os_freebsd_noauxv.go
@@ -3,7 +3,7 @@
 // license that can be found in the LICENSE file.
 
 // +build freebsd
-// +build !arm,!arm64
+// +build !arm
 
 package runtime
 
diff --git a/src/runtime/os_js.go b/src/runtime/os_js.go
index ff0ee3aa6b..94983b358d 100644
--- a/src/runtime/os_js.go
+++ b/src/runtime/os_js.go
@@ -59,7 +59,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
diff --git a/src/runtime/os_linux_arm64.go b/src/runtime/os_linux_arm64.go
index 19968dc164..c5fd742048 100644
--- a/src/runtime/os_linux_arm64.go
+++ b/src/runtime/os_linux_arm64.go
@@ -11,19 +11,7 @@ import "internal/cpu"
 func archauxv(tag, val uintptr) {
 	switch tag {
 	case _AT_HWCAP:
-		// arm64 doesn't have a 'cpuid' instruction equivalent and relies on
-		// HWCAP/HWCAP2 bits for hardware capabilities.
-		hwcap := uint(val)
-		if GOOS == "android" {
-			// The Samsung S9+ kernel reports support for atomics, but not all cores
-			// actually support them, resulting in SIGILL. See issue #28431.
-			// TODO(elias.naur): Only disable the optimization on bad chipsets.
-			const hwcap_ATOMICS = 1 << 8
-			hwcap &= ^uint(hwcap_ATOMICS)
-		}
-		cpu.HWCap = hwcap
-	case _AT_HWCAP2:
-		cpu.HWCap2 = uint(val)
+		cpu.HWCap = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go
index ee18fd1dc2..b9651f186c 100644
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -6,15 +6,10 @@ package runtime
 
 import "internal/cpu"
 
-const (
-	// bit masks taken from bits/hwcap.h
-	_HWCAP_S390_VX = 2048 // vector facility
-)
-
 func archauxv(tag, val uintptr) {
 	switch tag {
-	case _AT_HWCAP: // CPU capability bit flags
-		cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
+	case _AT_HWCAP:
+		cpu.HWCap = uint(val)
 	}
 }
 
diff --git a/src/runtime/os_netbsd.go b/src/runtime/os_netbsd.go
index c4c3d8e2fe..f7f90cedc1 100644
--- a/src/runtime/os_netbsd.go
+++ b/src/runtime/os_netbsd.go
@@ -359,7 +359,6 @@ func sysargs(argc int32, argv **byte) {
 	// now argv+n is auxv
 	auxv := (*[1 << 28]uintptr)(add(unsafe.Pointer(argv), uintptr(n)*sys.PtrSize))
 	sysauxv(auxv[:])
-	archauxv(auxv[:])
 }
 
 const (
diff --git a/src/runtime/os_netbsd_386.go b/src/runtime/os_netbsd_386.go
index c203af9cef..037f7e36dc 100644
--- a/src/runtime/os_netbsd_386.go
+++ b/src/runtime/os_netbsd_386.go
@@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp
 	mc.__gregs[_REG_EDX] = uint32(uintptr(unsafe.Pointer(gp)))
 	mc.__gregs[_REG_ESI] = uint32(fn)
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_amd64.go b/src/runtime/os_netbsd_amd64.go
index ea9d125492..5118b0c4ff 100644
--- a/src/runtime/os_netbsd_amd64.go
+++ b/src/runtime/os_netbsd_amd64.go
@@ -14,6 +14,3 @@ func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintp
 	mc.__gregs[_REG_R9] = uint64(uintptr(unsafe.Pointer(gp)))
 	mc.__gregs[_REG_R12] = uint64(fn)
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_arm.go b/src/runtime/os_netbsd_arm.go
index 646da9dc0b..b5ec23e45b 100644
--- a/src/runtime/os_netbsd_arm.go
+++ b/src/runtime/os_netbsd_arm.go
@@ -32,6 +32,3 @@ func cputicks() int64 {
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func archauxv(auxv []uintptr) {
-}
diff --git a/src/runtime/os_netbsd_arm64.go b/src/runtime/os_netbsd_arm64.go
index ae2638c778..8d21b0a430 100644
--- a/src/runtime/os_netbsd_arm64.go
+++ b/src/runtime/os_netbsd_arm64.go
@@ -4,10 +4,7 @@
 
 package runtime
 
-import (
-	"internal/cpu"
-	"unsafe"
-)
+import "unsafe"
 
 func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) {
 	// Machine dependent mcontext initialisation for LWP.
@@ -24,10 +21,3 @@ func cputicks() int64 {
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func archauxv(auxv []uintptr) {
-	// NetBSD does not supply AT_HWCAP, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue https://golang.org/issue/30824#issuecomment-494901591
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/src/runtime/os_openbsd_arm64.go b/src/runtime/os_openbsd_arm64.go
index d559a2a3e5..d71de7d196 100644
--- a/src/runtime/os_openbsd_arm64.go
+++ b/src/runtime/os_openbsd_arm64.go
@@ -4,20 +4,9 @@
 
 package runtime
 
-import (
-	"internal/cpu"
-)
-
 //go:nosplit
 func cputicks() int64 {
 	// Currently cputicks() is used in blocking profiler and to seed runtime·fastrand().
 	// runtime·nanotime() is a poor approximation of CPU ticks that is enough for the profiler.
 	return nanotime()
 }
-
-func sysargs(argc int32, argv **byte) {
-	// OpenBSD does not have auxv, however we still need to initialise cpu.HWCaps.
-	// For now specify the bare minimum until we add some form of capabilities
-	// detection. See issue #31746.
-	cpu.HWCap = 1<<1 | 1<<0 // ASIMD, FP
-}
diff --git a/src/runtime/os_plan9.go b/src/runtime/os_plan9.go
index f3037a7508..62aecea060 100644
--- a/src/runtime/os_plan9.go
+++ b/src/runtime/os_plan9.go
@@ -184,7 +184,7 @@ func mpreinit(mp *m) {
 	mp.errstr = (*byte)(mallocgc(_ERRMAX, nil, true))
 }
 
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 func msigrestore(sigmask sigset) {
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go
index a62e941229..ffb087f9db 100644
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -21,6 +21,7 @@ const (
 //go:cgo_import_dynamic runtime._CreateIoCompletionPort CreateIoCompletionPort%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateThread CreateThread%6 "kernel32.dll"
 //go:cgo_import_dynamic runtime._CreateWaitableTimerA CreateWaitableTimerA%3 "kernel32.dll"
+//go:cgo_import_dynamic runtime._CreateWaitableTimerExW CreateWaitableTimerExW%4 "kernel32.dll"
 //go:cgo_import_dynamic runtime._DuplicateHandle DuplicateHandle%7 "kernel32.dll"
 //go:cgo_import_dynamic runtime._ExitProcess ExitProcess%1 "kernel32.dll"
 //go:cgo_import_dynamic runtime._FreeEnvironmentStringsW FreeEnvironmentStringsW%1 "kernel32.dll"
@@ -68,6 +69,7 @@ var (
 	_CreateIoCompletionPort,
 	_CreateThread,
 	_CreateWaitableTimerA,
+	_CreateWaitableTimerExW,
 	_DuplicateHandle,
 	_ExitProcess,
 	_FreeEnvironmentStringsW,
@@ -151,6 +153,8 @@ type mOS struct {
 	waitsema   uintptr // semaphore for parking on locks
 	resumesema uintptr // semaphore to indicate suspend/resume
 
+	highResTimer uintptr // high resolution timer handle used in usleep
+
 	// preemptExtLock synchronizes preemptM with entry/exit from
 	// external C code.
 	//
@@ -402,11 +406,21 @@ const osRelaxMinNS = 60 * 1e6
 // osRelax is called by the scheduler when transitioning to and from
 // all Ps being idle.
 //
-// On Windows, it adjusts the system-wide timer resolution. Go needs a
+// Some versions of Windows have high resolution timer. For those
+// versions osRelax is noop.
+// For Windows versions without high resolution timer, osRelax
+// adjusts the system-wide timer resolution. Go needs a
 // high resolution timer while running and there's little extra cost
 // if we're already using the CPU, but if all Ps are idle there's no
 // need to consume extra power to drive the high-res timer.
 func osRelax(relax bool) uint32 {
+	if haveHighResTimer {
+		// If the high resolution timer is available, the runtime uses the timer
+		// to sleep for short durations. This means there's no need to adjust
+		// the global clock frequency.
+		return 0
+	}
+
 	if relax {
 		return uint32(stdcall1(_timeEndPeriod, 1))
 	} else {
@@ -414,6 +428,42 @@ func osRelax(relax bool) uint32 {
 	}
 }
 
+// haveHighResTimer indicates that the CreateWaitableTimerEx
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag is available.
+var haveHighResTimer = false
+
+// createHighResTimer calls CreateWaitableTimerEx with
+// CREATE_WAITABLE_TIMER_HIGH_RESOLUTION flag to create high
+// resolution timer. createHighResTimer returns new timer
+// handle or 0, if CreateWaitableTimerEx failed.
+func createHighResTimer() uintptr {
+	const (
+		// As per @jstarks, see
+		// https://github.com/golang/go/issues/8687#issuecomment-656259353
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION = 0x00000002
+
+		_SYNCHRONIZE        = 0x00100000
+		_TIMER_QUERY_STATE  = 0x0001
+		_TIMER_MODIFY_STATE = 0x0002
+	)
+	return stdcall4(_CreateWaitableTimerExW, 0, 0,
+		_CREATE_WAITABLE_TIMER_HIGH_RESOLUTION,
+		_SYNCHRONIZE|_TIMER_QUERY_STATE|_TIMER_MODIFY_STATE)
+}
+
+func initHighResTimer() {
+	if GOARCH == "arm" {
+		// TODO: Not yet implemented.
+		return
+	}
+	h := createHighResTimer()
+	if h != 0 {
+		haveHighResTimer = true
+		usleep2Addr = unsafe.Pointer(funcPC(usleep2HighRes))
+		stdcall1(_CloseHandle, h)
+	}
+}
+
 func osinit() {
 	asmstdcallAddr = unsafe.Pointer(funcPC(asmstdcall))
 	usleep2Addr = unsafe.Pointer(funcPC(usleep2))
@@ -429,6 +479,7 @@ func osinit() {
 
 	stdcall2(_SetConsoleCtrlHandler, funcPC(ctrlhandler), 1)
 
+	initHighResTimer()
 	timeBeginPeriodRetValue = osRelax(false)
 
 	ncpu = getproccount()
@@ -822,7 +873,7 @@ func mpreinit(mp *m) {
 }
 
 //go:nosplit
-func msigsave(mp *m) {
+func sigsave(p *sigset) {
 }
 
 //go:nosplit
@@ -844,9 +895,20 @@ func minit() {
 	var thandle uintptr
 	stdcall7(_DuplicateHandle, currentProcess, currentThread, currentProcess, uintptr(unsafe.Pointer(&thandle)), 0, 0, _DUPLICATE_SAME_ACCESS)
 
+	// Configure usleep timer, if possible.
+	var timer uintptr
+	if haveHighResTimer {
+		timer = createHighResTimer()
+		if timer == 0 {
+			print("runtime: CreateWaitableTimerEx failed; errno=", getlasterror(), "\n")
+			throw("CreateWaitableTimerEx when creating timer failed")
+		}
+	}
+
 	mp := getg().m
 	lock(&mp.threadLock)
 	mp.thread = thandle
+	mp.highResTimer = timer
 	unlock(&mp.threadLock)
 
 	// Query the true stack base from the OS. Currently we're
@@ -884,6 +946,10 @@ func unminit() {
 	lock(&mp.threadLock)
 	stdcall1(_CloseHandle, mp.thread)
 	mp.thread = 0
+	if mp.highResTimer != 0 {
+		stdcall1(_CloseHandle, mp.highResTimer)
+		mp.highResTimer = 0
+	}
 	unlock(&mp.threadLock)
 }
 
@@ -976,9 +1042,12 @@ func stdcall7(fn stdFunction, a0, a1, a2, a3, a4, a5, a6 uintptr) uintptr {
 	return stdcall(fn)
 }
 
-// in sys_windows_386.s and sys_windows_amd64.s
+// In sys_windows_386.s and sys_windows_amd64.s.
 func onosstack(fn unsafe.Pointer, arg uint32)
+
+// These are not callable functions. They should only be called via onosstack.
 func usleep2(usec uint32)
+func usleep2HighRes(usec uint32)
 func switchtothread()
 
 var usleep2Addr unsafe.Pointer
diff --git a/src/runtime/panic.go b/src/runtime/panic.go
index 6050a34d29..aed17d6fc6 100644
--- a/src/runtime/panic.go
+++ b/src/runtime/panic.go
@@ -421,15 +421,6 @@ func newdefer(siz int32) *_defer {
 			total := roundupsize(totaldefersize(uintptr(siz)))
 			d = (*_defer)(mallocgc(total, deferType, true))
 		})
-		if debugCachedWork {
-			// Duplicate the tail below so if there's a
-			// crash in checkPut we can tell if d was just
-			// allocated or came from the pool.
-			d.siz = siz
-			d.link = gp._defer
-			gp._defer = d
-			return d
-		}
 	}
 	d.siz = siz
 	d.heap = true
diff --git a/src/runtime/pprof/mprof_test.go b/src/runtime/pprof/mprof_test.go
index f253f07def..c11a45fd69 100644
--- a/src/runtime/pprof/mprof_test.go
+++ b/src/runtime/pprof/mprof_test.go
@@ -70,7 +70,7 @@ func TestMemoryProfiler(t *testing.T) {
 		runtime.MemProfileRate = oldRate
 	}()
 
-	// Allocate a meg to ensure that mcache.next_sample is updated to 1.
+	// Allocate a meg to ensure that mcache.nextSample is updated to 1.
 	for i := 0; i < 1024; i++ {
 		memSink = make([]byte, 1024)
 	}
diff --git a/src/runtime/preempt_386.s b/src/runtime/preempt_386.s
index a00ac8f385..c3a5fa1f36 100644
--- a/src/runtime/preempt_386.s
+++ b/src/runtime/preempt_386.s
@@ -5,7 +5,7 @@
 
 TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	PUSHFL
-	ADJSP $264
+	ADJSP $156
 	NOP SP
 	MOVL AX, 0(SP)
 	MOVL CX, 4(SP)
@@ -14,32 +14,29 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVL BP, 16(SP)
 	MOVL SI, 20(SP)
 	MOVL DI, 24(SP)
-	FSAVE 28(SP)
-	FLDCW runtime·controlWord64(SB)
 	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE nosse
-	MOVUPS X0, 136(SP)
-	MOVUPS X1, 152(SP)
-	MOVUPS X2, 168(SP)
-	MOVUPS X3, 184(SP)
-	MOVUPS X4, 200(SP)
-	MOVUPS X5, 216(SP)
-	MOVUPS X6, 232(SP)
-	MOVUPS X7, 248(SP)
+	MOVUPS X0, 28(SP)
+	MOVUPS X1, 44(SP)
+	MOVUPS X2, 60(SP)
+	MOVUPS X3, 76(SP)
+	MOVUPS X4, 92(SP)
+	MOVUPS X5, 108(SP)
+	MOVUPS X6, 124(SP)
+	MOVUPS X7, 140(SP)
 nosse:
 	CALL ·asyncPreempt2(SB)
 	CMPB internal∕cpu·X86+const_offsetX86HasSSE2(SB), $1
 	JNE nosse2
-	MOVUPS 248(SP), X7
-	MOVUPS 232(SP), X6
-	MOVUPS 216(SP), X5
-	MOVUPS 200(SP), X4
-	MOVUPS 184(SP), X3
-	MOVUPS 168(SP), X2
-	MOVUPS 152(SP), X1
-	MOVUPS 136(SP), X0
+	MOVUPS 140(SP), X7
+	MOVUPS 124(SP), X6
+	MOVUPS 108(SP), X5
+	MOVUPS 92(SP), X4
+	MOVUPS 76(SP), X3
+	MOVUPS 60(SP), X2
+	MOVUPS 44(SP), X1
+	MOVUPS 28(SP), X0
 nosse2:
-	FRSTOR 28(SP)
 	MOVL 24(SP), DI
 	MOVL 20(SP), SI
 	MOVL 16(SP), BP
@@ -47,6 +44,6 @@ nosse2:
 	MOVL 8(SP), DX
 	MOVL 4(SP), CX
 	MOVL 0(SP), AX
-	ADJSP $-264
+	ADJSP $-156
 	POPFL
 	RET
diff --git a/src/runtime/preempt_arm64.s b/src/runtime/preempt_arm64.s
index d0e77659c3..36ee13282c 100644
--- a/src/runtime/preempt_arm64.s
+++ b/src/runtime/preempt_arm64.s
@@ -10,9 +10,6 @@ TEXT ·asyncPreempt(SB),NOSPLIT|NOFRAME,$0-0
 	MOVD R29, -8(RSP)
 	SUB $8, RSP, R29
 	#endif
-	#ifdef GOOS_darwin
-	MOVD R30, (RSP)
-	#endif
 	#ifdef GOOS_ios
 	MOVD R30, (RSP)
 	#endif
diff --git a/src/runtime/print.go b/src/runtime/print.go
index e605eb34cb..64055a34cc 100644
--- a/src/runtime/print.go
+++ b/src/runtime/print.go
@@ -237,6 +237,9 @@ func printhex(v uint64) {
 func printpointer(p unsafe.Pointer) {
 	printhex(uint64(uintptr(p)))
 }
+func printuintptr(p uintptr) {
+	printhex(uint64(p))
+}
 
 func printstring(s string) {
 	gwrite(bytes(s))
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index a1e2ed0680..b335e1184d 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -137,6 +137,10 @@ func main() {
 	mainStarted = true
 
 	if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon
+		// For runtime_syscall_doAllThreadsSyscall, we
+		// register sysmon is not ready for the world to be
+		// stopped.
+		atomic.Store(&sched.sysmonStarting, 1)
 		systemstack(func() {
 			newm(sysmon, nil, -1)
 		})
@@ -153,12 +157,22 @@ func main() {
 	if g.m != &m0 {
 		throw("runtime.main not on m0")
 	}
+	m0.doesPark = true
 
-	doInit(&runtime_inittask) // must be before defer
-	if nanotime() == 0 {
+	// Record when the world started.
+	// Must be before doInit for tracing init.
+	runtimeInitTime = nanotime()
+	if runtimeInitTime == 0 {
 		throw("nanotime returning zero")
 	}
 
+	if debug.inittrace != 0 {
+		inittrace.id = getg().goid
+		inittrace.active = true
+	}
+
+	doInit(&runtime_inittask) // Must be before defer.
+
 	// Defer unlock so that runtime.Goexit during init does the unlock too.
 	needUnlock := true
 	defer func() {
@@ -167,9 +181,6 @@ func main() {
 		}
 	}()
 
-	// Record when the world started.
-	runtimeInitTime = nanotime()
-
 	gcenable()
 
 	main_init_done = make(chan bool)
@@ -196,6 +207,10 @@ func main() {
 
 	doInit(&main_inittask)
 
+	// Disable init tracing after main init done to avoid overhead
+	// of collecting statistics in malloc and newproc
+	inittrace.active = false
+
 	close(main_init_done)
 
 	needUnlock = false
@@ -583,7 +598,7 @@ func schedinit() {
 	typelinksinit() // uses maps, activeModules
 	itabsinit()     // uses activeModules
 
-	msigsave(_g_.m)
+	sigsave(&_g_.m.sigmask)
 	initSigmask = _g_.m.sigmask
 
 	goargs()
@@ -1216,6 +1231,21 @@ func mstartm0() {
 	initsig(false)
 }
 
+// mPark causes a thread to park itself - temporarily waking for
+// fixups but otherwise waiting to be fully woken. This is the
+// only way that m's should park themselves.
+//go:nosplit
+func mPark() {
+	g := getg()
+	for {
+		notesleep(&g.m.park)
+		noteclear(&g.m.park)
+		if !mDoFixup() {
+			return
+		}
+	}
+}
+
 // mexit tears down and exits the current thread.
 //
 // Don't call this directly to exit the thread, since it must run at
@@ -1247,7 +1277,7 @@ func mexit(osStack bool) {
 		sched.nmfreed++
 		checkdead()
 		unlock(&sched.lock)
-		notesleep(&m.park)
+		mPark()
 		throw("locked m0 woke up")
 	}
 
@@ -1301,6 +1331,14 @@ found:
 	checkdead()
 	unlock(&sched.lock)
 
+	if GOOS == "darwin" {
+		// Make sure pendingPreemptSignals is correct when an M exits.
+		// For #41702.
+		if atomic.Load(&m.signalPending) != 0 {
+			atomic.Xadd(&pendingPreemptSignals, -1)
+		}
+	}
+
 	if osStack {
 		// Return from mstart and let the system thread
 		// library free the g0 stack and terminate the thread.
@@ -1406,6 +1444,127 @@ func forEachP(fn func(*p)) {
 	releasem(mp)
 }
 
+// syscall_runtime_doAllThreadsSyscall serializes Go execution and
+// executes a specified fn() call on all m's.
+//
+// The boolean argument to fn() indicates whether the function's
+// return value will be consulted or not. That is, fn(true) should
+// return true if fn() succeeds, and fn(true) should return false if
+// it failed. When fn(false) is called, its return status will be
+// ignored.
+//
+// syscall_runtime_doAllThreadsSyscall first invokes fn(true) on a
+// single, coordinating, m, and only if it returns true does it go on
+// to invoke fn(false) on all of the other m's known to the process.
+//
+//go:linkname syscall_runtime_doAllThreadsSyscall syscall.runtime_doAllThreadsSyscall
+func syscall_runtime_doAllThreadsSyscall(fn func(bool) bool) {
+	if iscgo {
+		panic("doAllThreadsSyscall not supported with cgo enabled")
+	}
+	if fn == nil {
+		return
+	}
+	for atomic.Load(&sched.sysmonStarting) != 0 {
+		osyield()
+	}
+	stopTheWorldGC("doAllThreadsSyscall")
+	if atomic.Load(&newmHandoff.haveTemplateThread) != 0 {
+		// Ensure that there are no in-flight thread
+		// creations: don't want to race with allm.
+		lock(&newmHandoff.lock)
+		for !newmHandoff.waiting {
+			unlock(&newmHandoff.lock)
+			osyield()
+			lock(&newmHandoff.lock)
+		}
+		unlock(&newmHandoff.lock)
+	}
+	if netpollinited() {
+		netpollBreak()
+	}
+	_g_ := getg()
+	if raceenabled {
+		// For m's running without racectx, we loan out the
+		// racectx of this call.
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = _g_.racectx
+		unlock(&mFixupRace.lock)
+	}
+	if ok := fn(true); ok {
+		tid := _g_.m.procid
+		for mp := allm; mp != nil; mp = mp.alllink {
+			if mp.procid == tid {
+				// This m has already completed fn()
+				// call.
+				continue
+			}
+			// Be wary of mp's without procid values if
+			// they are known not to park. If they are
+			// marked as parking with a zero procid, then
+			// they will be racing with this code to be
+			// allocated a procid and we will annotate
+			// them with the need to execute the fn when
+			// they acquire a procid to run it.
+			if mp.procid == 0 && !mp.doesPark {
+				// Reaching here, we are either
+				// running Windows, or cgo linked
+				// code. Neither of which are
+				// currently supported by this API.
+				throw("unsupported runtime environment")
+			}
+			// stopTheWorldGC() doesn't guarantee stopping
+			// all the threads, so we lock here to avoid
+			// the possibility of racing with mp.
+			lock(&mp.mFixup.lock)
+			mp.mFixup.fn = fn
+			if mp.doesPark {
+				// For non-service threads this will
+				// cause the wakeup to be short lived
+				// (once the mutex is unlocked). The
+				// next real wakeup will occur after
+				// startTheWorldGC() is called.
+				notewakeup(&mp.park)
+			}
+			unlock(&mp.mFixup.lock)
+		}
+		for {
+			done := true
+			for mp := allm; done && mp != nil; mp = mp.alllink {
+				if mp.procid == tid {
+					continue
+				}
+				lock(&mp.mFixup.lock)
+				done = done && (mp.mFixup.fn == nil)
+				unlock(&mp.mFixup.lock)
+			}
+			if done {
+				break
+			}
+			// if needed force sysmon and/or newmHandoff to wakeup.
+			lock(&sched.lock)
+			if atomic.Load(&sched.sysmonwait) != 0 {
+				atomic.Store(&sched.sysmonwait, 0)
+				notewakeup(&sched.sysmonnote)
+			}
+			unlock(&sched.lock)
+			lock(&newmHandoff.lock)
+			if newmHandoff.waiting {
+				newmHandoff.waiting = false
+				notewakeup(&newmHandoff.wake)
+			}
+			unlock(&newmHandoff.lock)
+			osyield()
+		}
+	}
+	if raceenabled {
+		lock(&mFixupRace.lock)
+		mFixupRace.ctx = 0
+		unlock(&mFixupRace.lock)
+	}
+	startTheWorldGC()
+}
+
 // runSafePointFn runs the safe point function, if any, for this P.
 // This should be called like
 //
@@ -1536,7 +1695,7 @@ func allocm(_p_ *p, fn func(), id int64) *m {
 // When the callback is done with the m, it calls dropm to
 // put the m back on the list.
 //go:nosplit
-func needm(x byte) {
+func needm() {
 	if (iscgo || GOOS == "windows") && !cgoHasExtraM {
 		// Can happen if C/C++ code calls Go from a global ctor.
 		// Can also happen on Windows if a global ctor uses a
@@ -1548,6 +1707,18 @@ func needm(x byte) {
 		exit(1)
 	}
 
+	// Save and block signals before getting an M.
+	// The signal handler may call needm itself,
+	// and we must avoid a deadlock. Also, once g is installed,
+	// any incoming signals will try to execute,
+	// but we won't have the sigaltstack settings and other data
+	// set up appropriately until the end of minit, which will
+	// unblock the signals. This is the same dance as when
+	// starting a new m to run Go code via newosproc.
+	var sigmask sigset
+	sigsave(&sigmask)
+	sigblock()
+
 	// Lock extra list, take head, unlock popped list.
 	// nilokay=false is safe here because of the invariant above,
 	// that the extra list always contains or will soon contain
@@ -1565,14 +1736,8 @@ func needm(x byte) {
 	extraMCount--
 	unlockextra(mp.schedlink.ptr())
 
-	// Save and block signals before installing g.
-	// Once g is installed, any incoming signals will try to execute,
-	// but we won't have the sigaltstack settings and other data
-	// set up appropriately until the end of minit, which will
-	// unblock the signals. This is the same dance as when
-	// starting a new m to run Go code via newosproc.
-	msigsave(mp)
-	sigblock()
+	// Store the original signal mask for use by minit.
+	mp.sigmask = sigmask
 
 	// Install g (= m->g0) and set the stack bounds
 	// to match the current stack. We don't actually know
@@ -1581,8 +1746,8 @@ func needm(x byte) {
 	// which is more than enough for us.
 	setg(mp.g0)
 	_g_ := getg()
-	_g_.stack.hi = uintptr(noescape(unsafe.Pointer(&x))) + 1024
-	_g_.stack.lo = uintptr(noescape(unsafe.Pointer(&x))) - 32*1024
+	_g_.stack.hi = getcallersp() + 1024
+	_g_.stack.lo = getcallersp() - 32*1024
 	_g_.stackguard0 = _g_.stack.lo + _StackGuard
 
 	// Initialize this thread to use the m.
@@ -1798,6 +1963,7 @@ var newmHandoff struct {
 //go:nowritebarrierrec
 func newm(fn func(), _p_ *p, id int64) {
 	mp := allocm(_p_, fn, id)
+	mp.doesPark = (_p_ != nil)
 	mp.nextp.set(_p_)
 	mp.sigmask = initSigmask
 	if gp := getg(); gp != nil && gp.m != nil && (gp.m.lockedExt != 0 || gp.m.incgo) && GOOS != "plan9" {
@@ -1870,6 +2036,57 @@ func startTemplateThread() {
 	releasem(mp)
 }
 
+// mFixupRace is used to temporarily borrow the race context from the
+// coordinating m during a syscall_runtime_doAllThreadsSyscall and
+// loan it out to each of the m's of the runtime so they can execute a
+// mFixup.fn in that context.
+var mFixupRace struct {
+	lock mutex
+	ctx  uintptr
+}
+
+// mDoFixup runs any outstanding fixup function for the running m.
+// Returns true if a fixup was outstanding and actually executed.
+//
+//go:nosplit
+func mDoFixup() bool {
+	_g_ := getg()
+	lock(&_g_.m.mFixup.lock)
+	fn := _g_.m.mFixup.fn
+	if fn != nil {
+		if gcphase != _GCoff {
+			// We can't have a write barrier in this
+			// context since we may not have a P, but we
+			// clear fn to signal that we've executed the
+			// fixup. As long as fn is kept alive
+			// elsewhere, technically we should have no
+			// issues with the GC, but fn is likely
+			// generated in a different package altogether
+			// that may change independently. Just assert
+			// the GC is off so this lack of write barrier
+			// is more obviously safe.
+			throw("GC must be disabled to protect validity of fn value")
+		}
+		*(*uintptr)(unsafe.Pointer(&_g_.m.mFixup.fn)) = 0
+		if _g_.racectx != 0 || !raceenabled {
+			fn(false)
+		} else {
+			// temporarily acquire the context of the
+			// originator of the
+			// syscall_runtime_doAllThreadsSyscall and
+			// block others from using it for the duration
+			// of the fixup call.
+			lock(&mFixupRace.lock)
+			_g_.racectx = mFixupRace.ctx
+			fn(false)
+			_g_.racectx = 0
+			unlock(&mFixupRace.lock)
+		}
+	}
+	unlock(&_g_.m.mFixup.lock)
+	return fn != nil
+}
+
 // templateThread is a thread in a known-good state that exists solely
 // to start new threads in known-good states when the calling thread
 // may not be in a good state.
@@ -1906,6 +2123,7 @@ func templateThread() {
 		noteclear(&newmHandoff.wake)
 		unlock(&newmHandoff.lock)
 		notesleep(&newmHandoff.wake)
+		mDoFixup()
 	}
 }
 
@@ -1927,8 +2145,7 @@ func stopm() {
 	lock(&sched.lock)
 	mput(_g_.m)
 	unlock(&sched.lock)
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	acquirep(_g_.m.nextp.ptr())
 	_g_.m.nextp = 0
 }
@@ -2053,11 +2270,16 @@ func handoffp(_p_ *p) {
 		startm(_p_, false)
 		return
 	}
-	if when := nobarrierWakeTime(_p_); when != 0 {
-		wakeNetPoller(when)
-	}
+
+	// The scheduler lock cannot be held when calling wakeNetPoller below
+	// because wakeNetPoller may call wakep which may call startm.
+	when := nobarrierWakeTime(_p_)
 	pidleput(_p_)
 	unlock(&sched.lock)
+
+	if when != 0 {
+		wakeNetPoller(when)
+	}
 }
 
 // Tries to add one more P to execute G's.
@@ -2088,12 +2310,11 @@ func stoplockedm() {
 	}
 	incidlelocked(1)
 	// Wait until another thread schedules lockedg again.
-	notesleep(&_g_.m.park)
-	noteclear(&_g_.m.park)
+	mPark()
 	status := readgstatus(_g_.m.lockedg.ptr())
 	if status&^_Gscan != _Grunnable {
-		print("runtime:stoplockedm: g is not Grunnable or Gscanrunnable\n")
-		dumpgstatus(_g_)
+		print("runtime:stoplockedm: lockedg (atomicstatus=", status, ") is not Grunnable or Gscanrunnable\n")
+		dumpgstatus(_g_.m.lockedg.ptr())
 		throw("stoplockedm: not runnable")
 	}
 	acquirep(_g_.m.nextp.ptr())
@@ -2267,31 +2488,33 @@ top:
 		_g_.m.spinning = true
 		atomic.Xadd(&sched.nmspinning, 1)
 	}
-	for i := 0; i < 4; i++ {
+	const stealTries = 4
+	for i := 0; i < stealTries; i++ {
+		stealTimersOrRunNextG := i == stealTries-1
+
 		for enum := stealOrder.start(fastrand()); !enum.done(); enum.next() {
 			if sched.gcwaiting != 0 {
 				goto top
 			}
-			stealRunNextG := i > 2 // first look for ready queues with more than 1 g
 			p2 := allp[enum.position()]
 			if _p_ == p2 {
 				continue
 			}
-			if gp := runqsteal(_p_, p2, stealRunNextG); gp != nil {
-				return gp, false
-			}
 
-			// Consider stealing timers from p2.
-			// This call to checkTimers is the only place where
-			// we hold a lock on a different P's timers.
-			// Lock contention can be a problem here, so
-			// initially avoid grabbing the lock if p2 is running
-			// and is not marked for preemption. If p2 is running
-			// and not being preempted we assume it will handle its
-			// own timers.
-			// If we're still looking for work after checking all
-			// the P's, then go ahead and steal from an active P.
-			if i > 2 || (i > 1 && shouldStealTimers(p2)) {
+			// Steal timers from p2. This call to checkTimers is the only place
+			// where we might hold a lock on a different P's timers. We do this
+			// once on the last pass before checking runnext because stealing
+			// from the other P's runnext should be the last resort, so if there
+			// are timers to steal do that first.
+			//
+			// We only check timers on one of the stealing iterations because
+			// the time stored in now doesn't change in this loop and checking
+			// the timers for each P more than once with the same value of now
+			// is probably a waste of time.
+			//
+			// TODO(prattmic): Maintain a global look-aside similar to idlepMask
+			// to avoid looking at p2 if it can't possibly have timers.
+			if stealTimersOrRunNextG {
 				tnow, w, ran := checkTimers(p2, now)
 				now = tnow
 				if w != 0 && (pollUntil == 0 || w < pollUntil) {
@@ -2312,6 +2535,13 @@ top:
 					ranTimer = true
 				}
 			}
+
+			// Don't bother to attempt to steal if p2 is idle.
+			if !idlepMask.read(enum.position()) {
+				if gp := runqsteal(_p_, p2, stealTimersOrRunNextG); gp != nil {
+					return gp, false
+				}
+			}
 		}
 	}
 	if ranTimer {
@@ -2361,6 +2591,9 @@ stop:
 	// safe-points. We don't need to snapshot the contents because
 	// everything up to cap(allp) is immutable.
 	allpSnapshot := allp
+	// Also snapshot idlepMask. Value changes are OK, but we can't allow
+	// len to change out from under us.
+	idlepMaskSnapshot := idlepMask
 
 	// return P and block
 	lock(&sched.lock)
@@ -2384,7 +2617,7 @@ stop:
 	// drop nmspinning first and then check all per-P queues again (with
 	// #StoreLoad memory barrier in between). If we do it the other way around,
 	// another thread can submit a goroutine after we've checked all run queues
-	// but before we drop nmspinning; as the result nobody will unpark a thread
+	// but before we drop nmspinning; as a result nobody will unpark a thread
 	// to run the goroutine.
 	// If we discover new work below, we need to restore m.spinning as a signal
 	// for resetspinning to unpark a new worker thread (because there can be more
@@ -2401,8 +2634,8 @@ stop:
 	}
 
 	// check all runqueues once again
-	for _, _p_ := range allpSnapshot {
-		if !runqempty(_p_) {
+	for id, _p_ := range allpSnapshot {
+		if !idlepMaskSnapshot.read(uint32(id)) && !runqempty(_p_) {
 			lock(&sched.lock)
 			_p_ = pidleget()
 			unlock(&sched.lock)
@@ -2418,6 +2651,35 @@ stop:
 		}
 	}
 
+	// Similar to above, check for timer creation or expiry concurrently with
+	// transitioning from spinning to non-spinning. Note that we cannot use
+	// checkTimers here because it calls adjusttimers which may need to allocate
+	// memory, and that isn't allowed when we don't have an active P.
+	for _, _p_ := range allpSnapshot {
+		// This is similar to nobarrierWakeTime, but minimizes calls to
+		// nanotime.
+		if atomic.Load(&_p_.adjustTimers) > 0 {
+			if now == 0 {
+				now = nanotime()
+			}
+			pollUntil = now
+		} else {
+			w := int64(atomic.Load64(&_p_.timer0When))
+			if w != 0 && (pollUntil == 0 || w < pollUntil) {
+				pollUntil = w
+			}
+		}
+	}
+	if pollUntil != 0 {
+		if now == 0 {
+			now = nanotime()
+		}
+		delta = pollUntil - now
+		if delta < 0 {
+			delta = 0
+		}
+	}
+
 	// Check for idle-priority GC work again.
 	if gcBlackenEnabled != 0 && gcMarkWorkAvailable(nil) {
 		lock(&sched.lock)
@@ -2513,9 +2775,9 @@ func pollWork() bool {
 	return false
 }
 
-// wakeNetPoller wakes up the thread sleeping in the network poller,
-// if there is one, and if it isn't going to wake up anyhow before
-// the when argument.
+// wakeNetPoller wakes up the thread sleeping in the network poller if it isn't
+// going to wake up before the when argument; or it wakes an idle P to service
+// timers and the network poller if there isn't one already.
 func wakeNetPoller(when int64) {
 	if atomic.Load64(&sched.lastpoll) == 0 {
 		// In findrunnable we ensure that when polling the pollUntil
@@ -2526,6 +2788,10 @@ func wakeNetPoller(when int64) {
 		if pollerPollUntil == 0 || pollerPollUntil > when {
 			netpollBreak()
 		}
+	} else {
+		// There are no threads in the network poller, try to get
+		// one there so it can handle new timers.
+		wakep()
 	}
 }
 
@@ -2757,40 +3023,40 @@ func dropg() {
 // We pass now in and out to avoid extra calls of nanotime.
 //go:yeswritebarrierrec
 func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
-	// If there are no timers to adjust, and the first timer on
-	// the heap is not yet ready to run, then there is nothing to do.
-	if atomic.Load(&pp.adjustTimers) == 0 {
-		next := int64(atomic.Load64(&pp.timer0When))
-		if next == 0 {
-			return now, 0, false
-		}
-		if now == 0 {
-			now = nanotime()
-		}
-		if now < next {
-			// Next timer is not ready to run.
-			// But keep going if we would clear deleted timers.
-			// This corresponds to the condition below where
-			// we decide whether to call clearDeletedTimers.
-			if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
-				return now, next, false
-			}
+	// If it's not yet time for the first timer, or the first adjusted
+	// timer, then there is nothing to do.
+	next := int64(atomic.Load64(&pp.timer0When))
+	nextAdj := int64(atomic.Load64(&pp.timerModifiedEarliest))
+	if next == 0 || (nextAdj != 0 && nextAdj < next) {
+		next = nextAdj
+	}
+
+	if next == 0 {
+		// No timers to run or adjust.
+		return now, 0, false
+	}
+
+	if now == 0 {
+		now = nanotime()
+	}
+	if now < next {
+		// Next timer is not ready to run, but keep going
+		// if we would clear deleted timers.
+		// This corresponds to the condition below where
+		// we decide whether to call clearDeletedTimers.
+		if pp != getg().m.p.ptr() || int(atomic.Load(&pp.deletedTimers)) <= int(atomic.Load(&pp.numTimers)/4) {
+			return now, next, false
 		}
 	}
 
 	lock(&pp.timersLock)
 
-	adjusttimers(pp)
-
-	rnow = now
 	if len(pp.timers) > 0 {
-		if rnow == 0 {
-			rnow = nanotime()
-		}
+		adjusttimers(pp, now)
 		for len(pp.timers) > 0 {
 			// Note that runtimer may temporarily unlock
 			// pp.timersLock.
-			if tw := runtimer(pp, rnow); tw != 0 {
+			if tw := runtimer(pp, now); tw != 0 {
 				if tw > 0 {
 					pollUntil = tw
 				}
@@ -2809,26 +3075,7 @@ func checkTimers(pp *p, now int64) (rnow, pollUntil int64, ran bool) {
 
 	unlock(&pp.timersLock)
 
-	return rnow, pollUntil, ran
-}
-
-// shouldStealTimers reports whether we should try stealing the timers from p2.
-// We don't steal timers from a running P that is not marked for preemption,
-// on the assumption that it will run its own timers. This reduces
-// contention on the timers lock.
-func shouldStealTimers(p2 *p) bool {
-	if p2.status != _Prunning {
-		return true
-	}
-	mp := p2.m.ptr()
-	if mp == nil || mp.locks > 0 {
-		return false
-	}
-	gp := mp.curg
-	if gp == nil || gp.atomicstatus != _Grunning || !gp.preempt {
-		return false
-	}
-	return true
+	return now, pollUntil, ran
 }
 
 func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
@@ -2986,7 +3233,8 @@ func goexit0(gp *g) {
 		// Flush assist credit to the global pool. This gives
 		// better information to pacing if the application is
 		// rapidly creating an exiting goroutines.
-		scanCredit := int64(gcController.assistWorkPerByte * float64(gp.gcAssistBytes))
+		assistWorkPerByte := float64frombits(atomic.Load64(&gcController.assistWorkPerByte))
+		scanCredit := int64(assistWorkPerByte * float64(gp.gcAssistBytes))
 		atomic.Xaddint64(&gcController.bgScanCredit, scanCredit)
 		gp.gcAssistBytes = 0
 	}
@@ -3434,7 +3682,7 @@ func beforefork() {
 	// a signal handler before exec if a signal is sent to the process
 	// group. See issue #18600.
 	gp.m.locks++
-	msigsave(gp.m)
+	sigsave(&gp.m.sigmask)
 	sigblock()
 
 	// This function is called before fork in syscall package.
@@ -3500,11 +3748,24 @@ func syscall_runtime_AfterForkInChild() {
 	inForkedChild = false
 }
 
+// pendingPreemptSignals is the number of preemption signals
+// that have been sent but not received. This is only used on Darwin.
+// For #41702.
+var pendingPreemptSignals uint32
+
 // Called from syscall package before Exec.
 //go:linkname syscall_runtime_BeforeExec syscall.runtime_BeforeExec
 func syscall_runtime_BeforeExec() {
 	// Prevent thread creation during exec.
 	execLock.lock()
+
+	// On Darwin, wait for all pending preemption signals to
+	// be received. See issue #41702.
+	if GOOS == "darwin" {
+		for int32(atomic.Load(&pendingPreemptSignals)) > 0 {
+			osyield()
+		}
+	}
 }
 
 // Called from syscall package after Exec.
@@ -4008,7 +4269,7 @@ func sigprof(pc, sp, lr uintptr, gp *g, mp *m) {
 	// First, it may be that the g switch has no PC update, because the SP
 	// either corresponds to a user g throughout (as in asmcgocall)
 	// or because it has been arranged to look like a user g frame
-	// (as in cgocallback_gofunc). In this case, since the entire
+	// (as in cgocallback). In this case, since the entire
 	// transition is a g+SP update, a partial transition updating just one of
 	// those will be detected by the stack bounds check.
 	//
@@ -4367,6 +4628,8 @@ func procresize(nprocs int32) *p {
 	}
 	sched.procresizetime = now
 
+	maskWords := (nprocs + 31) / 32
+
 	// Grow allp if necessary.
 	if nprocs > int32(len(allp)) {
 		// Synchronize with retake, which could be running
@@ -4381,6 +4644,15 @@ func procresize(nprocs int32) *p {
 			copy(nallp, allp[:cap(allp)])
 			allp = nallp
 		}
+
+		if maskWords <= int32(cap(idlepMask)) {
+			idlepMask = idlepMask[:maskWords]
+		} else {
+			nidlepMask := make([]uint32, maskWords)
+			// No need to copy beyond len, old Ps are irrelevant.
+			copy(nidlepMask, idlepMask)
+			idlepMask = nidlepMask
+		}
 		unlock(&allpLock)
 	}
 
@@ -4439,6 +4711,7 @@ func procresize(nprocs int32) *p {
 	if int32(len(allp)) != nprocs {
 		lock(&allpLock)
 		allp = allp[:nprocs]
+		idlepMask = idlepMask[:maskWords]
 		unlock(&allpLock)
 	}
 
@@ -4660,9 +4933,14 @@ func sysmon() {
 	checkdead()
 	unlock(&sched.lock)
 
+	// For syscall_runtime_doAllThreadsSyscall, sysmon is
+	// sufficiently up to participate in fixups.
+	atomic.Store(&sched.sysmonStarting, 0)
+
 	lasttrace := int64(0)
 	idle := 0 // how many cycles in succession we had not wokeup somebody
 	delay := uint32(0)
+
 	for {
 		if idle == 0 { // start with 20us sleep...
 			delay = 20
@@ -4673,11 +4951,29 @@ func sysmon() {
 			delay = 10 * 1000
 		}
 		usleep(delay)
+		mDoFixup()
+
+		// sysmon should not enter deep sleep if schedtrace is enabled so that
+		// it can print that information at the right time.
+		//
+		// It should also not enter deep sleep if there are any active P's so
+		// that it can retake P's from syscalls, preempt long running G's, and
+		// poll the network if all P's are busy for long stretches.
+		//
+		// It should wakeup from deep sleep if any P's become active either due
+		// to exiting a syscall or waking up due to a timer expiring so that it
+		// can resume performing those duties. If it wakes from a syscall it
+		// resets idle and delay as a bet that since it had retaken a P from a
+		// syscall before, it may need to do it again shortly after the
+		// application starts work again. It does not reset idle when waking
+		// from a timer to avoid adding system load to applications that spend
+		// most of their time sleeping.
 		now := nanotime()
-		next, _ := timeSleepUntil()
 		if debug.schedtrace <= 0 && (sched.gcwaiting != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs)) {
 			lock(&sched.lock)
 			if atomic.Load(&sched.gcwaiting) != 0 || atomic.Load(&sched.npidle) == uint32(gomaxprocs) {
+				syscallWake := false
+				next, _ := timeSleepUntil()
 				if next > now {
 					atomic.Store(&sched.sysmonwait, 1)
 					unlock(&sched.lock)
@@ -4691,32 +4987,27 @@ func sysmon() {
 					if shouldRelax {
 						osRelax(true)
 					}
-					notetsleep(&sched.sysmonnote, sleep)
+					syscallWake = notetsleep(&sched.sysmonnote, sleep)
+					mDoFixup()
 					if shouldRelax {
 						osRelax(false)
 					}
-					now = nanotime()
-					next, _ = timeSleepUntil()
 					lock(&sched.lock)
 					atomic.Store(&sched.sysmonwait, 0)
 					noteclear(&sched.sysmonnote)
 				}
-				idle = 0
-				delay = 20
+				if syscallWake {
+					idle = 0
+					delay = 20
+				}
 			}
 			unlock(&sched.lock)
 		}
+
 		lock(&sched.sysmonlock)
-		{
-			// If we spent a long time blocked on sysmonlock
-			// then we want to update now and next since it's
-			// likely stale.
-			now1 := nanotime()
-			if now1-now > 50*1000 /* 50µs */ {
-				next, _ = timeSleepUntil()
-			}
-			now = now1
-		}
+		// Update now in case we blocked on sysmonnote or spent a long time
+		// blocked on schedlock or sysmonlock above.
+		now = nanotime()
 
 		// trigger libc interceptors if needed
 		if *cgo_yield != nil {
@@ -4740,12 +5031,7 @@ func sysmon() {
 				incidlelocked(1)
 			}
 		}
-		if next < now {
-			// There are timers that should have already run,
-			// perhaps because there is an unpreemptible P.
-			// Try to start an M to run them.
-			startm(nil, false)
-		}
+		mDoFixup()
 		if atomic.Load(&scavenge.sysmonWake) != 0 {
 			// Kick the scavenger awake if someone requested it.
 			wakeScavenger()
@@ -5122,8 +5408,46 @@ func globrunqget(_p_ *p, max int32) *g {
 	return gp
 }
 
-// Put p to on _Pidle list.
+// pIdleMask is a bitmap of of Ps in the _Pidle list, one bit per P.
+type pIdleMask []uint32
+
+// read returns true if P id is in the _Pidle list, and thus cannot have work.
+func (p pIdleMask) read(id uint32) bool {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	return (atomic.Load(&p[word]) & mask) != 0
+}
+
+// set sets P id as idle in mask.
+//
+// Must be called only for a P owned by the caller. In order to maintain
+// consistency, a P going idle must the idle mask simultaneously with updates
+// to the idle P list under the sched.lock, otherwise a racing pidleget may
+// clear the mask before pidleput sets the mask, corrupting the bitmap.
+//
+// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema.
+func (p pIdleMask) set(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.Or(&p[word], mask)
+}
+
+// clear sets P id as non-idle in mask.
+//
+// See comment on set.
+func (p pIdleMask) clear(id int32) {
+	word := id / 32
+	mask := uint32(1) << (id % 32)
+	atomic.And(&p[word], ^mask)
+}
+
+// pidleput puts p to on the _Pidle list.
+//
+// This releases ownership of p. Once sched.lock is released it is no longer
+// safe to use p.
+//
 // sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleput(_p_ *p) {
@@ -5132,13 +5456,16 @@ func pidleput(_p_ *p) {
 	if !runqempty(_p_) {
 		throw("pidleput: P has non-empty run queue")
 	}
+	idlepMask.set(_p_.id)
 	_p_.link = sched.pidle
 	sched.pidle.set(_p_)
 	atomic.Xadd(&sched.npidle, 1) // TODO: fast atomic
 }
 
-// Try get a p from _Pidle list.
+// pidleget tries to get a p from the _Pidle list, acquiring ownership.
+//
 // sched.lock must be held.
+//
 // May run during STW, so write barriers are not allowed.
 //go:nowritebarrierrec
 func pidleget() *p {
@@ -5146,6 +5473,7 @@ func pidleget() *p {
 
 	_p_ := sched.pidle.ptr()
 	if _p_ != nil {
+		idlepMask.clear(_p_.id)
 		sched.pidle = _p_.link
 		atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic
 	}
@@ -5665,6 +5993,17 @@ type initTask struct {
 	// followed by nfns pcs, one per init function to run
 }
 
+// inittrace stores statistics for init functions which are
+// updated by malloc and newproc when active is true.
+var inittrace tracestat
+
+type tracestat struct {
+	active bool   // init tracing activation status
+	id     int64  // init go routine id
+	allocs uint64 // heap allocations
+	bytes  uint64 // heap allocated bytes
+}
+
 func doInit(t *initTask) {
 	switch t.state {
 	case 2: // fully initialized
@@ -5673,16 +6012,52 @@ func doInit(t *initTask) {
 		throw("recursive call during initialization - linker skew")
 	default: // not initialized yet
 		t.state = 1 // initialization in progress
+
 		for i := uintptr(0); i < t.ndeps; i++ {
 			p := add(unsafe.Pointer(t), (3+i)*sys.PtrSize)
 			t2 := *(**initTask)(p)
 			doInit(t2)
 		}
+
+		if t.nfns == 0 {
+			t.state = 2 // initialization done
+			return
+		}
+
+		var (
+			start  int64
+			before tracestat
+		)
+
+		if inittrace.active {
+			start = nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			before = inittrace
+		}
+
+		firstFunc := add(unsafe.Pointer(t), (3+t.ndeps)*sys.PtrSize)
 		for i := uintptr(0); i < t.nfns; i++ {
-			p := add(unsafe.Pointer(t), (3+t.ndeps+i)*sys.PtrSize)
+			p := add(firstFunc, i*sys.PtrSize)
 			f := *(*func())(unsafe.Pointer(&p))
 			f()
 		}
+
+		if inittrace.active {
+			end := nanotime()
+			// Load stats non-atomically since tracinit is updated only by this init go routine.
+			after := inittrace
+
+			pkg := funcpkgpath(findfunc(funcPC(firstFunc)))
+
+			var sbuf [24]byte
+			print("init ", pkg, " @")
+			print(string(fmtNSAsMS(sbuf[:], uint64(start-runtimeInitTime))), " ms, ")
+			print(string(fmtNSAsMS(sbuf[:], uint64(end-start))), " ms clock, ")
+			print(string(itoa(sbuf[:], after.bytes-before.bytes)), " bytes, ")
+			print(string(itoa(sbuf[:], after.allocs-before.allocs)), " allocs")
+			print("\n")
+		}
+
 		t.state = 2 // initialization done
 	}
 }
diff --git a/src/runtime/race/README b/src/runtime/race/README
index 34485f0fb2..b36d82ccfd 100644
--- a/src/runtime/race/README
+++ b/src/runtime/race/README
@@ -4,10 +4,10 @@ the LLVM project (https://github.com/llvm/llvm-project/tree/master/compiler-rt).
 
 To update the .syso files use golang.org/x/build/cmd/racebuild.
 
-race_darwin_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_freebsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_amd64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_linux_ppc64le.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
-race_netbsd_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_windows_amd64.syso built with LLVM 3496d6e4bea9cb99cb382939b7e79a50a3b863a5 and Go 553e003414d3aa90cc39830ee22f08453d9f3408.
-race_linux_arm64.syso built with LLVM 6c75db8b4bc59eace18143ce086419d37da24746 and Go 7388956b76ce15a11346cebefcf6193db044caaf.
+race_darwin_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_freebsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_ppc64le.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_netbsd_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_windows_amd64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
+race_linux_arm64.syso built with LLVM 89f7ccea6f6488c443655880229c54db1f180153 and Go f62d3202bf9dbb3a00ad2a2c63ff4fa4188c5d3b.
diff --git a/src/runtime/race/output_test.go b/src/runtime/race/output_test.go
index d3e7762175..b4b8936c7c 100644
--- a/src/runtime/race/output_test.go
+++ b/src/runtime/race/output_test.go
@@ -309,7 +309,7 @@ Read at 0x[0-9,a-f]+ by main goroutine:
 Previous write at 0x[0-9,a-f]+ by goroutine [0-9]:
   main\.goCallback\(\)
       .*/main\.go:27 \+0x[0-9,a-f]+
-  main._cgoexpwrap_[0-9a-z]+_goCallback\(\)
+  _cgoexp_[0-9a-z]+_goCallback\(\)
       .*_cgo_gotypes\.go:[0-9]+ \+0x[0-9,a-f]+
 
 Goroutine [0-9] \(running\) created at:
diff --git a/src/runtime/race/race_darwin_amd64.syso b/src/runtime/race/race_darwin_amd64.syso
index d03a593f5a..3f95ecc8ee 100644
--- a/src/runtime/race/race_darwin_amd64.syso
+++ b/src/runtime/race/race_darwin_amd64.syso
diff --git a/src/runtime/race/race_freebsd_amd64.syso b/src/runtime/race/race_freebsd_amd64.syso
index 573591c56f..2a5b46f4ce 100644
--- a/src/runtime/race/race_freebsd_amd64.syso
+++ b/src/runtime/race/race_freebsd_amd64.syso
diff --git a/src/runtime/race/race_linux_amd64.syso b/src/runtime/race/race_linux_amd64.syso
index d31f85df56..e00398c964 100644
--- a/src/runtime/race/race_linux_amd64.syso
+++ b/src/runtime/race/race_linux_amd64.syso
diff --git a/src/runtime/race/race_linux_arm64.syso b/src/runtime/race/race_linux_arm64.syso
index 7c74171b0f..9dae738700 100644
--- a/src/runtime/race/race_linux_arm64.syso
+++ b/src/runtime/race/race_linux_arm64.syso
diff --git a/src/runtime/race/race_linux_ppc64le.syso b/src/runtime/race/race_linux_ppc64le.syso
index a3c72bec55..b562656d56 100644
--- a/src/runtime/race/race_linux_ppc64le.syso
+++ b/src/runtime/race/race_linux_ppc64le.syso
diff --git a/src/runtime/race/race_netbsd_amd64.syso b/src/runtime/race/race_netbsd_amd64.syso
index 54e276bcff..11af16f046 100644
--- a/src/runtime/race/race_netbsd_amd64.syso
+++ b/src/runtime/race/race_netbsd_amd64.syso
diff --git a/src/runtime/race/race_windows_amd64.syso b/src/runtime/race/race_windows_amd64.syso
index abaf42649f..9fbf9b4391 100644
--- a/src/runtime/race/race_windows_amd64.syso
+++ b/src/runtime/race/race_windows_amd64.syso
diff --git a/src/runtime/rt0_darwin_arm64.s b/src/runtime/rt0_darwin_arm64.s
index e3972f4924..0040361215 100644
--- a/src/runtime/rt0_darwin_arm64.s
+++ b/src/runtime/rt0_darwin_arm64.s
@@ -4,11 +4,14 @@
 
 #include "textflag.h"
 
-// No need for _rt0_arm64_darwin as darwin/arm64 only
-// supports external linking.
 TEXT _rt0_arm64_darwin(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$42, R0
-	BL  libc_exit(SB)
+	MOVD	$runtime·rt0_go(SB), R2
+	BL	(R2)
+exit:
+	MOVD	$0, R0
+	MOVD	$1, R16	// sys_exit
+	SVC	$0x80
+	B	exit
 
 // When linking with -buildmode=c-archive or -buildmode=c-shared,
 // this symbol is called from a global initialization function.
@@ -86,11 +89,6 @@ GLOBL _rt0_arm64_darwin_lib_argc<>(SB),NOPTR, $8
 DATA  _rt0_arm64_darwin_lib_argv<>(SB)/8, $0
 GLOBL _rt0_arm64_darwin_lib_argv<>(SB),NOPTR, $8
 
+// external linking entry point.
 TEXT main(SB),NOSPLIT|NOFRAME,$0
-	MOVD	$runtime·rt0_go(SB), R2
-	BL	(R2)
-exit:
-	MOVD	$0, R0
-	MOVD	$1, R16	// sys_exit
-	SVC	$0x80
-	B	exit
+	JMP	_rt0_arm64_darwin(SB)
diff --git a/src/runtime/rt0_ios_amd64.s b/src/runtime/rt0_ios_amd64.s
new file mode 100644
index 0000000000..c6990324f4
--- /dev/null
+++ b/src/runtime/rt0_ios_amd64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/amd64 only supports external linking.
+TEXT _rt0_amd64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_amd64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_amd64_darwin_lib(SB)
diff --git a/src/runtime/rt0_ios_arm64.s b/src/runtime/rt0_ios_arm64.s
new file mode 100644
index 0000000000..dcc83656e2
--- /dev/null
+++ b/src/runtime/rt0_ios_arm64.s
@@ -0,0 +1,14 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "textflag.h"
+
+// internal linking executable entry point.
+// ios/arm64 only supports external linking.
+TEXT _rt0_arm64_ios(SB),NOSPLIT|NOFRAME,$0
+	UNDEF
+
+// library entry point.
+TEXT _rt0_arm64_ios_lib(SB),NOSPLIT|NOFRAME,$0
+	JMP	_rt0_arm64_darwin_lib(SB)
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go
index 7c893aa25c..0f182ac58e 100644
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -300,7 +300,6 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-	allocfreetrace     int32
 	cgocheck           int32
 	clobberfree        int32
 	efence             int32
@@ -311,13 +310,20 @@ var debug struct {
 	gctrace            int32
 	invalidptr         int32
 	madvdontneed       int32 // for Linux; issue 28466
-	sbrk               int32
 	scavenge           int32
 	scavtrace          int32
 	scheddetail        int32
 	schedtrace         int32
 	tracebackancestors int32
 	asyncpreemptoff    int32
+
+	// debug.malloc is used as a combined debug check
+	// in the malloc function and should be set
+	// if any of the below debug options is != 0.
+	malloc         bool
+	allocfreetrace int32
+	inittrace      int32
+	sbrk           int32
 }
 
 var dbgvars = []dbgVar{
@@ -339,6 +345,7 @@ var dbgvars = []dbgVar{
 	{"schedtrace", &debug.schedtrace},
 	{"tracebackancestors", &debug.tracebackancestors},
 	{"asyncpreemptoff", &debug.asyncpreemptoff},
+	{"inittrace", &debug.inittrace},
 }
 
 func parsedebugvars() {
@@ -378,6 +385,8 @@ func parsedebugvars() {
 		}
 	}
 
+	debug.malloc = (debug.allocfreetrace | debug.inittrace | debug.sbrk) != 0
+
 	setTraceback(gogetenv("GOTRACEBACK"))
 	traceback_env = traceback_cache
 }
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go
index 519872b8e2..a2e4411c7d 100644
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -387,14 +387,6 @@ type libcall struct {
 	err  uintptr // error number
 }
 
-// describes how to handle callback
-type wincallbackcontext struct {
-	gobody       unsafe.Pointer // go function to call
-	argsize      uintptr        // callback arguments size (in bytes)
-	restorestack uintptr        // adjust stack on return by (in bytes) (386 only)
-	cleanstack   bool
-}
-
 // Stack describes a Go execution stack.
 // The bounds of the stack are exactly [lo, hi),
 // with no implicit data structures on either side.
@@ -528,6 +520,7 @@ type m struct {
 	ncgo          int32       // number of cgo calls currently in progress
 	cgoCallersUse uint32      // if non-zero, cgoCallers in use temporarily
 	cgoCallers    *cgoCallers // cgo traceback if crashing in cgo call
+	doesPark      bool        // non-P running threads: sysmon and newmHandoff never use .park
 	park          note
 	alllink       *m // on allm
 	schedlink     muintptr
@@ -544,6 +537,13 @@ type m struct {
 	syscalltick   uint32
 	freelink      *m // on sched.freem
 
+	// mFixup is used to synchronize OS related m state (credentials etc)
+	// use mutex to access.
+	mFixup struct {
+		lock mutex
+		fn   func(bool) bool
+	}
+
 	// these are here because they are too large to be on the stack
 	// of low-level NOSPLIT functions.
 	libcall   libcall
@@ -646,6 +646,13 @@ type p struct {
 	// This is 0 if the timer heap is empty.
 	timer0When uint64
 
+	// The earliest known nextwhen field of a timer with
+	// timerModifiedEarlier status. Because the timer may have been
+	// modified again, there need not be any timer with this value.
+	// This is updated using atomic functions.
+	// This is 0 if the value is unknown.
+	timerModifiedEarliest uint64
+
 	// Per-P GC state
 	gcAssistTime         int64    // Nanoseconds in assistAlloc
 	gcFractionalMarkTime int64    // Nanoseconds in fractional mark worker (atomic)
@@ -768,6 +775,10 @@ type schedt struct {
 	sysmonwait uint32
 	sysmonnote note
 
+	// While true, sysmon not ready for mFixup calls.
+	// Accessed atomically.
+	sysmonStarting uint32
+
 	// safepointFn should be called on each P at the next GC
 	// safepoint if p.runSafePointFn is set.
 	safePointFn   func(*p)
@@ -1035,14 +1046,22 @@ func (w waitReason) String() string {
 var (
 	allglen    uintptr
 	allm       *m
-	allp       []*p  // len(allp) == gomaxprocs; may change at safe points, otherwise immutable
-	allpLock   mutex // Protects P-less reads of allp and all writes
 	gomaxprocs int32
 	ncpu       int32
 	forcegc    forcegcstate
 	sched      schedt
 	newprocs   int32
 
+	// allpLock protects P-less reads and size changes of allp and
+	// idlepMask, and all writes to allp.
+	allpLock mutex
+	// len(allp) == gomaxprocs; may change at safe points, otherwise
+	// immutable.
+	allp []*p
+	// Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must
+	// be atomic. Length may change at safe points.
+	idlepMask pIdleMask
+
 	// Information about what cpu features are available.
 	// Packages outside the runtime should not use these
 	// as they are not an external api.
diff --git a/src/runtime/signal_unix.go b/src/runtime/signal_unix.go
index a3d6f34c88..bf4a319b37 100644
--- a/src/runtime/signal_unix.go
+++ b/src/runtime/signal_unix.go
@@ -335,6 +335,10 @@ func doSigPreempt(gp *g, ctxt *sigctxt) {
 	// Acknowledge the preemption.
 	atomic.Xadd(&gp.m.preemptGen, 1)
 	atomic.Store(&gp.m.signalPending, 0)
+
+	if GOOS == "darwin" {
+		atomic.Xadd(&pendingPreemptSignals, -1)
+	}
 }
 
 const preemptMSupported = true
@@ -356,7 +360,18 @@ func preemptM(mp *m) {
 		// required).
 		return
 	}
+
+	// On Darwin, don't try to preempt threads during exec.
+	// Issue #41702.
+	if GOOS == "darwin" {
+		execLock.rlock()
+	}
+
 	if atomic.Cas(&mp.signalPending, 0, 1) {
+		if GOOS == "darwin" {
+			atomic.Xadd(&pendingPreemptSignals, 1)
+		}
+
 		// If multiple threads are preempting the same M, it may send many
 		// signals to the same M such that it hardly make progress, causing
 		// live-lock problem. Apparently this could happen on darwin. See
@@ -364,6 +379,10 @@ func preemptM(mp *m) {
 		// Only send a signal if there isn't already one pending.
 		signalM(mp, sigPreempt)
 	}
+
+	if GOOS == "darwin" {
+		execLock.runlock()
+	}
 }
 
 // sigFetchG fetches the value of G safely when running in a signal handler.
@@ -424,6 +443,9 @@ func sigtrampgo(sig uint32, info *siginfo, ctx unsafe.Pointer) {
 			// no non-Go signal handler for sigPreempt.
 			// The default behavior for sigPreempt is to ignore
 			// the signal, so badsignal will be a no-op anyway.
+			if GOOS == "darwin" {
+				atomic.Xadd(&pendingPreemptSignals, -1)
+			}
 			return
 		}
 		c.fixsigcode(sig)
@@ -482,14 +504,14 @@ func adjustSignalStack(sig uint32, mp *m, gsigStack *gsignalStack) bool {
 	sigaltstack(nil, &st)
 	if st.ss_flags&_SS_DISABLE != 0 {
 		setg(nil)
-		needm(0)
+		needm()
 		noSignalStack(sig)
 		dropm()
 	}
 	stsp := uintptr(unsafe.Pointer(st.ss_sp))
 	if sp < stsp || sp >= stsp+st.ss_size {
 		setg(nil)
-		needm(0)
+		needm()
 		sigNotOnStack(sig)
 		dropm()
 	}
@@ -929,7 +951,7 @@ func badsignal(sig uintptr, c *sigctxt) {
 		exit(2)
 		*(*uintptr)(unsafe.Pointer(uintptr(123))) = 2
 	}
-	needm(0)
+	needm()
 	if !sigsend(uint32(sig)) {
 		// A foreign thread received the signal sig, and the
 		// Go code does not want to handle it.
@@ -1009,15 +1031,15 @@ func sigfwdgo(sig uint32, info *siginfo, ctx unsafe.Pointer) bool {
 	return true
 }
 
-// msigsave saves the current thread's signal mask into mp.sigmask.
+// sigsave saves the current thread's signal mask into *p.
 // This is used to preserve the non-Go signal mask when a non-Go
 // thread calls a Go function.
 // This is nosplit and nowritebarrierrec because it is called by needm
 // which may be called on a non-Go thread with no g available.
 //go:nosplit
 //go:nowritebarrierrec
-func msigsave(mp *m) {
-	sigprocmask(_SIG_SETMASK, nil, &mp.sigmask)
+func sigsave(p *sigset) {
+	sigprocmask(_SIG_SETMASK, nil, p)
 }
 
 // msigrestore sets the current thread's signal mask to sigmask.
@@ -1089,7 +1111,7 @@ func minitSignalStack() {
 // thread's signal mask. When this is called all signals have been
 // blocked for the thread.  This starts with m.sigmask, which was set
 // either from initSigmask for a newly created thread or by calling
-// msigsave if this is a non-Go thread calling a Go function. It
+// sigsave if this is a non-Go thread calling a Go function. It
 // removes all essential signals from the mask, thus causing those
 // signals to not be blocked. Then it sets the thread's signal mask.
 // After this is called the thread can receive signals.
diff --git a/src/runtime/sigtab_linux_generic.go b/src/runtime/sigtab_linux_generic.go
index b26040b803..38d686544f 100644
--- a/src/runtime/sigtab_linux_generic.go
+++ b/src/runtime/sigtab_linux_generic.go
@@ -45,7 +45,7 @@ var sigtable = [...]sigTabT{
 	/* 31 */ {_SigThrow, "SIGSYS: bad system call"},
 	/* 32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/* 33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/* 34 */ {_SigNotify, "signal 34"},
+	/* 34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/* 35 */ {_SigNotify, "signal 35"},
 	/* 36 */ {_SigNotify, "signal 36"},
 	/* 37 */ {_SigNotify, "signal 37"},
diff --git a/src/runtime/sigtab_linux_mipsx.go b/src/runtime/sigtab_linux_mipsx.go
index 81dd2314c5..51ef470ce7 100644
--- a/src/runtime/sigtab_linux_mipsx.go
+++ b/src/runtime/sigtab_linux_mipsx.go
@@ -42,7 +42,7 @@ var sigtable = [...]sigTabT{
 	/*  31 */ {_SigNotify, "SIGXFSZ: file size limit exceeded"},
 	/*  32 */ {_SigSetStack + _SigUnblock, "signal 32"}, /* SIGCANCEL; see issue 6997 */
 	/*  33 */ {_SigSetStack + _SigUnblock, "signal 33"}, /* SIGSETXID; see issues 3871, 9400, 12498 */
-	/*  34 */ {_SigNotify, "signal 34"},
+	/*  34 */ {_SigSetStack + _SigUnblock, "signal 34"}, /* musl SIGSYNCCALL; see issue 39343 */
 	/*  35 */ {_SigNotify, "signal 35"},
 	/*  36 */ {_SigNotify, "signal 36"},
 	/*  37 */ {_SigNotify, "signal 37"},
diff --git a/src/runtime/stack.go b/src/runtime/stack.go
index 3802cd049e..7b9dce5393 100644
--- a/src/runtime/stack.go
+++ b/src/runtime/stack.go
@@ -66,7 +66,7 @@ const (
 	// to each stack below the usual guard area for OS-specific
 	// purposes like signal handling. Used on Windows, Plan 9,
 	// and iOS because they do not use a separate stack.
-	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + (sys.GoosDarwin+sys.GoosIos)*sys.GoarchArm64*1024
+	_StackSystem = sys.GoosWindows*512*sys.PtrSize + sys.GoosPlan9*512 + sys.GoosIos*sys.GoarchArm64*1024
 
 	// The minimum size of stack used by Go code
 	_StackMin = 2048
@@ -187,7 +187,7 @@ func stackpoolalloc(order uint8) gclinkptr {
 	lockWithRankMayAcquire(&mheap_.lock, lockRankMheap)
 	if s == nil {
 		// no free stacks. Allocate another span worth.
-		s = mheap_.allocManual(_StackCacheSize>>_PageShift, &memstats.stacks_inuse)
+		s = mheap_.allocManual(_StackCacheSize>>_PageShift, spanAllocStack)
 		if s == nil {
 			throw("out of memory")
 		}
@@ -251,7 +251,7 @@ func stackpoolfree(x gclinkptr, order uint8) {
 		stackpool[order].item.span.remove(s)
 		s.manualFreeList = 0
 		osStackFree(s)
-		mheap_.freeManual(s, &memstats.stacks_inuse)
+		mheap_.freeManual(s, spanAllocStack)
 	}
 }
 
@@ -396,7 +396,7 @@ func stackalloc(n uint32) stack {
 
 		if s == nil {
 			// Allocate a new stack from the heap.
-			s = mheap_.allocManual(npage, &memstats.stacks_inuse)
+			s = mheap_.allocManual(npage, spanAllocStack)
 			if s == nil {
 				throw("out of memory")
 			}
@@ -480,7 +480,7 @@ func stackfree(stk stack) {
 			// Free the stack immediately if we're
 			// sweeping.
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 		} else {
 			// If the GC is running, we can't return a
 			// stack span to the heap because it could be
@@ -1193,7 +1193,7 @@ func freeStackSpans() {
 				list.remove(s)
 				s.manualFreeList = 0
 				osStackFree(s)
-				mheap_.freeManual(s, &memstats.stacks_inuse)
+				mheap_.freeManual(s, spanAllocStack)
 			}
 			s = next
 		}
@@ -1207,7 +1207,7 @@ func freeStackSpans() {
 			next := s.next
 			stackLarge.free[i].remove(s)
 			osStackFree(s)
-			mheap_.freeManual(s, &memstats.stacks_inuse)
+			mheap_.freeManual(s, spanAllocStack)
 			s = next
 		}
 	}
diff --git a/src/runtime/stubs.go b/src/runtime/stubs.go
index b891a12fdd..d77cb4d460 100644
--- a/src/runtime/stubs.go
+++ b/src/runtime/stubs.go
@@ -130,6 +130,9 @@ func fastrandn(n uint32) uint32 {
 //go:linkname sync_fastrand sync.fastrand
 func sync_fastrand() uint32 { return fastrand() }
 
+//go:linkname net_fastrand net.fastrand
+func net_fastrand() uint32 { return fastrand() }
+
 // in internal/bytealg/equal_*.s
 //go:noescape
 func memequal(a, b unsafe.Pointer, size uintptr) bool
@@ -145,7 +148,13 @@ func noescape(p unsafe.Pointer) unsafe.Pointer {
 	return unsafe.Pointer(x ^ 0)
 }
 
-func cgocallback(fn, frame unsafe.Pointer, framesize, ctxt uintptr)
+// Not all cgocallback frames are actually cgocallback,
+// so not all have these arguments. Mark them uintptr so that the GC
+// does not misinterpret memory when the arguments are not present.
+// cgocallback is not called from Go, only from crosscall2.
+// This in turn calls cgocallbackg, which is where we'll find
+// pointer-declared arguments.
+func cgocallback(fn, frame, ctxt uintptr)
 func gogo(buf *gobuf)
 func gosave(buf *gobuf)
 
@@ -160,10 +169,11 @@ func breakpoint()
 // back into arg+retoffset before returning. If copying result bytes back,
 // the caller should pass the argument frame type as argtype, so that
 // call can execute appropriate write barriers during the copy.
-// Package reflect passes a frame type. In package runtime, there is only
-// one call that copies results back, in cgocallbackg1, and it does NOT pass a
-// frame type, meaning there are no write barriers invoked. See that call
-// site for justification.
+//
+// Package reflect always passes a frame type. In package runtime,
+// Windows callbacks are the only use of this that copies results
+// back, and those cannot have pointers in their results, so runtime
+// passes nil for the frame type.
 //
 // Package reflect accesses this symbol through a linkname.
 func reflectcall(argtype *_type, fn, arg unsafe.Pointer, argsize uint32, retoffset uint32)
@@ -184,14 +194,6 @@ type neverCallThisFunction struct{}
 // prematurely and if there is leftover state it may panic.
 func goexit(neverCallThisFunction)
 
-// Not all cgocallback_gofunc frames are actually cgocallback_gofunc,
-// so not all have these arguments. Mark them uintptr so that the GC
-// does not misinterpret memory when the arguments are not present.
-// cgocallback_gofunc is not called from go, only from cgocallback,
-// so the arguments will be found via cgocallback's pointer-declared arguments.
-// See the assembly implementations for more details.
-func cgocallback_gofunc(fv, frame, framesize, ctxt uintptr)
-
 // publicationBarrier performs a store/store barrier (a "publication"
 // or "export" barrier). Some form of synchronization is required
 // between initializing an object and making that object accessible to
@@ -271,6 +273,7 @@ func return0()
 
 // in asm_*.s
 // not called directly; definitions here supply type information for traceback.
+func call16(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call32(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call64(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
 func call128(typ, fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/stubs32.go b/src/runtime/stubs32.go
deleted file mode 100644
index a7f52f6b9e..0000000000
--- a/src/runtime/stubs32.go
+++ /dev/null
@@ -1,14 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build 386 arm mips mipsle
-
-package runtime
-
-import "unsafe"
-
-// Declarations for runtime services implemented in C or assembly that
-// are only present on 32 bit systems.
-
-func call16(fn, arg unsafe.Pointer, n, retoffset uint32)
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index a14f5c13d9..932fba3de0 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -326,7 +326,7 @@ const (
 	funcID_gcBgMarkWorker
 	funcID_systemstack_switch
 	funcID_systemstack
-	funcID_cgocallback_gofunc
+	funcID_cgocallback
 	funcID_gogo
 	funcID_externalthreadhandler
 	funcID_debugCallV1
@@ -844,6 +844,22 @@ func funcname(f funcInfo) string {
 	return gostringnocopy(cfuncname(f))
 }
 
+func funcpkgpath(f funcInfo) string {
+	name := funcname(f)
+	i := len(name) - 1
+	for ; i > 0; i-- {
+		if name[i] == '/' {
+			break
+		}
+	}
+	for ; i < len(name); i++ {
+		if name[i] == '.' {
+			break
+		}
+	}
+	return name[:i]
+}
+
 func cfuncnameFromNameoff(f funcInfo, nameoff int32) *byte {
 	if !f.valid() {
 		return nil
diff --git a/src/runtime/sys_darwin_arm64.s b/src/runtime/sys_darwin_arm64.s
index 585d4f2c64..f8d6f28dc7 100644
--- a/src/runtime/sys_darwin_arm64.s
+++ b/src/runtime/sys_darwin_arm64.s
@@ -202,6 +202,7 @@ TEXT runtime·sigtramp(SB),NOSPLIT,$192
 	BEQ	2(PC)
 	BL	runtime·load_g(SB)
 
+#ifdef GOOS_ios
 	MOVD	RSP, R6
 	CMP	$0, g
 	BEQ	nog
@@ -226,16 +227,21 @@ nog:
 	// Switch to gsignal stack.
 	MOVD	R6, RSP
 
-	// Call sigtrampgo.
+	// Save arguments.
 	MOVW	R0, (8*1)(RSP)
 	MOVD	R1, (8*2)(RSP)
 	MOVD	R2, (8*3)(RSP)
+#endif
+
+	// Call sigtrampgo.
 	MOVD	$runtime·sigtrampgo(SB), R11
 	BL	(R11)
 
+#ifdef GOOS_ios
 	// Switch to old stack.
 	MOVD	(8*4)(RSP), R5
 	MOVD	R5, RSP
+#endif
 
 	// Restore callee-save registers.
 	MOVD	(8*4)(RSP), R19
@@ -329,12 +335,20 @@ TEXT runtime·fcntl_trampoline(SB),NOSPLIT,$0
 	ADD	$16, RSP
 	RET
 
-// sigaltstack on iOS is not supported and will always
-// run the signal handler on the main stack, so our sigtramp has
-// to do the stack switch ourselves.
 TEXT runtime·sigaltstack_trampoline(SB),NOSPLIT,$0
+#ifdef GOOS_ios
+	// sigaltstack on iOS is not supported and will always
+	// run the signal handler on the main stack, so our sigtramp has
+	// to do the stack switch ourselves.
 	MOVW	$43, R0
 	BL	libc_exit(SB)
+#else
+	MOVD	8(R0), R1		// arg 2 old
+	MOVD	0(R0), R0		// arg 1 new
+	CALL	libc_sigaltstack(SB)
+	CBZ	R0, 2(PC)
+	BL	notok<>(SB)
+#endif
 	RET
 
 // Thread related functions
@@ -693,3 +707,23 @@ TEXT runtime·syscall6X(SB),NOSPLIT,$0
 	MOVD	R0, 72(R2)	// save err
 ok:
 	RET
+
+// syscallNoErr is like syscall6 but does not check for errors, and
+// only returns one value, for use with standard C ABI library functions.
+TEXT runtime·syscallNoErr(SB),NOSPLIT,$0
+	SUB	$16, RSP	// push structure pointer
+	MOVD	R0, (RSP)
+
+	MOVD	0(R0), R12	// fn
+	MOVD	16(R0), R1	// a2
+	MOVD	24(R0), R2	// a3
+	MOVD	32(R0), R3	// a4
+	MOVD	40(R0), R4	// a5
+	MOVD	48(R0), R5	// a6
+	MOVD	8(R0), R0	// a1
+	BL	(R12)
+
+	MOVD	(RSP), R2	// pop structure pointer
+	ADD	$16, RSP
+	MOVD	R0, 56(R2)	// save r1
+	RET
diff --git a/src/runtime/sys_freebsd_arm64.s b/src/runtime/sys_freebsd_arm64.s
index 2330f2ffe2..8a4f9b7fa1 100644
--- a/src/runtime/sys_freebsd_arm64.s
+++ b/src/runtime/sys_freebsd_arm64.s
@@ -515,24 +515,3 @@ TEXT runtime·getCntxct(SB),NOSPLIT,$0
 
 	MOVW	R0, ret+8(FP)
 	RET
-
-// func getisar0() uint64
-TEXT runtime·getisar0(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 0 into R0
-	MRS	ID_AA64ISAR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getisar1() uint64
-TEXT runtime·getisar1(SB),NOSPLIT,$0
-	// get Instruction Set Attributes 1 into R0
-	MRS	ID_AA64ISAR1_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
-
-// func getpfr0() uint64
-TEXT runtime·getpfr0(SB),NOSPLIT,$0
-	// get Processor Feature Register 0 into R0
-	MRS	ID_AA64PFR0_EL1, R0
-	MOVD	R0, ret+0(FP)
-	RET
diff --git a/src/runtime/sys_windows_386.s b/src/runtime/sys_windows_386.s
index 9e1f40925d..2e5e82879c 100644
--- a/src/runtime/sys_windows_386.s
+++ b/src/runtime/sys_windows_386.s
@@ -239,7 +239,7 @@ GLOBL runtime·cbctxts(SB), NOPTR, $4
 TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVL	0(SP), AX	// will use to find our callback context
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
 	ADDL	$4, SP
 
 	// address to callback parameters into CX
@@ -251,50 +251,35 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	PUSHL	BP
 	PUSHL	BX
 
-	// determine index into runtime·cbctxts table
+	// Go ABI requires DF flag to be cleared.
+	CLD
+
+	// determine index into runtime·cbs table
 	SUBL	$runtime·callbackasm(SB), AX
 	MOVL	$0, DX
 	MOVL	$5, BX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	BX
-
-	// find correspondent runtime·cbctxts table entry
-	MOVL	runtime·cbctxts(SB), BX
-	MOVL	-4(BX)(AX*4), BX
-
-	// extract callback context
-	MOVL	wincallbackcontext_gobody(BX), AX
-	MOVL	wincallbackcontext_argsize(BX), DX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	PUSHL	0(CX)(DX*1)
-
-	// extend argsize by size of return value
-	ADDL	$4, DX
-
-	// remember how to restore stack on return
-	MOVL	wincallbackcontext_restorestack(BX), BX
-	PUSHL	BX
-
-	// call target Go function
-	PUSHL	DX			// argsize (including return value)
-	PUSHL	CX			// callback parameters
-	PUSHL	AX			// address of target Go function
-	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	POPL	AX
-	POPL	CX
-	POPL	DX
-
-	// how to restore stack on return
-	POPL	BX
-
-	// return value into AX (as per Windows spec)
-	// and restore previously preserved value
-	MOVL	-4(CX)(DX*1), AX
-	POPL	-4(CX)(DX*1)
-
-	MOVL	BX, CX			// cannot use BX anymore
+	SUBL	$1, AX	// subtract 1 because return PC is to the next slot
+
+	// Create a struct callbackArgs on our stack.
+	SUBL	$(12+callbackArgs__size), SP
+	MOVL	AX, (12+callbackArgs_index)(SP)		// callback index
+	MOVL	CX, (12+callbackArgs_args)(SP)		// address of args vector
+	MOVL	$0, (12+callbackArgs_result)(SP)	// result
+	LEAL	12(SP), AX	// AX = &callbackArgs{...}
+
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVL	$0, 8(SP)	// context
+	MOVL	AX, 4(SP)	// frame (address of callbackArgs)
+	LEAL	·callbackWrap(SB), AX
+	MOVL	AX, 0(SP)	// PC of function to call
+	CALL	runtime·cgocallback(SB)
+
+	// Get callback result.
+	MOVL	(12+callbackArgs_result)(SP), AX
+	// Get popRet.
+	MOVL	(12+callbackArgs_retPop)(SP), CX	// Can't use a callee-save register
+	ADDL	$(12+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	POPL	BX
@@ -428,6 +413,42 @@ TEXT runtime·usleep2(SB),NOSPLIT,$20
 	MOVL	BP, SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT,$36
+	// Want negative 100ns units.
+	NEGL	BX
+	MOVL	$-1, hi-4(SP)
+	MOVL	BX, lo-8(SP)
+
+	get_tls(CX)
+	MOVL	g(CX), CX
+	MOVL	g_m(CX), CX
+	MOVL	(m_mOS+mOS_highResTimer)(CX), CX
+	MOVL	CX, saved_timer-12(SP)
+
+	MOVL	$0, fResume-16(SP)
+	MOVL	$0, lpArgToCompletionRoutine-20(SP)
+	MOVL	$0, pfnCompletionRoutine-24(SP)
+	MOVL	$0, lPeriod-28(SP)
+	LEAL	lo-8(SP), BX
+	MOVL	BX, lpDueTime-32(SP)
+	MOVL	CX, hTimer-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	MOVL	$0, ptime-28(SP)
+	MOVL	$0, alertable-32(SP)
+	MOVL	saved_timer-12(SP), CX
+	MOVL	CX, handle-36(SP)
+	MOVL	SP, BP
+	MOVL	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+	MOVL	BP, SP
+
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT,$0
 	MOVL	SP, BP
diff --git a/src/runtime/sys_windows_amd64.s b/src/runtime/sys_windows_amd64.s
index 6c8eecd4e7..e9ec99a51d 100644
--- a/src/runtime/sys_windows_amd64.s
+++ b/src/runtime/sys_windows_amd64.s
@@ -291,31 +291,20 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
   	MOVQ	DX, (16+8)(SP)
   	MOVQ	R8, (16+16)(SP)
   	MOVQ	R9, (16+24)(SP)
+	// R8 = address of args vector
+	LEAQ	(16+0)(SP), R8
 
-	// remove return address from stack, we are not returning there
+	// remove return address from stack, we are not returning to callbackasm, but to its caller.
   	MOVQ	0(SP), AX
 	ADDQ	$8, SP
 
-	// determine index into runtime·cbctxts table
+	// determine index into runtime·cbs table
 	MOVQ	$runtime·callbackasm(SB), DX
 	SUBQ	DX, AX
 	MOVQ	$0, DX
 	MOVQ	$5, CX	// divide by 5 because each call instruction in runtime·callbacks is 5 bytes long
 	DIVL	CX
-
-	// find correspondent runtime·cbctxts table entry
-	MOVQ	runtime·cbctxts(SB), CX
-	MOVQ	-8(CX)(AX*8), AX
-
-	// extract callback context
-	MOVQ	wincallbackcontext_argsize(AX), DX
-	MOVQ	wincallbackcontext_gobody(AX), AX
-
-	// preserve whatever's at the memory location that
-	// the callback will use to store the return value
-	LEAQ	8(SP), CX       // args vector, skip return address
-	PUSHQ	0(CX)(DX*1)     // store 8 bytes from just after the args array
-	ADDQ	$8, DX          // extend argsize by size of return value
+	SUBQ	$1, AX	// subtract 1 because return PC is to the next slot
 
 	// DI SI BP BX R12 R13 R14 R15 registers and DF flag are preserved
 	// as required by windows callback convention.
@@ -330,18 +319,25 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	MOVQ	R14, 8(SP)
 	MOVQ	R15, 0(SP)
 
-	// prepare call stack.  use SUBQ to hide from stack frame checks
-	// cgocallback(Go func, void *frame, uintptr framesize)
-	SUBQ	$24, SP
-	MOVQ	DX, 16(SP)	// argsize (including return value)
-	MOVQ	CX, 8(SP)	// callback parameters
-	MOVQ	AX, 0(SP)	// address of target Go function
+	// Go ABI requires DF flag to be cleared.
 	CLD
-	CALL	runtime·cgocallback_gofunc(SB)
-	MOVQ	0(SP), AX
-	MOVQ	8(SP), CX
-	MOVQ	16(SP), DX
-	ADDQ	$24, SP
+
+	// Create a struct callbackArgs on our stack to be passed as
+	// the "frame" to cgocallback and on to callbackWrap.
+	SUBQ	$(24+callbackArgs__size), SP
+	MOVQ	AX, (24+callbackArgs_index)(SP) 	// callback index
+	MOVQ	R8, (24+callbackArgs_args)(SP)  	// address of args vector
+	MOVQ	$0, (24+callbackArgs_result)(SP)	// result
+	LEAQ	24(SP), AX
+	// Call cgocallback, which will call callbackWrap(frame).
+	MOVQ	$0, 16(SP)	// context
+	MOVQ	AX, 8(SP)	// frame (address of callbackArgs)
+	LEAQ	·callbackWrap(SB), BX
+	MOVQ	BX, 0(SP)	// PC of function value to call (callbackWrap)
+	CALL	·cgocallback(SB)
+	// Get callback result.
+	MOVQ	(24+callbackArgs_result)(SP), AX
+	ADDQ	$(24+callbackArgs__size), SP
 
 	// restore registers as required for windows callback
 	MOVQ	0(SP), R15
@@ -355,8 +351,7 @@ TEXT runtime·callbackasm1(SB),NOSPLIT,$0
 	ADDQ	$64, SP
 	POPFQ
 
-	MOVQ	-8(CX)(DX*1), AX  // return value
-	POPQ	-8(CX)(DX*1)      // restore bytes just after the args
+	// The return value was placed in AX above.
 	RET
 
 // uint32 tstart_stdcall(M *newm);
@@ -457,6 +452,38 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$48
 	MOVQ	40(SP), SP
 	RET
 
+// Runs on OS stack. duration (in 100ns units) is in BX.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$72
+	MOVQ	SP, AX
+	ANDQ	$~15, SP	// alignment as per Windows requirement
+	MOVQ	AX, 64(SP)
+
+	get_tls(CX)
+	MOVQ	g(CX), CX
+	MOVQ	g_m(CX), CX
+	MOVQ	(m_mOS+mOS_highResTimer)(CX), CX	// hTimer
+	MOVQ	CX, 48(SP)				// save hTimer for later
+	// Want negative 100ns units.
+	NEGQ	BX
+	LEAQ	56(SP), DX				// lpDueTime
+	MOVQ	BX, (DX)
+	MOVQ	$0, R8					// lPeriod
+	MOVQ	$0, R9					// pfnCompletionRoutine
+	MOVQ	$0, AX
+	MOVQ	AX, 32(SP)				// lpArgToCompletionRoutine
+	MOVQ	AX, 40(SP)				// fResume
+	MOVQ	runtime·_SetWaitableTimer(SB), AX
+	CALL	AX
+
+	MOVQ	48(SP), CX				// handle
+	MOVQ	$0, DX					// alertable
+	MOVQ	$0, R8					// ptime
+	MOVQ	runtime·_NtWaitForSingleObject(SB), AX
+	CALL	AX
+
+	MOVQ	64(SP), SP
+	RET
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVQ	SP, AX
diff --git a/src/runtime/sys_windows_arm.s b/src/runtime/sys_windows_arm.s
index 256b5ff7f0..3fc6d27cb0 100644
--- a/src/runtime/sys_windows_arm.s
+++ b/src/runtime/sys_windows_arm.s
@@ -314,6 +314,9 @@ TEXT runtime·externalthreadhandler(SB),NOSPLIT|NOFRAME,$0
 GLOBL runtime·cbctxts(SB), NOPTR, $4
 
 TEXT runtime·callbackasm1(SB),NOSPLIT|NOFRAME,$0
+	// TODO(austin): This needs to be converted to match changes
+	// in cgocallback, but I have no way to test. See CL 258938,
+	// and callbackasm1 on amd64 and 386.
 	MOVM.DB.W [R4-R11, R14], (R13)	// push {r4-r11, lr}
 	SUB	$36, R13		// space for locals
 
@@ -468,6 +471,11 @@ TEXT runtime·usleep2(SB),NOSPLIT|NOFRAME,$0
 	MOVW	R4, R13			// Restore SP
 	MOVM.IA.W (R13), [R4, R15]	// pop {R4, pc}
 
+// Runs on OS stack. Duration (in 100ns units) is in R0.
+// TODO: neeeds to be implemented properly.
+TEXT runtime·usleep2HighRes(SB),NOSPLIT|NOFRAME,$0
+	B	runtime·abort(SB)
+
 // Runs on OS stack.
 TEXT runtime·switchtothread(SB),NOSPLIT|NOFRAME,$0
 	MOVM.DB.W [R4, R14], (R13)  	// push {R4, lr}
diff --git a/src/runtime/syscall_windows.go b/src/runtime/syscall_windows.go
index 0e2fcfb02d..21f2452b5a 100644
--- a/src/runtime/syscall_windows.go
+++ b/src/runtime/syscall_windows.go
@@ -5,27 +5,52 @@
 package runtime
 
 import (
+	"runtime/internal/sys"
 	"unsafe"
 )
 
-type callbacks struct {
-	lock mutex
-	ctxt [cb_max]*wincallbackcontext
-	n    int
+// cbs stores all registered Go callbacks.
+var cbs struct {
+	lock  mutex
+	ctxt  [cb_max]winCallback
+	index map[winCallbackKey]int
+	n     int
 }
 
-func (c *wincallbackcontext) isCleanstack() bool {
-	return c.cleanstack
+// winCallback records information about a registered Go callback.
+type winCallback struct {
+	fn     *funcval // Go function
+	retPop uintptr  // For 386 cdecl, how many bytes to pop on return
+
+	// abiMap specifies how to translate from a C frame to a Go
+	// frame. This does not specify how to translate back because
+	// the result is always a uintptr. If the C ABI is fastcall,
+	// this assumes the four fastcall registers were first spilled
+	// to the shadow space.
+	abiMap []abiPart
+	// retOffset is the offset of the uintptr-sized result in the Go
+	// frame.
+	retOffset uintptr
 }
 
-func (c *wincallbackcontext) setCleanstack(cleanstack bool) {
-	c.cleanstack = cleanstack
+// abiPart encodes a step in translating between calling ABIs.
+type abiPart struct {
+	src, dst uintptr
+	len      uintptr
 }
 
-var (
-	cbs     callbacks
-	cbctxts **wincallbackcontext = &cbs.ctxt[0] // to simplify access to cbs.ctxt in sys_windows_*.s
-)
+func (a *abiPart) tryMerge(b abiPart) bool {
+	if a.src+a.len == b.src && a.dst+a.len == b.dst {
+		a.len += b.len
+		return true
+	}
+	return false
+}
+
+type winCallbackKey struct {
+	fn    *funcval
+	cdecl bool
+}
 
 func callbackasm()
 
@@ -53,57 +78,157 @@ func callbackasmAddr(i int) uintptr {
 	return funcPC(callbackasm) + uintptr(i*entrySize)
 }
 
+const callbackMaxFrame = 64 * sys.PtrSize
+
+// compileCallback converts a Go function fn into a C function pointer
+// that can be passed to Windows APIs.
+//
+// On 386, if cdecl is true, the returned C function will use the
+// cdecl calling convention; otherwise, it will use stdcall. On amd64,
+// it always uses fastcall. On arm, it always uses the ARM convention.
+//
 //go:linkname compileCallback syscall.compileCallback
-func compileCallback(fn eface, cleanstack bool) (code uintptr) {
+func compileCallback(fn eface, cdecl bool) (code uintptr) {
+	if GOARCH != "386" {
+		// cdecl is only meaningful on 386.
+		cdecl = false
+	}
+
 	if fn._type == nil || (fn._type.kind&kindMask) != kindFunc {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
 	ft := (*functype)(unsafe.Pointer(fn._type))
+
+	// Check arguments and construct ABI translation.
+	var abiMap []abiPart
+	var src, dst uintptr
+	for _, t := range ft.in() {
+		if t.size > sys.PtrSize {
+			// We don't support this right now. In
+			// stdcall/cdecl, 64-bit ints and doubles are
+			// passed as two words (little endian); and
+			// structs are pushed on the stack. In
+			// fastcall, arguments larger than the word
+			// size are passed by reference.
+			panic("compileCallback: argument size is larger than uintptr")
+		}
+		if k := t.kind & kindMask; GOARCH == "amd64" && (k == kindFloat32 || k == kindFloat64) {
+			// In fastcall, floating-point arguments in
+			// the first four positions are passed in
+			// floating-point registers, which we don't
+			// currently spill.
+			panic("compileCallback: float arguments not supported")
+		}
+
+		// The Go ABI aligns arguments.
+		dst = alignUp(dst, uintptr(t.align))
+		// In the C ABI, we're already on a word boundary.
+		// Also, sub-word-sized fastcall register arguments
+		// are stored to the least-significant bytes of the
+		// argument word and all supported Windows
+		// architectures are little endian, so src is already
+		// pointing to the right place for smaller arguments.
+
+		// Copy just the size of the argument. Note that this
+		// could be a small by-value struct, but C and Go
+		// struct layouts are compatible, so we can copy these
+		// directly, too.
+		part := abiPart{src, dst, t.size}
+		// Add this step to the adapter.
+		if len(abiMap) == 0 || !abiMap[len(abiMap)-1].tryMerge(part) {
+			abiMap = append(abiMap, part)
+		}
+
+		// cdecl, stdcall, and fastcall pad arguments to word size.
+		src += sys.PtrSize
+		// The Go ABI packs arguments.
+		dst += t.size
+	}
+	// The Go ABI aligns the result to the word size. src is
+	// already aligned.
+	dst = alignUp(dst, sys.PtrSize)
+	retOffset := dst
+
 	if len(ft.out()) != 1 {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	uintptrSize := unsafe.Sizeof(uintptr(0))
-	if ft.out()[0].size != uintptrSize {
+	if ft.out()[0].size != sys.PtrSize {
 		panic("compileCallback: expected function with one uintptr-sized result")
 	}
-	argsize := uintptr(0)
-	for _, t := range ft.in() {
-		if t.size > uintptrSize {
-			panic("compileCallback: argument size is larger than uintptr")
-		}
-		argsize += uintptrSize
+	if k := ft.out()[0].kind & kindMask; k == kindFloat32 || k == kindFloat64 {
+		// In cdecl and stdcall, float results are returned in
+		// ST(0). In fastcall, they're returned in XMM0.
+		// Either way, it's not AX.
+		panic("compileCallback: float results not supported")
 	}
+	// Make room for the uintptr-sized result.
+	dst += sys.PtrSize
 
-	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
+	if dst > callbackMaxFrame {
+		panic("compileCallback: function argument frame too large")
+	}
 
-	n := cbs.n
-	for i := 0; i < n; i++ {
-		if cbs.ctxt[i].gobody == fn.data && cbs.ctxt[i].isCleanstack() == cleanstack {
-			r := callbackasmAddr(i)
-			unlock(&cbs.lock)
-			return r
-		}
+	// For cdecl, the callee is responsible for popping its
+	// arguments from the C stack.
+	var retPop uintptr
+	if cdecl {
+		retPop = src
 	}
-	if n >= cb_max {
+
+	key := winCallbackKey{(*funcval)(fn.data), cdecl}
+
+	lock(&cbs.lock) // We don't unlock this in a defer because this is used from the system stack.
+
+	// Check if this callback is already registered.
+	if n, ok := cbs.index[key]; ok {
 		unlock(&cbs.lock)
-		throw("too many callback functions")
+		return callbackasmAddr(n)
 	}
 
-	c := new(wincallbackcontext)
-	c.gobody = fn.data
-	c.argsize = argsize
-	c.setCleanstack(cleanstack)
-	if cleanstack && argsize != 0 {
-		c.restorestack = argsize
-	} else {
-		c.restorestack = 0
+	// Register the callback.
+	if cbs.index == nil {
+		cbs.index = make(map[winCallbackKey]int)
+	}
+	n := cbs.n
+	if n >= len(cbs.ctxt) {
+		unlock(&cbs.lock)
+		throw("too many callback functions")
 	}
+	c := winCallback{key.fn, retPop, abiMap, retOffset}
 	cbs.ctxt[n] = c
+	cbs.index[key] = n
 	cbs.n++
 
-	r := callbackasmAddr(n)
 	unlock(&cbs.lock)
-	return r
+	return callbackasmAddr(n)
+}
+
+type callbackArgs struct {
+	index uintptr
+	args  unsafe.Pointer // Arguments in stdcall/cdecl convention, with registers spilled
+	// Below are out-args from callbackWrap
+	result uintptr
+	retPop uintptr // For 386 cdecl, how many bytes to pop on return
+}
+
+// callbackWrap is called by callbackasm to invoke a registered C callback.
+func callbackWrap(a *callbackArgs) {
+	c := cbs.ctxt[a.index]
+	a.retPop = c.retPop
+
+	// Convert from stdcall to Go ABI.
+	var frame [callbackMaxFrame]byte
+	goArgs := unsafe.Pointer(&frame)
+	for _, part := range c.abiMap {
+		memmove(add(goArgs, part.dst), add(a.args, part.src), part.len)
+	}
+
+	// Even though this is copying back results, we can pass a nil
+	// type because those results must not require write barriers.
+	reflectcall(nil, unsafe.Pointer(c.fn), noescape(goArgs), uint32(c.retOffset)+sys.PtrSize, uint32(c.retOffset))
+
+	// Extract the result.
+	a.result = *(*uintptr)(unsafe.Pointer(&frame[c.retOffset]))
 }
 
 const _LOAD_LIBRARY_SEARCH_SYSTEM32 = 0x00000800
diff --git a/src/runtime/syscall_windows_test.go b/src/runtime/syscall_windows_test.go
index 2e74546e38..7705d2a017 100644
--- a/src/runtime/syscall_windows_test.go
+++ b/src/runtime/syscall_windows_test.go
@@ -9,11 +9,13 @@ import (
 	"fmt"
 	"internal/syscall/windows/sysdll"
 	"internal/testenv"
+	"io"
 	"io/ioutil"
 	"math"
 	"os"
 	"os/exec"
 	"path/filepath"
+	"reflect"
 	"runtime"
 	"strconv"
 	"strings"
@@ -285,99 +287,108 @@ func TestCallbackInAnotherThread(t *testing.T) {
 	}
 }
 
-type cbDLLFunc int // int determines number of callback parameters
-
-func (f cbDLLFunc) stdcallName() string {
-	return fmt.Sprintf("stdcall%d", f)
+type cbFunc struct {
+	goFunc interface{}
 }
 
-func (f cbDLLFunc) cdeclName() string {
-	return fmt.Sprintf("cdecl%d", f)
+func (f cbFunc) cName(cdecl bool) string {
+	name := "stdcall"
+	if cdecl {
+		name = "cdecl"
+	}
+	t := reflect.TypeOf(f.goFunc)
+	for i := 0; i < t.NumIn(); i++ {
+		name += "_" + t.In(i).Name()
+	}
+	return name
 }
 
-func (f cbDLLFunc) buildOne(stdcall bool) string {
-	var funcname, attr string
-	if stdcall {
-		funcname = f.stdcallName()
-		attr = "__stdcall"
-	} else {
-		funcname = f.cdeclName()
+func (f cbFunc) cSrc(w io.Writer, cdecl bool) {
+	// Construct a C function that takes a callback with
+	// f.goFunc's signature, and calls it with integers 1..N.
+	funcname := f.cName(cdecl)
+	attr := "__stdcall"
+	if cdecl {
 		attr = "__cdecl"
 	}
 	typename := "t" + funcname
-	p := make([]string, f)
-	for i := range p {
-		p[i] = "uintptr_t"
-	}
-	params := strings.Join(p, ",")
-	for i := range p {
-		p[i] = fmt.Sprintf("%d", i+1)
-	}
-	args := strings.Join(p, ",")
-	return fmt.Sprintf(`
-typedef void %s (*%s)(%s);
-void %s(%s f, uintptr_t n) {
-	uintptr_t i;
-	for(i=0;i<n;i++){
-		f(%s);
+	t := reflect.TypeOf(f.goFunc)
+	cTypes := make([]string, t.NumIn())
+	cArgs := make([]string, t.NumIn())
+	for i := range cTypes {
+		// We included stdint.h, so this works for all sized
+		// integer types, and uint8Pair_t.
+		cTypes[i] = t.In(i).Name() + "_t"
+		if t.In(i).Name() == "uint8Pair" {
+			cArgs[i] = fmt.Sprintf("(uint8Pair_t){%d,1}", i)
+		} else {
+			cArgs[i] = fmt.Sprintf("%d", i+1)
+		}
 	}
+	fmt.Fprintf(w, `
+typedef uintptr_t %s (*%s)(%s);
+uintptr_t %s(%s f) {
+	return f(%s);
 }
-	`, attr, typename, params, funcname, typename, args)
+	`, attr, typename, strings.Join(cTypes, ","), funcname, typename, strings.Join(cArgs, ","))
 }
 
-func (f cbDLLFunc) build() string {
-	return "#include <stdint.h>\n\n" + f.buildOne(false) + f.buildOne(true)
+func (f cbFunc) testOne(t *testing.T, dll *syscall.DLL, cdecl bool, cb uintptr) {
+	r1, _, _ := dll.MustFindProc(f.cName(cdecl)).Call(cb)
+
+	want := 0
+	for i := 0; i < reflect.TypeOf(f.goFunc).NumIn(); i++ {
+		want += i + 1
+	}
+	if int(r1) != want {
+		t.Errorf("wanted result %d; got %d", want, r1)
+	}
 }
 
-var cbFuncs = [...]interface{}{
-	2: func(i1, i2 uintptr) uintptr {
-		if i1+i2 != 3 {
-			panic("bad input")
-		}
-		return 0
-	},
-	3: func(i1, i2, i3 uintptr) uintptr {
-		if i1+i2+i3 != 6 {
-			panic("bad input")
-		}
-		return 0
-	},
-	4: func(i1, i2, i3, i4 uintptr) uintptr {
-		if i1+i2+i3+i4 != 10 {
-			panic("bad input")
-		}
-		return 0
-	},
-	5: func(i1, i2, i3, i4, i5 uintptr) uintptr {
-		if i1+i2+i3+i4+i5 != 15 {
-			panic("bad input")
-		}
-		return 0
-	},
-	6: func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6 != 21 {
-			panic("bad input")
-		}
-		return 0
-	},
-	7: func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7 != 28 {
-			panic("bad input")
-		}
-		return 0
-	},
-	8: func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8 != 36 {
-			panic("bad input")
-		}
-		return 0
-	},
-	9: func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
-		if i1+i2+i3+i4+i5+i6+i7+i8+i9 != 45 {
-			panic("bad input")
-		}
-		return 0
-	},
+type uint8Pair struct{ x, y uint8 }
+
+var cbFuncs = []cbFunc{
+	{func(i1, i2 uintptr) uintptr {
+		return i1 + i2
+	}},
+	{func(i1, i2, i3 uintptr) uintptr {
+		return i1 + i2 + i3
+	}},
+	{func(i1, i2, i3, i4 uintptr) uintptr {
+		return i1 + i2 + i3 + i4
+	}},
+	{func(i1, i2, i3, i4, i5 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5, i6 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uintptr) uintptr {
+		return i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9
+	}},
+
+	// Non-uintptr parameters.
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 uint16) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1, i2, i3, i4, i5, i6, i7, i8, i9 int8) uintptr {
+		return uintptr(i1 + i2 + i3 + i4 + i5 + i6 + i7 + i8 + i9)
+	}},
+	{func(i1 int8, i2 int16, i3 int32, i4, i5 uintptr) uintptr {
+		return uintptr(i1) + uintptr(i2) + uintptr(i3) + i4 + i5
+	}},
+	{func(i1, i2, i3, i4, i5 uint8Pair) uintptr {
+		return uintptr(i1.x + i1.y + i2.x + i2.y + i3.x + i3.y + i4.x + i4.y + i5.x + i5.y)
+	}},
 }
 
 type cbDLL struct {
@@ -385,21 +396,26 @@ type cbDLL struct {
 	buildArgs func(out, src string) []string
 }
 
-func (d *cbDLL) buildSrc(t *testing.T, path string) {
+func (d *cbDLL) makeSrc(t *testing.T, path string) {
 	f, err := os.Create(path)
 	if err != nil {
 		t.Fatalf("failed to create source file: %v", err)
 	}
 	defer f.Close()
 
-	for i := 2; i < 10; i++ {
-		fmt.Fprint(f, cbDLLFunc(i).build())
+	fmt.Fprint(f, `
+#include <stdint.h>
+typedef struct { uint8_t x, y; } uint8Pair_t;
+`)
+	for _, cbf := range cbFuncs {
+		cbf.cSrc(f, false)
+		cbf.cSrc(f, true)
 	}
 }
 
 func (d *cbDLL) build(t *testing.T, dir string) string {
 	srcname := d.name + ".c"
-	d.buildSrc(t, filepath.Join(dir, srcname))
+	d.makeSrc(t, filepath.Join(dir, srcname))
 	outname := d.name + ".dll"
 	args := d.buildArgs(outname, srcname)
 	cmd := exec.Command(args[0], args[1:]...)
@@ -426,51 +442,6 @@ var cbDLLs = []cbDLL{
 	},
 }
 
-type cbTest struct {
-	n     int     // number of callback parameters
-	param uintptr // dll function parameter
-}
-
-func (test *cbTest) run(t *testing.T, dllpath string) {
-	dll := syscall.MustLoadDLL(dllpath)
-	defer dll.Release()
-	cb := cbFuncs[test.n]
-	stdcall := syscall.NewCallback(cb)
-	f := cbDLLFunc(test.n)
-	test.runOne(t, dll, f.stdcallName(), stdcall)
-	cdecl := syscall.NewCallbackCDecl(cb)
-	test.runOne(t, dll, f.cdeclName(), cdecl)
-}
-
-func (test *cbTest) runOne(t *testing.T, dll *syscall.DLL, proc string, cb uintptr) {
-	defer func() {
-		if r := recover(); r != nil {
-			t.Errorf("dll call %v(..., %d) failed: %v", proc, test.param, r)
-		}
-	}()
-	dll.MustFindProc(proc).Call(cb, test.param)
-}
-
-var cbTests = []cbTest{
-	{2, 1},
-	{2, 10000},
-	{3, 3},
-	{4, 5},
-	{4, 6},
-	{5, 2},
-	{6, 7},
-	{6, 8},
-	{7, 6},
-	{8, 1},
-	{9, 8},
-	{9, 10000},
-	{3, 4},
-	{5, 3},
-	{7, 7},
-	{8, 2},
-	{9, 9},
-}
-
 func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	if _, err := exec.LookPath("gcc"); err != nil {
 		t.Skip("skipping test: gcc is missing")
@@ -482,10 +453,21 @@ func TestStdcallAndCDeclCallbacks(t *testing.T) {
 	defer os.RemoveAll(tmp)
 
 	for _, dll := range cbDLLs {
-		dllPath := dll.build(t, tmp)
-		for _, test := range cbTests {
-			test.run(t, dllPath)
-		}
+		t.Run(dll.name, func(t *testing.T) {
+			dllPath := dll.build(t, tmp)
+			dll := syscall.MustLoadDLL(dllPath)
+			defer dll.Release()
+			for _, cbf := range cbFuncs {
+				t.Run(cbf.cName(false), func(t *testing.T) {
+					stdcall := syscall.NewCallback(cbf.goFunc)
+					cbf.testOne(t, dll, false, stdcall)
+				})
+				t.Run(cbf.cName(true), func(t *testing.T) {
+					cdecl := syscall.NewCallbackCDecl(cbf.goFunc)
+					cbf.testOne(t, dll, true, cdecl)
+				})
+			}
+		})
 	}
 }
 
diff --git a/src/runtime/testdata/testprogcgo/eintr.go b/src/runtime/testdata/testprogcgo/eintr.go
index 791ff1bedc..1722a75eb9 100644
--- a/src/runtime/testdata/testprogcgo/eintr.go
+++ b/src/runtime/testdata/testprogcgo/eintr.go
@@ -32,7 +32,6 @@ import (
 	"errors"
 	"fmt"
 	"io"
-	"io/ioutil"
 	"log"
 	"net"
 	"os"
@@ -242,5 +241,5 @@ func testExec(wg *sync.WaitGroup) {
 
 // Block blocks until stdin is closed.
 func Block() {
-	io.Copy(ioutil.Discard, os.Stdin)
+	io.Copy(io.Discard, os.Stdin)
 }
diff --git a/src/runtime/testdata/testprogcgo/exec.go b/src/runtime/testdata/testprogcgo/exec.go
index 94da5dc526..15723c7369 100644
--- a/src/runtime/testdata/testprogcgo/exec.go
+++ b/src/runtime/testdata/testprogcgo/exec.go
@@ -31,6 +31,7 @@ import "C"
 
 import (
 	"fmt"
+	"io/fs"
 	"os"
 	"os/exec"
 	"os/signal"
@@ -98,7 +99,7 @@ func CgoExecSignalMask() {
 
 // isEAGAIN reports whether err is an EAGAIN error from a process execution.
 func isEAGAIN(err error) bool {
-	if p, ok := err.(*os.PathError); ok {
+	if p, ok := err.(*fs.PathError); ok {
 		err = p.Err
 	}
 	return err == syscall.EAGAIN
diff --git a/src/runtime/testdata/testprogcgo/needmdeadlock.go b/src/runtime/testdata/testprogcgo/needmdeadlock.go
new file mode 100644
index 0000000000..5a9c359006
--- /dev/null
+++ b/src/runtime/testdata/testprogcgo/needmdeadlock.go
@@ -0,0 +1,95 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !plan9,!windows
+
+package main
+
+// This is for issue #42207.
+// During a call to needm we could get a SIGCHLD signal
+// which would itself call needm, causing a deadlock.
+
+/*
+#include <signal.h>
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+
+extern void GoNeedM();
+
+#define SIGNALERS 10
+
+static void* needmSignalThread(void* p) {
+	pthread_t* pt = (pthread_t*)(p);
+	int i;
+
+	for (i = 0; i < 100; i++) {
+		if (pthread_kill(*pt, SIGCHLD) < 0) {
+			return NULL;
+		}
+		usleep(1);
+	}
+	return NULL;
+}
+
+// We don't need many calls, as the deadlock is only likely
+// to occur the first couple of times that needm is called.
+// After that there will likely be an extra M available.
+#define CALLS 10
+
+static void* needmCallbackThread(void* p) {
+	int i;
+
+	for (i = 0; i < SIGNALERS; i++) {
+		sched_yield(); // Help the signal threads get started.
+	}
+	for (i = 0; i < CALLS; i++) {
+		GoNeedM();
+	}
+	return NULL;
+}
+
+static void runNeedmSignalThread() {
+	int i;
+	pthread_t caller;
+	pthread_t s[SIGNALERS];
+
+	pthread_create(&caller, NULL, needmCallbackThread, NULL);
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_create(&s[i], NULL, needmSignalThread, &caller);
+	}
+	for (i = 0; i < SIGNALERS; i++) {
+		pthread_join(s[i], NULL);
+	}
+	pthread_join(caller, NULL);
+}
+*/
+import "C"
+
+import (
+	"fmt"
+	"os"
+	"time"
+)
+
+func init() {
+	register("NeedmDeadlock", NeedmDeadlock)
+}
+
+//export GoNeedM
+func GoNeedM() {
+}
+
+func NeedmDeadlock() {
+	// The failure symptom is that the program hangs because of a
+	// deadlock in needm, so set an alarm.
+	go func() {
+		time.Sleep(5 * time.Second)
+		fmt.Println("Hung for 5 seconds")
+		os.Exit(1)
+	}()
+
+	C.runNeedmSignalThread()
+	fmt.Println("OK")
+}
diff --git a/src/runtime/time.go b/src/runtime/time.go
index f895bf8443..99290f66d0 100644
--- a/src/runtime/time.go
+++ b/src/runtime/time.go
@@ -491,6 +491,8 @@ loop:
 			newStatus = timerModifiedEarlier
 		}
 
+		tpp := t.pp.ptr()
+
 		// Update the adjustTimers field.  Subtract one if we
 		// are removing a timerModifiedEarlier, add one if we
 		// are adding a timerModifiedEarlier.
@@ -500,9 +502,10 @@ loop:
 		}
 		if newStatus == timerModifiedEarlier {
 			adjust++
+			updateTimerModifiedEarliest(tpp, when)
 		}
 		if adjust != 0 {
-			atomic.Xadd(&t.pp.ptr().adjustTimers, adjust)
+			atomic.Xadd(&tpp.adjustTimers, adjust)
 		}
 
 		// Set the new status of the timer.
@@ -637,16 +640,36 @@ func moveTimers(pp *p, timers []*timer) {
 // the correct place in the heap. While looking for those timers,
 // it also moves timers that have been modified to run later,
 // and removes deleted timers. The caller must have locked the timers for pp.
-func adjusttimers(pp *p) {
-	if len(pp.timers) == 0 {
-		return
-	}
+func adjusttimers(pp *p, now int64) {
 	if atomic.Load(&pp.adjustTimers) == 0 {
 		if verifyTimers {
 			verifyTimerHeap(pp)
 		}
+		// There are no timers to adjust, so it is safe to clear
+		// timerModifiedEarliest. Do so in case it is stale.
+		// Everything will work if we don't do this,
+		// but clearing here may save future calls to adjusttimers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
 		return
 	}
+
+	// If we haven't yet reached the time of the first timerModifiedEarlier
+	// timer, don't do anything. This speeds up programs that adjust
+	// a lot of timers back and forth if the timers rarely expire.
+	// We'll postpone looking through all the adjusted timers until
+	// one would actually expire.
+	if first := atomic.Load64(&pp.timerModifiedEarliest); first != 0 {
+		if int64(first) > now {
+			if verifyTimers {
+				verifyTimerHeap(pp)
+			}
+			return
+		}
+
+		// We are going to clear all timerModifiedEarlier timers.
+		atomic.Store64(&pp.timerModifiedEarliest, 0)
+	}
+
 	var moved []*timer
 loop:
 	for i := 0; i < len(pp.timers); i++ {
@@ -868,6 +891,10 @@ func runOneTimer(pp *p, t *timer, now int64) {
 //
 // The caller must have locked the timers for pp.
 func clearDeletedTimers(pp *p) {
+	// We are going to clear all timerModifiedEarlier timers.
+	// Do this now in case new ones show up while we are looping.
+	atomic.Store64(&pp.timerModifiedEarliest, 0)
+
 	cdel := int32(0)
 	cearlier := int32(0)
 	to := 0
@@ -977,6 +1004,21 @@ func updateTimer0When(pp *p) {
 	}
 }
 
+// updateTimerModifiedEarliest updates the recorded nextwhen field of the
+// earlier timerModifiedEarier value.
+// The timers for pp will not be locked.
+func updateTimerModifiedEarliest(pp *p, nextwhen int64) {
+	for {
+		old := atomic.Load64(&pp.timerModifiedEarliest)
+		if old != 0 && int64(old) < nextwhen {
+			return
+		}
+		if atomic.Cas64(&pp.timerModifiedEarliest, old, uint64(nextwhen)) {
+			return
+		}
+	}
+}
+
 // timeSleepUntil returns the time when the next timer should fire,
 // and the P that holds the timer heap that that timer is on.
 // This is only called by sysmon and checkdead.
@@ -993,48 +1035,17 @@ func timeSleepUntil() (int64, *p) {
 			continue
 		}
 
-		c := atomic.Load(&pp.adjustTimers)
-		if c == 0 {
-			w := int64(atomic.Load64(&pp.timer0When))
-			if w != 0 && w < next {
-				next = w
-				pret = pp
-			}
-			continue
+		w := int64(atomic.Load64(&pp.timer0When))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
 
-		lock(&pp.timersLock)
-		for _, t := range pp.timers {
-			switch s := atomic.Load(&t.status); s {
-			case timerWaiting:
-				if t.when < next {
-					next = t.when
-				}
-			case timerModifiedEarlier, timerModifiedLater:
-				if t.nextwhen < next {
-					next = t.nextwhen
-				}
-				if s == timerModifiedEarlier {
-					c--
-				}
-			}
-			// The timers are sorted, so we only have to check
-			// the first timer for each P, unless there are
-			// some timerModifiedEarlier timers. The number
-			// of timerModifiedEarlier timers is in the adjustTimers
-			// field, used to initialize c, above.
-			//
-			// We don't worry about cases like timerModifying.
-			// New timers can show up at any time,
-			// so this function is necessarily imprecise.
-			// Do a signed check here since we aren't
-			// synchronizing the read of pp.adjustTimers
-			// with the check of a timer status.
-			if int32(c) <= 0 {
-				break
-			}
+		w = int64(atomic.Load64(&pp.timerModifiedEarliest))
+		if w != 0 && w < next {
+			next = w
+			pret = pp
 		}
-		unlock(&pp.timersLock)
 	}
 	unlock(&allpLock)
 
diff --git a/src/runtime/time_test.go b/src/runtime/time_test.go
index a8dab7db8e..afd9af2af4 100644
--- a/src/runtime/time_test.go
+++ b/src/runtime/time_test.go
@@ -20,6 +20,10 @@ func TestFakeTime(t *testing.T) {
 		t.Skip("faketime not supported on windows")
 	}
 
+	// Faketime is advanced in checkdead. External linking brings in cgo,
+	// causing checkdead not working.
+	testenv.MustInternalLink(t)
+
 	t.Parallel()
 
 	exe, err := buildTestProg(t, "testfaketime", "-tags=faketime")
diff --git a/src/runtime/trace.go b/src/runtime/trace.go
index 169b650eb4..d3ecd148be 100644
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -13,6 +13,7 @@
 package runtime
 
 import (
+	"runtime/internal/atomic"
 	"runtime/internal/sys"
 	"unsafe"
 )
@@ -1146,11 +1147,11 @@ func traceHeapAlloc() {
 }
 
 func traceNextGC() {
-	if memstats.next_gc == ^uint64(0) {
+	if nextGC := atomic.Load64(&memstats.next_gc); nextGC == ^uint64(0) {
 		// Heap-based triggering is disabled.
 		traceEvent(traceEvNextGC, -1, 0)
 	} else {
-		traceEvent(traceEvNextGC, -1, memstats.next_gc)
+		traceEvent(traceEvNextGC, -1, nextGC)
 	}
 }
 
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go
index 94f4a44976..f3df152535 100644
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -450,7 +450,7 @@ func gentraceback(pc0, sp0, lr0 uintptr, gp *g, skip int, pcbuf *uintptr, max in
 		}
 		n++
 
-		if f.funcID == funcID_cgocallback_gofunc && len(cgoCtxt) > 0 {
+		if f.funcID == funcID_cgocallback && len(cgoCtxt) > 0 {
 			ctxt := cgoCtxt[len(cgoCtxt)-1]
 			cgoCtxt = cgoCtxt[:len(cgoCtxt)-1]
 
diff --git a/src/runtime/type.go b/src/runtime/type.go
index 81455f3532..36492619e1 100644
--- a/src/runtime/type.go
+++ b/src/runtime/type.go
@@ -366,7 +366,19 @@ type imethod struct {
 type interfacetype struct {
 	typ     _type
 	pkgpath name
-	mhdr    []imethod
+	// expMethods contains all interface methods.
+	//
+	// - len(expMethods) returns number of exported methods.
+	// - cap(expMethods) returns all interface methods, including both exported/non-exported methods.
+	expMethods []imethod
+}
+
+func (it *interfacetype) methods() []imethod {
+	return it.expMethods[:cap(it.expMethods)]
+}
+
+func (it *interfacetype) isEmpty() bool {
+	return cap(it.expMethods) == 0
 }
 
 type maptype struct {
@@ -664,13 +676,15 @@ func typesEqual(t, v *_type, seen map[_typePair]struct{}) bool {
 		if it.pkgpath.name() != iv.pkgpath.name() {
 			return false
 		}
-		if len(it.mhdr) != len(iv.mhdr) {
+		itmethods := it.methods()
+		ivmethods := iv.methods()
+		if len(itmethods) != len(ivmethods) {
 			return false
 		}
-		for i := range it.mhdr {
-			tm := &it.mhdr[i]
-			vm := &iv.mhdr[i]
-			// Note the mhdr array can be relocated from
+		for i := range itmethods {
+			tm := &itmethods[i]
+			vm := &ivmethods[i]
+			// Note the expMethods array can be relocated from
 			// another module. See #17724.
 			tname := resolveNameOff(unsafe.Pointer(tm), tm.name)
 			vname := resolveNameOff(unsafe.Pointer(vm), vm.name)
diff --git a/src/runtime/vlrt.go b/src/runtime/vlrt.go
index 38e0b32801..996c0611fd 100644
--- a/src/runtime/vlrt.go
+++ b/src/runtime/vlrt.go
@@ -263,7 +263,7 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 	return q, n
 }
 
-// Floating point control word values for GOARCH=386 GO386=387.
+// Floating point control word values.
 // Bits 0-5 are bits to disable floating-point exceptions.
 // Bits 8-9 are the precision control:
 //   0 = single precision a.k.a. float32
@@ -273,6 +273,5 @@ func slowdodiv(n, d uint64) (q, r uint64) {
 //   3 = round toward zero
 var (
 	controlWord64      uint16 = 0x3f + 2<<8 + 0<<10
-	controlWord32             = 0x3f + 0<<8 + 0<<10
-	controlWord64trunc        = 0x3f + 2<<8 + 3<<10
+	controlWord64trunc uint16 = 0x3f + 2<<8 + 3<<10
 )