diff options
author | Dmitriy Vyukov <dvyukov@google.com> | 2011-06-28 15:09:53 -0400 |
---|---|---|
committer | Russ Cox <rsc@golang.org> | 2011-06-28 15:09:53 -0400 |
commit | 997c00f9919794d878aee4a87187dfaaebef6cd9 (patch) | |
tree | d4496824ad9b28b614a9d264aa6ae4acbd5c5682 | |
parent | 39acba55eeedb50d927ce2ed8cf2662eeaa59447 (diff) | |
download | go-997c00f9919794d878aee4a87187dfaaebef6cd9.tar.gz go-997c00f9919794d878aee4a87187dfaaebef6cd9.zip |
runtime: replace Semacquire/Semrelease implementation
1. The implementation uses distributed hash table of waitlists instead of a centralized one.
It significantly improves scalability for uncontended semaphores.
2. The implementation provides wait-free fast-path for signalers.
3. The implementation uses less locks (1 lock/unlock instead of 5 for Semacquire).
4. runtime·ready() call is moved out of critical section.
5. Semacquire() does not call semwake().
Benchmark results on HP Z600 (2 x Xeon E5620, 8 HT cores, 2.40GHz)
are as follows:
benchmark old ns/op new ns/op delta
runtime_test.BenchmarkSemaUncontended 58.20 36.30 -37.63%
runtime_test.BenchmarkSemaUncontended-2 199.00 18.30 -90.80%
runtime_test.BenchmarkSemaUncontended-4 327.00 9.20 -97.19%
runtime_test.BenchmarkSemaUncontended-8 491.00 5.32 -98.92%
runtime_test.BenchmarkSemaUncontended-16 946.00 4.18 -99.56%
runtime_test.BenchmarkSemaSyntNonblock 59.00 36.80 -37.63%
runtime_test.BenchmarkSemaSyntNonblock-2 167.00 138.00 -17.37%
runtime_test.BenchmarkSemaSyntNonblock-4 333.00 129.00 -61.26%
runtime_test.BenchmarkSemaSyntNonblock-8 464.00 130.00 -71.98%
runtime_test.BenchmarkSemaSyntNonblock-16 1015.00 136.00 -86.60%
runtime_test.BenchmarkSemaSyntBlock 58.80 36.70 -37.59%
runtime_test.BenchmarkSemaSyntBlock-2 294.00 149.00 -49.32%
runtime_test.BenchmarkSemaSyntBlock-4 333.00 177.00 -46.85%
runtime_test.BenchmarkSemaSyntBlock-8 471.00 221.00 -53.08%
runtime_test.BenchmarkSemaSyntBlock-16 990.00 227.00 -77.07%
runtime_test.BenchmarkSemaWorkNonblock 829.00 832.00 +0.36%
runtime_test.BenchmarkSemaWorkNonblock-2 425.00 419.00 -1.41%
runtime_test.BenchmarkSemaWorkNonblock-4 308.00 220.00 -28.57%
runtime_test.BenchmarkSemaWorkNonblock-8 394.00 147.00 -62.69%
runtime_test.BenchmarkSemaWorkNonblock-16 1510.00 149.00 -90.13%
runtime_test.BenchmarkSemaWorkBlock 828.00 813.00 -1.81%
runtime_test.BenchmarkSemaWorkBlock-2 428.00 436.00 +1.87%
runtime_test.BenchmarkSemaWorkBlock-4 232.00 219.00 -5.60%
runtime_test.BenchmarkSemaWorkBlock-8 392.00 251.00 -35.97%
runtime_test.BenchmarkSemaWorkBlock-16 1524.00 298.00 -80.45%
sync_test.BenchmarkMutexUncontended 24.10 24.00 -0.41%
sync_test.BenchmarkMutexUncontended-2 12.00 12.00 +0.00%
sync_test.BenchmarkMutexUncontended-4 6.25 6.17 -1.28%
sync_test.BenchmarkMutexUncontended-8 3.43 3.34 -2.62%
sync_test.BenchmarkMutexUncontended-16 2.34 2.32 -0.85%
sync_test.BenchmarkMutex 24.70 24.70 +0.00%
sync_test.BenchmarkMutex-2 208.00 99.50 -52.16%
sync_test.BenchmarkMutex-4 2744.00 256.00 -90.67%
sync_test.BenchmarkMutex-8 5137.00 556.00 -89.18%
sync_test.BenchmarkMutex-16 5368.00 1284.00 -76.08%
sync_test.BenchmarkMutexSlack 24.70 25.00 +1.21%
sync_test.BenchmarkMutexSlack-2 1094.00 186.00 -83.00%
sync_test.BenchmarkMutexSlack-4 3430.00 402.00 -88.28%
sync_test.BenchmarkMutexSlack-8 5051.00 1066.00 -78.90%
sync_test.BenchmarkMutexSlack-16 6806.00 1363.00 -79.97%
sync_test.BenchmarkMutexWork 793.00 792.00 -0.13%
sync_test.BenchmarkMutexWork-2 398.00 398.00 +0.00%
sync_test.BenchmarkMutexWork-4 1441.00 308.00 -78.63%
sync_test.BenchmarkMutexWork-8 8532.00 847.00 -90.07%
sync_test.BenchmarkMutexWork-16 8225.00 2760.00 -66.44%
sync_test.BenchmarkMutexWorkSlack 793.00 793.00 +0.00%
sync_test.BenchmarkMutexWorkSlack-2 418.00 414.00 -0.96%
sync_test.BenchmarkMutexWorkSlack-4 4481.00 480.00 -89.29%
sync_test.BenchmarkMutexWorkSlack-8 6317.00 1598.00 -74.70%
sync_test.BenchmarkMutexWorkSlack-16 9111.00 3038.00 -66.66%
R=rsc
CC=golang-dev
https://golang.org/cl/4631059
-rw-r--r-- | src/pkg/runtime/386/atomic.c | 12 | ||||
-rw-r--r-- | src/pkg/runtime/Makefile | 1 | ||||
-rw-r--r-- | src/pkg/runtime/amd64/atomic.c | 12 | ||||
-rw-r--r-- | src/pkg/runtime/arm/atomic.c | 12 | ||||
-rw-r--r-- | src/pkg/runtime/runtime.h | 3 | ||||
-rw-r--r-- | src/pkg/runtime/sema.goc | 187 |
6 files changed, 131 insertions, 96 deletions
diff --git a/src/pkg/runtime/386/atomic.c b/src/pkg/runtime/386/atomic.c new file mode 100644 index 0000000000..c031cc4f69 --- /dev/null +++ b/src/pkg/runtime/386/atomic.c @@ -0,0 +1,12 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "runtime.h" + +#pragma textflag 7 +uint32 +runtime·atomicload(uint32 volatile* addr) +{ + return *addr; +} diff --git a/src/pkg/runtime/Makefile b/src/pkg/runtime/Makefile index 79f847e64a..03f960cb86 100644 --- a/src/pkg/runtime/Makefile +++ b/src/pkg/runtime/Makefile @@ -47,6 +47,7 @@ OFILES_arm=\ OFILES=\ asm.$O\ + atomic.$O\ cgocall.$O\ chan.$O\ closure.$O\ diff --git a/src/pkg/runtime/amd64/atomic.c b/src/pkg/runtime/amd64/atomic.c new file mode 100644 index 0000000000..c031cc4f69 --- /dev/null +++ b/src/pkg/runtime/amd64/atomic.c @@ -0,0 +1,12 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "runtime.h" + +#pragma textflag 7 +uint32 +runtime·atomicload(uint32 volatile* addr) +{ + return *addr; +} diff --git a/src/pkg/runtime/arm/atomic.c b/src/pkg/runtime/arm/atomic.c new file mode 100644 index 0000000000..9fd47bae7b --- /dev/null +++ b/src/pkg/runtime/arm/atomic.c @@ -0,0 +1,12 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "runtime.h" + +#pragma textflag 7 +uint32 +runtime·atomicload(uint32 volatile* addr) +{ + return runtime·xadd(addr, 0); +} diff --git a/src/pkg/runtime/runtime.h b/src/pkg/runtime/runtime.h index f3ccff1bcd..7bc0962ba9 100644 --- a/src/pkg/runtime/runtime.h +++ b/src/pkg/runtime/runtime.h @@ -416,7 +416,10 @@ int32 runtime·write(int32, void*, int32); int32 runtime·mincore(void*, uintptr, byte*); bool runtime·cas(uint32*, uint32, uint32); bool runtime·casp(void**, void*, void*); +// Don't confuse with XADD x86 instruction, +// this one is actually 'addx', that is, add-and-fetch. uint32 runtime·xadd(uint32 volatile*, int32); +uint32 runtime·atomicload(uint32 volatile*); void runtime·jmpdefer(byte*, void*); void runtime·exit1(int32); void runtime·ready(G*); diff --git a/src/pkg/runtime/sema.goc b/src/pkg/runtime/sema.goc index 1c77e87a53..ae84351edf 100644 --- a/src/pkg/runtime/sema.goc +++ b/src/pkg/runtime/sema.goc @@ -23,104 +23,68 @@ package runtime typedef struct Sema Sema; struct Sema { - uint32 *addr; + uint32 volatile *addr; G *g; Sema *prev; Sema *next; }; -// TODO: For now, a linked list; maybe a hash table of linked lists later. -static Sema *semfirst, *semlast; -static Lock semlock; +typedef struct SemaRoot SemaRoot; +struct SemaRoot +{ + Lock; + Sema *head; + Sema *tail; + // Number of waiters. Read w/o the lock. + uint32 volatile nwait; +}; + +// Prime to not correlate with any user patterns. +#define SEMTABLESZ 251 + +static union +{ + SemaRoot; + // Modern processors tend to have 64-byte cache lines, + // potentially with 128-byte effective cache line size for reading. + // While there are hypothetical architectures + // with 16-4096 byte cache lines, 128 looks like a good compromise. + uint8 pad[128]; +} semtable[SEMTABLESZ]; + +static SemaRoot* +semroot(uint32 *addr) +{ + return &semtable[((uintptr)addr >> 3) % SEMTABLESZ]; +} static void -semqueue(uint32 *addr, Sema *s) +semqueue(SemaRoot *root, uint32 volatile *addr, Sema *s) { + s->g = g; s->addr = addr; - s->g = nil; - - runtime·lock(&semlock); - s->prev = semlast; s->next = nil; - if(semlast) - semlast->next = s; + s->prev = root->tail; + if(root->tail) + root->tail->next = s; else - semfirst = s; - semlast = s; - runtime·unlock(&semlock); + root->head = s; + root->tail = s; } static void -semdequeue(Sema *s) +semdequeue(SemaRoot *root, Sema *s) { - runtime·lock(&semlock); if(s->next) s->next->prev = s->prev; else - semlast = s->prev; + root->tail = s->prev; if(s->prev) s->prev->next = s->next; else - semfirst = s->next; + root->head = s->next; s->prev = nil; s->next = nil; - runtime·unlock(&semlock); -} - -static void -semwakeup(uint32 *addr) -{ - Sema *s; - - runtime·lock(&semlock); - for(s=semfirst; s; s=s->next) { - if(s->addr == addr && s->g) { - runtime·ready(s->g); - s->g = nil; - break; - } - } - runtime·unlock(&semlock); -} - -// Step 1 of sleep: make ourselves available for wakeup. -// TODO(rsc): Maybe we can write a version without -// locks by using cas on s->g. Maybe not: I need to -// think more about whether it would be correct. -static void -semsleep1(Sema *s) -{ - runtime·lock(&semlock); - s->g = g; - runtime·unlock(&semlock); -} - -// Decided not to go through with it: undo step 1. -static void -semsleepundo1(Sema *s) -{ - runtime·lock(&semlock); - if(s->g != nil) { - s->g = nil; // back ourselves out - } else { - // If s->g == nil already, semwakeup - // already readied us. Since we never stopped - // running, readying us just set g->readyonstop. - // Clear it. - if(g->readyonstop == 0) - *(int32*)0x555 = 555; - g->readyonstop = 0; - } - runtime·unlock(&semlock); -} - -// Step 2: wait for the wakeup. -static void -semsleep2(Sema *s) -{ - USED(s); - g->status = Gwaiting; - runtime·gosched(); } static int32 @@ -128,52 +92,83 @@ cansemacquire(uint32 *addr) { uint32 v; - while((v = *addr) > 0) + while((v = runtime·atomicload(addr)) > 0) if(runtime·cas(addr, v, v-1)) return 1; return 0; } -// For now has no return value. -// Might return an ok (not interrupted) bool in the future? void -runtime·semacquire(uint32 *addr) +runtime·semacquire(uint32 volatile *addr) { Sema s; + SemaRoot *root; // Easy case. if(cansemacquire(addr)) return; // Harder case: - // queue - // try semacquire one more time, sleep if failed - // dequeue - // wake up one more guy to avoid races (TODO(rsc): maybe unnecessary?) - semqueue(addr, &s); + // increment waiter count + // try cansemacquire one more time, return if succeeded + // enqueue itself as a waiter + // sleep + // (waiter descriptor is dequeued by signaler) + root = semroot(addr); for(;;) { - semsleep1(&s); + runtime·lock(root); + // Add ourselves to nwait to disable "easy case" in semrelease. + runtime·xadd(&root->nwait, 1); + // Check cansemacquire to avoid missed wakeup. if(cansemacquire(addr)) { - semsleepundo1(&s); - break; + runtime·xadd(&root->nwait, -1); + runtime·unlock(root); + return; } - semsleep2(&s); + // Any semrelease after the cansemacquire knows we're waiting + // (we set nwait above), so go to sleep. + semqueue(root, addr, &s); + g->status = Gwaiting; + runtime·unlock(root); + runtime·gosched(); + if(cansemacquire(addr)) + return; } - semdequeue(&s); - semwakeup(addr); } void -runtime·semrelease(uint32 *addr) +runtime·semrelease(uint32 volatile *addr) { - uint32 v; + Sema *s; + SemaRoot *root; - for(;;) { - v = *addr; - if(runtime·cas(addr, v, v+1)) + root = semroot(addr); + runtime·xadd(addr, 1); + + // Easy case: no waiters? + // This check must happen after the xadd, to avoid a missed wakeup + // (see loop in semacquire). + if(runtime·atomicload(&root->nwait) == 0) + return; + + // Harder case: search for a waiter and wake it. + runtime·lock(root); + if(runtime·atomicload(&root->nwait) == 0) { + // The count is already consumed by another goroutine, + // so no need to wake up another goroutine. + runtime·unlock(root); + return; + } + for(s = root->head; s; s = s->next) { + if(s->addr == addr) { + runtime·xadd(&root->nwait, -1); + semdequeue(root, s); break; + } } - semwakeup(addr); + runtime·unlock(root); + if(s) + runtime·ready(s->g); } func Semacquire(addr *uint32) { |