aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/mksizeclasses.go
diff options
context:
space:
mode:
authorMatthew Dempsky <mdempsky@google.com>2021-03-11 15:45:52 -0800
committerMatthew Dempsky <mdempsky@google.com>2021-03-12 18:02:59 +0000
commit4662029264e79f144eef4323631b3356624e884f (patch)
treeb59453243f0f51d85d840a3677bd33a1625a2e18 /src/runtime/mksizeclasses.go
parent735647d92e839f9ac3a91864a2c34263338a35e6 (diff)
downloadgo-4662029264e79f144eef4323631b3356624e884f.tar.gz
go-4662029264e79f144eef4323631b3356624e884f.zip
runtime: simplify divmagic for span calculations
It's both simpler and faster to just unconditionally do two 32-bit multiplies rather than a bunch of branching to try to avoid them. This is safe thanks to the tight bounds derived in [1] and verified during mksizeclasses.go. Benchstat results below for compilebench benchmarks on my P920. See also [2] for micro benchmarks comparing the new functions against the originals (as well as several basic attempts at optimizing them). name old time/op new time/op delta Template 295ms ± 3% 290ms ± 1% -1.95% (p=0.000 n=20+20) Unicode 113ms ± 3% 110ms ± 2% -2.32% (p=0.000 n=21+17) GoTypes 1.78s ± 1% 1.76s ± 1% -1.23% (p=0.000 n=21+20) Compiler 119ms ± 2% 117ms ± 4% -1.53% (p=0.007 n=20+20) SSA 14.3s ± 1% 13.8s ± 1% -3.12% (p=0.000 n=17+20) Flate 173ms ± 2% 170ms ± 1% -1.64% (p=0.000 n=20+19) GoParser 278ms ± 2% 273ms ± 2% -1.92% (p=0.000 n=20+19) Reflect 686ms ± 3% 671ms ± 3% -2.18% (p=0.000 n=19+20) Tar 255ms ± 2% 248ms ± 2% -2.90% (p=0.000 n=20+20) XML 335ms ± 3% 327ms ± 2% -2.34% (p=0.000 n=20+20) LinkCompiler 799ms ± 1% 799ms ± 1% ~ (p=0.925 n=20+20) ExternalLinkCompiler 1.90s ± 1% 1.90s ± 0% ~ (p=0.327 n=20+20) LinkWithoutDebugCompiler 385ms ± 1% 386ms ± 1% ~ (p=0.251 n=18+20) [Geo mean] 512ms 504ms -1.61% name old user-time/op new user-time/op delta Template 286ms ± 4% 282ms ± 4% -1.42% (p=0.025 n=21+20) Unicode 104ms ± 9% 102ms ±14% ~ (p=0.294 n=21+20) GoTypes 1.75s ± 3% 1.72s ± 2% -1.36% (p=0.000 n=21+20) Compiler 109ms ±11% 108ms ± 8% ~ (p=0.187 n=21+19) SSA 14.0s ± 1% 13.5s ± 2% -3.25% (p=0.000 n=16+20) Flate 166ms ± 4% 164ms ± 4% -1.34% (p=0.032 n=19+19) GoParser 268ms ± 4% 263ms ± 4% -1.71% (p=0.011 n=18+20) Reflect 666ms ± 3% 654ms ± 4% -1.77% (p=0.002 n=18+20) Tar 245ms ± 5% 236ms ± 6% -3.34% (p=0.000 n=20+20) XML 320ms ± 4% 314ms ± 3% -2.01% (p=0.001 n=19+18) LinkCompiler 744ms ± 4% 747ms ± 3% ~ (p=0.627 n=20+19) ExternalLinkCompiler 1.71s ± 3% 1.72s ± 2% ~ (p=0.149 n=20+20) LinkWithoutDebugCompiler 345ms ± 6% 342ms ± 8% ~ (p=0.355 n=20+20) [Geo mean] 484ms 477ms -1.50% [1] Daniel Lemire, Owen Kaser, Nathan Kurz. 2019. "Faster Remainder by Direct Computation: Applications to Compilers and Software Libraries." https://arxiv.org/abs/1902.01961 [2] https://github.com/mdempsky/benchdivmagic Change-Id: Ie4d214e7a908b0d979c878f2d404bd56bdf374f6 Reviewed-on: https://go-review.googlesource.com/c/go/+/300994 Run-TryBot: Matthew Dempsky <mdempsky@google.com> Trust: Matthew Dempsky <mdempsky@google.com> Trust: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
Diffstat (limited to 'src/runtime/mksizeclasses.go')
-rw-r--r--src/runtime/mksizeclasses.go118
1 files changed, 55 insertions, 63 deletions
diff --git a/src/runtime/mksizeclasses.go b/src/runtime/mksizeclasses.go
index 8b9bbe01e6..ddbf1bf7fe 100644
--- a/src/runtime/mksizeclasses.go
+++ b/src/runtime/mksizeclasses.go
@@ -37,6 +37,7 @@ import (
"go/format"
"io"
"log"
+ "math"
"math/bits"
"os"
)
@@ -88,11 +89,6 @@ const (
type class struct {
size int // max size
npages int // number of pages
-
- mul int
- shift uint
- shift2 uint
- mask int
}
func powerOfTwo(x int) bool {
@@ -169,9 +165,9 @@ func makeClasses() []class {
return classes
}
-// computeDivMagic computes some magic constants to implement
-// the division required to compute object number from span offset.
-// n / c.size is implemented as n >> c.shift * c.mul >> c.shift2
+// computeDivMagic checks that the division required to compute object
+// index from span offset can be computed using 32-bit multiplication.
+// n / c.size is implemented as (n * (^uint32(0)/uint32(c.size) + 1)) >> 32
// for all 0 <= n <= c.npages * pageSize
func computeDivMagic(c *class) {
// divisor
@@ -183,62 +179,60 @@ func computeDivMagic(c *class) {
// maximum input value for which the formula needs to work.
max := c.npages * pageSize
+ // As reported in [1], if n and d are unsigned N-bit integers, we
+ // can compute n / d as ⌊n * c / 2^F⌋, where c is ⌈2^F / d⌉ and F is
+ // computed with:
+ //
+ // Algorithm 2: Algorithm to select the number of fractional bits
+ // and the scaled approximate reciprocal in the case of unsigned
+ // integers.
+ //
+ // if d is a power of two then
+ // Let F ← log₂(d) and c = 1.
+ // else
+ // Let F ← N + L where L is the smallest integer
+ // such that d ≤ (2^(N+L) mod d) + 2^L.
+ // end if
+ //
+ // [1] "Faster Remainder by Direct Computation: Applications to
+ // Compilers and Software Libraries" Daniel Lemire, Owen Kaser,
+ // Nathan Kurz arXiv:1902.01961
+ //
+ // To minimize the risk of introducing errors, we implement the
+ // algorithm exactly as stated, rather than trying to adapt it to
+ // fit typical Go idioms.
+ N := bits.Len(uint(max))
+ var F int
if powerOfTwo(d) {
- // If the size is a power of two, heapBitsForObject can divide even faster by masking.
- // Compute this mask.
- if max >= 1<<16 {
- panic("max too big for power of two size")
+ F = int(math.Log2(float64(d)))
+ if d != 1<<F {
+ panic("imprecise log2")
}
- c.mask = 1<<16 - d
- }
-
- // Compute pre-shift by factoring power of 2 out of d.
- for d%2 == 0 {
- c.shift++
- d >>= 1
- max >>= 1
- }
-
- // Find the smallest k that works.
- // A small k allows us to fit the math required into 32 bits
- // so we can use 32-bit multiplies and shifts on 32-bit platforms.
-nextk:
- for k := uint(0); ; k++ {
- mul := (int(1)<<k + d - 1) / d // ⌈2^k / d⌉
-
- // Test to see if mul works.
- for n := 0; n <= max; n++ {
- if n*mul>>k != n/d {
- continue nextk
+ } else {
+ for L := 0; ; L++ {
+ if d <= ((1<<(N+L))%d)+(1<<L) {
+ F = N + L
+ break
}
}
- if mul >= 1<<16 {
- panic("mul too big")
- }
- if uint64(mul)*uint64(max) >= 1<<32 {
- panic("mul*max too big")
- }
- c.mul = mul
- c.shift2 = k
- break
}
- // double-check.
+ // Also, noted in the paper, F is the smallest number of fractional
+ // bits required. We use 32 bits, because it works for all size
+ // classes and is fast on all CPU architectures that we support.
+ if F > 32 {
+ fmt.Printf("d=%d max=%d N=%d F=%d\n", c.size, max, N, F)
+ panic("size class requires more than 32 bits of precision")
+ }
+
+ // Brute force double-check with the exact computation that will be
+ // done by the runtime.
+ m := ^uint32(0)/uint32(c.size) + 1
for n := 0; n <= max; n++ {
- if n*c.mul>>c.shift2 != n/d {
- fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
- panic("bad multiply magic")
- }
- // Also check the exact computations that will be done by the runtime,
- // for both 32 and 64 bit operations.
- if uint32(n)*uint32(c.mul)>>uint8(c.shift2) != uint32(n/d) {
- fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
+ if uint32((uint64(n)*uint64(m))>>32) != uint32(n/c.size) {
+ fmt.Printf("d=%d max=%d m=%d n=%d\n", d, max, m, n)
panic("bad 32-bit multiply magic")
}
- if uint64(n)*uint64(c.mul)>>uint8(c.shift2) != uint64(n/d) {
- fmt.Printf("d=%d max=%d mul=%d shift2=%d n=%d\n", d, max, c.mul, c.shift2, n)
- panic("bad 64-bit multiply magic")
- }
}
}
@@ -302,15 +296,13 @@ func printClasses(w io.Writer, classes []class) {
}
fmt.Fprintln(w, "}")
- fmt.Fprintln(w, "type divMagic struct {")
- fmt.Fprintln(w, " shift uint8")
- fmt.Fprintln(w, " shift2 uint8")
- fmt.Fprintln(w, " mul uint16")
- fmt.Fprintln(w, " baseMask uint16")
- fmt.Fprintln(w, "}")
- fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]divMagic {")
+ fmt.Fprint(w, "var class_to_divmagic = [_NumSizeClasses]uint32 {")
for _, c := range classes {
- fmt.Fprintf(w, "{%d,%d,%d,%d},", c.shift, c.shift2, c.mul, c.mask)
+ if c.size == 0 {
+ fmt.Fprintf(w, "0,")
+ continue
+ }
+ fmt.Fprintf(w, "^uint32(0)/%d+1,", c.size)
}
fmt.Fprintln(w, "}")