runtime: simplify divmagic for span calculations

It's both simpler and faster to just unconditionally do two 32-bit multiplies rather than a bunch of branching to try to avoid them. This is safe thanks to the tight bounds derived in [1] and verified during mksizeclasses.go. Benchstat results below for compilebench benchmarks on my P920. See also [2] for micro benchmarks comparing the new functions against the originals (as well as several basic attempts at optimizing them). name old time/op new time/op delta Template 295ms ± 3% 290ms ± 1% -1.95% (p=0.000 n=20+20) Unicode 113ms ± 3% 110ms ± 2% -2.32% (p=0.000 n=21+17) GoTypes 1.78s ± 1% 1.76s ± 1% -1.23% (p=0.000 n=21+20) Compiler 119ms ± 2% 117ms ± 4% -1.53% (p=0.007 n=20+20) SSA 14.3s ± 1% 13.8s ± 1% -3.12% (p=0.000 n=17+20) Flate 173ms ± 2% 170ms ± 1% -1.64% (p=0.000 n=20+19) GoParser 278ms ± 2% 273ms ± 2% -1.92% (p=0.000 n=20+19) Reflect 686ms ± 3% 671ms ± 3% -2.18% (p=0.000 n=19+20) Tar 255ms ± 2% 248ms ± 2% -2.90% (p=0.000 n=20+20) XML 335ms ± 3% 327ms ± 2% -2.34% (p=0.000 n=20+20) LinkCompiler 799ms ± 1% 799ms ± 1% ~ (p=0.925 n=20+20) ExternalLinkCompiler 1.90s ± 1% 1.90s ± 0% ~ (p=0.327 n=20+20) LinkWithoutDebugCompiler 385ms ± 1% 386ms ± 1% ~ (p=0.251 n=18+20) [Geo mean] 512ms 504ms -1.61% name old user-time/op new user-time/op delta Template 286ms ± 4% 282ms ± 4% -1.42% (p=0.025 n=21+20) Unicode 104ms ± 9% 102ms ±14% ~ (p=0.294 n=21+20) GoTypes 1.75s ± 3% 1.72s ± 2% -1.36% (p=0.000 n=21+20) Compiler 109ms ±11% 108ms ± 8% ~ (p=0.187 n=21+19) SSA 14.0s ± 1% 13.5s ± 2% -3.25% (p=0.000 n=16+20) Flate 166ms ± 4% 164ms ± 4% -1.34% (p=0.032 n=19+19) GoParser 268ms ± 4% 263ms ± 4% -1.71% (p=0.011 n=18+20) Reflect 666ms ± 3% 654ms ± 4% -1.77% (p=0.002 n=18+20) Tar 245ms ± 5% 236ms ± 6% -3.34% (p=0.000 n=20+20) XML 320ms ± 4% 314ms ± 3% -2.01% (p=0.001 n=19+18) LinkCompiler 744ms ± 4% 747ms ± 3% ~ (p=0.627 n=20+19) ExternalLinkCompiler 1.71s ± 3% 1.72s ± 2% ~ (p=0.149 n=20+20) LinkWithoutDebugCompiler 345ms ± 6% 342ms ± 8% ~ (p=0.355 n=20+20) [Geo mean] 484ms 477ms -1.50% [1] Daniel Lemire, Owen Kaser, Nathan Kurz. 2019. "Faster Remainder by Direct Computation: Applications to Compilers and Software Libraries." https://arxiv.org/abs/1902.01961 [2] https://github.com/mdempsky/benchdivmagic Change-Id: Ie4d214e7a908b0d979c878f2d404bd56bdf374f6 Reviewed-on: https://go-review.googlesource.com/c/go/+/300994 Run-TryBot: Matthew Dempsky <mdempsky@google.com> Trust: Matthew Dempsky <mdempsky@google.com> Trust: Michael Knyszek <mknyszek@google.com> TryBot-Result: Go Bot <gobot@golang.org> Reviewed-by: Michael Knyszek <mknyszek@google.com>
author: Matthew Dempsky <mdempsky@google.com> 2021-03-11 15:45:52 -0800
committer: Matthew Dempsky <mdempsky@google.com> 2021-03-12 18:02:59 +0000
commit: 4662029264e79f144eef4323631b3356624e884f (patch)
tree: b59453243f0f51d85d840a3677bd33a1625a2e18 /src/runtime/mbitmap.go
parent: 735647d92e839f9ac3a91864a2c34263338a35e6 (diff)
download: go-4662029264e79f144eef4323631b3356624e884f.tar.gz
go-4662029264e79f144eef4323631b3356624e884f.zip
1 files changed, 21 insertions, 27 deletions
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index fbfaae0f93..2d12c563b8 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -226,16 +226,25 @@ func (s *mspan) isFree(index uintptr) bool {
 	return *bytep&mask == 0
 }
 
-func (s *mspan) objIndex(p uintptr) uintptr {
-	byteOffset := p - s.base()
-	if byteOffset == 0 {
-		return 0
-	}
-	if s.baseMask != 0 {
-		// s.baseMask is non-0, elemsize is a power of two, so shift by s.divShift
-		return byteOffset >> s.divShift
+// divideByElemSize returns n/s.elemsize.
+// n must be within [0, s.npages*_PageSize),
+// or may be exactly s.npages*_PageSize
+// if s.elemsize is from sizeclasses.go.
+func (s *mspan) divideByElemSize(n uintptr) uintptr {
+	const doubleCheck = false
+
+	// See explanation in mksizeclasses.go's computeDivMagic.
+	q := uintptr((uint64(n) * uint64(s.divMul)) >> 32)
+
+	if doubleCheck && q != n/s.elemsize {
+		println(n, "/", s.elemsize, "should be", n/s.elemsize, "but got", q)
+		throw("bad magic division")
 	}
-	return uintptr(((uint64(byteOffset) >> s.divShift) * uint64(s.divMul)) >> s.divShift2)
+	return q
+}
+
+func (s *mspan) objIndex(p uintptr) uintptr {
+	return s.divideByElemSize(p - s.base())
 }
 
 func markBitsForAddr(p uintptr) markBits {
@@ -388,24 +397,9 @@ func findObject(p, refBase, refOff uintptr) (base uintptr, s *mspan, objIndex ui
 		}
 		return
 	}
-	// If this span holds object of a power of 2 size, just mask off the bits to
-	// the interior of the object. Otherwise use the size to get the base.
-	if s.baseMask != 0 {
-		// optimize for power of 2 sized objects.
-		base = s.base()
-		base = base + (p-base)&uintptr(s.baseMask)
-		objIndex = (base - s.base()) >> s.divShift
-		// base = p & s.baseMask is faster for small spans,
-		// but doesn't work for large spans.
-		// Overall, it's faster to use the more general computation above.
-	} else {
-		base = s.base()
-		if p-base >= s.elemsize {
-			// n := (p - base) / s.elemsize, using division by multiplication
-			objIndex = uintptr(p-base) >> s.divShift * uintptr(s.divMul) >> s.divShift2
-			base += objIndex * s.elemsize
-		}
-	}
+
+	objIndex = s.objIndex(p)
+	base = s.base() + objIndex*s.elemsize
 	return
 }
author	Matthew Dempsky <mdempsky@google.com>	2021-03-11 15:45:52 -0800
committer	Matthew Dempsky <mdempsky@google.com>	2021-03-12 18:02:59 +0000
commit	4662029264e79f144eef4323631b3356624e884f (patch)
tree	b59453243f0f51d85d840a3677bd33a1625a2e18 /src/runtime/mbitmap.go
parent	735647d92e839f9ac3a91864a2c34263338a35e6 (diff)
download	go-4662029264e79f144eef4323631b3356624e884f.tar.gz go-4662029264e79f144eef4323631b3356624e884f.zip