diff options
author | Keith Randall <khr@golang.org> | 2021-05-07 14:14:39 -0700 |
---|---|---|
committer | Ben Shi <powerman1st@163.com> | 2021-05-08 03:27:59 +0000 |
commit | b211fe005860db3ceff5fd56af9951d6d1f44325 (patch) | |
tree | 2d4db9f01381ed1cab1ac47f620d08865e6a78ef /src/cmd/compile/internal/ssa/gen | |
parent | f24eac47710b0170fd45611ab1867e87701e0a95 (diff) | |
download | go-b211fe005860db3ceff5fd56af9951d6d1f44325.tar.gz go-b211fe005860db3ceff5fd56af9951d6d1f44325.zip |
cmd/compile: remove bit operations that modify memory directly
These operations (BT{S,R,C}{Q,L}modify) are quite a bit slower than
other ways of doing the same thing.
Without the BTxmodify operations, there are two fallback ways the compiler
performs these operations: AND/OR/XOR operations directly on memory, or
load-BTx-write sequences. The compiler kinda chooses one arbitrarily
depending on rewrite rule application order. Currently, it uses
load-BTx-write for the Const benchmarks and AND/OR/XOR directly to memory
for the non-Const benchmarks. TBD, someone might investigate which of
the two fallback strategies is really better. For now, they are both
better than BTx ops.
name old time/op new time/op delta
BitSet-8 1.09µs ± 2% 0.64µs ± 5% -41.60% (p=0.000 n=9+10)
BitClear-8 1.15µs ± 3% 0.68µs ± 6% -41.00% (p=0.000 n=10+10)
BitToggle-8 1.18µs ± 4% 0.73µs ± 2% -38.36% (p=0.000 n=10+8)
BitSetConst-8 37.0ns ± 7% 25.8ns ± 2% -30.24% (p=0.000 n=10+10)
BitClearConst-8 30.7ns ± 2% 25.0ns ±12% -18.46% (p=0.000 n=10+10)
BitToggleConst-8 36.9ns ± 1% 23.8ns ± 3% -35.46% (p=0.000 n=9+10)
Fixes #45790
Update #45242
Change-Id: Ie33a72dc139f261af82db15d446cd0855afb4e59
Reviewed-on: https://go-review.googlesource.com/c/go/+/318149
Trust: Keith Randall <khr@golang.org>
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Ben Shi <powerman1st@163.com>
Diffstat (limited to 'src/cmd/compile/internal/ssa/gen')
-rw-r--r-- | src/cmd/compile/internal/ssa/gen/AMD64.rules | 56 | ||||
-rw-r--r-- | src/cmd/compile/internal/ssa/gen/AMD64Ops.go | 19 |
2 files changed, 20 insertions, 55 deletions
diff --git a/src/cmd/compile/internal/ssa/gen/AMD64.rules b/src/cmd/compile/internal/ssa/gen/AMD64.rules index 7a88a488c0..ec91ea1513 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64.rules +++ b/src/cmd/compile/internal/ssa/gen/AMD64.rules @@ -624,14 +624,6 @@ // Recognize bit setting (a |= 1<<b) and toggling (a ^= 1<<b) (OR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTS(Q|L) x y) (XOR(Q|L) (SHL(Q|L) (MOV(Q|L)const [1]) y) x) => (BTC(Q|L) x y) -(ORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) => - (BTSLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem) -(ORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) => - (BTSQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem) -(XORLmodify [off] {sym} ptr s:(SHLL (MOVLconst [1]) <t> x) mem) => - (BTCLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem) -(XORQmodify [off] {sym} ptr s:(SHLQ (MOVQconst [1]) <t> x) mem) => - (BTCQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem) // Convert ORconst into BTS, if the code gets smaller, with boundary being // (ORL $40,AX is 3 bytes, ORL $80,AX is 6 bytes). @@ -654,10 +646,6 @@ => (BTRQconst [int8(log64(^c))] x) (ANDL (MOVLconst [c]) x) && isUint32PowerOfTwo(int64(^c)) && uint64(^c) >= 128 => (BTRLconst [int8(log32(^c))] x) -(ANDLmodify [off] {sym} ptr (NOTL s:(SHLL (MOVLconst [1]) <t> x)) mem) => - (BTRLmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem) -(ANDQmodify [off] {sym} ptr (NOTQ s:(SHLQ (MOVQconst [1]) <t> x)) mem) => - (BTRQmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem) // Special-case bit patterns on first/last bit. // generic.rules changes ANDs of high-part/low-part masks into a couple of shifts, @@ -1126,14 +1114,14 @@ ((ADD|SUB|MUL|DIV)SSload [off1+off2] {sym} val base mem) ((ADD|SUB|MUL|DIV)SDload [off1] {sym} val (ADDQconst [off2] base) mem) && is32Bit(int64(off1)+int64(off2)) => ((ADD|SUB|MUL|DIV)SDload [off1+off2] {sym} val base mem) -((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) -((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) -((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => - ((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {sym} base val mem) -((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => - ((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {sym} base val mem) +((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => + ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym} (ADDQconst [off2] base) mem) && ValAndOff(valoff1).canAdd32(off2) => + ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {sym} base mem) +((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {sym} base val mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym} (ADDQconst [off2] base) val mem) && is32Bit(int64(off1)+int64(off2)) => + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {sym} base val mem) // Fold constants into stores. (MOVQstore [off] {sym} ptr (MOVQconst [c]) mem) && validVal(c) => @@ -1181,18 +1169,18 @@ ((ADD|SUB|MUL|DIV)SDload [off1] {sym1} val (LEAQ [off2] {sym2} base) mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => ((ADD|SUB|MUL|DIV)SDload [off1+off2] {mergeSym(sym1,sym2)} val base mem) -((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) +((ADD|AND|OR|XOR)Qconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) -((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) + ((ADD|AND|OR|XOR)Qconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) +((ADD|AND|OR|XOR)Lconstmodify [valoff1] {sym1} (LEAQ [off2] {sym2} base) mem) && ValAndOff(valoff1).canAdd32(off2) && canMergeSym(sym1, sym2) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) -((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + ((ADD|AND|OR|XOR)Lconstmodify [ValAndOff(valoff1).addOffset32(off2)] {mergeSym(sym1,sym2)} base mem) +((ADD|SUB|AND|OR|XOR)Qmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => - ((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) -((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) + ((ADD|SUB|AND|OR|XOR)Qmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) +((ADD|SUB|AND|OR|XOR)Lmodify [off1] {sym1} (LEAQ [off2] {sym2} base) val mem) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => - ((ADD|SUB|AND|OR|XOR|BTC|BTR|BTS)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) + ((ADD|SUB|AND|OR|XOR)Lmodify [off1+off2] {mergeSym(sym1,sym2)} base val mem) // fold LEAQs together (LEAQ [off1] {sym1} (LEAQ [off2] {sym2} x)) && is32Bit(int64(off1)+int64(off2)) && canMergeSym(sym1, sym2) => @@ -2078,13 +2066,9 @@ (MOVLstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Lload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) (MOVLstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)L l:(MOVLload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => ((ADD|SUB|AND|OR|XOR)Lmodify [off] {sym} ptr x mem) -(MOVLstore {sym} [off] ptr y:((BTC|BTR|BTS)L l:(MOVLload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => - ((BTC|BTR|BTS)Lmodify [off] {sym} ptr (ANDLconst <t> [31] x) mem) (MOVQstore {sym} [off] ptr y:((ADD|AND|OR|XOR)Qload x [off] {sym} ptr mem) mem) && y.Uses==1 && clobber(y) => ((ADD|AND|OR|XOR)Qmodify [off] {sym} ptr x mem) (MOVQstore {sym} [off] ptr y:((ADD|SUB|AND|OR|XOR)Q l:(MOVQload [off] {sym} ptr mem) x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => ((ADD|SUB|AND|OR|XOR)Qmodify [off] {sym} ptr x mem) -(MOVQstore {sym} [off] ptr y:((BTC|BTR|BTS)Q l:(MOVQload [off] {sym} ptr mem) <t> x) mem) && y.Uses==1 && l.Uses==1 && clobber(y, l) => - ((BTC|BTR|BTS)Qmodify [off] {sym} ptr (ANDQconst <t> [63] x) mem) // Merge ADDQconst and LEAQ into atomic loads. (MOV(Q|L|B)atomicload [off1] {sym} (ADDQconst [off2] ptr) mem) && is32Bit(int64(off1)+int64(off2)) => @@ -2138,12 +2122,12 @@ (MOVWQZX (MOVBQZX x)) => (MOVBQZX x) (MOVBQZX (MOVBQZX x)) => (MOVBQZX x) -(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem) +(MOVQstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Qconst [c] l:(MOVQload [off] {sym} ptr2 mem)) mem) && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) -(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem) + ((ADD|AND|OR|XOR)Qconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) +(MOVLstore [off] {sym} ptr a:((ADD|AND|OR|XOR)Lconst [c] l:(MOVLload [off] {sym} ptr2 mem)) mem) && isSamePtr(ptr, ptr2) && a.Uses == 1 && l.Uses == 1 && clobber(l, a) => - ((ADD|AND|OR|XOR|BTC|BTR|BTS)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) + ((ADD|AND|OR|XOR)Lconstmodify {sym} [makeValAndOff(int32(c),off)] ptr mem) // float <-> int register moves, with no conversion. // These come up when compiling math.{Float{32,64}bits,Float{32,64}frombits}. diff --git a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go index af53cc4f9d..67b3293903 100644 --- a/src/cmd/compile/internal/ssa/gen/AMD64Ops.go +++ b/src/cmd/compile/internal/ssa/gen/AMD64Ops.go @@ -362,25 +362,6 @@ func init() { {name: "BTSLconst", argLength: 1, reg: gp11, asm: "BTSL", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 32 {name: "BTSQconst", argLength: 1, reg: gp11, asm: "BTSQ", resultInArg0: true, clobberFlags: true, aux: "Int8"}, // set bit auxint in arg0, 0 <= auxint < 64 - // direct bit operation on memory operand - // - // Note that these operations do not mask the bit offset (arg1), and will write beyond their expected - // bounds if that argument is larger than 64/32 (for BT*Q and BT*L, respectively). If the compiler - // cannot prove that arg1 is in range, it must be explicitly masked (see e.g. the patterns that produce - // BT*modify from (MOVstore (BT* (MOVLload ptr mem) x) mem)). - {name: "BTCQmodify", argLength: 3, reg: gpstore, asm: "BTCQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit arg1 in 64-bit arg0+auxint+aux, arg2=mem - {name: "BTCLmodify", argLength: 3, reg: gpstore, asm: "BTCL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit arg1 in 32-bit arg0+auxint+aux, arg2=mem - {name: "BTSQmodify", argLength: 3, reg: gpstore, asm: "BTSQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit arg1 in 64-bit arg0+auxint+aux, arg2=mem - {name: "BTSLmodify", argLength: 3, reg: gpstore, asm: "BTSL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit arg1 in 32-bit arg0+auxint+aux, arg2=mem - {name: "BTRQmodify", argLength: 3, reg: gpstore, asm: "BTRQ", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit arg1 in 64-bit arg0+auxint+aux, arg2=mem - {name: "BTRLmodify", argLength: 3, reg: gpstore, asm: "BTRL", aux: "SymOff", typ: "Mem", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit arg1 in 32-bit arg0+auxint+aux, arg2=mem - {name: "BTCQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "BTCLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTCL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // complement bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "BTSQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "BTSLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTSL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // set bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "BTRQconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRQ", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 64-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "BTRLconstmodify", argLength: 2, reg: gpstoreconst, asm: "BTRL", aux: "SymValAndOff", clobberFlags: true, faultOnNilArg0: true, symEffect: "Read,Write"}, // reset bit ValAndOff(AuxInt).Val() in 32-bit arg0+ValAndOff(AuxInt).Off()+aux, arg1=mem - {name: "TESTQ", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTQ", typ: "Flags"}, // (arg0 & arg1) compare to 0 {name: "TESTL", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTL", typ: "Flags"}, // (arg0 & arg1) compare to 0 {name: "TESTW", argLength: 2, reg: gp2flags, commutative: true, asm: "TESTW", typ: "Flags"}, // (arg0 & arg1) compare to 0 |