aboutsummaryrefslogtreecommitdiff
path: root/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go
diff options
context:
space:
mode:
Diffstat (limited to 'vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go')
-rw-r--r--vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go826
1 files changed, 826 insertions, 0 deletions
diff --git a/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go
new file mode 100644
index 0000000..b6563e8
--- /dev/null
+++ b/vendor/github.com/syndtr/goleveldb/leveldb/db_compaction.go
@@ -0,0 +1,826 @@
+// Copyright (c) 2012, Suryandaru Triandana <syndtr@gmail.com>
+// All rights reserved.
+//
+// Use of this source code is governed by a BSD-style license that can be
+// found in the LICENSE file.
+
+package leveldb
+
+import (
+ "sync"
+ "time"
+
+ "github.com/syndtr/goleveldb/leveldb/errors"
+ "github.com/syndtr/goleveldb/leveldb/opt"
+ "github.com/syndtr/goleveldb/leveldb/storage"
+)
+
+var (
+ errCompactionTransactExiting = errors.New("leveldb: compaction transact exiting")
+)
+
+type cStat struct {
+ duration time.Duration
+ read int64
+ write int64
+}
+
+func (p *cStat) add(n *cStatStaging) {
+ p.duration += n.duration
+ p.read += n.read
+ p.write += n.write
+}
+
+func (p *cStat) get() (duration time.Duration, read, write int64) {
+ return p.duration, p.read, p.write
+}
+
+type cStatStaging struct {
+ start time.Time
+ duration time.Duration
+ on bool
+ read int64
+ write int64
+}
+
+func (p *cStatStaging) startTimer() {
+ if !p.on {
+ p.start = time.Now()
+ p.on = true
+ }
+}
+
+func (p *cStatStaging) stopTimer() {
+ if p.on {
+ p.duration += time.Since(p.start)
+ p.on = false
+ }
+}
+
+type cStats struct {
+ lk sync.Mutex
+ stats []cStat
+}
+
+func (p *cStats) addStat(level int, n *cStatStaging) {
+ p.lk.Lock()
+ if level >= len(p.stats) {
+ newStats := make([]cStat, level+1)
+ copy(newStats, p.stats)
+ p.stats = newStats
+ }
+ p.stats[level].add(n)
+ p.lk.Unlock()
+}
+
+func (p *cStats) getStat(level int) (duration time.Duration, read, write int64) {
+ p.lk.Lock()
+ defer p.lk.Unlock()
+ if level < len(p.stats) {
+ return p.stats[level].get()
+ }
+ return
+}
+
+func (db *DB) compactionError() {
+ var err error
+noerr:
+ // No error.
+ for {
+ select {
+ case err = <-db.compErrSetC:
+ switch {
+ case err == nil:
+ case err == ErrReadOnly, errors.IsCorrupted(err):
+ goto hasperr
+ default:
+ goto haserr
+ }
+ case <-db.closeC:
+ return
+ }
+ }
+haserr:
+ // Transient error.
+ for {
+ select {
+ case db.compErrC <- err:
+ case err = <-db.compErrSetC:
+ switch {
+ case err == nil:
+ goto noerr
+ case err == ErrReadOnly, errors.IsCorrupted(err):
+ goto hasperr
+ default:
+ }
+ case <-db.closeC:
+ return
+ }
+ }
+hasperr:
+ // Persistent error.
+ for {
+ select {
+ case db.compErrC <- err:
+ case db.compPerErrC <- err:
+ case db.writeLockC <- struct{}{}:
+ // Hold write lock, so that write won't pass-through.
+ db.compWriteLocking = true
+ case <-db.closeC:
+ if db.compWriteLocking {
+ // We should release the lock or Close will hang.
+ <-db.writeLockC
+ }
+ return
+ }
+ }
+}
+
+type compactionTransactCounter int
+
+func (cnt *compactionTransactCounter) incr() {
+ *cnt++
+}
+
+type compactionTransactInterface interface {
+ run(cnt *compactionTransactCounter) error
+ revert() error
+}
+
+func (db *DB) compactionTransact(name string, t compactionTransactInterface) {
+ defer func() {
+ if x := recover(); x != nil {
+ if x == errCompactionTransactExiting {
+ if err := t.revert(); err != nil {
+ db.logf("%s revert error %q", name, err)
+ }
+ }
+ panic(x)
+ }
+ }()
+
+ const (
+ backoffMin = 1 * time.Second
+ backoffMax = 8 * time.Second
+ backoffMul = 2 * time.Second
+ )
+ var (
+ backoff = backoffMin
+ backoffT = time.NewTimer(backoff)
+ lastCnt = compactionTransactCounter(0)
+
+ disableBackoff = db.s.o.GetDisableCompactionBackoff()
+ )
+ for n := 0; ; n++ {
+ // Check whether the DB is closed.
+ if db.isClosed() {
+ db.logf("%s exiting", name)
+ db.compactionExitTransact()
+ } else if n > 0 {
+ db.logf("%s retrying N·%d", name, n)
+ }
+
+ // Execute.
+ cnt := compactionTransactCounter(0)
+ err := t.run(&cnt)
+ if err != nil {
+ db.logf("%s error I·%d %q", name, cnt, err)
+ }
+
+ // Set compaction error status.
+ select {
+ case db.compErrSetC <- err:
+ case perr := <-db.compPerErrC:
+ if err != nil {
+ db.logf("%s exiting (persistent error %q)", name, perr)
+ db.compactionExitTransact()
+ }
+ case <-db.closeC:
+ db.logf("%s exiting", name)
+ db.compactionExitTransact()
+ }
+ if err == nil {
+ return
+ }
+ if errors.IsCorrupted(err) {
+ db.logf("%s exiting (corruption detected)", name)
+ db.compactionExitTransact()
+ }
+
+ if !disableBackoff {
+ // Reset backoff duration if counter is advancing.
+ if cnt > lastCnt {
+ backoff = backoffMin
+ lastCnt = cnt
+ }
+
+ // Backoff.
+ backoffT.Reset(backoff)
+ if backoff < backoffMax {
+ backoff *= backoffMul
+ if backoff > backoffMax {
+ backoff = backoffMax
+ }
+ }
+ select {
+ case <-backoffT.C:
+ case <-db.closeC:
+ db.logf("%s exiting", name)
+ db.compactionExitTransact()
+ }
+ }
+ }
+}
+
+type compactionTransactFunc struct {
+ runFunc func(cnt *compactionTransactCounter) error
+ revertFunc func() error
+}
+
+func (t *compactionTransactFunc) run(cnt *compactionTransactCounter) error {
+ return t.runFunc(cnt)
+}
+
+func (t *compactionTransactFunc) revert() error {
+ if t.revertFunc != nil {
+ return t.revertFunc()
+ }
+ return nil
+}
+
+func (db *DB) compactionTransactFunc(name string, run func(cnt *compactionTransactCounter) error, revert func() error) {
+ db.compactionTransact(name, &compactionTransactFunc{run, revert})
+}
+
+func (db *DB) compactionExitTransact() {
+ panic(errCompactionTransactExiting)
+}
+
+func (db *DB) compactionCommit(name string, rec *sessionRecord) {
+ db.compCommitLk.Lock()
+ defer db.compCommitLk.Unlock() // Defer is necessary.
+ db.compactionTransactFunc(name+"@commit", func(cnt *compactionTransactCounter) error {
+ return db.s.commit(rec)
+ }, nil)
+}
+
+func (db *DB) memCompaction() {
+ mdb := db.getFrozenMem()
+ if mdb == nil {
+ return
+ }
+ defer mdb.decref()
+
+ db.logf("memdb@flush N·%d S·%s", mdb.Len(), shortenb(mdb.Size()))
+
+ // Don't compact empty memdb.
+ if mdb.Len() == 0 {
+ db.logf("memdb@flush skipping")
+ // drop frozen memdb
+ db.dropFrozenMem()
+ return
+ }
+
+ // Pause table compaction.
+ resumeC := make(chan struct{})
+ select {
+ case db.tcompPauseC <- (chan<- struct{})(resumeC):
+ case <-db.compPerErrC:
+ close(resumeC)
+ resumeC = nil
+ case <-db.closeC:
+ db.compactionExitTransact()
+ }
+
+ var (
+ rec = &sessionRecord{}
+ stats = &cStatStaging{}
+ flushLevel int
+ )
+
+ // Generate tables.
+ db.compactionTransactFunc("memdb@flush", func(cnt *compactionTransactCounter) (err error) {
+ stats.startTimer()
+ flushLevel, err = db.s.flushMemdb(rec, mdb.DB, db.memdbMaxLevel)
+ stats.stopTimer()
+ return
+ }, func() error {
+ for _, r := range rec.addedTables {
+ db.logf("memdb@flush revert @%d", r.num)
+ if err := db.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: r.num}); err != nil {
+ return err
+ }
+ }
+ return nil
+ })
+
+ rec.setJournalNum(db.journalFd.Num)
+ rec.setSeqNum(db.frozenSeq)
+
+ // Commit.
+ stats.startTimer()
+ db.compactionCommit("memdb", rec)
+ stats.stopTimer()
+
+ db.logf("memdb@flush committed F·%d T·%v", len(rec.addedTables), stats.duration)
+
+ for _, r := range rec.addedTables {
+ stats.write += r.size
+ }
+ db.compStats.addStat(flushLevel, stats)
+
+ // Drop frozen memdb.
+ db.dropFrozenMem()
+
+ // Resume table compaction.
+ if resumeC != nil {
+ select {
+ case <-resumeC:
+ close(resumeC)
+ case <-db.closeC:
+ db.compactionExitTransact()
+ }
+ }
+
+ // Trigger table compaction.
+ db.compTrigger(db.tcompCmdC)
+}
+
+type tableCompactionBuilder struct {
+ db *DB
+ s *session
+ c *compaction
+ rec *sessionRecord
+ stat0, stat1 *cStatStaging
+
+ snapHasLastUkey bool
+ snapLastUkey []byte
+ snapLastSeq uint64
+ snapIter int
+ snapKerrCnt int
+ snapDropCnt int
+
+ kerrCnt int
+ dropCnt int
+
+ minSeq uint64
+ strict bool
+ tableSize int
+
+ tw *tWriter
+}
+
+func (b *tableCompactionBuilder) appendKV(key, value []byte) error {
+ // Create new table if not already.
+ if b.tw == nil {
+ // Check for pause event.
+ if b.db != nil {
+ select {
+ case ch := <-b.db.tcompPauseC:
+ b.db.pauseCompaction(ch)
+ case <-b.db.closeC:
+ b.db.compactionExitTransact()
+ default:
+ }
+ }
+
+ // Create new table.
+ var err error
+ b.tw, err = b.s.tops.create()
+ if err != nil {
+ return err
+ }
+ }
+
+ // Write key/value into table.
+ return b.tw.append(key, value)
+}
+
+func (b *tableCompactionBuilder) needFlush() bool {
+ return b.tw.tw.BytesLen() >= b.tableSize
+}
+
+func (b *tableCompactionBuilder) flush() error {
+ t, err := b.tw.finish()
+ if err != nil {
+ return err
+ }
+ b.rec.addTableFile(b.c.sourceLevel+1, t)
+ b.stat1.write += t.size
+ b.s.logf("table@build created L%d@%d N·%d S·%s %q:%q", b.c.sourceLevel+1, t.fd.Num, b.tw.tw.EntriesLen(), shortenb(int(t.size)), t.imin, t.imax)
+ b.tw = nil
+ return nil
+}
+
+func (b *tableCompactionBuilder) cleanup() {
+ if b.tw != nil {
+ b.tw.drop()
+ b.tw = nil
+ }
+}
+
+func (b *tableCompactionBuilder) run(cnt *compactionTransactCounter) error {
+ snapResumed := b.snapIter > 0
+ hasLastUkey := b.snapHasLastUkey // The key might has zero length, so this is necessary.
+ lastUkey := append([]byte{}, b.snapLastUkey...)
+ lastSeq := b.snapLastSeq
+ b.kerrCnt = b.snapKerrCnt
+ b.dropCnt = b.snapDropCnt
+ // Restore compaction state.
+ b.c.restore()
+
+ defer b.cleanup()
+
+ b.stat1.startTimer()
+ defer b.stat1.stopTimer()
+
+ iter := b.c.newIterator()
+ defer iter.Release()
+ for i := 0; iter.Next(); i++ {
+ // Incr transact counter.
+ cnt.incr()
+
+ // Skip until last state.
+ if i < b.snapIter {
+ continue
+ }
+
+ resumed := false
+ if snapResumed {
+ resumed = true
+ snapResumed = false
+ }
+
+ ikey := iter.Key()
+ ukey, seq, kt, kerr := parseInternalKey(ikey)
+
+ if kerr == nil {
+ shouldStop := !resumed && b.c.shouldStopBefore(ikey)
+
+ if !hasLastUkey || b.s.icmp.uCompare(lastUkey, ukey) != 0 {
+ // First occurrence of this user key.
+
+ // Only rotate tables if ukey doesn't hop across.
+ if b.tw != nil && (shouldStop || b.needFlush()) {
+ if err := b.flush(); err != nil {
+ return err
+ }
+
+ // Creates snapshot of the state.
+ b.c.save()
+ b.snapHasLastUkey = hasLastUkey
+ b.snapLastUkey = append(b.snapLastUkey[:0], lastUkey...)
+ b.snapLastSeq = lastSeq
+ b.snapIter = i
+ b.snapKerrCnt = b.kerrCnt
+ b.snapDropCnt = b.dropCnt
+ }
+
+ hasLastUkey = true
+ lastUkey = append(lastUkey[:0], ukey...)
+ lastSeq = keyMaxSeq
+ }
+
+ switch {
+ case lastSeq <= b.minSeq:
+ // Dropped because newer entry for same user key exist
+ fallthrough // (A)
+ case kt == keyTypeDel && seq <= b.minSeq && b.c.baseLevelForKey(lastUkey):
+ // For this user key:
+ // (1) there is no data in higher levels
+ // (2) data in lower levels will have larger seq numbers
+ // (3) data in layers that are being compacted here and have
+ // smaller seq numbers will be dropped in the next
+ // few iterations of this loop (by rule (A) above).
+ // Therefore this deletion marker is obsolete and can be dropped.
+ lastSeq = seq
+ b.dropCnt++
+ continue
+ default:
+ lastSeq = seq
+ }
+ } else {
+ if b.strict {
+ return kerr
+ }
+
+ // Don't drop corrupted keys.
+ hasLastUkey = false
+ lastUkey = lastUkey[:0]
+ lastSeq = keyMaxSeq
+ b.kerrCnt++
+ }
+
+ if err := b.appendKV(ikey, iter.Value()); err != nil {
+ return err
+ }
+ }
+
+ if err := iter.Error(); err != nil {
+ return err
+ }
+
+ // Finish last table.
+ if b.tw != nil && !b.tw.empty() {
+ return b.flush()
+ }
+ return nil
+}
+
+func (b *tableCompactionBuilder) revert() error {
+ for _, at := range b.rec.addedTables {
+ b.s.logf("table@build revert @%d", at.num)
+ if err := b.s.stor.Remove(storage.FileDesc{Type: storage.TypeTable, Num: at.num}); err != nil {
+ return err
+ }
+ }
+ return nil
+}
+
+func (db *DB) tableCompaction(c *compaction, noTrivial bool) {
+ defer c.release()
+
+ rec := &sessionRecord{}
+ rec.addCompPtr(c.sourceLevel, c.imax)
+
+ if !noTrivial && c.trivial() {
+ t := c.levels[0][0]
+ db.logf("table@move L%d@%d -> L%d", c.sourceLevel, t.fd.Num, c.sourceLevel+1)
+ rec.delTable(c.sourceLevel, t.fd.Num)
+ rec.addTableFile(c.sourceLevel+1, t)
+ db.compactionCommit("table-move", rec)
+ return
+ }
+
+ var stats [2]cStatStaging
+ for i, tables := range c.levels {
+ for _, t := range tables {
+ stats[i].read += t.size
+ // Insert deleted tables into record
+ rec.delTable(c.sourceLevel+i, t.fd.Num)
+ }
+ }
+ sourceSize := int(stats[0].read + stats[1].read)
+ minSeq := db.minSeq()
+ db.logf("table@compaction L%d·%d -> L%d·%d S·%s Q·%d", c.sourceLevel, len(c.levels[0]), c.sourceLevel+1, len(c.levels[1]), shortenb(sourceSize), minSeq)
+
+ b := &tableCompactionBuilder{
+ db: db,
+ s: db.s,
+ c: c,
+ rec: rec,
+ stat1: &stats[1],
+ minSeq: minSeq,
+ strict: db.s.o.GetStrict(opt.StrictCompaction),
+ tableSize: db.s.o.GetCompactionTableSize(c.sourceLevel + 1),
+ }
+ db.compactionTransact("table@build", b)
+
+ // Commit.
+ stats[1].startTimer()
+ db.compactionCommit("table", rec)
+ stats[1].stopTimer()
+
+ resultSize := int(stats[1].write)
+ db.logf("table@compaction committed F%s S%s Ke·%d D·%d T·%v", sint(len(rec.addedTables)-len(rec.deletedTables)), sshortenb(resultSize-sourceSize), b.kerrCnt, b.dropCnt, stats[1].duration)
+
+ // Save compaction stats
+ for i := range stats {
+ db.compStats.addStat(c.sourceLevel+1, &stats[i])
+ }
+}
+
+func (db *DB) tableRangeCompaction(level int, umin, umax []byte) error {
+ db.logf("table@compaction range L%d %q:%q", level, umin, umax)
+ if level >= 0 {
+ if c := db.s.getCompactionRange(level, umin, umax, true); c != nil {
+ db.tableCompaction(c, true)
+ }
+ } else {
+ // Retry until nothing to compact.
+ for {
+ compacted := false
+
+ // Scan for maximum level with overlapped tables.
+ v := db.s.version()
+ m := 1
+ for i := m; i < len(v.levels); i++ {
+ tables := v.levels[i]
+ if tables.overlaps(db.s.icmp, umin, umax, false) {
+ m = i
+ }
+ }
+ v.release()
+
+ for level := 0; level < m; level++ {
+ if c := db.s.getCompactionRange(level, umin, umax, false); c != nil {
+ db.tableCompaction(c, true)
+ compacted = true
+ }
+ }
+
+ if !compacted {
+ break
+ }
+ }
+ }
+
+ return nil
+}
+
+func (db *DB) tableAutoCompaction() {
+ if c := db.s.pickCompaction(); c != nil {
+ db.tableCompaction(c, false)
+ }
+}
+
+func (db *DB) tableNeedCompaction() bool {
+ v := db.s.version()
+ defer v.release()
+ return v.needCompaction()
+}
+
+func (db *DB) pauseCompaction(ch chan<- struct{}) {
+ select {
+ case ch <- struct{}{}:
+ case <-db.closeC:
+ db.compactionExitTransact()
+ }
+}
+
+type cCmd interface {
+ ack(err error)
+}
+
+type cAuto struct {
+ ackC chan<- error
+}
+
+func (r cAuto) ack(err error) {
+ if r.ackC != nil {
+ defer func() {
+ recover()
+ }()
+ r.ackC <- err
+ }
+}
+
+type cRange struct {
+ level int
+ min, max []byte
+ ackC chan<- error
+}
+
+func (r cRange) ack(err error) {
+ if r.ackC != nil {
+ defer func() {
+ recover()
+ }()
+ r.ackC <- err
+ }
+}
+
+// This will trigger auto compaction but will not wait for it.
+func (db *DB) compTrigger(compC chan<- cCmd) {
+ select {
+ case compC <- cAuto{}:
+ default:
+ }
+}
+
+// This will trigger auto compaction and/or wait for all compaction to be done.
+func (db *DB) compTriggerWait(compC chan<- cCmd) (err error) {
+ ch := make(chan error)
+ defer close(ch)
+ // Send cmd.
+ select {
+ case compC <- cAuto{ch}:
+ case err = <-db.compErrC:
+ return
+ case <-db.closeC:
+ return ErrClosed
+ }
+ // Wait cmd.
+ select {
+ case err = <-ch:
+ case err = <-db.compErrC:
+ case <-db.closeC:
+ return ErrClosed
+ }
+ return err
+}
+
+// Send range compaction request.
+func (db *DB) compTriggerRange(compC chan<- cCmd, level int, min, max []byte) (err error) {
+ ch := make(chan error)
+ defer close(ch)
+ // Send cmd.
+ select {
+ case compC <- cRange{level, min, max, ch}:
+ case err := <-db.compErrC:
+ return err
+ case <-db.closeC:
+ return ErrClosed
+ }
+ // Wait cmd.
+ select {
+ case err = <-ch:
+ case err = <-db.compErrC:
+ case <-db.closeC:
+ return ErrClosed
+ }
+ return err
+}
+
+func (db *DB) mCompaction() {
+ var x cCmd
+
+ defer func() {
+ if x := recover(); x != nil {
+ if x != errCompactionTransactExiting {
+ panic(x)
+ }
+ }
+ if x != nil {
+ x.ack(ErrClosed)
+ }
+ db.closeW.Done()
+ }()
+
+ for {
+ select {
+ case x = <-db.mcompCmdC:
+ switch x.(type) {
+ case cAuto:
+ db.memCompaction()
+ x.ack(nil)
+ x = nil
+ default:
+ panic("leveldb: unknown command")
+ }
+ case <-db.closeC:
+ return
+ }
+ }
+}
+
+func (db *DB) tCompaction() {
+ var x cCmd
+ var ackQ []cCmd
+
+ defer func() {
+ if x := recover(); x != nil {
+ if x != errCompactionTransactExiting {
+ panic(x)
+ }
+ }
+ for i := range ackQ {
+ ackQ[i].ack(ErrClosed)
+ ackQ[i] = nil
+ }
+ if x != nil {
+ x.ack(ErrClosed)
+ }
+ db.closeW.Done()
+ }()
+
+ for {
+ if db.tableNeedCompaction() {
+ select {
+ case x = <-db.tcompCmdC:
+ case ch := <-db.tcompPauseC:
+ db.pauseCompaction(ch)
+ continue
+ case <-db.closeC:
+ return
+ default:
+ }
+ } else {
+ for i := range ackQ {
+ ackQ[i].ack(nil)
+ ackQ[i] = nil
+ }
+ ackQ = ackQ[:0]
+ select {
+ case x = <-db.tcompCmdC:
+ case ch := <-db.tcompPauseC:
+ db.pauseCompaction(ch)
+ continue
+ case <-db.closeC:
+ return
+ }
+ }
+ if x != nil {
+ switch cmd := x.(type) {
+ case cAuto:
+ ackQ = append(ackQ, x)
+ case cRange:
+ x.ack(db.tableRangeCompaction(cmd.level, cmd.min, cmd.max))
+ default:
+ panic("leveldb: unknown command")
+ }
+ x = nil
+ }
+ db.tableAutoCompaction()
+ }
+}