aboutsummaryrefslogtreecommitdiff
path: root/queue.go
blob: da4a7b808c3d43bbfe7deaa14727742251e2b6da (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
package crawl

import (
	"bytes"
	"encoding/binary"
	"errors"
	"math/rand"
	"sync/atomic"
	"time"

	"github.com/syndtr/goleveldb/leveldb"
)

type queue struct {
	db        *gobDB
	numActive int32
}

var (
	queuePrefix  = []byte("queue")
	activePrefix = []byte("queue_active")

	queueKeySep   = []byte{'/'}
	queueKeySepP1 = []byte{'/' + 1}
)

type queuePair struct {
	key []byte

	URL   string
	Depth int
}

// Scan the pending queue and send items on 'ch'. Returns an error
// when the queue is empty (work is done).
func (q *queue) Scan(ch chan<- queuePair) error {
	n := 0
	startKey, endKey := queueScanRange()
	iter := q.db.NewRangeIterator(startKey, endKey)
	defer iter.Release()

	for iter.Next() {
		var p queuePair
		if err := iter.Value(&p); err != nil {
			continue
		}
		p.key = iter.Key()
		q.acquire(p)
		ch <- p
		n++
	}

	if n == 0 && q.numActive == 0 {
		return errors.New("EOF")
	}
	return nil
}

// Add an item to the pending work queue.
func (q *queue) Add(wb *leveldb.Batch, urlStr string, depth int, when time.Time) {
	t := uint64(when.UnixNano())
	qkey := bytes.Join([][]byte{queuePrefix, encodeUint64(t), encodeUint64(uint64(rand.Int63()))}, queueKeySep)
	q.db.PutObjBatch(wb, qkey, &queuePair{URL: urlStr, Depth: depth})
}

func (q *queue) acquire(qp queuePair) {
	wb := new(leveldb.Batch)
	q.db.PutObjBatch(wb, activeQueueKey(qp.key), qp)
	wb.Delete(qp.key)
	q.db.Write(wb, nil)

	atomic.AddInt32(&q.numActive, 1)
}

// Release an item from the queue. Processing for this item is done.
func (q *queue) Release(wb *leveldb.Batch, qp queuePair) {
	wb.Delete(activeQueueKey(qp.key))
	atomic.AddInt32(&q.numActive, -1)
}

// Retry processing this item at a later time.
func (q *queue) Retry(wb *leveldb.Batch, qp queuePair, delay time.Duration) {
	wb.Delete(activeQueueKey(qp.key))
	q.Add(wb, qp.URL, qp.Depth, time.Now().Add(delay))
	atomic.AddInt32(&q.numActive, -1)
}

// Recover moves all active tasks to the pending queue. To be
// called at startup to recover tasks that were active when the
// previous run terminated.
func (q *queue) Recover() {
	wb := new(leveldb.Batch)

	prefix := bytes.Join([][]byte{activePrefix, []byte{}}, queueKeySep)
	iter := q.db.NewPrefixIterator(prefix)
	defer iter.Release()
	for iter.Next() {
		var p queuePair
		if err := iter.Value(&p); err != nil {
			continue
		}
		p.key = iter.Key()[len(activePrefix)+1:]
		q.db.PutObjBatch(wb, p.key, &p)
		wb.Delete(iter.Key())
	}

	q.db.Write(wb, nil)
}

func encodeUint64(n uint64) []byte {
	var b [8]byte
	binary.BigEndian.PutUint64(b[:], n)
	return b[:]
}

func activeQueueKey(key []byte) []byte {
	return bytes.Join([][]byte{activePrefix, key}, queueKeySep)
}

func queueScanRange() ([]byte, []byte) {
	tlim := uint64(time.Now().UnixNano() + 1)
	startKey := bytes.Join([][]byte{queuePrefix, []byte{}}, queueKeySep)
	endKey := bytes.Join([][]byte{queuePrefix, encodeUint64(tlim)}, queueKeySep)
	return startKey, endKey
}