aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2021-12-03 07:19:28 -0700
committerJordan <me@jordan.im>2021-12-03 07:19:28 -0700
commitb0835fc63877c63e88e75840c5d52d3167ad5e10 (patch)
tree9db697711cab98fa9c49f87d40d6f7212baae19c
downloadkeep-b0835fc63877c63e88e75840c5d52d3167ad5e10.tar.gz
keep-b0835fc63877c63e88e75840c5d52d3167ad5e10.zip
initial commit
-rw-r--r--.gitignore3
-rw-r--r--Makefile29
-rw-r--r--README.md21
-rw-r--r--archive.go73
-rw-r--r--archive_test.go24
-rw-r--r--db.go78
-rw-r--r--db_test.go48
-rw-r--r--go.mod9
-rw-r--r--go.sum15
-rw-r--r--keep.go187
-rw-r--r--keep.json4
11 files changed, 491 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..1abb790
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,3 @@
+*.swp
+*.swo
+keep
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..ddff8eb
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,29 @@
+.POSIX:
+.SUFFIXES:
+
+GO = go
+RM = rm
+GOFLAGS =
+PREFIX = /usr/local
+BINDIR = $(PREFIX)/bin
+CONFIGDIR = $(HOME)/.keep
+
+goflags = $(GOFLAGS)
+
+all: keep
+
+keep:
+ $(GO) build $(goflags) -ldflags "-X main.buildPrefix=$(PREFIX)"
+
+clean:
+ $(RM) -f keep
+
+test:
+ $(GO) test -v ./...
+
+install: all
+ mkdir -p $(DESTDIR)$(BINDIR)
+ mkdir -p $(DESTDIR)$(CONFIGDIR)
+ cp -f keep $(DESTDIR)$(BINDIR)
+ cp -n keep.json $(DESTDIR)$(CONFIGDIR)
+
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..f5046de
--- /dev/null
+++ b/README.md
@@ -0,0 +1,21 @@
+# Keep
+
+Keep is a minimal Discord bot which saves any URLs parsed from messages visible
+to the configured account on the Wayback Machine.
+
+A local cache of saved URLs is kept to prevent duplicate availability API
+requests.
+
+## Installation
+
+Keep can be compiled with `make` or `go build`, and installed system-wide by
+running `make install` with root-level permissions. Tests can be run with `make
+test`.
+
+## Usage
+
+```
+Usage of ./keep:
+ -config string
+ path to configuration file (default "/home/jordan/.keep/keep.json")
+```
diff --git a/archive.go b/archive.go
new file mode 100644
index 0000000..c88cff7
--- /dev/null
+++ b/archive.go
@@ -0,0 +1,73 @@
+package main
+
+import (
+ "encoding/json"
+ "log"
+ "net/http"
+ "strconv"
+ "time"
+)
+
+var (
+ API_AVAILABILITY string = "http://archive.org/wayback/available?url="
+ API_SAVE string = "https://web.archive.org/save/"
+
+ TIMEOUT time.Duration = 25
+ client *http.Client = &http.Client{Timeout: TIMEOUT * time.Second}
+
+ blacklist = []string{"cdn.discordapp.com", "discord.com", "tenor.com",
+ "c.tenor.com", "archive.org", "web.archive.org", "youtu.be",
+ "youtube.com", "www.youtube.com"}
+)
+
+type Wayback struct {
+ Snapshots Snapshot `json:"archived_snapshots,omitempty"`
+}
+
+type Snapshot struct {
+ Recent Closest `json:"closest"`
+}
+
+type Closest struct {
+ Available bool `json:"available"`
+ Status string `json:"status"`
+}
+
+func isBlacklisted(host string) bool {
+
+ for _, h := range blacklist {
+
+ if host == h {
+ return true
+ }
+ }
+ return false
+}
+
+func isArchived(url string) (bool, int) {
+
+ req, err := http.NewRequest("GET", API_AVAILABILITY+url, nil)
+ resp, err := client.Do(req)
+ if err != nil {
+ log.Println(err)
+ return false, 0
+ }
+ av := &Wayback{}
+ decoder := json.NewDecoder(resp.Body)
+ if err := decoder.Decode(av); err != nil {
+ log.Println(err)
+ return false, 0
+ }
+ status, _ := strconv.Atoi(av.Snapshots.Recent.Status)
+ return av.Snapshots.Recent.Available, status
+}
+
+func archive(url string) int {
+
+ req, err := http.NewRequest("GET", API_SAVE+url, nil)
+ resp, err := client.Do(req)
+ if err != nil {
+ return 0
+ }
+ return resp.StatusCode
+}
diff --git a/archive_test.go b/archive_test.go
new file mode 100644
index 0000000..7c4549e
--- /dev/null
+++ b/archive_test.go
@@ -0,0 +1,24 @@
+package main
+
+import (
+ "net/http"
+ "testing"
+)
+
+func TestIsArchived(t *testing.T) {
+
+ url := "http://example.com/"
+ archived, status := isArchived(url)
+ if archived != true || status != 200 {
+ t.Errorf("Received %t, %d: want %t, %d", archived, status, true, 200)
+ }
+}
+
+func TestArchive200(t *testing.T) {
+
+ url := "http://example.com/"
+ status := archive(url)
+ if status != http.StatusOK {
+ t.Errorf("Recieved %d; want %d", status, http.StatusOK)
+ }
+}
diff --git a/db.go b/db.go
new file mode 100644
index 0000000..ddac17e
--- /dev/null
+++ b/db.go
@@ -0,0 +1,78 @@
+package main
+
+import (
+ "database/sql"
+ "errors"
+ "log"
+ "os"
+
+ _ "github.com/mattn/go-sqlite3"
+)
+
+func initDB(path string) *sql.DB {
+
+ if _, err := os.Stat(path); errors.Is(err, os.ErrNotExist) {
+ log.Printf("Creating %s...\n", path)
+ file, err := os.Create(path)
+ if err != nil {
+ log.Fatal(err)
+ }
+ file.Close()
+
+ db, _ := sql.Open("sqlite3", path)
+ initTables(db)
+ return db
+ } else {
+ db, err := sql.Open("sqlite3", path)
+ if err != nil {
+ log.Fatal(err)
+ }
+ return db
+ }
+}
+
+func initTables(db *sql.DB) {
+
+ q := `CREATE TABLE IF NOT EXISTS urls (
+ id integer NOT NULL PRIMARY KEY AUTOINCREMENT,
+ url VARCHAR(500) NOT NULL,
+ author_id VARCHAR(18),
+ guild_id VARCHAR(18),
+ channel_id VARCHAR(18),
+ status_code INTEGER
+ );
+ CREATE UNIQUE INDEX idx_urls_url ON urls(url);`
+ s, err := db.Prepare(q)
+ if err != nil {
+ log.Fatal(err)
+ }
+ s.Exec()
+}
+
+func addArchived(db *sql.DB, m *Message, status_code int) {
+
+ q := `INSERT OR IGNORE INTO urls(url, author_id, guild_id, channel_id, status_code) VALUES (?, ?, ?, ?, ?)`
+ s, err := db.Prepare(q)
+ if err != nil {
+ log.Fatal(err)
+ }
+ defer s.Close()
+ _, err = s.Exec(m.URL, m.Author, m.Guild, m.Channel, status_code)
+ if err != nil {
+ log.Fatal(err)
+ }
+}
+
+func isCached(db *sql.DB, url string) (bool, int) {
+
+ var status_code int
+ err := db.QueryRow("SELECT status_code FROM urls WHERE url = ?",
+ url).Scan(&status_code)
+ switch {
+ case err == sql.ErrNoRows:
+ return false, status_code
+ case err != nil:
+ log.Fatal(err)
+ }
+ return true, status_code
+}
diff --git a/db_test.go b/db_test.go
new file mode 100644
index 0000000..b429aaf
--- /dev/null
+++ b/db_test.go
@@ -0,0 +1,48 @@
+package main
+
+import (
+ "database/sql"
+ "io/ioutil"
+ "net/http"
+ "os"
+ "testing"
+)
+
+var (
+ db *sql.DB
+ db_path string
+)
+
+func TestInitDB(t *testing.T) {
+
+ tmpDB, _ := ioutil.TempFile("", "tmp-*.db")
+ db_path = tmpDB.Name()
+ os.Remove(db_path)
+ db = initDB(db_path)
+}
+
+func TestAddArchived(t *testing.T) {
+
+ m := Message{
+ URL: "http://example.com/",
+ Author: "000000000000000000",
+ Guild: "000000000000000000",
+ Channel: "000000000000000000",
+ }
+ addArchived(db, &m, 200)
+}
+
+func TestIsCached(t *testing.T) {
+
+ url := "http://example.com/"
+ cached, status_code := isCached(db, url)
+ if status_code != http.StatusOK || cached != true {
+ t.Errorf("Received %t, %d; wanted %t, %d", cached, status_code, true,
+ http.StatusOK)
+ }
+}
+
+func TestDBCleanup(t *testing.T) {
+
+ os.Remove(db_path)
+}
diff --git a/go.mod b/go.mod
new file mode 100644
index 0000000..e6473ed
--- /dev/null
+++ b/go.mod
@@ -0,0 +1,9 @@
+module keep
+
+go 1.16
+
+require (
+ github.com/bwmarrin/discordgo v0.23.2 // indirect
+ github.com/mattn/go-sqlite3 v1.14.9 // indirect
+ golang.org/x/net v0.0.0-20211201190559-0a0e4e1bb54c // indirect
+)
diff --git a/go.sum b/go.sum
new file mode 100644
index 0000000..537376f
--- /dev/null
+++ b/go.sum
@@ -0,0 +1,15 @@
+github.com/bwmarrin/discordgo v0.23.2 h1:BzrtTktixGHIu9Tt7dEE6diysEF9HWnXeHuoJEt2fH4=
+github.com/bwmarrin/discordgo v0.23.2/go.mod h1:c1WtWUGN6nREDmzIpyTp/iD3VYt4Fpx+bVyfBG7JE+M=
+github.com/gorilla/websocket v1.4.0 h1:WDFjx/TMzVgy9VdMMQi2K2Emtwi2QcUQsztZ/zLaH/Q=
+github.com/gorilla/websocket v1.4.0/go.mod h1:E7qHFY5m1UJ88s3WnNqhKjPHQ0heANvMoAMk2YaljkQ=
+github.com/mattn/go-sqlite3 v1.14.9 h1:10HX2Td0ocZpYEjhilsuo6WWtUqttj2Kb0KtD86/KYA=
+github.com/mattn/go-sqlite3 v1.14.9/go.mod h1:NyWgC/yNuGj7Q9rpYnZvas74GogHl5/Z4A/KQRfk6bU=
+golang.org/x/crypto v0.0.0-20181030102418-4d3f4d9ffa16 h1:y6ce7gCWtnH+m3dCjzQ1PCuwl28DDIc3VNnvY29DlIA=
+golang.org/x/crypto v0.0.0-20181030102418-4d3f4d9ffa16/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4=
+golang.org/x/net v0.0.0-20211201190559-0a0e4e1bb54c h1:WtYZ93XtWSO5KlOMgPZu7hXY9WhMZpprvlm5VwvAl8c=
+golang.org/x/net v0.0.0-20211201190559-0a0e4e1bb54c/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y=
+golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
+golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
+golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
+golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
diff --git a/keep.go b/keep.go
new file mode 100644
index 0000000..3ef7248
--- /dev/null
+++ b/keep.go
@@ -0,0 +1,187 @@
+package main
+
+import (
+ "database/sql"
+ "encoding/json"
+ "flag"
+ "fmt"
+ "io/ioutil"
+ "log"
+ "net/http"
+ "net/url"
+ "os"
+ "os/signal"
+ "os/user"
+ "path"
+ "strings"
+ "syscall"
+ "time"
+
+ "github.com/bwmarrin/discordgo"
+ "golang.org/x/net/publicsuffix"
+)
+
+type Config struct {
+ Token string `json:"token"`
+ Verbose bool `json:"verbose"`
+}
+
+type Message struct {
+ URL string
+ Author string
+ Guild string
+ Channel string
+}
+
+var (
+ messageChan chan *Message
+ config Config
+)
+
+func main() {
+
+ // Create ~/.keep directory in user's home to store db
+ user, err := user.Current()
+ if err != nil {
+ log.Fatal(err)
+ }
+ keepDir := path.Join(user.HomeDir, ".keep")
+
+ // Default config location: ~/.keep/keep.json
+ var configPath string
+ flag.StringVar(&configPath, "config", path.Join(keepDir, "keep.json"),
+ "path to configuration file")
+ flag.Parse()
+ conf, err := ioutil.ReadFile(configPath)
+ if err != nil {
+ log.Fatal(err)
+ }
+ err = json.Unmarshal([]byte(conf), &config)
+ if err != nil {
+ log.Fatal(err)
+ }
+
+ // Create and initialize URL cache database
+ db := initDB(path.Join(keepDir, "keep.db"))
+
+ // Channel for passing URLs to the archive goroutine for archival
+ messageChan = make(chan *Message, 25)
+ go archiver(db)
+
+ // Create a new Discord session using provided credentials
+ dg, err := discordgo.New(config.Token)
+ if err != nil {
+ fmt.Println("error creating Discord session,", err)
+ return
+ }
+
+ // Make our client look like Firefox since we're authenticating with
+ // user/pass credentials (self bot)
+ dg.UserAgent = "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:94.0) Gecko/20100101 Firefox/94.0 "
+
+ // Register the messageCreate func as a callback for MessageCreate events
+ dg.AddHandler(messageCreate)
+
+ // We only care about receiving message events
+ dg.Identify.Intents = discordgo.IntentsGuildMessages
+
+ // Open a websocket connection to Discord and begin listening
+ err = dg.Open()
+ if err != nil {
+ fmt.Println("error opening connection,", err)
+ return
+ }
+
+ // Wait here until CTRL-C or other term signal is received
+ sc := make(chan os.Signal, 1)
+ signal.Notify(sc, syscall.SIGINT, syscall.SIGTERM, os.Interrupt, os.Kill)
+ <-sc
+
+ // Cleanly close down the Discord session
+ dg.Close()
+}
+
+// archiver is intended to be run in its own goroutine, receiving URLs from main
+// over a shared channel for processing
+func archiver(db *sql.DB) {
+
+ // Each iteration removes and processes one url from the channel
+ for {
+
+ // Blocks until URL is received
+ message := <-messageChan
+
+ // Skip if we have URL in database and status OK
+ cached, status_code := isCached(db, message.URL)
+ if cached && status_code == http.StatusOK {
+ log.Printf("%d %s", status_code, message.URL)
+ continue
+ }
+
+ // Skip if the Internet Archive already has a copy available
+ archived, status_code := isArchived(message.URL)
+ if archived && status_code == http.StatusOK {
+ addArchived(db, message, status_code)
+ log.Printf("%d %s", status_code, message.URL)
+ continue
+ }
+
+ // Archive, URL is not present in cache or IA
+ status_code = archive(message.URL)
+ addArchived(db, message, status_code)
+ log.Printf("%d %s", status_code, message.URL)
+
+ // Limit requests to Wayback API to 5-second intervals
+ time.Sleep(5 * time.Second)
+ }
+}
+
+// messageCreate be called (due to AddHandler above) every time a new message is
+// created on any channel that the authenticated bot has access to
+func messageCreate(s *discordgo.Session, m *discordgo.MessageCreate) {
+
+ // https://github.com/bwmarrin/discordgo/issues/961
+ if m.Content == "" {
+ chanMsgs, err := s.ChannelMessages(m.ChannelID, 1, "", "", m.ID)
+ if err != nil {
+ log.Printf("Unable to get messages: %s", err)
+ return
+ }
+ m.Content = chanMsgs[0].Content
+ m.Attachments = chanMsgs[0].Attachments
+ }
+
+ // Log all messages if verbose set to true
+ if config.Verbose {
+ log.Println(m.Content)
+ }
+
+ // Split message by spaces into individual fields
+ for _, w := range strings.Fields(m.Content) {
+
+ // Assess whether message part looks like a valid URL
+ u, err := url.Parse(w)
+ if err != nil || !u.IsAbs() || strings.IndexByte(u.Host, '.') <= 0 {
+ continue
+ }
+
+ // Ensure domain TLD is ICANN-managed
+ if _, icann := publicsuffix.PublicSuffix(u.Host); !icann {
+ continue
+ }
+
+ // Ensure host is not present in blacklisted set
+ if isBlacklisted(u.Host) {
+ continue
+ }
+
+ // Send message attributes/URL over the channel
+ message := Message{
+ URL: w,
+ Author: m.Author.ID,
+ Guild: m.GuildID,
+ Channel: m.ChannelID,
+ }
+ messageChan <- &message
+ }
+}
diff --git a/keep.json b/keep.json
new file mode 100644
index 0000000..4882042
--- /dev/null
+++ b/keep.json
@@ -0,0 +1,4 @@
+{
+ "token":"YXiHglqrSrEXRSIX83PhbPxskICaEOFTiUo757i57o1ffk67Zgb2qORhLq1",
+ "verbose":false
+}