From ca276b66837ac04bf92de257e5e65d2992f1a547 Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 24 Nov 2022 00:55:37 -0700 Subject: archive, keep: send authenticated POST requests to /save/ IA endpoint --- archive.go | 36 ++++++++++++++++++++++++++++++------ archive_test.go | 9 --------- keep.go | 44 +++++++++++++++++++++++--------------------- keep.json | 3 ++- 4 files changed, 55 insertions(+), 37 deletions(-) diff --git a/archive.go b/archive.go index 124c42e..9397797 100644 --- a/archive.go +++ b/archive.go @@ -2,22 +2,30 @@ package main import ( "encoding/json" + "fmt" "log" "net" "net/http" + "net/url" "regexp" "strconv" + "strings" "time" ) var ( API_AVAILABILITY string = "https://archive.org/wayback/available?url=" - API_SAVE string = "https://web.archive.org/save/" + API_SAVE string = "https://web.archive.org/save" TIMEOUT time.Duration = 10 client *http.Client = &http.Client{Timeout: TIMEOUT * time.Second} ) +type Save struct { + URL string `json:"url"` + Job string `json:"job_id"` +} + type Wayback struct { Snapshots Snapshot `json:"archived_snapshots,omitempty"` } @@ -60,15 +68,31 @@ func isArchived(url string) (bool, int) { return av.Snapshots.Recent.Available, status } -func archive(url string) int { +func archive(accessKey string, secretKey string, URL string) (int, string) { + + params := url.Values{} + params.Set("url", URL) + req, _ := http.NewRequest(http.MethodPost, API_SAVE, + strings.NewReader(params.Encode()), + ) + req.Header.Set("Accept", "application/json") + auth := fmt.Sprintf("LOW %s:%s", accessKey, secretKey) + req.Header.Set("Authorization", auth) + req.Header.Set("Content-Type", "application/x-www-form-urlencoded") - req, err := http.NewRequest("GET", API_SAVE+url, nil) resp, err := client.Do(req) if err != nil { if e, _ := err.(net.Error); !e.Timeout() { - log.Println("archive:", err) + log.Println("SAVE", err) } - return 0 + return 0, "" + } + defer resp.Body.Close() + save := &Save{} + decoder := json.NewDecoder(resp.Body) + if err := decoder.Decode(save); err != nil { + log.Println("SAVE", err) + return 0, "" } - return resp.StatusCode + return resp.StatusCode, save.Job } diff --git a/archive_test.go b/archive_test.go index 7e7d4ff..842b3b3 100644 --- a/archive_test.go +++ b/archive_test.go @@ -42,12 +42,3 @@ func TestIsNotIgnored(t *testing.T) { t.Errorf("Received %t; want %t", ignored, false) } } - -func TestArchive200(t *testing.T) { - - url := "http://example.com/" - status := archive(url) - if status != http.StatusOK { - t.Errorf("Recieved %d; want %d", status, http.StatusOK) - } -} diff --git a/keep.go b/keep.go index 5751bad..4432217 100644 --- a/keep.go +++ b/keep.go @@ -17,17 +17,19 @@ import ( "syscall" "time" + "github.com/PuerkitoBio/purell" "github.com/bwmarrin/discordgo" "golang.org/x/net/publicsuffix" - "github.com/PuerkitoBio/purell" ) type Config struct { - Token string `json:"token"` - Verbose bool `json:"verbose"` - Ignore []string `json:"ignore"` - Host string `json:"host"` - Port string `json:"port"` + AccessKey string `json:"access-key"` + SecretKey string `json:"secret-key"` + Token string `json:"token"` + Verbose bool `json:"verbose"` + Ignore []string `json:"ignore"` + Host string `json:"host"` + Port string `json:"port"` } type Message struct { @@ -75,7 +77,7 @@ func main() { // Channel for passing URLs to the archive goroutine for archival messageChan = make(chan *Message, 25) - go archiver(db) + go archiver(config.AccessKey, config.SecretKey, db) // Start HTTP server http.HandleFunc("/", db.IndexHandler) @@ -118,7 +120,7 @@ func main() { // archiver is intended to be run in its own goroutine, receiving URLs from main // over a shared channel for processing -func archiver(db *SqliteDB) { +func archiver(accessKey string, secretKey string, db *SqliteDB) { // Each iteration removes and processes one url from the channel for { @@ -127,24 +129,24 @@ func archiver(db *SqliteDB) { message := <-messageChan // Skip if we've already seen URL (cached) - cached, status_code := db.IsCached(message.URL) + cached, statusCode := db.IsCached(message.URL) if cached { - log.Println("SEEN", status_code, message.URL) + log.Println("SEEN", statusCode, message.URL) continue } // Skip if the Internet Archive already has a copy available - archived, status_code := isArchived(message.URL) - if archived && status_code == http.StatusOK { - db.AddArchived(message, status_code) - log.Println("SKIP", status_code, message.URL) + archived, statusCode := isArchived(message.URL) + if archived && statusCode == http.StatusOK { + db.AddArchived(message, statusCode) + log.Println("SKIP", statusCode, message.URL) continue } // Archive, URL is not present in cache or IA - status_code = archive(message.URL) - db.AddArchived(message, status_code) - log.Println("SAVE", status_code, message.URL) + statusCode, jobID := archive(accessKey, secretKey, message.URL) + db.AddArchived(message, statusCode) + log.Println("SAVE", statusCode, message.URL, jobID) // Limit requests to Wayback API to 15-second intervals time.Sleep(15 * time.Second) @@ -190,10 +192,10 @@ func messageCreate(s *discordgo.Session, m *discordgo.MessageCreate) { // Normalize URL (RFC 3986) uStr := purell.NormalizeURL(u, purell.FlagsSafe| - purell.FlagRemoveDotSegments| - purell.FlagRemoveDuplicateSlashes| - purell.FlagRemoveFragment| - purell.FlagSortQuery) + purell.FlagRemoveDotSegments| + purell.FlagRemoveDuplicateSlashes| + purell.FlagRemoveFragment| + purell.FlagSortQuery) // Ensure host is not present in ignoreList set if isIgnored(config.Ignore, uStr) { diff --git a/keep.json b/keep.json index 3f16f14..a37761c 100644 --- a/keep.json +++ b/keep.json @@ -2,6 +2,8 @@ "host":"127.0.0.1", "port":"9099", "token":"YXiHglqrSrEXRSIX83PhbPxskICaEOFTiUo757i57o1ffk67Zgb2qORhLq1", + "access-key": "auHAtnCpfxKVBhli", + "secret-key": "GDJshp7K8VRjcuuK", "verbose":false, "ignore": [ "^https?://([^/]*\\.)?discordapp\\.[^/]+/", @@ -38,7 +40,6 @@ "^https?://([^/]*\\.)?duckduckgo\\.com/", "^https?://([^/]*\\.)?twimg\\.com/", "^https?://([^/]*\\.)?strawpoll\\.com/", - "^https?://([^/]*\\.)?strawpoll\\.com/", "^https?://([^/]*\\.)?4chan\\.org/", "^https?://([^/]*\\.)?4plebs\\.org/", "^https?://([^/]*\\.)?google\\.com/search", -- cgit v1.2.3-54-g00ecf