aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJordan <me@jordan.im>2022-11-24 00:55:37 -0700
committerJordan <me@jordan.im>2022-11-24 00:55:37 -0700
commitca276b66837ac04bf92de257e5e65d2992f1a547 (patch)
treefb11dfbe6bd885c8503ba5eedca02dbaf0a13313
parent12c3c1fcc2e72a74934a59fcecba51ef47a6667a (diff)
downloadkeep-ca276b66837ac04bf92de257e5e65d2992f1a547.tar.gz
keep-ca276b66837ac04bf92de257e5e65d2992f1a547.zip
archive, keep: send authenticated POST requests to /save/ IA endpoint
-rw-r--r--archive.go36
-rw-r--r--archive_test.go9
-rw-r--r--keep.go44
-rw-r--r--keep.json3
4 files changed, 55 insertions, 37 deletions
diff --git a/archive.go b/archive.go
index 124c42e..9397797 100644
--- a/archive.go
+++ b/archive.go
@@ -2,22 +2,30 @@ package main
import (
"encoding/json"
+ "fmt"
"log"
"net"
"net/http"
+ "net/url"
"regexp"
"strconv"
+ "strings"
"time"
)
var (
API_AVAILABILITY string = "https://archive.org/wayback/available?url="
- API_SAVE string = "https://web.archive.org/save/"
+ API_SAVE string = "https://web.archive.org/save"
TIMEOUT time.Duration = 10
client *http.Client = &http.Client{Timeout: TIMEOUT * time.Second}
)
+type Save struct {
+ URL string `json:"url"`
+ Job string `json:"job_id"`
+}
+
type Wayback struct {
Snapshots Snapshot `json:"archived_snapshots,omitempty"`
}
@@ -60,15 +68,31 @@ func isArchived(url string) (bool, int) {
return av.Snapshots.Recent.Available, status
}
-func archive(url string) int {
+func archive(accessKey string, secretKey string, URL string) (int, string) {
+
+ params := url.Values{}
+ params.Set("url", URL)
+ req, _ := http.NewRequest(http.MethodPost, API_SAVE,
+ strings.NewReader(params.Encode()),
+ )
+ req.Header.Set("Accept", "application/json")
+ auth := fmt.Sprintf("LOW %s:%s", accessKey, secretKey)
+ req.Header.Set("Authorization", auth)
+ req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
- req, err := http.NewRequest("GET", API_SAVE+url, nil)
resp, err := client.Do(req)
if err != nil {
if e, _ := err.(net.Error); !e.Timeout() {
- log.Println("archive:", err)
+ log.Println("SAVE", err)
}
- return 0
+ return 0, ""
+ }
+ defer resp.Body.Close()
+ save := &Save{}
+ decoder := json.NewDecoder(resp.Body)
+ if err := decoder.Decode(save); err != nil {
+ log.Println("SAVE", err)
+ return 0, ""
}
- return resp.StatusCode
+ return resp.StatusCode, save.Job
}
diff --git a/archive_test.go b/archive_test.go
index 7e7d4ff..842b3b3 100644
--- a/archive_test.go
+++ b/archive_test.go
@@ -42,12 +42,3 @@ func TestIsNotIgnored(t *testing.T) {
t.Errorf("Received %t; want %t", ignored, false)
}
}
-
-func TestArchive200(t *testing.T) {
-
- url := "http://example.com/"
- status := archive(url)
- if status != http.StatusOK {
- t.Errorf("Recieved %d; want %d", status, http.StatusOK)
- }
-}
diff --git a/keep.go b/keep.go
index 5751bad..4432217 100644
--- a/keep.go
+++ b/keep.go
@@ -17,17 +17,19 @@ import (
"syscall"
"time"
+ "github.com/PuerkitoBio/purell"
"github.com/bwmarrin/discordgo"
"golang.org/x/net/publicsuffix"
- "github.com/PuerkitoBio/purell"
)
type Config struct {
- Token string `json:"token"`
- Verbose bool `json:"verbose"`
- Ignore []string `json:"ignore"`
- Host string `json:"host"`
- Port string `json:"port"`
+ AccessKey string `json:"access-key"`
+ SecretKey string `json:"secret-key"`
+ Token string `json:"token"`
+ Verbose bool `json:"verbose"`
+ Ignore []string `json:"ignore"`
+ Host string `json:"host"`
+ Port string `json:"port"`
}
type Message struct {
@@ -75,7 +77,7 @@ func main() {
// Channel for passing URLs to the archive goroutine for archival
messageChan = make(chan *Message, 25)
- go archiver(db)
+ go archiver(config.AccessKey, config.SecretKey, db)
// Start HTTP server
http.HandleFunc("/", db.IndexHandler)
@@ -118,7 +120,7 @@ func main() {
// archiver is intended to be run in its own goroutine, receiving URLs from main
// over a shared channel for processing
-func archiver(db *SqliteDB) {
+func archiver(accessKey string, secretKey string, db *SqliteDB) {
// Each iteration removes and processes one url from the channel
for {
@@ -127,24 +129,24 @@ func archiver(db *SqliteDB) {
message := <-messageChan
// Skip if we've already seen URL (cached)
- cached, status_code := db.IsCached(message.URL)
+ cached, statusCode := db.IsCached(message.URL)
if cached {
- log.Println("SEEN", status_code, message.URL)
+ log.Println("SEEN", statusCode, message.URL)
continue
}
// Skip if the Internet Archive already has a copy available
- archived, status_code := isArchived(message.URL)
- if archived && status_code == http.StatusOK {
- db.AddArchived(message, status_code)
- log.Println("SKIP", status_code, message.URL)
+ archived, statusCode := isArchived(message.URL)
+ if archived && statusCode == http.StatusOK {
+ db.AddArchived(message, statusCode)
+ log.Println("SKIP", statusCode, message.URL)
continue
}
// Archive, URL is not present in cache or IA
- status_code = archive(message.URL)
- db.AddArchived(message, status_code)
- log.Println("SAVE", status_code, message.URL)
+ statusCode, jobID := archive(accessKey, secretKey, message.URL)
+ db.AddArchived(message, statusCode)
+ log.Println("SAVE", statusCode, message.URL, jobID)
// Limit requests to Wayback API to 15-second intervals
time.Sleep(15 * time.Second)
@@ -190,10 +192,10 @@ func messageCreate(s *discordgo.Session, m *discordgo.MessageCreate) {
// Normalize URL (RFC 3986)
uStr := purell.NormalizeURL(u,
purell.FlagsSafe|
- purell.FlagRemoveDotSegments|
- purell.FlagRemoveDuplicateSlashes|
- purell.FlagRemoveFragment|
- purell.FlagSortQuery)
+ purell.FlagRemoveDotSegments|
+ purell.FlagRemoveDuplicateSlashes|
+ purell.FlagRemoveFragment|
+ purell.FlagSortQuery)
// Ensure host is not present in ignoreList set
if isIgnored(config.Ignore, uStr) {
diff --git a/keep.json b/keep.json
index 3f16f14..a37761c 100644
--- a/keep.json
+++ b/keep.json
@@ -2,6 +2,8 @@
"host":"127.0.0.1",
"port":"9099",
"token":"YXiHglqrSrEXRSIX83PhbPxskICaEOFTiUo757i57o1ffk67Zgb2qORhLq1",
+ "access-key": "auHAtnCpfxKVBhli",
+ "secret-key": "GDJshp7K8VRjcuuK",
"verbose":false,
"ignore": [
"^https?://([^/]*\\.)?discordapp\\.[^/]+/",
@@ -38,7 +40,6 @@
"^https?://([^/]*\\.)?duckduckgo\\.com/",
"^https?://([^/]*\\.)?twimg\\.com/",
"^https?://([^/]*\\.)?strawpoll\\.com/",
- "^https?://([^/]*\\.)?strawpoll\\.com/",
"^https?://([^/]*\\.)?4chan\\.org/",
"^https?://([^/]*\\.)?4plebs\\.org/",
"^https?://([^/]*\\.)?google\\.com/search",