pouch.sh


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59

#!/usr/bin/env bash

PROGRAM="${0##*/}"
AGENT="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0"

cmd_usage() {
    cat >&2 <<-_EOF
Usage: $PROGRAM HTML_FILE_PATH

  HTML_FILE is a pocket-exported document containing the set of URLs to save

Dependencies:

  google-chrome 59+ (headless mode support)
  curl
_EOF
}

get_urls() {
    sed -n 's/.*href="\([^"]*\).*/\1/p' $1
}

url_to_filename() {
    echo "${1##*//}" |        # remove protocol (https://...)
        sed 's/\/$//' |       # remove trailing slash
        tr /. - |             # replace /. characters with -
        tr -cd '[[:alnum:]-]' # remove non-alphanumeric/"-" chars
}

save() {
    read url

    if [[ -f "$(url_to_filename $url).pdf" ]]; then
        return 0
    fi

    content_type=$(curl -s -I -A "$AGENT" "$url" | grep --ignore-case '^content-type:' | cut -d' ' -f2)
    pdfname=$(url_to_filename $url).pdf

    if [[ $content_type == *"application/pdf"* ]]; then
        echo [+] $url...
        curl -A "$AGENT" "$url" -o "$pdfname"
    else
        echo [+] $url...
        google-chrome --headless --disable-gpu --print-to-pdf="$pdfname" "$url"
    fi

}

if [[ $# -eq 1 && ( $1 == --help || $1 == -h || $1 == help ) ]]; then
    cmd_usage
elif [[ $# -eq 1 ]]; then
    get_urls $1 | while read line ; do save $line ; done
else
    cmd_usage
    exit 1
fi

exit 0