From a5efd75f7954a64d2b8e1f3d4ecf6434774f6af3 Mon Sep 17 00:00:00 2001 From: Jordan Date: Thu, 24 Dec 2020 10:34:56 -0700 Subject: use curl for PDF downloads --- README | 5 +++-- pouch.sh | 26 +++++++++++++++++++------- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/README b/README index 39dac0f..059684b 100644 --- a/README +++ b/README @@ -1,7 +1,8 @@ Usage: pouch.sh HTML_FILE_PATH - HTML_FILE is a pocket-exported document containing the set of URLs to save + HTML_FILE is a pocket-exported document containing the set of URLs to save Dependencies: - google-chrome 59+ (headless mode support) + google-chrome 59+ (headless mode support) + curl diff --git a/pouch.sh b/pouch.sh index 500a3a5..341e846 100755 --- a/pouch.sh +++ b/pouch.sh @@ -1,17 +1,19 @@ #!/usr/bin/env bash PROGRAM="${0##*/}" +AGENT="Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:84.0) Gecko/20100101 Firefox/84.0" cmd_usage() { cat >&2 <<-_EOF - Usage: $PROGRAM HTML_FILE_PATH +Usage: $PROGRAM HTML_FILE_PATH - HTML_FILE is a pocket-exported document containing the set of URLs to save + HTML_FILE is a pocket-exported document containing the set of URLs to save - Dependencies: +Dependencies: - google-chrome 59+ (headless mode support) - _EOF + google-chrome 59+ (headless mode support) + curl +_EOF } get_urls() { @@ -27,12 +29,22 @@ url_to_filename() { save() { read url + + if [[ -f "$(url_to_filename $url).pdf" ]]; then + return 0 + fi + + content_type=$(curl -s -I -A "$AGENT" "$url" | grep --ignore-case '^content-type:' | cut -d' ' -f2) pdfname=$(url_to_filename $url).pdf - if [[ ! -f ./$pdfname ]]; then + if [[ $content_type == *"application/pdf"* ]]; then echo [+] $url... - google-chrome --headless --disable-gpu --print-to-pdf=$pdfname $url + curl -A "$AGENT" "$url" -o "$pdfname" + else + echo [+] $url... + google-chrome --headless --disable-gpu --print-to-pdf="$pdfname" "$url" fi + } if [[ $# -eq 1 && ( $1 == --help || $1 == -h || $1 == help ) ]]; then -- cgit v1.2.3-54-g00ecf