From 93685ba170c63c6509314a72d83e3d2096ed5ab7 Mon Sep 17 00:00:00 2001 From: Jordan Date: Tue, 9 Mar 2021 11:03:01 -0700 Subject: support arxiv, parsing improvements --- crane.go | 410 ++++++++++++---------------------------------- http.go | 230 ++++++++++++++++++++++++++ templates/admin-edit.html | 14 +- templates/list.html | 14 +- util.go | 120 ++++++++++---- 5 files changed, 450 insertions(+), 338 deletions(-) create mode 100644 http.go diff --git a/crane.go b/crane.go index 0d7996a..32e9a5f 100644 --- a/crane.go +++ b/crane.go @@ -7,7 +7,6 @@ import ( "errors" "flag" "fmt" - "html/template" "io/ioutil" "log" "mime" @@ -53,6 +52,7 @@ type Meta struct { FirstPage string `xml:"doi_record>crossref>journal>journal_article>pages>first_page"` LastPage string `xml:"doi_record>crossref>journal>journal_article>pages>last_page"` DOI string `xml:"doi_record>crossref>journal>journal_article>doi_data>doi"` + ArxivID string `xml:"doi_record>crossref>journal>journal_article>arxiv_data>arxiv_id"` Resource string `xml:"doi_record>crossref>journal>journal_article>doi_data>resource"` } @@ -76,8 +76,8 @@ type Resp struct { } // getPaperFileNameFromMeta returns the built filename (absent an extension) -// from doi.org metadata, consisting of the lowercase last name of the first -// author followed by the year of publication (e.g. doe2020) +// from metadata, consisting of the lowercase last name of the first author +// followed by the year of publication (e.g. doe2020) func getPaperFileNameFromMeta(p *Meta) string { var mainAuthor string for _, contributor := range p.Contributors { @@ -111,23 +111,26 @@ func getPaperFileNameFromResp(resp *http.Response) string { filename = strings.TrimSuffix(filepath.Base(u.Path), "/") } filename = strings.TrimSuffix(filename, ".pdf") + filename = strings.Replace(filename, "..", "", -1) + filename = strings.Replace(filename, "/", "", -1) return filename } // getUniqueName ensures a paper name is unique, appending "-$ext" until // a unique name is found and returned func (papers *Papers) getUniqueName(category string, name string) string { + newName := name ext := 2 for { - key := filepath.Join(category, name+".pdf") + key := filepath.Join(category, newName+".pdf") if _, exists := papers.List[category][key]; exists != true { break } else { - name = fmt.Sprint(name, "-", ext) + newName = fmt.Sprint(name, "-", ext) ext++ } } - return name + return newName } // findPapersWalk is a WalkFunc passed to filepath.Walk() to process papers @@ -205,69 +208,32 @@ func (papers *Papers) PopulatePapers() error { return nil } -// NewPaperFromDirectLink contains routines used to retrieve papers from remote -// endpoints provided a direct link's http.Response -func (papers *Papers) NewPaperFromDirectLink(resp *http.Response, - category string) (*Paper, error) { - tmpPDF, err := ioutil.TempFile("", "tmp-*.pdf") - if err != nil { - return &Paper{}, err - } - err = saveRespBody(resp, tmpPDF.Name()) - if err != nil { - return &Paper{}, err - } - if err := tmpPDF.Close(); err != nil { - return &Paper{}, err - } - defer os.Remove(tmpPDF.Name()) - - var paper Paper - paper.PaperName = papers.getUniqueName(category, - getPaperFileNameFromResp(resp)) - - if err != nil { - return &Paper{}, err - } - paper.PaperPath = filepath.Join(papers.Path, - filepath.Join(category, paper.PaperName+".pdf")) - - if err := renameFile(tmpPDF.Name(), paper.PaperPath); err != nil { - return nil, err - } - papers.List[category][filepath.Join(category, - paper.PaperName+".pdf")] = &paper - return &paper, nil -} - // NewPaperFromDOI contains routines used to retrieve papers from remote // endpoints provided a DOI func (papers *Papers) NewPaperFromDOI(doi []byte, category string) (*Paper, error) { - tmpXML, err := getMetaFromDOI(client, doi) + var paper Paper + + meta, err := getMetaFromDOI(client, doi) if err != nil { return nil, err } - defer os.Remove(tmpXML) - // open temporary XML file for parsing - f, err := os.Open(tmpXML) + // create a temporary file to store XML stream + tmpXML, err := ioutil.TempFile("", "tmp-*.meta.xml") if err != nil { return nil, err } - r := bufio.NewReader(f) - d := xml.NewDecoder(r) + defer os.Remove(tmpXML.Name()) - // populate p struct with values derived from doi.org metadata - var paper Paper - if err := d.Decode(&paper.Meta); err != nil { - return nil, err - } - if err := f.Close(); err != nil { + e := xml.NewEncoder(tmpXML) + err = e.Encode(meta) + if err != nil { return nil, err } + tmpXML.Close() - name := getPaperFileNameFromMeta(&paper.Meta) // doe2020 + name := getPaperFileNameFromMeta(meta) // doe2020 if name == "" { // last-resort condition if metadata lacking author or publication year name = strings.Replace(string(doi), "..", "", -1) @@ -280,7 +246,7 @@ func (papers *Papers) NewPaperFromDOI(doi []byte, category string) (*Paper, // if not matching, check if DOIs match (genuine duplicate) if name != uniqueName { key := filepath.Join(category, name+".pdf") - if paper.Meta.DOI == papers.List[category][key].Meta.DOI { + if meta.DOI == papers.List[category][key].Meta.DOI { return nil, fmt.Errorf("paper %q with DOI %q already downloaded", name, string(doi)) } @@ -306,7 +272,47 @@ func (papers *Papers) NewPaperFromDOI(doi []byte, category string) (*Paper, if err := renameFile(tmpPDF, paper.PaperPath); err != nil { return nil, err } - if err := renameFile(tmpXML, paper.MetaPath); err != nil { + if err := renameFile(tmpXML.Name(), paper.MetaPath); err != nil { + return nil, err + } + paper.Meta = *meta + papers.List[category][filepath.Join(category, + paper.PaperName+".pdf")] = &paper + return &paper, nil +} + +// NewPaperFromDirectLink contains routines used to retrieve papers from remote +// endpoints provided a direct link's http.Response and/or optional metadata +func (papers *Papers) NewPaperFromDirectLink(resp *http.Response, meta *Meta, + category string) (*Paper, error) { + tmpPDF, err := ioutil.TempFile("", "tmp-*.pdf") + if err != nil { + return &Paper{}, err + } + err = saveRespBody(resp, tmpPDF.Name()) + if err != nil { + return &Paper{}, err + } + if err := tmpPDF.Close(); err != nil { + return &Paper{}, err + } + defer os.Remove(tmpPDF.Name()) + + var paper Paper + paper.PaperName = papers.getUniqueName(category, + getPaperFileNameFromMeta(meta)) + if paper.PaperName == "" { + paper.PaperName = papers.getUniqueName(category, + getPaperFileNameFromResp(resp)) + } + + if err != nil { + return &Paper{}, err + } + paper.PaperPath = filepath.Join(papers.Path, + filepath.Join(category, paper.PaperName+".pdf")) + + if err := renameFile(tmpPDF.Name(), paper.PaperPath); err != nil { return nil, err } papers.List[category][filepath.Join(category, @@ -454,276 +460,74 @@ func (papers *Papers) RenameCategory(oldCategory string, // a DOI and initiate paper download func (papers *Papers) ProcessAddPaperInput(category string, input string) (*Paper, error) { - var doi []byte - - // URL processing routine; download paper directly or check page for a DOI - if u, _ := url.Parse(input); u.Scheme != "" && u.Host != "" { + if strings.HasPrefix(input, "http") { resp, err := makeRequest(client, input) if err != nil { return &Paper{}, err } if resp.Header.Get("Content-Type") == "application/pdf" { - paper, err := papers.NewPaperFromDirectLink(resp, category) + paper, err := papers.NewPaperFromDirectLink(resp, &Meta{}, category) if err != nil { return &Paper{}, err } return paper, nil } - doi = getDOIFromPage(resp) - // last resort, pass url to sci-hub and see if they know the DOI - if doi == nil { - resp, err = makeRequest(client, scihubURL+input) - if err != nil { - return &Paper{}, err - } - doi = getDOIFromPage(resp) - } - if doi == nil { - return &Paper{}, fmt.Errorf("%q: DOI not found on page", input) + meta, err := getMetaFromCitation(resp) + if err != nil { + return nil, err } - } else { - // input was not a URL, hopefully it has or contains a DOI - doi = getDOIFromBytes([]byte(input)) - if doi == nil { - return &Paper{}, fmt.Errorf("%q is not a valid DOI or URL\n", input) + if meta.Resource != "" { + resp, err := makeRequest(client, meta.Resource) + if err == nil && strings.HasPrefix(resp.Header.Get("Content-Type"), "application/pdf") { + paper, err := papers.NewPaperFromDirectLink(resp, meta, category) + if err != nil { + return nil, err + } else { + tmpXML, err := ioutil.TempFile("", "tmp-*.meta.xml") + if err != nil { + return nil, err + } + defer os.Remove(tmpXML.Name()) + + e := xml.NewEncoder(tmpXML) + err = e.Encode(meta) + if err != nil { + return nil, err + } + tmpXML.Close() + + paper.MetaPath = filepath.Join(filepath.Join(papers.Path, + category), paper.PaperName+".meta.xml") + if err := renameFile(tmpXML.Name(), paper.MetaPath); err != nil { + return nil, err + } + + paper.Meta = *meta + return paper, nil + } + } } - } - paper, err := papers.NewPaperFromDOI(doi, category) - if err != nil { - if u, _ := url.Parse(input); u.Scheme != "" && u.Host != "" { - // try to force sci-hub to cache paper if dl failed and input was - // URL, retry - makeRequest(client, scihubURL+input) - paper, err := papers.NewPaperFromDOI(doi, category) + if meta.DOI != "" { + paper, err := papers.NewPaperFromDOI([]byte(meta.DOI), category) if err != nil { - return &Paper{}, err + return nil, err } return paper, nil } else { - return &Paper{}, err - } - } - return paper, nil -} - -// IndexHandler renders the index of papers stored in papers.Path -func (papers *Papers) IndexHandler(w http.ResponseWriter, r *http.Request) { - // catch-all for paths unhandled by direct http.HandleFunc calls - if r.URL.Path != "/" { - http.Error(w, http.StatusText(http.StatusNotFound), http.StatusNotFound) - return - } - t, _ := template.ParseFiles(filepath.Join(templateDir, "layout.html"), - filepath.Join(templateDir, "index.html"), - filepath.Join(templateDir, "list.html"), - ) - res := Resp{ - Papers: papers.List, - } - t.Execute(w, &res) -} - -// AdminHandler renders the index of papers stored in papers.Path with -// additional forms to modify the collection (add, delete, rename...) -func (papers *Papers) AdminHandler(w http.ResponseWriter, r *http.Request) { - t, _ := template.ParseFiles(filepath.Join(templateDir, "admin.html"), - filepath.Join(templateDir, "layout.html"), - filepath.Join(templateDir, "list.html"), - ) - res := Resp{ - Papers: papers.List, - } - if user != "" && pass != "" { - username, password, ok := r.BasicAuth() - if ok && user == username && pass == password { - t.Execute(w, &res) - } else { - w.Header().Add("WWW-Authenticate", - `Basic realm="Please authenticate"`) - http.Error(w, http.StatusText(http.StatusUnauthorized), - http.StatusUnauthorized) + return &Paper{}, fmt.Errorf("%q: DOI could not be discovered", input) } } else { - t.Execute(w, &res) - } -} - -// EditHandler renders the index of papers stored in papers.Path, prefixing -// a checkbox to each unique paper and category for modification -func (papers *Papers) EditHandler(w http.ResponseWriter, r *http.Request) { - t, _ := template.ParseFiles(filepath.Join(templateDir, "admin-edit.html"), - filepath.Join(templateDir, "layout.html"), - filepath.Join(templateDir, "list.html"), - ) - res := Resp{ - Papers: papers.List, - } - if user != "" && pass != "" { - username, password, ok := r.BasicAuth() - if !ok || user != username || pass != password { - w.Header().Add("WWW-Authenticate", - `Basic realm="Please authenticate"`) - http.Error(w, http.StatusText(http.StatusUnauthorized), - http.StatusUnauthorized) - return - } - } - if err := r.ParseForm(); err != nil { - res.Status = err.Error() - t.Execute(w, &res) - return - } - - if action := r.FormValue("action"); action == "delete" { - for _, paper := range r.Form["paper"] { - if res.Status != "" { - break - } - if err := papers.DeletePaper(paper); err != nil { - res.Status = err.Error() - } - } - for _, category := range r.Form["category"] { - if res.Status != "" { - break - } - if err := papers.DeleteCategory(category); err != nil { - res.Status = err.Error() - } - } - if res.Status == "" { - res.Status = "delete successful" - } - } else if strings.HasPrefix(action, "move") { - destCategory := strings.SplitN(action, "move-", 2)[1] - for _, paper := range r.Form["paper"] { - if res.Status != "" { - break - } - if err := papers.MovePaper(paper, destCategory); err != nil { - res.Status = err.Error() - } - } - if res.Status == "" { - res.Status = "move successful" - } - } else { - rc := r.FormValue("rename-category") - rt := r.FormValue("rename-to") - if rc != "" && rt != "" { - // ensure filesystem safety of category names - rc = strings.Trim(strings.Replace(rc, "..", "", -1), "/.") - rt = strings.Trim(strings.Replace(rt, "..", "", -1), "/.") - - if err := papers.RenameCategory(rc, rt); err != nil { - res.Status = err.Error() - } - if res.Status == "" { - res.Status = "rename successful" - } - } - } - t.Execute(w, &res) -} - -// AddHandler provides support for new paper processing and category addition -func (papers *Papers) AddHandler(w http.ResponseWriter, r *http.Request) { - t, _ := template.ParseFiles(filepath.Join(templateDir, "admin.html"), - filepath.Join(templateDir, "layout.html"), - filepath.Join(templateDir, "list.html"), - ) - if user != "" && pass != "" { - username, password, ok := r.BasicAuth() - if !ok || user != username || pass != password { - w.Header().Add("WWW-Authenticate", - `Basic realm="Please authenticate"`) - http.Error(w, http.StatusText(http.StatusUnauthorized), - http.StatusUnauthorized) - return + doi := getDOIFromBytes([]byte(input)) + if doi == nil { + return &Paper{}, fmt.Errorf("%q is not a valid DOI or URL\n", input) } - } - p := r.FormValue("dl-paper") - c := r.FormValue("dl-category") - nc := r.FormValue("new-category") - - // sanitize input; we use the category to build the path used to save - // papers - nc = strings.Trim(strings.Replace(nc, "..", "", -1), "/.") - res := Resp{Papers: papers.List} - - // paper download, both required fields populated - if len(strings.TrimSpace(p)) > 0 && len(strings.TrimSpace(c)) > 0 { - if paper, err := papers.ProcessAddPaperInput(c, p); err != nil { - res.Status = err.Error() + if paper, err := papers.NewPaperFromDOI(doi, category); err != nil { + return nil, fmt.Errorf("%q: %v", input, err) } else { - if paper.Meta.Title != "" { - res.Status = fmt.Sprintf("%q downloaded successfully", - paper.Meta.Title) - } else { - res.Status = fmt.Sprintf("%q downloaded successfully", - paper.PaperName) - } - res.LastPaperDL = strings.TrimPrefix(paper.PaperPath, - papers.Path+"/") - } - res.LastUsedCategory = c - } else if len(strings.TrimSpace(nc)) > 0 { - // accounts for nested category addition; e.g. "foo/bar/baz" where - // "foo/bar" and/or "foo" do not already exist - n := nc - for n != "." { - _, exists := papers.List[n] - if exists == true { - res.Status = fmt.Sprintf("category %q already exists", n) - } else if err := os.MkdirAll(filepath.Join(papers.Path, n), - os.ModePerm); err != nil { - res.Status = fmt.Sprintf(err.Error()) - } else { - papers.List[n] = make(map[string]*Paper) - } - if res.Status != "" { - break - } - res.LastUsedCategory = n - n = filepath.Dir(n) - } - if res.Status == "" { - res.Status = fmt.Sprintf("category %q added successfully", nc) + return paper, nil } } - t.Execute(w, &res) -} - -// DownloadHandler serves saved papers up for download -func (papers *Papers) DownloadHandler(w http.ResponseWriter, r *http.Request) { - paper := strings.TrimPrefix(r.URL.Path, "/download/") - category := filepath.Dir(paper) - - // return 404 if the provided paper category or paper key do not exist in - // the papers set - if _, exists := papers.List[category]; exists == false { - http.Error(w, http.StatusText(http.StatusNotFound), - http.StatusNotFound) - return - } - if _, exists := papers.List[category][paper]; exists == false { - http.Error(w, http.StatusText(http.StatusNotFound), - http.StatusNotFound) - return - } - - // ensure the paper (PaperPath) actually exists on the filesystem - i, err := os.Stat(papers.List[category][paper].PaperPath) - if os.IsNotExist(err) { - http.Error(w, http.StatusText(http.StatusNotFound), - http.StatusNotFound) - } else if i.IsDir() { - http.Error(w, http.StatusText(http.StatusForbidden), - http.StatusForbidden) - } else { - http.ServeFile(w, r, papers.List[category][paper].PaperPath) - } } func main() { diff --git a/http.go b/http.go new file mode 100644 index 0000000..fcfd38f --- /dev/null +++ b/http.go @@ -0,0 +1,230 @@ +package main + +import ( + "fmt" + "net/http" + "html/template" + "os" + "path/filepath" + "strings" +) + +// IndexHandler renders the index of papers stored in papers.Path +func (papers *Papers) IndexHandler(w http.ResponseWriter, r *http.Request) { + // catch-all for paths unhandled by direct http.HandleFunc calls + if r.URL.Path != "/" { + http.Error(w, http.StatusText(http.StatusNotFound), http.StatusNotFound) + return + } + t, _ := template.ParseFiles(filepath.Join(templateDir, "layout.html"), + filepath.Join(templateDir, "index.html"), + filepath.Join(templateDir, "list.html"), + ) + res := Resp{ + Papers: papers.List, + } + t.Execute(w, &res) +} + +// AdminHandler renders the index of papers stored in papers.Path with +// additional forms to modify the collection (add, delete, rename...) +func (papers *Papers) AdminHandler(w http.ResponseWriter, r *http.Request) { + t, _ := template.ParseFiles(filepath.Join(templateDir, "admin.html"), + filepath.Join(templateDir, "layout.html"), + filepath.Join(templateDir, "list.html"), + ) + res := Resp{ + Papers: papers.List, + } + if user != "" && pass != "" { + username, password, ok := r.BasicAuth() + if ok && user == username && pass == password { + t.Execute(w, &res) + } else { + w.Header().Add("WWW-Authenticate", + `Basic realm="Please authenticate"`) + http.Error(w, http.StatusText(http.StatusUnauthorized), + http.StatusUnauthorized) + } + } else { + t.Execute(w, &res) + } +} + +// EditHandler renders the index of papers stored in papers.Path, prefixing +// a checkbox to each unique paper and category for modification +func (papers *Papers) EditHandler(w http.ResponseWriter, r *http.Request) { + t, _ := template.ParseFiles(filepath.Join(templateDir, "admin-edit.html"), + filepath.Join(templateDir, "layout.html"), + filepath.Join(templateDir, "list.html"), + ) + res := Resp{ + Papers: papers.List, + } + if user != "" && pass != "" { + username, password, ok := r.BasicAuth() + if !ok || user != username || pass != password { + w.Header().Add("WWW-Authenticate", + `Basic realm="Please authenticate"`) + http.Error(w, http.StatusText(http.StatusUnauthorized), + http.StatusUnauthorized) + return + } + } + if err := r.ParseForm(); err != nil { + res.Status = err.Error() + t.Execute(w, &res) + return + } + + if action := r.FormValue("action"); action == "delete" { + for _, paper := range r.Form["paper"] { + if res.Status != "" { + break + } + if err := papers.DeletePaper(paper); err != nil { + res.Status = err.Error() + } + } + for _, category := range r.Form["category"] { + if res.Status != "" { + break + } + if err := papers.DeleteCategory(category); err != nil { + res.Status = err.Error() + } + } + if res.Status == "" { + res.Status = "delete successful" + } + } else if strings.HasPrefix(action, "move") { + destCategory := strings.SplitN(action, "move-", 2)[1] + for _, paper := range r.Form["paper"] { + if res.Status != "" { + break + } + if err := papers.MovePaper(paper, destCategory); err != nil { + res.Status = err.Error() + } + } + if res.Status == "" { + res.Status = "move successful" + } + } else { + rc := r.FormValue("rename-category") + rt := r.FormValue("rename-to") + if rc != "" && rt != "" { + // ensure filesystem safety of category names + rc = strings.Trim(strings.Replace(rc, "..", "", -1), "/.") + rt = strings.Trim(strings.Replace(rt, "..", "", -1), "/.") + + if err := papers.RenameCategory(rc, rt); err != nil { + res.Status = err.Error() + } + if res.Status == "" { + res.Status = "rename successful" + } + } + } + t.Execute(w, &res) +} + +// AddHandler provides support for new paper processing and category addition +func (papers *Papers) AddHandler(w http.ResponseWriter, r *http.Request) { + t, _ := template.ParseFiles(filepath.Join(templateDir, "admin.html"), + filepath.Join(templateDir, "layout.html"), + filepath.Join(templateDir, "list.html"), + ) + if user != "" && pass != "" { + username, password, ok := r.BasicAuth() + if !ok || user != username || pass != password { + w.Header().Add("WWW-Authenticate", + `Basic realm="Please authenticate"`) + http.Error(w, http.StatusText(http.StatusUnauthorized), + http.StatusUnauthorized) + return + } + } + p := r.FormValue("dl-paper") + c := r.FormValue("dl-category") + nc := r.FormValue("new-category") + + // sanitize input; we use the category to build the path used to save + // papers + nc = strings.Trim(strings.Replace(nc, "..", "", -1), "/.") + res := Resp{Papers: papers.List} + + // paper download, both required fields populated + if len(strings.TrimSpace(p)) > 0 && len(strings.TrimSpace(c)) > 0 { + if paper, err := papers.ProcessAddPaperInput(c, p); err != nil { + res.Status = err.Error() + } else { + if paper.Meta.Title != "" { + res.Status = fmt.Sprintf("%q downloaded successfully", + paper.Meta.Title) + } else { + res.Status = fmt.Sprintf("%q downloaded successfully", + paper.PaperName) + } + res.LastPaperDL = strings.TrimPrefix(paper.PaperPath, + papers.Path+"/") + } + res.LastUsedCategory = c + } else if len(strings.TrimSpace(nc)) > 0 { + // accounts for nested category addition; e.g. "foo/bar/baz" where + // "foo/bar" and/or "foo" do not already exist + n := nc + for n != "." { + _, exists := papers.List[n] + if exists == true { + res.Status = fmt.Sprintf("category %q already exists", n) + } else if err := os.MkdirAll(filepath.Join(papers.Path, n), + os.ModePerm); err != nil { + res.Status = fmt.Sprintf(err.Error()) + } else { + papers.List[n] = make(map[string]*Paper) + } + if res.Status != "" { + break + } + res.LastUsedCategory = n + n = filepath.Dir(n) + } + if res.Status == "" { + res.Status = fmt.Sprintf("category %q added successfully", nc) + } + } + t.Execute(w, &res) +} + +// DownloadHandler serves saved papers up for download +func (papers *Papers) DownloadHandler(w http.ResponseWriter, r *http.Request) { + paper := strings.TrimPrefix(r.URL.Path, "/download/") + category := filepath.Dir(paper) + + // return 404 if the provided paper category or paper key do not exist in + // the papers set + if _, exists := papers.List[category]; exists == false { + http.Error(w, http.StatusText(http.StatusNotFound), + http.StatusNotFound) + return + } + if _, exists := papers.List[category][paper]; exists == false { + http.Error(w, http.StatusText(http.StatusNotFound), + http.StatusNotFound) + return + } + + // ensure the paper (PaperPath) actually exists on the filesystem + i, err := os.Stat(papers.List[category][paper].PaperPath) + if os.IsNotExist(err) { + http.Error(w, http.StatusText(http.StatusNotFound), + http.StatusNotFound) + } else if i.IsDir() { + http.Error(w, http.StatusText(http.StatusForbidden), + http.StatusForbidden) + } else { + http.ServeFile(w, r, papers.List[category][paper].PaperPath) + } +} + diff --git a/templates/admin-edit.html b/templates/admin-edit.html index 4192af0..57a9aaa 100644 --- a/templates/admin-edit.html +++ b/templates/admin-edit.html @@ -58,10 +58,20 @@
{{ end }} - {{ if $paper.Meta.Title }} + {{ $contCount := len $paper.Meta.Contributors }}{{ if ge $contCount 1 }} {{ range $index, $contributor := $paper.Meta.Contributors }}{{if $index}}, {{end}}{{ $contributor.FirstName }} {{ $contributor.LastName }}{{end}}
- {{ $paper.Meta.PubYear }} - {{ $paper.Meta.DOI }} - {{ $paper.Meta.Journal }} {{ end }} + + {{ $hasVal := false }} + {{ if $paper.Meta.PubYear }}{{ $hasVal = true }}{{ $paper.Meta.PubYear }}{{ end }} + + {{ if $paper.Meta.DOI }}{{ if $hasVal }}- {{end}}{{ $paper.Meta.DOI }} + + {{ else if $paper.Meta.ArxivID }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.ArxivID }} + {{ else if $paper.Meta.Resource }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.Resource }}{{ end }} + + {{ if $paper.Meta.Journal }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.Journal }}{{ end }} + {{ end }} {{ end }} diff --git a/templates/list.html b/templates/list.html index 8f6382d..9e139b9 100644 --- a/templates/list.html +++ b/templates/list.html @@ -14,10 +14,20 @@ {{ $paper.PaperName }}
{{ end }} - {{ if $paper.Meta.Title }} + {{ $contCount := len $paper.Meta.Contributors }}{{ if ge $contCount 1 }} {{ range $index, $contributor := $paper.Meta.Contributors }}{{if $index}}, {{end}}{{ $contributor.FirstName }} {{ $contributor.LastName }}{{end}}
- {{ $paper.Meta.PubYear }} - {{ $paper.Meta.DOI }} - {{ $paper.Meta.Journal }} {{ end }} + + {{ $hasVal := false }} + {{ if $paper.Meta.PubYear }}{{ $hasVal = true }}{{ $paper.Meta.PubYear }}{{ end }} + + {{ if $paper.Meta.DOI }}{{ if $hasVal }}- {{end}}{{ $paper.Meta.DOI }} + + {{ else if $paper.Meta.ArxivID }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.ArxivID }} + {{ else if $paper.Meta.Resource }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.Resource }}{{ end }} + + {{ if $paper.Meta.Journal }}{{ if $hasVal }}- {{ end }}{{ $paper.Meta.Journal }}{{ end }} + {{ end }} {{ end }} diff --git a/util.go b/util.go index a1b585c..cc2af29 100644 --- a/util.go +++ b/util.go @@ -2,6 +2,7 @@ package main import ( "bufio" + "encoding/xml" "fmt" "io" "io/ioutil" @@ -9,6 +10,11 @@ import ( "net/http" "os" "regexp" + "strconv" + "strings" + "time" + + "golang.org/x/net/html" ) var privateIPBlocks []*net.IPNet @@ -71,21 +77,80 @@ func makeRequest(client *http.Client, u string) (*http.Response, error) { return resp, nil } -// getDOIFromPage returns the parsed DOI from the body of the *http.Response -// provided -func getDOIFromPage(resp *http.Response) []byte { - defer resp.Body.Close() - scanner := bufio.NewScanner(resp.Body) - for scanner.Scan() { - doi := getDOIFromBytes(scanner.Bytes()) - if doi != nil { - return doi +// getMetaFromCitation parses an *http.Response for tags to populate a +// paper's Meta attributes and returns the paper +func getMetaFromCitation(resp *http.Response) (*Meta, error) { + doc, err := html.Parse(resp.Body) + if err != nil { + return nil, err + } + + var meta Meta + var f func(*html.Node) + f = func(n *html.Node) { + if n.Type == html.ElementNode && n.Data == "meta" { + var name string + var cont string + for _, a := range n.Attr { + if a.Key == "name" || a.Key == "property" { + name = a.Val + } else if a.Key == "content" { + cont = a.Val + } + } + switch name { + case "citation_title": + meta.Title = cont + case "citation_author": + var c Contributor + // Doe, Jain + if strings.Contains(cont, ",") { + v := strings.Split(cont, ", ") + c.FirstName = strings.Join(v[1:], " ") + c.LastName = v[0] + // Jain Doe + } else { + v := strings.Split(cont, " ") + c.FirstName = strings.Join(v[:len(v)-1], " ") + c.LastName = strings.Join(v[len(v)-1:], " ") + } + c.Role = "author" + if len(meta.Contributors) > 0 { + c.Sequence = "additional" + } else { + c.Sequence = "first" + } + meta.Contributors = append(meta.Contributors, c) + case "citation_date", "citation_publication_date": + var formats = []string{"2006-01-02", "2006/01/02", "2006"} + for _, format := range formats { + t, err := time.Parse(format, cont) + if err == nil { + meta.PubMonth = t.Month().String() + meta.PubYear = strconv.Itoa(t.Year()) + break + } + } + case "citation_journal_title", "og:site_name", "DC.Publisher": + meta.Journal = cont + case "citation_firstpage": + meta.FirstPage = cont + case "citation_lastpage": + meta.LastPage = cont + case "citation_doi": + meta.DOI = cont + case "citation_arxiv_id": + meta.ArxivID = cont + case "citation_pdf_url": + meta.Resource = cont + } } - if err := scanner.Err(); err != nil { - return nil + for c := n.FirstChild; c != nil; c = c.NextSibling { + f(c) } } - return nil + f(doc) + return &meta, nil } // renameFile is an alternative to os.Rename which supports moving files @@ -148,41 +213,34 @@ func copyFile(src, dst string) (err error) { } // getMetaFromDOI saves doi.org API data to TempFile and returns its path -func getMetaFromDOI(client *http.Client, doi []byte) (string, error) { +func getMetaFromDOI(client *http.Client, doi []byte) (*Meta, error) { u := "https://doi.org/" + string(doi) req, err := http.NewRequest("GET", u, nil) req.Header.Add("Accept", "application/vnd.crossref.unixref+xml;q=1,application/rdf+xml;q=0.5") resp, err := client.Do(req) if err != nil { - return "", err + return nil, err } if resp.StatusCode != http.StatusOK { - return "", fmt.Errorf("%q: status code not OK, DOI invalid?", u) + return nil, fmt.Errorf("%q: failed to get metadata", u) } if resp.Header.Get("Content-Type") != "application/vnd.crossref.unixref+xml" { - return "", fmt.Errorf("%q: content-type not application/vnd.crossref.unixref+xml", u) + return nil, fmt.Errorf("%q: content-type not application/vnd.crossref.unixref+xml", u) } if err != nil { - return "", err - } - - // create a temporary file to store XML stream - tmpXML, err := ioutil.TempFile("", "tmp-*.meta.xml") - if err != nil { - return "", err + return nil, err } + r := bufio.NewReader(resp.Body) + d := xml.NewDecoder(r) - // incrementally save XML data to the temporary file; saves memory using - // the filesystem instead of passing around buffers - if err := saveRespBody(resp, tmpXML.Name()); err != nil { - return "", err - } - if err := tmpXML.Close(); err != nil { - return "", err + // populate p struct with values derived from doi.org metadata + var meta Meta + if err := d.Decode(&meta); err != nil { + return nil, err } - return tmpXML.Name(), nil + return &meta, nil } // getPaper saves makes an outbound request to a remote resource and saves the -- cgit v1.2.3-54-g00ecf