summaryrefslogtreecommitdiff
path: root/searx/engines
diff options
context:
space:
mode:
authorjazzzooo <38244149+jazzzooo@users.noreply.github.com>2023-09-13 16:21:10 +0000
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-09-14 17:39:23 +0200
commit74600c028d5e4a0745dd3bc48556a000d741f13e (patch)
tree721682d5f1e4f7b9489cbd03b2e180e7687c21cc /searx/engines
parented6a5a01bb4d4ec3b209608a16cca44811a56c1d (diff)
downloadsearxng-74600c028d5e4a0745dd3bc48556a000d741f13e.tar.gz
searxng-74600c028d5e4a0745dd3bc48556a000d741f13e.zip
[fix] engine - Crossref
Crossref was broken on result types journal-issue and component .. The old code had lots of assumptions, and broke during parsing. Now the assumptions are more explicit and checked them with the API.
Diffstat (limited to 'searx/engines')
-rw-r--r--searx/engines/crossref.py82
1 files changed, 43 insertions, 39 deletions
diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py
index e12a0da5b..c2ed7763f 100644
--- a/searx/engines/crossref.py
+++ b/searx/engines/crossref.py
@@ -1,60 +1,64 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
-"""Semantic Scholar (Science)
-"""
-# pylint: disable=use-dict-literal
+"""CrossRef"""
from urllib.parse import urlencode
-from searx.utils import html_to_text
+from datetime import datetime
about = {
- "website": 'https://www.crossref.org/',
- "wikidata_id": 'Q5188229',
- "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
+ "website": "https://www.crossref.org/",
+ "wikidata_id": "Q5188229",
+ "official_api_documentation": "https://api.crossref.org",
"use_official_api": False,
"require_api_key": False,
- "results": 'JSON',
+ "results": "JSON",
}
-categories = ['science', 'scientific publications']
+categories = ["science", "scientific publications"]
paging = True
-search_url = 'https://api.crossref.org/works'
+search_url = "https://api.crossref.org/works"
def request(query, params):
- params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
+ params["url"] = search_url + "?" + urlencode({"query": query, "offset": 20 * (params["pageno"] - 1)})
return params
def response(resp):
- res = resp.json()
results = []
- for record in res['message']['items']:
- record_type = record['type']
- if record_type == 'book-chapter':
- title = record['container-title'][0]
- if record['title'][0].lower().strip() != title.lower().strip():
- title = html_to_text(title) + ' (' + html_to_text(record['title'][0]) + ')'
- journal = None
+ for record in resp.json()["message"]["items"]:
+
+ if record["type"] == "component":
+ # These seem to be files published along with papers. Not something you'd search for
+ continue
+ result = {
+ "template": "paper.html",
+ "content": record.get("abstract", ""),
+ "doi": record.get("DOI"),
+ "pages": record.get("page"),
+ "publisher": record.get("publisher"),
+ "tags": record.get("subject"),
+ "type": record.get("type"),
+ "url": record.get("URL"),
+ "volume": record.get("volume"),
+ }
+ if record["type"] == "book-chapter":
+ result["title"] = record["container-title"][0]
+ if record["title"][0].lower().strip() != result["title"].lower().strip():
+ result["title"] += f" ({record['title'][0]})"
else:
- title = html_to_text(record['title'][0])
- journal = record.get('container-title', [None])[0]
- url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
- authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
- isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
- results.append(
- {
- 'template': 'paper.html',
- 'url': url,
- 'title': title,
- 'journal': journal,
- 'volume': record.get('volume'),
- 'type': record['type'],
- 'content': html_to_text(record.get('abstract', '')),
- 'publisher': record.get('publisher'),
- 'authors': authors,
- 'doi': record['DOI'],
- 'isbn': isbn,
- }
- )
+ result["title"] = record["title"][0] if "title" in record else record.get("container-title", [None])[0]
+ result["journal"] = record.get("container-title", [None])[0] if "title" in record else None
+
+ if "resource" in record and "primary" in record["resource"] and "URL" in record["resource"]["primary"]:
+ result["url"] = record["resource"]["primary"]["URL"]
+ if "published" in record and "date-parts" in record["published"]:
+ result["publishedDate"] = datetime(*(record["published"]["date-parts"][0] + [1, 1][:3]))
+ result["authors"] = [a.get("given", "") + " " + a.get("family", "") for a in record.get("author", [])]
+ result["isbn"] = record.get("isbn") or [i["value"] for i in record.get("isbn-type", [])]
+ # All the links are not PDFs, even if the URL ends with ".pdf"
+ # result["pdf_url"] = record.get("link", [{"URL": None}])[0]["URL"]
+
+ results.append(result)
+
return results