summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2022-09-24 14:26:07 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2022-09-24 15:02:39 +0200
commit16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4 (patch)
treea960840931c02d6997b9a086a7ec8c3022fb53e0 /searx
parentc76830d8a878a69924bfda54825c4bd09b6287db (diff)
downloadsearxng-16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4.tar.gz
searxng-16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4.zip
[mod] core.ac.uk: try multiple ways to get url
If the url is not found, using: * the DOI * the downloadUrl * the ARK id
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/core.py37
1 files changed, 29 insertions, 8 deletions
diff --git a/searx/engines/core.py b/searx/engines/core.py
index c95fa1d28..a997343f2 100644
--- a/searx/engines/core.py
+++ b/searx/engines/core.py
@@ -41,7 +41,6 @@ def request(query, params):
)
params['url'] = base_url + search_path
- logger.debug("query_url --> %s", params['url'])
return params
@@ -51,17 +50,39 @@ def response(resp):
for result in json_data['data']:
source = result['_source']
- if not source['urls']:
+ url = None
+ if source.get('urls'):
+ url = source['urls'][0].replace('http://', 'https://', 1)
+
+ if url is None and source.get('doi'):
+ # use the DOI reference
+ url = 'https://doi.org/' + source['doi']
+
+ if url is None and source.get('downloadUrl'):
+ # use the downloadUrl
+ url = source['downloadUrl']
+
+ if url is None and source.get('identifiers'):
+ # try to find an ark id, see
+ # https://www.wikidata.org/wiki/Property:P8091
+ # and https://en.wikipedia.org/wiki/Archival_Resource_Key
+ arkids = [
+ identifier[5:] # 5 is the length of "ark:/"
+ for identifier in source.get('identifiers')
+ if isinstance(identifier, str) and identifier.startswith('ark:/')
+ ]
+ if len(arkids) > 0:
+ url = 'https://n2t.net/' + arkids[0]
+
+ if url is None:
continue
time = source['publishedDate'] or source['depositedDate']
if time:
publishedDate = datetime.fromtimestamp(time / 1000)
- journals = []
- if source['journals']:
- for j in source['journals']:
- journals.append(j['title'])
+ # sometimes the 'title' is None / filter None values
+ journals = [j['title'] for j in (source.get('journals') or []) if j['title']]
publisher = source['publisher']
if publisher:
@@ -71,8 +92,8 @@ def response(resp):
{
'template': 'paper.html',
'title': source['title'],
- 'url': source['urls'][0].replace('http://', 'https://', 1),
- 'content': source['description'],
+ 'url': url,
+ 'content': source['description'] or '',
# 'comments': '',
'tags': source['topics'],
'publishedDate': publishedDate,