diff options
author | Alexandre Flament <alex@al-f.net> | 2022-09-24 14:26:07 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2022-09-24 15:02:39 +0200 |
commit | 16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4 (patch) | |
tree | a960840931c02d6997b9a086a7ec8c3022fb53e0 /searx | |
parent | c76830d8a878a69924bfda54825c4bd09b6287db (diff) | |
download | searxng-16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4.tar.gz searxng-16443d4f4a4a3b94c8646db48ac3f1ae6f0623c4.zip |
[mod] core.ac.uk: try multiple ways to get url
If the url is not found, using:
* the DOI
* the downloadUrl
* the ARK id
Diffstat (limited to 'searx')
-rw-r--r-- | searx/engines/core.py | 37 |
1 files changed, 29 insertions, 8 deletions
diff --git a/searx/engines/core.py b/searx/engines/core.py index c95fa1d28..a997343f2 100644 --- a/searx/engines/core.py +++ b/searx/engines/core.py @@ -41,7 +41,6 @@ def request(query, params): ) params['url'] = base_url + search_path - logger.debug("query_url --> %s", params['url']) return params @@ -51,17 +50,39 @@ def response(resp): for result in json_data['data']: source = result['_source'] - if not source['urls']: + url = None + if source.get('urls'): + url = source['urls'][0].replace('http://', 'https://', 1) + + if url is None and source.get('doi'): + # use the DOI reference + url = 'https://doi.org/' + source['doi'] + + if url is None and source.get('downloadUrl'): + # use the downloadUrl + url = source['downloadUrl'] + + if url is None and source.get('identifiers'): + # try to find an ark id, see + # https://www.wikidata.org/wiki/Property:P8091 + # and https://en.wikipedia.org/wiki/Archival_Resource_Key + arkids = [ + identifier[5:] # 5 is the length of "ark:/" + for identifier in source.get('identifiers') + if isinstance(identifier, str) and identifier.startswith('ark:/') + ] + if len(arkids) > 0: + url = 'https://n2t.net/' + arkids[0] + + if url is None: continue time = source['publishedDate'] or source['depositedDate'] if time: publishedDate = datetime.fromtimestamp(time / 1000) - journals = [] - if source['journals']: - for j in source['journals']: - journals.append(j['title']) + # sometimes the 'title' is None / filter None values + journals = [j['title'] for j in (source.get('journals') or []) if j['title']] publisher = source['publisher'] if publisher: @@ -71,8 +92,8 @@ def response(resp): { 'template': 'paper.html', 'title': source['title'], - 'url': source['urls'][0].replace('http://', 'https://', 1), - 'content': source['description'], + 'url': url, + 'content': source['description'] or '', # 'comments': '', 'tags': source['topics'], 'publishedDate': publishedDate, |