diff options
author | Alexandre FLAMENT <alexandre.flament@hesge.ch> | 2022-08-26 16:10:12 +0000 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2022-09-23 20:45:58 +0200 |
commit | e36f85b8365e5d6a9263dd78242a10a305a9000c (patch) | |
tree | fce8f3e33d26847b004c20c378fb3fa35ca2c8eb /searx/engines | |
parent | 593026ad9cd024fd7b3182d48f274aa41b374c74 (diff) | |
download | searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.tar.gz searxng-e36f85b8365e5d6a9263dd78242a10a305a9000c.zip |
Science category: update the engines
* use the paper.html template
* fetch more data from the engines
* add crossref.py
Diffstat (limited to 'searx/engines')
-rw-r--r-- | searx/engines/arxiv.py | 92 | ||||
-rw-r--r-- | searx/engines/crossref.py | 59 | ||||
-rw-r--r-- | searx/engines/google_scholar.py | 85 | ||||
-rw-r--r-- | searx/engines/pubmed.py | 99 | ||||
-rw-r--r-- | searx/engines/semantic_scholar.py | 57 | ||||
-rw-r--r-- | searx/engines/springer.py | 38 |
6 files changed, 315 insertions, 115 deletions
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index a1a58172d..a4811ebd5 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -3,9 +3,10 @@ ArXiV (Scientific preprints) """ -from lxml import html +from lxml import etree +from lxml.etree import XPath from datetime import datetime -from searx.utils import eval_xpath_list, eval_xpath_getindex +from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex # about about = { @@ -17,7 +18,7 @@ about = { "results": 'XML-RSS', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True base_url = ( @@ -27,6 +28,23 @@ base_url = ( # engine dependent config number_of_results = 10 +# xpaths +arxiv_namespaces = { + "atom": "http://www.w3.org/2005/Atom", + "arxiv": "http://arxiv.org/schemas/atom", +} +xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces) +xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces) +xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces) +xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces) +xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces) +xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces) +xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces) +xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces) +xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces) +xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces) +xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces) + def request(query, params): # basic search @@ -41,30 +59,50 @@ def request(query, params): def response(resp): results = [] - - dom = html.fromstring(resp.content) - - for entry in eval_xpath_list(dom, '//entry'): - title = eval_xpath_getindex(entry, './/title', 0).text - - url = eval_xpath_getindex(entry, './/id', 0).text - - content_string = '{doi_content}{abstract_content}' - - abstract = eval_xpath_getindex(entry, './/summary', 0).text - - # If a doi is available, add it to the snipppet - doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None) - doi_content = doi_element.text if doi_element is not None else '' - content = content_string.format(doi_content=doi_content, abstract_content=abstract) - - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term - - publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ') - - res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content} + dom = etree.fromstring(resp.content) + for entry in eval_xpath_list(dom, xpath_entry): + title = eval_xpath_getindex(entry, xpath_title, 0).text + + url = eval_xpath_getindex(entry, xpath_id, 0).text + abstract = eval_xpath_getindex(entry, xpath_summary, 0).text + + authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)] + + # doi + doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None) + doi = None if doi_element is None else doi_element.text + + # pdf + pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None) + pdf_url = None if pdf_element is None else pdf_element.attrib.get('href') + + # journal + journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None) + journal = None if journal_element is None else journal_element.text + + # tags + tag_elements = eval_xpath(entry, xpath_category) + tags = [str(tag) for tag in tag_elements] + + # comments + comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None) + comments = None if comments_elements is None else comments_elements.text + + publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ') + + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'publishedDate': publishedDate, + 'content': abstract, + 'doi': doi, + 'authors': authors, + 'journal': journal, + 'tags': tags, + 'comments': comments, + 'pdf_url': pdf_url, + } results.append(res_dict) diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py new file mode 100644 index 000000000..d61318146 --- /dev/null +++ b/searx/engines/crossref.py @@ -0,0 +1,59 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Semantic Scholar (Science) +""" + +from urllib.parse import urlencode +from searx.utils import html_to_text + +about = { + "website": 'https://www.crossref.org/', + "wikidata_id": 'Q5188229', + "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc', + "use_official_api": False, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['science', 'scientific publications'] +paging = True +search_url = 'https://api.crossref.org/works' + + +def request(query, params): + params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1))) + return params + + +def response(resp): + res = resp.json() + results = [] + for record in res['message']['items']: + record_type = record['type'] + if record_type == 'book-chapter': + title = record['container-title'][0] + if record['title'][0].lower().strip() != title.lower().strip(): + title = title + ' (' + record['title'][0] + ')' + journal = None + else: + title = record['title'][0] + journal = record.get('container-title', [None])[0] + url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL'] + authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])] + isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])] + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'journal': journal, + 'volume': record.get('volume'), + 'type': record['type'], + 'content': html_to_text(record.get('abstract', '')), + 'publisher': record.get('publisher'), + 'authors': authors, + 'doi': record['DOI'], + 'isbn': isbn, + } + ) + return results diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py index 41c62886b..c07cd4cea 100644 --- a/searx/engines/google_scholar.py +++ b/searx/engines/google_scholar.py @@ -13,10 +13,12 @@ Definitions`_. from urllib.parse import urlencode from datetime import datetime +from typing import Optional from lxml import html from searx.utils import ( eval_xpath, + eval_xpath_getindex, eval_xpath_list, extract_text, ) @@ -46,7 +48,7 @@ about = { } # engine dependent config -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True language_support = True use_locale_domain = True @@ -99,7 +101,43 @@ def request(query, params): return params -def response(resp): +def parse_gs_a(text: Optional[str]): + """Parse the text written in green. + + Possible formats: + * "{authors} - {journal}, {year} - {publisher}" + * "{authors} - {year} - {publisher}" + * "{authors} - {publisher}" + """ + if text is None or text == "": + return None, None, None, None + + s_text = text.split(' - ') + authors = s_text[0].split(', ') + publisher = s_text[-1] + if len(s_text) != 3: + return authors, None, publisher, None + + # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}" + # get journal and year + journal_year = s_text[1].split(', ') + # journal is optional and may contains some coma + if len(journal_year) > 1: + journal = ', '.join(journal_year[0:-1]) + if journal == '…': + journal = None + else: + journal = None + # year + year = journal_year[-1] + try: + publishedDate = datetime.strptime(year.strip(), '%Y') + except ValueError: + publishedDate = None + return authors, journal, publisher, publishedDate + + +def response(resp): # pylint: disable=too-many-locals """Get response from google's search request""" results = [] @@ -112,30 +150,53 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'): + for result in eval_xpath_list(dom, '//div[@data-cid]'): - title = extract_text(eval_xpath(result, './h3[1]//a')) + title = extract_text(eval_xpath(result, './/h3[1]//a')) if not title: # this is a [ZITATION] block continue - url = eval_xpath(result, './h3[1]//a/@href')[0] - content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or '' - - pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]')) - if pub_info: - content += "[%s]" % pub_info - pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]')) if pub_type: - title = title + " " + pub_type + pub_type = pub_type[1:-1].lower() + + url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0) + content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]')) + authors, journal, publisher, publishedDate = parse_gs_a( + extract_text(eval_xpath(result, './/div[@class="gs_a"]')) + ) + if publisher in url: + publisher = None + + # cited by + comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]')) + + # link to the html or pdf document + html_url = None + pdf_url = None + doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None) + doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]')) + if doc_type == "[PDF]": + pdf_url = doc_url + else: + html_url = doc_url results.append( { + 'template': 'paper.html', + 'type': pub_type, 'url': url, 'title': title, + 'authors': authors, + 'publisher': publisher, + 'journal': journal, + 'publishedDate': publishedDate, 'content': content, + 'comments': comments, + 'html_url': html_url, + 'pdf_url': pdf_url, } ) diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 27444ae24..02e282d5f 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -3,11 +3,15 @@ PubMed (Scholar publications) """ -from flask_babel import gettext from lxml import etree from datetime import datetime from urllib.parse import urlencode from searx.network import get +from searx.utils import ( + eval_xpath_getindex, + eval_xpath_list, + extract_text, +) # about about = { @@ -22,7 +26,7 @@ about = { "results": 'XML', } -categories = ['science'] +categories = ['science', 'scientific publications'] base_url = ( 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}' @@ -63,46 +67,61 @@ def response(resp): retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args) - search_results_xml = get(retrieve_url_encoded).content - search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation') - - for entry in search_results: - title = entry.xpath('.//Article/ArticleTitle')[0].text + search_results_response = get(retrieve_url_encoded).content + search_results = etree.XML(search_results_response) + for entry in eval_xpath_list(search_results, '//PubmedArticle'): + medline = eval_xpath_getindex(entry, './MedlineCitation', 0) - pmid = entry.xpath('.//PMID')[0].text + title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text + pmid = eval_xpath_getindex(medline, './/PMID', 0).text url = pubmed_url + pmid - - try: - content = entry.xpath('.//Abstract/AbstractText')[0].text - except: - content = gettext('No abstract is available for this publication.') - - # If a doi is available, add it to the snipppet - try: - doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text - content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content) - except: - pass - - if len(content) > 300: - content = content[0:300] + "..." - # TODO: center snippet on query term - - res_dict = {'url': url, 'title': title, 'content': content} - - try: - publishedDate = datetime.strptime( - entry.xpath('.//DateCreated/Year')[0].text - + '-' - + entry.xpath('.//DateCreated/Month')[0].text - + '-' - + entry.xpath('.//DateCreated/Day')[0].text, - '%Y-%m-%d', - ) - res_dict['publishedDate'] = publishedDate - except: - pass + content = extract_text( + eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True + ) + doi = extract_text( + eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True + ) + journal = extract_text( + eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True + ) + issn = extract_text( + eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True + ) + authors = [] + for author in eval_xpath_list(medline, './Article/AuthorList/Author'): + f = eval_xpath_getindex(author, './ForeName', 0, default=None) + l = eval_xpath_getindex(author, './LastName', 0, default=None) + f = '' if f is None else f.text + l = '' if l is None else l.text + authors.append((f + ' ' + l).strip()) + + res_dict = { + 'template': 'paper.html', + 'url': url, + 'title': title, + 'content': content, + 'journal': journal, + 'issn': [issn], + 'authors': authors, + 'doi': doi, + } + + accepted_date = eval_xpath_getindex( + entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None + ) + if accepted_date is not None: + year = eval_xpath_getindex(accepted_date, './Year', 0) + month = eval_xpath_getindex(accepted_date, './Month', 0) + day = eval_xpath_getindex(accepted_date, './Day', 0) + try: + publishedDate = datetime.strptime( + year.text + '-' + month.text + '-' + day.text, + '%Y-%m-%d', + ) + res_dict['publishedDate'] = publishedDate + except Exception as e: + print(e) results.append(res_dict) - return results + return results diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py index bda731047..b2701c333 100644 --- a/searx/engines/semantic_scholar.py +++ b/searx/engines/semantic_scholar.py @@ -6,6 +6,8 @@ from json import dumps, loads from datetime import datetime +from flask_babel import gettext + about = { "website": 'https://www.semanticscholar.org/', "wikidata_id": 'Q22908627', @@ -15,6 +17,7 @@ about = { "results": 'JSON', } +categories = ['science', 'scientific publications'] paging = True search_url = 'https://www.semanticscholar.org/api/1/search' paper_url = 'https://www.semanticscholar.org/paper' @@ -47,9 +50,6 @@ def response(resp): results = [] for result in res['results']: - item = {} - metadata = [] - url = result.get('primaryPaperLink', {}).get('url') if not url and result.get('links'): url = result.get('links')[0] @@ -60,22 +60,47 @@ def response(resp): if not url: url = paper_url + '/%s' % result['id'] - item['url'] = url + # publishedDate + if 'pubDate' in result: + publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d") + else: + publishedDate = None - item['title'] = result['title']['text'] - item['content'] = result['paperAbstract']['text'] + # authors + authors = [author[0]['name'] for author in result.get('authors', [])] - metadata = result.get('fieldsOfStudy') or [] - venue = result.get('venue', {}).get('text') - if venue: - metadata.append(venue) - if metadata: - item['metadata'] = ', '.join(metadata) + # pick for the first alternate link, but not from the crawler + pdf_url = None + for doc in result.get('alternatePaperLinks', []): + if doc['linkType'] != 'crawler': + pdf_url = doc['url'] + break - pubDate = result.get('pubDate') - if pubDate: - item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d") + # comments + comments = None + if 'citationStats' in result: + comments = gettext( + '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}' + ).format( + numCitations=result['citationStats']['numCitations'], + firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'], + lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'], + ) - results.append(item) + results.append( + { + 'template': 'paper.html', + 'url': url, + 'title': result['title']['text'], + 'content': result['paperAbstract']['text'], + 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'), + 'doi': result.get('doiInfo', {}).get('doi'), + 'tags': result.get('fieldsOfStudy'), + 'authors': authors, + 'pdf_url': pdf_url, + 'publishedDate': publishedDate, + 'comments': comments, + } + ) return results diff --git a/searx/engines/springer.py b/searx/engines/springer.py index 512d71e5e..2711fa807 100644 --- a/searx/engines/springer.py +++ b/searx/engines/springer.py @@ -19,7 +19,7 @@ about = { "results": 'JSON', } -categories = ['science'] +categories = ['science', 'scientific publications'] paging = True nb_per_page = 10 api_key = 'unset' @@ -41,32 +41,30 @@ def response(resp): json_data = loads(resp.text) for record in json_data['records']: - content = record['abstract'][0:500] - if len(record['abstract']) > len(content): - content += "..." + content = record['abstract'] published = datetime.strptime(record['publicationDate'], '%Y-%m-%d') - - metadata = [ - record[x] - for x in [ - 'publicationName', - 'identifier', - 'contentType', - ] - if record.get(x) is not None - ] - - metadata = ' / '.join(metadata) - if record.get('startingPage') and record.get('endingPage') is not None: - metadata += " (%(startingPage)s-%(endingPage)s)" % record - + authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']] + tags = record.get('genre') + if isinstance(tags, str): + tags = [tags] results.append( { + 'template': 'paper.html', 'title': record['title'], 'url': record['url'][0]['value'].replace('http://', 'https://', 1), + 'type': record.get('contentType'), 'content': content, 'publishedDate': published, - 'metadata': metadata, + 'authors': authors, + 'doi': record.get('doi'), + 'journal': record.get('publicationName'), + 'start_page': record.get('start_page'), + 'end_page': record.get('end_page'), + 'tags': tags, + 'issn': [record.get('issn')], + 'isbn': [record.get('isbn')], + 'volume': record.get('volume') or None, + 'number': record.get('number') or None, } ) return results |