summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--searx/engines/arxiv.py92
-rw-r--r--searx/engines/crossref.py59
-rw-r--r--searx/engines/google_scholar.py85
-rw-r--r--searx/engines/pubmed.py99
-rw-r--r--searx/engines/semantic_scholar.py57
-rw-r--r--searx/engines/springer.py38
-rw-r--r--searx/searxng.msg1
-rw-r--r--searx/settings.yml26
8 files changed, 320 insertions, 137 deletions
diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py
index a1a58172d..a4811ebd5 100644
--- a/searx/engines/arxiv.py
+++ b/searx/engines/arxiv.py
@@ -3,9 +3,10 @@
ArXiV (Scientific preprints)
"""
-from lxml import html
+from lxml import etree
+from lxml.etree import XPath
from datetime import datetime
-from searx.utils import eval_xpath_list, eval_xpath_getindex
+from searx.utils import eval_xpath, eval_xpath_list, eval_xpath_getindex
# about
about = {
@@ -17,7 +18,7 @@ about = {
"results": 'XML-RSS',
}
-categories = ['science']
+categories = ['science', 'scientific publications']
paging = True
base_url = (
@@ -27,6 +28,23 @@ base_url = (
# engine dependent config
number_of_results = 10
+# xpaths
+arxiv_namespaces = {
+ "atom": "http://www.w3.org/2005/Atom",
+ "arxiv": "http://arxiv.org/schemas/atom",
+}
+xpath_entry = XPath('//atom:entry', namespaces=arxiv_namespaces)
+xpath_title = XPath('.//atom:title', namespaces=arxiv_namespaces)
+xpath_id = XPath('.//atom:id', namespaces=arxiv_namespaces)
+xpath_summary = XPath('.//atom:summary', namespaces=arxiv_namespaces)
+xpath_author_name = XPath('.//atom:author/atom:name', namespaces=arxiv_namespaces)
+xpath_doi = XPath('.//arxiv:doi', namespaces=arxiv_namespaces)
+xpath_pdf = XPath('.//atom:link[@title="pdf"]', namespaces=arxiv_namespaces)
+xpath_published = XPath('.//atom:published', namespaces=arxiv_namespaces)
+xpath_journal = XPath('.//arxiv:journal_ref', namespaces=arxiv_namespaces)
+xpath_category = XPath('.//atom:category/@term', namespaces=arxiv_namespaces)
+xpath_comment = XPath('./arxiv:comment', namespaces=arxiv_namespaces)
+
def request(query, params):
# basic search
@@ -41,30 +59,50 @@ def request(query, params):
def response(resp):
results = []
-
- dom = html.fromstring(resp.content)
-
- for entry in eval_xpath_list(dom, '//entry'):
- title = eval_xpath_getindex(entry, './/title', 0).text
-
- url = eval_xpath_getindex(entry, './/id', 0).text
-
- content_string = '{doi_content}{abstract_content}'
-
- abstract = eval_xpath_getindex(entry, './/summary', 0).text
-
- # If a doi is available, add it to the snipppet
- doi_element = eval_xpath_getindex(entry, './/link[@title="doi"]', 0, default=None)
- doi_content = doi_element.text if doi_element is not None else ''
- content = content_string.format(doi_content=doi_content, abstract_content=abstract)
-
- if len(content) > 300:
- content = content[0:300] + "..."
- # TODO: center snippet on query term
-
- publishedDate = datetime.strptime(eval_xpath_getindex(entry, './/published', 0).text, '%Y-%m-%dT%H:%M:%SZ')
-
- res_dict = {'url': url, 'title': title, 'publishedDate': publishedDate, 'content': content}
+ dom = etree.fromstring(resp.content)
+ for entry in eval_xpath_list(dom, xpath_entry):
+ title = eval_xpath_getindex(entry, xpath_title, 0).text
+
+ url = eval_xpath_getindex(entry, xpath_id, 0).text
+ abstract = eval_xpath_getindex(entry, xpath_summary, 0).text
+
+ authors = [author.text for author in eval_xpath_list(entry, xpath_author_name)]
+
+ # doi
+ doi_element = eval_xpath_getindex(entry, xpath_doi, 0, default=None)
+ doi = None if doi_element is None else doi_element.text
+
+ # pdf
+ pdf_element = eval_xpath_getindex(entry, xpath_pdf, 0, default=None)
+ pdf_url = None if pdf_element is None else pdf_element.attrib.get('href')
+
+ # journal
+ journal_element = eval_xpath_getindex(entry, xpath_journal, 0, default=None)
+ journal = None if journal_element is None else journal_element.text
+
+ # tags
+ tag_elements = eval_xpath(entry, xpath_category)
+ tags = [str(tag) for tag in tag_elements]
+
+ # comments
+ comments_elements = eval_xpath_getindex(entry, xpath_comment, 0, default=None)
+ comments = None if comments_elements is None else comments_elements.text
+
+ publishedDate = datetime.strptime(eval_xpath_getindex(entry, xpath_published, 0).text, '%Y-%m-%dT%H:%M:%SZ')
+
+ res_dict = {
+ 'template': 'paper.html',
+ 'url': url,
+ 'title': title,
+ 'publishedDate': publishedDate,
+ 'content': abstract,
+ 'doi': doi,
+ 'authors': authors,
+ 'journal': journal,
+ 'tags': tags,
+ 'comments': comments,
+ 'pdf_url': pdf_url,
+ }
results.append(res_dict)
diff --git a/searx/engines/crossref.py b/searx/engines/crossref.py
new file mode 100644
index 000000000..d61318146
--- /dev/null
+++ b/searx/engines/crossref.py
@@ -0,0 +1,59 @@
+# SPDX-License-Identifier: AGPL-3.0-or-later
+# lint: pylint
+"""Semantic Scholar (Science)
+"""
+
+from urllib.parse import urlencode
+from searx.utils import html_to_text
+
+about = {
+ "website": 'https://www.crossref.org/',
+ "wikidata_id": 'Q5188229',
+ "official_api_documentation": 'https://github.com/CrossRef/rest-api-doc',
+ "use_official_api": False,
+ "require_api_key": False,
+ "results": 'JSON',
+}
+
+categories = ['science', 'scientific publications']
+paging = True
+search_url = 'https://api.crossref.org/works'
+
+
+def request(query, params):
+ params['url'] = search_url + '?' + urlencode(dict(query=query, offset=20 * (params['pageno'] - 1)))
+ return params
+
+
+def response(resp):
+ res = resp.json()
+ results = []
+ for record in res['message']['items']:
+ record_type = record['type']
+ if record_type == 'book-chapter':
+ title = record['container-title'][0]
+ if record['title'][0].lower().strip() != title.lower().strip():
+ title = title + ' (' + record['title'][0] + ')'
+ journal = None
+ else:
+ title = record['title'][0]
+ journal = record.get('container-title', [None])[0]
+ url = record.get('resource', {}).get('primary', {}).get('URL') or record['URL']
+ authors = [author.get('given', '') + ' ' + author.get('family', '') for author in record.get('author', [])]
+ isbn = record.get('isbn') or [i['value'] for i in record.get('isbn-type', [])]
+ results.append(
+ {
+ 'template': 'paper.html',
+ 'url': url,
+ 'title': title,
+ 'journal': journal,
+ 'volume': record.get('volume'),
+ 'type': record['type'],
+ 'content': html_to_text(record.get('abstract', '')),
+ 'publisher': record.get('publisher'),
+ 'authors': authors,
+ 'doi': record['DOI'],
+ 'isbn': isbn,
+ }
+ )
+ return results
diff --git a/searx/engines/google_scholar.py b/searx/engines/google_scholar.py
index 41c62886b..c07cd4cea 100644
--- a/searx/engines/google_scholar.py
+++ b/searx/engines/google_scholar.py
@@ -13,10 +13,12 @@ Definitions`_.
from urllib.parse import urlencode
from datetime import datetime
+from typing import Optional
from lxml import html
from searx.utils import (
eval_xpath,
+ eval_xpath_getindex,
eval_xpath_list,
extract_text,
)
@@ -46,7 +48,7 @@ about = {
}
# engine dependent config
-categories = ['science']
+categories = ['science', 'scientific publications']
paging = True
language_support = True
use_locale_domain = True
@@ -99,7 +101,43 @@ def request(query, params):
return params
-def response(resp):
+def parse_gs_a(text: Optional[str]):
+ """Parse the text written in green.
+
+ Possible formats:
+ * "{authors} - {journal}, {year} - {publisher}"
+ * "{authors} - {year} - {publisher}"
+ * "{authors} - {publisher}"
+ """
+ if text is None or text == "":
+ return None, None, None, None
+
+ s_text = text.split(' - ')
+ authors = s_text[0].split(', ')
+ publisher = s_text[-1]
+ if len(s_text) != 3:
+ return authors, None, publisher, None
+
+ # the format is "{authors} - {journal}, {year} - {publisher}" or "{authors} - {year} - {publisher}"
+ # get journal and year
+ journal_year = s_text[1].split(', ')
+ # journal is optional and may contains some coma
+ if len(journal_year) > 1:
+ journal = ', '.join(journal_year[0:-1])
+ if journal == '…':
+ journal = None
+ else:
+ journal = None
+ # year
+ year = journal_year[-1]
+ try:
+ publishedDate = datetime.strptime(year.strip(), '%Y')
+ except ValueError:
+ publishedDate = None
+ return authors, journal, publisher, publishedDate
+
+
+def response(resp): # pylint: disable=too-many-locals
"""Get response from google's search request"""
results = []
@@ -112,30 +150,53 @@ def response(resp):
dom = html.fromstring(resp.text)
# parse results
- for result in eval_xpath_list(dom, '//div[@class="gs_ri"]'):
+ for result in eval_xpath_list(dom, '//div[@data-cid]'):
- title = extract_text(eval_xpath(result, './h3[1]//a'))
+ title = extract_text(eval_xpath(result, './/h3[1]//a'))
if not title:
# this is a [ZITATION] block
continue
- url = eval_xpath(result, './h3[1]//a/@href')[0]
- content = extract_text(eval_xpath(result, './div[@class="gs_rs"]')) or ''
-
- pub_info = extract_text(eval_xpath(result, './div[@class="gs_a"]'))
- if pub_info:
- content += "[%s]" % pub_info
-
pub_type = extract_text(eval_xpath(result, './/span[@class="gs_ct1"]'))
if pub_type:
- title = title + " " + pub_type
+ pub_type = pub_type[1:-1].lower()
+
+ url = eval_xpath_getindex(result, './/h3[1]//a/@href', 0)
+ content = extract_text(eval_xpath(result, './/div[@class="gs_rs"]'))
+ authors, journal, publisher, publishedDate = parse_gs_a(
+ extract_text(eval_xpath(result, './/div[@class="gs_a"]'))
+ )
+ if publisher in url:
+ publisher = None
+
+ # cited by
+ comments = extract_text(eval_xpath(result, './/div[@class="gs_fl"]/a[starts-with(@href,"/scholar?cites=")]'))
+
+ # link to the html or pdf document
+ html_url = None
+ pdf_url = None
+ doc_url = eval_xpath_getindex(result, './/div[@class="gs_or_ggsm"]/a/@href', 0, default=None)
+ doc_type = extract_text(eval_xpath(result, './/span[@class="gs_ctg2"]'))
+ if doc_type == "[PDF]":
+ pdf_url = doc_url
+ else:
+ html_url = doc_url
results.append(
{
+ 'template': 'paper.html',
+ 'type': pub_type,
'url': url,
'title': title,
+ 'authors': authors,
+ 'publisher': publisher,
+ 'journal': journal,
+ 'publishedDate': publishedDate,
'content': content,
+ 'comments': comments,
+ 'html_url': html_url,
+ 'pdf_url': pdf_url,
}
)
diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py
index 27444ae24..02e282d5f 100644
--- a/searx/engines/pubmed.py
+++ b/searx/engines/pubmed.py
@@ -3,11 +3,15 @@
PubMed (Scholar publications)
"""
-from flask_babel import gettext
from lxml import etree
from datetime import datetime
from urllib.parse import urlencode
from searx.network import get
+from searx.utils import (
+ eval_xpath_getindex,
+ eval_xpath_list,
+ extract_text,
+)
# about
about = {
@@ -22,7 +26,7 @@ about = {
"results": 'XML',
}
-categories = ['science']
+categories = ['science', 'scientific publications']
base_url = (
'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' + '?db=pubmed&{query}&retstart={offset}&retmax={hits}'
@@ -63,46 +67,61 @@ def response(resp):
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
- search_results_xml = get(retrieve_url_encoded).content
- search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
-
- for entry in search_results:
- title = entry.xpath('.//Article/ArticleTitle')[0].text
+ search_results_response = get(retrieve_url_encoded).content
+ search_results = etree.XML(search_results_response)
+ for entry in eval_xpath_list(search_results, '//PubmedArticle'):
+ medline = eval_xpath_getindex(entry, './MedlineCitation', 0)
- pmid = entry.xpath('.//PMID')[0].text
+ title = eval_xpath_getindex(medline, './/Article/ArticleTitle', 0).text
+ pmid = eval_xpath_getindex(medline, './/PMID', 0).text
url = pubmed_url + pmid
-
- try:
- content = entry.xpath('.//Abstract/AbstractText')[0].text
- except:
- content = gettext('No abstract is available for this publication.')
-
- # If a doi is available, add it to the snipppet
- try:
- doi = entry.xpath('.//ELocationID[@EIdType="doi"]')[0].text
- content = 'DOI: {doi} Abstract: {content}'.format(doi=doi, content=content)
- except:
- pass
-
- if len(content) > 300:
- content = content[0:300] + "..."
- # TODO: center snippet on query term
-
- res_dict = {'url': url, 'title': title, 'content': content}
-
- try:
- publishedDate = datetime.strptime(
- entry.xpath('.//DateCreated/Year')[0].text
- + '-'
- + entry.xpath('.//DateCreated/Month')[0].text
- + '-'
- + entry.xpath('.//DateCreated/Day')[0].text,
- '%Y-%m-%d',
- )
- res_dict['publishedDate'] = publishedDate
- except:
- pass
+ content = extract_text(
+ eval_xpath_getindex(medline, './/Abstract/AbstractText//text()', 0, default=None), allow_none=True
+ )
+ doi = extract_text(
+ eval_xpath_getindex(medline, './/ELocationID[@EIdType="doi"]/text()', 0, default=None), allow_none=True
+ )
+ journal = extract_text(
+ eval_xpath_getindex(medline, './Article/Journal/Title/text()', 0, default=None), allow_none=True
+ )
+ issn = extract_text(
+ eval_xpath_getindex(medline, './Article/Journal/ISSN/text()', 0, default=None), allow_none=True
+ )
+ authors = []
+ for author in eval_xpath_list(medline, './Article/AuthorList/Author'):
+ f = eval_xpath_getindex(author, './ForeName', 0, default=None)
+ l = eval_xpath_getindex(author, './LastName', 0, default=None)
+ f = '' if f is None else f.text
+ l = '' if l is None else l.text
+ authors.append((f + ' ' + l).strip())
+
+ res_dict = {
+ 'template': 'paper.html',
+ 'url': url,
+ 'title': title,
+ 'content': content,
+ 'journal': journal,
+ 'issn': [issn],
+ 'authors': authors,
+ 'doi': doi,
+ }
+
+ accepted_date = eval_xpath_getindex(
+ entry, './PubmedData/History//PubMedPubDate[@PubStatus="accepted"]', 0, default=None
+ )
+ if accepted_date is not None:
+ year = eval_xpath_getindex(accepted_date, './Year', 0)
+ month = eval_xpath_getindex(accepted_date, './Month', 0)
+ day = eval_xpath_getindex(accepted_date, './Day', 0)
+ try:
+ publishedDate = datetime.strptime(
+ year.text + '-' + month.text + '-' + day.text,
+ '%Y-%m-%d',
+ )
+ res_dict['publishedDate'] = publishedDate
+ except Exception as e:
+ print(e)
results.append(res_dict)
- return results
+ return results
diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py
index bda731047..b2701c333 100644
--- a/searx/engines/semantic_scholar.py
+++ b/searx/engines/semantic_scholar.py
@@ -6,6 +6,8 @@
from json import dumps, loads
from datetime import datetime
+from flask_babel import gettext
+
about = {
"website": 'https://www.semanticscholar.org/',
"wikidata_id": 'Q22908627',
@@ -15,6 +17,7 @@ about = {
"results": 'JSON',
}
+categories = ['science', 'scientific publications']
paging = True
search_url = 'https://www.semanticscholar.org/api/1/search'
paper_url = 'https://www.semanticscholar.org/paper'
@@ -47,9 +50,6 @@ def response(resp):
results = []
for result in res['results']:
- item = {}
- metadata = []
-
url = result.get('primaryPaperLink', {}).get('url')
if not url and result.get('links'):
url = result.get('links')[0]
@@ -60,22 +60,47 @@ def response(resp):
if not url:
url = paper_url + '/%s' % result['id']
- item['url'] = url
+ # publishedDate
+ if 'pubDate' in result:
+ publishedDate = datetime.strptime(result['pubDate'], "%Y-%m-%d")
+ else:
+ publishedDate = None
- item['title'] = result['title']['text']
- item['content'] = result['paperAbstract']['text']
+ # authors
+ authors = [author[0]['name'] for author in result.get('authors', [])]
- metadata = result.get('fieldsOfStudy') or []
- venue = result.get('venue', {}).get('text')
- if venue:
- metadata.append(venue)
- if metadata:
- item['metadata'] = ', '.join(metadata)
+ # pick for the first alternate link, but not from the crawler
+ pdf_url = None
+ for doc in result.get('alternatePaperLinks', []):
+ if doc['linkType'] != 'crawler':
+ pdf_url = doc['url']
+ break
- pubDate = result.get('pubDate')
- if pubDate:
- item['publishedDate'] = datetime.strptime(pubDate, "%Y-%m-%d")
+ # comments
+ comments = None
+ if 'citationStats' in result:
+ comments = gettext(
+ '{numCitations} citations from the year {firstCitationVelocityYear} to {lastCitationVelocityYear}'
+ ).format(
+ numCitations=result['citationStats']['numCitations'],
+ firstCitationVelocityYear=result['citationStats']['firstCitationVelocityYear'],
+ lastCitationVelocityYear=result['citationStats']['lastCitationVelocityYear'],
+ )
- results.append(item)
+ results.append(
+ {
+ 'template': 'paper.html',
+ 'url': url,
+ 'title': result['title']['text'],
+ 'content': result['paperAbstract']['text'],
+ 'journal': result.get('venue', {}).get('text') or result.get('journal', {}).get('name'),
+ 'doi': result.get('doiInfo', {}).get('doi'),
+ 'tags': result.get('fieldsOfStudy'),
+ 'authors': authors,
+ 'pdf_url': pdf_url,
+ 'publishedDate': publishedDate,
+ 'comments': comments,
+ }
+ )
return results
diff --git a/searx/engines/springer.py b/searx/engines/springer.py
index 512d71e5e..2711fa807 100644
--- a/searx/engines/springer.py
+++ b/searx/engines/springer.py
@@ -19,7 +19,7 @@ about = {
"results": 'JSON',
}
-categories = ['science']
+categories = ['science', 'scientific publications']
paging = True
nb_per_page = 10
api_key = 'unset'
@@ -41,32 +41,30 @@ def response(resp):
json_data = loads(resp.text)
for record in json_data['records']:
- content = record['abstract'][0:500]
- if len(record['abstract']) > len(content):
- content += "..."
+ content = record['abstract']
published = datetime.strptime(record['publicationDate'], '%Y-%m-%d')
-
- metadata = [
- record[x]
- for x in [
- 'publicationName',
- 'identifier',
- 'contentType',
- ]
- if record.get(x) is not None
- ]
-
- metadata = ' / '.join(metadata)
- if record.get('startingPage') and record.get('endingPage') is not None:
- metadata += " (%(startingPage)s-%(endingPage)s)" % record
-
+ authors = [" ".join(author['creator'].split(', ')[::-1]) for author in record['creators']]
+ tags = record.get('genre')
+ if isinstance(tags, str):
+ tags = [tags]
results.append(
{
+ 'template': 'paper.html',
'title': record['title'],
'url': record['url'][0]['value'].replace('http://', 'https://', 1),
+ 'type': record.get('contentType'),
'content': content,
'publishedDate': published,
- 'metadata': metadata,
+ 'authors': authors,
+ 'doi': record.get('doi'),
+ 'journal': record.get('publicationName'),
+ 'start_page': record.get('start_page'),
+ 'end_page': record.get('end_page'),
+ 'tags': tags,
+ 'issn': [record.get('issn')],
+ 'isbn': [record.get('isbn')],
+ 'volume': record.get('volume') or None,
+ 'number': record.get('number') or None,
}
)
return results
diff --git a/searx/searxng.msg b/searx/searxng.msg
index 3b876f96d..c37240f83 100644
--- a/searx/searxng.msg
+++ b/searx/searxng.msg
@@ -43,6 +43,7 @@ CATEGORY_GROUPS = {
'REPOS': 'repos',
'SOFTWARE_WIKIS': 'software wikis',
'WEB': 'web',
+ 'SCIENTIFIC PUBLICATIONS': 'scientific publications',
}
STYLE_NAMES = {
diff --git a/searx/settings.yml b/searx/settings.yml
index 3f07bb2dd..ba38e694a 100644
--- a/searx/settings.yml
+++ b/searx/settings.yml
@@ -319,7 +319,6 @@ engines:
- name: arxiv
engine: arxiv
shortcut: arx
- categories: science
timeout: 4.0
# tmp suspended: dh key too small
@@ -411,23 +410,9 @@ engines:
# api_key: 'unset'
- name: crossref
- engine: json_engine
- paging: true
- search_url: https://search.crossref.org/dois?q={query}&page={pageno}
- url_query: doi
- title_query: title
- title_html_to_text: true
- content_query: fullCitation
- content_html_to_text: true
- categories: science
+ engine: crossref
shortcut: cr
- about:
- website: https://www.crossref.org/
- wikidata_id: Q5188229
- official_api_documentation: https://github.com/CrossRef/rest-api-doc
- use_official_api: false
- require_api_key: false
- results: JSON
+ timeout: 10
- name: yep
engine: json_engine
@@ -1068,7 +1053,7 @@ engines:
title_query: metadata/oaf:entity/oaf:result/title/$
content_query: metadata/oaf:entity/oaf:result/description/$
content_html_to_text: true
- categories: science
+ categories: "science"
shortcut: oad
timeout: 5.0
about:
@@ -1198,7 +1183,6 @@ engines:
- name: pubmed
engine: pubmed
shortcut: pub
- categories: science
timeout: 3.0
- name: pypi
@@ -1346,7 +1330,6 @@ engines:
engine: semantic_scholar
disabled: true
shortcut: se
- categories: science
# Spotify needs API credentials
# - name: spotify
@@ -1372,8 +1355,7 @@ engines:
# # working API key, for test & debug: "a69685087d07eca9f13db62f65b8f601"
# api_key: 'unset'
# shortcut: springer
- # categories: science
- # timeout: 6.0
+ # timeout: 15.0
- name: startpage
engine: startpage