diff options
-rw-r--r-- | Dockerfile | 3 | ||||
-rw-r--r-- | requirements-dev.txt | 2 | ||||
-rw-r--r-- | requirements.txt | 2 | ||||
-rw-r--r-- | searx/engines/semantic_scholar.py | 42 | ||||
-rw-r--r-- | searx/engines/seznam.py | 37 | ||||
-rw-r--r-- | searx/engines/wikipedia.py | 2 | ||||
-rw-r--r-- | searx/settings.yml | 11 |
7 files changed, 68 insertions, 31 deletions
diff --git a/Dockerfile b/Dockerfile index f251d06ea..3894aa968 100644 --- a/Dockerfile +++ b/Dockerfile @@ -41,8 +41,6 @@ RUN apk upgrade --no-cache \ openssl-dev \ tar \ git \ - protoc \ - protobuf-dev \ && apk add --no-cache \ ca-certificates \ su-exec \ @@ -55,7 +53,6 @@ RUN apk upgrade --no-cache \ uwsgi \ uwsgi-python3 \ brotli \ - protobuf \ && pip3 install --upgrade pip \ && pip3 install --no-cache -r requirements.txt \ && apk del build-dependencies \ diff --git a/requirements-dev.txt b/requirements-dev.txt index ef948c587..2ed51f067 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,4 +14,4 @@ sphinx-jinja==1.1.1 sphinx-tabs==2.1.0 sphinxcontrib-programoutput==0.16 sphinx-autobuild==2020.9.1 -linuxdoc==20210110 +linuxdoc==20210324 diff --git a/requirements.txt b/requirements.txt index 39eff78ad..bfbcecc51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,7 @@ flask-babel==2.0.0 flask==1.1.2 idna==2.10 jinja2==2.11.3 -lxml==4.6.2 +lxml==4.6.3 pygments==2.8.0 python-dateutil==2.8.1 pyyaml==5.4.1 diff --git a/searx/engines/semantic_scholar.py b/searx/engines/semantic_scholar.py new file mode 100644 index 000000000..297d0cf71 --- /dev/null +++ b/searx/engines/semantic_scholar.py @@ -0,0 +1,42 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + Semantic Scholar (Science) +""" + +from json import dumps, loads + + +search_url = 'https://www.semanticscholar.org/api/1/search' + + +def request(query, params): + params['url'] = search_url + params['method'] = 'POST' + params['headers']['content-type'] = 'application/json' + params['data'] = dumps({ + "queryString": query, + "page": params['pageno'], + "pageSize": 10, + "sort": "relevance", + "useFallbackRankerService": False, + "useFallbackSearchCluster": False, + "getQuerySuggestions": False, + "authors": [], + "coAuthors": [], + "venues": [], + "performTitleMatch": True, + }) + return params + + +def response(resp): + res = loads(resp.text) + results = [] + for result in res['results']: + results.append({ + 'url': result['primaryPaperLink']['url'], + 'title': result['title']['text'], + 'content': result['paperAbstractTruncated'] + }) + + return results diff --git a/searx/engines/seznam.py b/searx/engines/seznam.py index 1df92a845..faceb0550 100644 --- a/searx/engines/seznam.py +++ b/searx/engines/seznam.py @@ -7,7 +7,12 @@ from urllib.parse import urlencode, urlparse from lxml import html from searx.poolrequests import get from searx.exceptions import SearxEngineAccessDeniedException -from searx.utils import extract_text, eval_xpath_list, eval_xpath_getindex +from searx.utils import ( + extract_text, + eval_xpath_list, + eval_xpath_getindex, + eval_xpath, +) # about about = { @@ -26,7 +31,10 @@ def request(query, params): response_index = get(base_url, headers=params['headers'], raise_for_httperror=True) dom = html.fromstring(response_index.text) - url_params = {'q': query} + url_params = { + 'q': query, + 'oq': query, + } for e in eval_xpath_list(dom, '//input[@type="hidden"]'): name = e.get('name') value = e.get('value') @@ -45,20 +53,15 @@ def response(resp): results = [] dom = html.fromstring(resp.content.decode()) - for result_element in eval_xpath_list(dom, '//div[@id="searchpage-root"]//div[@data-dot="results"]/div'): - dot_data = eval_xpath_getindex(result_element, './div/div[@data-dot-data]/@data-dot-data', 0, default=None) - if dot_data is None: - title_element = eval_xpath_getindex(result_element, './/h3/a', 0) - results.append({ - 'url': title_element.get('href'), - 'title': extract_text(title_element), - 'content': extract_text(eval_xpath_getindex(title_element, '../../div[2]', 0)), - }) - elif dot_data == '{"reporter_name":"hint/related/relates"}': - suggestions_element = eval_xpath_getindex(result_element, - './div/div[@data-dot="main-box"]', 0, default=None) - if suggestions_element is not None: - for suggestion in eval_xpath_list(suggestions_element, './/ul/li'): - results.append({'suggestion': extract_text(suggestion)}) + for result_element in eval_xpath_list(dom, '//div[@data-dot="results"]/div'): + result_data = eval_xpath_getindex(result_element, './/div[contains(@class, "Result")]', 0, default=None) + if result_data is None: + continue + title_element = eval_xpath_getindex(result_element, './/h3/a', 0) + results.append({ + 'url': title_element.get('href'), + 'title': extract_text(title_element), + 'content': extract_text(eval_xpath(result_data, './/p[@class="Result-description"]')), + }) return results diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index da867c81e..3ad8748fb 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -76,7 +76,7 @@ def response(resp): if api_result.get('type') != 'standard': return [] - title = api_result['displaytitle'] + title = api_result['title'] wikipedia_link = api_result['content_urls']['desktop']['page'] results.append({'url': wikipedia_link, 'title': title}) diff --git a/searx/settings.yml b/searx/settings.yml index 1958210d7..84aca86f1 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -918,13 +918,8 @@ engines: # disabled : True - name : semantic scholar - engine : xpath - paging : True - search_url : https://www.semanticscholar.org/search?q={query}&sort=relevance&page={pageno}&ae=false - results_xpath : //article - url_xpath : .//div[@class="search-result-title"]/a/@href - title_xpath : .//div[@class="search-result-title"]/a - content_xpath : .//div[@class="search-result-abstract"] + engine : semantic_scholar + disabled : True shortcut : se categories : science about: @@ -933,7 +928,7 @@ engines: official_api_documentation: https://api.semanticscholar.org/ use_official_api: false require_api_key: false - results: HTML + results: JSON # Spotify needs API credentials # - name : spotify |