From 7adb9090e5dbc25b0d120772beca01dc4eb0791e Mon Sep 17 00:00:00 2001 From: Paolo Basso <12545838+paolobasso99@users.noreply.github.com> Date: Sun, 25 Jun 2023 17:24:28 +0200 Subject: [mod] engine: Anna's Archive - add language support --- searx/engines/annas-archive.py | 63 -------------------------------------- searx/engines/annas_archive.py | 68 ++++++++++++++++++++++++++++++++++++++++++ searx/settings.yml | 4 +-- 3 files changed, 70 insertions(+), 65 deletions(-) delete mode 100644 searx/engines/annas-archive.py create mode 100644 searx/engines/annas_archive.py diff --git a/searx/engines/annas-archive.py b/searx/engines/annas-archive.py deleted file mode 100644 index 56d1ca77a..000000000 --- a/searx/engines/annas-archive.py +++ /dev/null @@ -1,63 +0,0 @@ -# SPDX-License-Identifier: AGPL-3.0-or-later -# lint: pylint -"""Anna's Archive - -""" -from typing import List, Dict, Any, Optional -from urllib.parse import quote -from lxml import html - -from searx.utils import extract_text, eval_xpath - -# about -about: Dict[str, Any] = { - "website": "https://annas-archive.org/", - "wikidata_id": "Q115288326", - "official_api_documentation": None, - "use_official_api": False, - "require_api_key": False, - "results": "HTML", -} - -# engine dependent config -categories: List[str] = ["files"] -paging: bool = False - -# search-url -base_url: str = "https://annas-archive.org" - -# xpath queries -xpath_results: str = '//main//a[starts-with(@href,"/md5")]' -xpath_url: str = ".//@href" -xpath_title: str = ".//h3/text()[1]" -xpath_authors: str = './/div[contains(@class, "italic")]' -xpath_publisher: str = './/div[contains(@class, "text-sm")]' -xpath_file_info: str = './/div[contains(@class, "text-xs")]' - - -def request(query, params: Dict[str, Any]) -> Dict[str, Any]: - search_url: str = base_url + "/search?q={search_query}" - params["url"] = search_url.format(search_query=quote(query)) - return params - - -def response(resp) -> List[Dict[str, Optional[str]]]: - results: List[Dict[str, Optional[str]]] = [] - dom = html.fromstring(resp.text) - - for item in dom.xpath(xpath_results): - result: Dict[str, Optional[str]] = {} - - result["url"] = base_url + item.xpath(xpath_url)[0] - - result["title"] = extract_text(eval_xpath(item, xpath_title)) - - result["content"] = "{publisher}. {authors}. {file_info}".format( - authors=extract_text(eval_xpath(item, xpath_authors)), - publisher=extract_text(eval_xpath(item, xpath_publisher)), - file_info=extract_text(eval_xpath(item, xpath_file_info)), - ) - - results.append(result) - - return results diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py new file mode 100644 index 000000000..1d5aa41ee --- /dev/null +++ b/searx/engines/annas_archive.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +"""Anna's Archive + +""" +from typing import List, Dict, Any, Optional +from urllib.parse import quote +from lxml import html + +from searx.utils import extract_text, eval_xpath + +# about +about: Dict[str, Any] = { + "website": "https://annas-archive.org/", + "wikidata_id": "Q115288326", + "official_api_documentation": None, + "use_official_api": False, + "require_api_key": False, + "results": "HTML", +} + +# engine dependent config +categories: List[str] = ["files"] +paging: bool = False + +# search-url +base_url: str = "https://annas-archive.org" + +# xpath queries +xpath_results: str = '//main//a[starts-with(@href,"/md5")]' +xpath_url: str = ".//@href" +xpath_title: str = ".//h3/text()[1]" +xpath_authors: str = './/div[contains(@class, "italic")]' +xpath_publisher: str = './/div[contains(@class, "text-sm")]' +xpath_file_info: str = './/div[contains(@class, "text-xs")]' + + +def request(query, params: Dict[str, Any]) -> Dict[str, Any]: + search_url: str = base_url + "/search?q={search_query}&lang={lang}" + lang: str = "" + if params["language"] != "all": + lang = params["language"] + + params["url"] = search_url.format(search_query=quote(query), lang=lang) + print(params) + return params + + +def response(resp) -> List[Dict[str, Optional[str]]]: + results: List[Dict[str, Optional[str]]] = [] + dom = html.fromstring(resp.text) + + for item in dom.xpath(xpath_results): + result: Dict[str, Optional[str]] = {} + + result["url"] = base_url + item.xpath(xpath_url)[0] + + result["title"] = extract_text(eval_xpath(item, xpath_title)) + + result["content"] = "{publisher}. {authors}. {file_info}".format( + authors=extract_text(eval_xpath(item, xpath_authors)), + publisher=extract_text(eval_xpath(item, xpath_publisher)), + file_info=extract_text(eval_xpath(item, xpath_file_info)), + ) + + results.append(result) + + return results diff --git a/searx/settings.yml b/searx/settings.yml index 561ec41a9..8877fba54 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -298,8 +298,8 @@ engines: disabled: true - name: anna's archive - engine: annas-archive - paging: False + engine: annas_archive + paging: false categories: files disabled: true shortcut: aa -- cgit v1.2.3-54-g00ecf