summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2023-06-28 09:16:49 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2023-06-29 09:32:57 +0200
commit87e7926ae96bc394427859c3688037c0d1710230 (patch)
treee9deb45c93d10670c451809948cc7efc915a59c3
parente2df6b77a3985f1c4b4dc0372332209076abf308 (diff)
downloadsearxng-87e7926ae96bc394427859c3688037c0d1710230.tar.gz
searxng-87e7926ae96bc394427859c3688037c0d1710230.zip
[fix] engine: Anna's Archive - grep results from '.js-scroll-hidden' elements
The renderuing of the WEB page is very strange; except the firts position all other positions of Anna's result page are enclosed in SGML comments. These cooments are *uncommented* by some JS code, see query of the class '.js-scroll-hidden' in Anna's HTML template [1]. [1] https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
-rw-r--r--searx/engines/annas_archive.py40
1 files changed, 21 insertions, 19 deletions
diff --git a/searx/engines/annas_archive.py b/searx/engines/annas_archive.py
index cebc8d45c..db9bd1719 100644
--- a/searx/engines/annas_archive.py
+++ b/searx/engines/annas_archive.py
@@ -97,14 +97,6 @@ aa_ext: str = ''
"""
-# xpath queries
-xpath_results: str = '//main//a[starts-with(@href,"/md5")]'
-xpath_url: str = ".//@href"
-xpath_title: str = ".//h3/text()[1]"
-xpath_authors: str = './/div[contains(@class, "italic")]'
-xpath_publisher: str = './/div[contains(@class, "text-sm")]'
-xpath_file_info: str = './/div[contains(@class, "text-xs")]'
-
def init(engine_settings=None): # pylint: disable=unused-argument
"""Check of engine's settings."""
@@ -131,22 +123,32 @@ def response(resp) -> List[Dict[str, Optional[str]]]:
results: List[Dict[str, Optional[str]]] = []
dom = html.fromstring(resp.text)
- for item in dom.xpath(xpath_results):
- result: Dict[str, Optional[str]] = {}
+ for item in eval_xpath_list(dom, '//main//div[contains(@class, "h-[125]")]/a'):
+ results.append(_get_result(item))
- result["url"] = base_url + item.xpath(xpath_url)[0]
+ # The rendering of the WEB page is very strange; except the first position
+ # all other positions of Anna's result page are enclosed in SGML comments.
+ # These comments are *uncommented* by some JS code, see query of class
+ # '.js-scroll-hidden' in Anna's HTML template:
+ # https://annas-software.org/AnnaArchivist/annas-archive/-/blob/main/allthethings/templates/macros/md5_list.html
- result["title"] = extract_text(eval_xpath(item, xpath_title))
+ for item in eval_xpath_list(dom, '//main//div[contains(@class, "js-scroll-hidden")]'):
+ item = html.fromstring(item.xpath('./comment()')[0].text)
+ results.append(_get_result(item))
- result["content"] = "{publisher}. {authors}. {file_info}".format(
- authors=extract_text(eval_xpath(item, xpath_authors)),
- publisher=extract_text(eval_xpath(item, xpath_publisher)),
- file_info=extract_text(eval_xpath(item, xpath_file_info)),
- )
+ return results
- results.append(result)
- return results
+def _get_result(item):
+ return {
+ 'template': 'paper.html',
+ 'url': base_url + item.xpath('./@href')[0],
+ 'title': extract_text(eval_xpath(item, './/h3/text()[1]')),
+ 'publisher': extract_text(eval_xpath(item, './/div[contains(@class, "text-sm")]')),
+ 'authors': [extract_text(eval_xpath(item, './/div[contains(@class, "italic")]'))],
+ 'content': extract_text(eval_xpath(item, './/div[contains(@class, "text-xs")]')),
+ 'img_src': item.xpath('.//img/@src')[0],
+ }
def fetch_traits(engine_traits: EngineTraits):