summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2023-09-11 08:22:32 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-09-11 19:42:31 +0200
commitf182abd6f8f1eac20d19c3e4b4c9800115f2a705 (patch)
tree42b534c408f98fc742603b66b145bee23ca79a72
parente73a6f5d14f4f790cbbf318d271895017ee48b94 (diff)
downloadsearxng-f182abd6f8f1eac20d19c3e4b4c9800115f2a705.tar.gz
searxng-f182abd6f8f1eac20d19c3e4b4c9800115f2a705.zip
[mod] library of congress: fix engine
-rw-r--r--docs/dev/engines/online/loc.rst13
-rw-r--r--searx/engines/loc.py91
2 files changed, 73 insertions, 31 deletions
diff --git a/docs/dev/engines/online/loc.rst b/docs/dev/engines/online/loc.rst
new file mode 100644
index 000000000..2ed76cd81
--- /dev/null
+++ b/docs/dev/engines/online/loc.rst
@@ -0,0 +1,13 @@
+.. _loc engine:
+
+===================
+Library of Congress
+===================
+
+.. contents:: Contents
+ :depth: 2
+ :local:
+ :backlinks: entry
+
+.. automodule:: searx.engines.loc
+ :members:
diff --git a/searx/engines/loc.py b/searx/engines/loc.py
index 0b2f3a689..5f58eb3dc 100644
--- a/searx/engines/loc.py
+++ b/searx/engines/loc.py
@@ -1,67 +1,96 @@
# SPDX-License-Identifier: AGPL-3.0-or-later
-"""
+"""Library of Congress: query Photo, Print and Drawing from API endpoint_
+``photos``.
+
+.. _endpoint: https://www.loc.gov/apis/json-and-yaml/requests/endpoints/
+
+.. note::
- Library of Congress : images from Prints and Photographs Online Catalog
+ Beside the ``photos`` endpoint_ there are more endpoints available / we are
+ looking forward for contributions implementing more endpoints.
"""
-from json import loads
from urllib.parse import urlencode
-
+from searx.network import raise_for_httperror
about = {
"website": 'https://www.loc.gov/pictures/',
"wikidata_id": 'Q131454',
- "official_api_documentation": 'https://www.loc.gov/pictures/api',
+ "official_api_documentation": 'https://www.loc.gov/api',
"use_official_api": True,
"require_api_key": False,
"results": 'JSON',
}
categories = ['images']
-
paging = True
-base_url = 'https://loc.gov/pictures/search/?'
-search_string = "&sp={page}&{query}&fo=json"
-
-IMG_SRC_FIXES = {
- 'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/',
- 'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/',
- 'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/',
-}
+endpoint = 'photos'
+base_url = 'https://loc.gov'
+search_string = "/{endpoint}/?sp={page}&{query}&fo=json"
def request(query, params):
- search_path = search_string.format(query=urlencode({'q': query}), page=params['pageno'])
-
+ search_path = search_string.format(
+ endpoint=endpoint,
+ query=urlencode({'q': query}),
+ page=params['pageno'],
+ )
params['url'] = base_url + search_path
-
+ params['raise_for_httperror'] = False
return params
def response(resp):
+
results = []
+ json_data = resp.json()
- json_data = loads(resp.text)
+ json_results = json_data.get('results')
+ if not json_results:
+ # when a search term has none results, loc sends a JSON in a HTTP 404
+ # response and the HTTP status code is set in the 'status' element.
+ if json_data.get('status') == 404:
+ return results
+
+ raise_for_httperror(resp)
+
+ for result in json_results:
+
+ url = result["item"].get("link")
+ if not url:
+ continue
+
+ img_src = result['item'].get('service_medium')
+ if not img_src or img_src == 'https://memory.loc.gov/pp/grp.gif':
+ continue
+
+ title = result['title']
+ if title.startswith('['):
+ title = title.strip('[]')
+
+ content_items = [
+ result['item'].get('created_published_date'),
+ result['item'].get('summary', [None])[0],
+ result['item'].get('notes', [None])[0],
+ result['item'].get('part_of', [None])[0],
+ ]
+
+ author = None
+ if result['item'].get('creators'):
+ author = result['item']['creators'][0]['title']
- for result in json_data['results']:
- img_src = result['image']['full']
- for url_prefix, url_replace in IMG_SRC_FIXES.items():
- if img_src.startswith(url_prefix):
- img_src = img_src.replace(url_prefix, url_replace)
- break
- else:
- img_src = result['image']['thumb']
results.append(
{
- 'url': result['links']['item'],
- 'title': result['title'],
- 'img_src': img_src,
- 'thumbnail_src': result['image']['thumb'],
- 'author': result['creator'],
'template': 'images.html',
+ 'url': url,
+ 'title': title,
+ 'content': ' / '.join([i for i in content_items if i]),
+ 'img_src': img_src,
+ 'thumbnail_src': result['item'].get('thumb_gallery'),
+ 'author': author,
}
)