summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorMarkus Heiser <markus.heiser@darmarit.de>2022-07-05 22:02:29 +0200
committerMarkus Heiser <markus.heiser@darmarit.de>2022-07-23 16:00:58 +0200
commit1540891561d9b24c960c239981f35eaf380879c4 (patch)
tree72d61559f813dd1ec77e06f747c9f70f2255f0c8 /searx
parent50d714d82952417f3db837a176b751d3845c0cc8 (diff)
downloadsearxng-1540891561d9b24c960c239981f35eaf380879c4.tar.gz
searxng-1540891561d9b24c960c239981f35eaf380879c4.zip
[fix] engine tineye: handle 422 response of not supported img format
Closes: https://github.com/searxng/searxng/issues/1449 Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/tineye.py184
1 files changed, 153 insertions, 31 deletions
diff --git a/searx/engines/tineye.py b/searx/engines/tineye.py
index fe5b60393..6c5ff134c 100644
--- a/searx/engines/tineye.py
+++ b/searx/engines/tineye.py
@@ -17,6 +17,7 @@ billion images `[tineye.com] <https://tineye.com/how>`_.
from urllib.parse import urlencode
from datetime import datetime
+from flask_babel import gettext
about = {
"website": 'https://tineye.com',
@@ -28,20 +29,41 @@ about = {
}
engine_type = 'online_url_search'
+""":py:obj:`searx.search.processors.online_url_search`"""
+
categories = ['general']
paging = True
safesearch = False
base_url = 'https://tineye.com'
search_string = '/result_json/?page={page}&{query}'
+FORMAT_NOT_SUPPORTED = gettext(
+ "Could not read that image url. This may be due to an unsupported file"
+ " format. TinEye only supports images that are JPEG, PNG, GIF, BMP, TIFF or WebP."
+)
+"""TinEye error message"""
+
+NO_SIGNATURE_ERROR = gettext(
+ "The image is too simple to find matches. TinEye requires a basic level of"
+ " visual detail to successfully identify matches."
+)
+"""TinEye error message"""
+
+DOWNLOAD_ERROR = gettext("The image could not be downloaded.")
+"""TinEye error message"""
+
def request(query, params):
+ """Build TinEye HTTP request using ``search_urls`` of a :py:obj:`engine_type`."""
+
+ params['raise_for_httperror'] = False
if params['search_urls']['data:image']:
query = params['search_urls']['data:image']
elif params['search_urls']['http']:
query = params['search_urls']['http']
+ logger.debug("query URL: %s", query)
query = urlencode({'url': query})
# see https://github.com/TinEye/pytineye/blob/main/pytineye/api.py
@@ -59,45 +81,145 @@ def request(query, params):
return params
+def parse_tineye_match(match_json):
+ """Takes parsed JSON from the API server and turns it into a :py:obj:`dict`
+ object.
+
+ Attributes `(class Match) <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__
+
+ - `image_url`, link to the result image.
+ - `domain`, domain this result was found on.
+ - `score`, a number (0 to 100) that indicates how closely the images match.
+ - `width`, image width in pixels.
+ - `height`, image height in pixels.
+ - `size`, image area in pixels.
+ - `format`, image format.
+ - `filesize`, image size in bytes.
+ - `overlay`, overlay URL.
+ - `tags`, whether this match belongs to a collection or stock domain.
+
+ - `backlinks`, a list of Backlink objects pointing to the original websites
+ and image URLs. List items are instances of :py:obj:`dict`, (`Backlink
+ <https://github.com/TinEye/pytineye/blob/main/pytineye/api.py>`__):
+
+ - `url`, the image URL to the image.
+ - `backlink`, the original website URL.
+ - `crawl_date`, the date the image was crawled.
+
+ """
+
+ # HINT: there exists an alternative backlink dict in the domains list / e.g.::
+ #
+ # match_json['domains'][0]['backlinks']
+
+ backlinks = []
+ if "backlinks" in match_json:
+
+ for backlink_json in match_json["backlinks"]:
+ if not isinstance(backlink_json, dict):
+ continue
+
+ crawl_date = backlink_json.get("crawl_date")
+ if crawl_date:
+ crawl_date = datetime.fromisoformat(crawl_date[:-3])
+ else:
+ crawl_date = datetime.min
+
+ backlinks.append(
+ {
+ 'url': backlink_json.get("url"),
+ 'backlink': backlink_json.get("backlink"),
+ 'crawl_date': crawl_date,
+ 'image_name': backlink_json.get("image_name"),
+ }
+ )
+
+ return {
+ 'image_url': match_json.get("image_url"),
+ 'domain': match_json.get("domain"),
+ 'score': match_json.get("score"),
+ 'width': match_json.get("width"),
+ 'height': match_json.get("height"),
+ 'size': match_json.get("size"),
+ 'image_format': match_json.get("format"),
+ 'filesize': match_json.get("filesize"),
+ 'overlay': match_json.get("overlay"),
+ 'tags': match_json.get("tags"),
+ 'backlinks': backlinks,
+ }
+
+
def response(resp):
+ """Parse HTTP response from TinEye."""
results = []
- # Define wanted results
- json_data = resp.json()
- number_of_results = json_data['num_matches']
-
- for i in json_data['matches']:
- image_format = i['format']
- width = i['width']
- height = i['height']
- thumbnail_src = i['image_url']
- backlink = i['domains'][0]['backlinks'][0]
- url = backlink['backlink']
- source = backlink['url']
- title = backlink['image_name']
- img_src = backlink['url']
-
- # Get and convert published date
- api_date = backlink['crawl_date'][:-3]
- publishedDate = datetime.fromisoformat(api_date)
-
- # Append results
+ try:
+ json_data = resp.json()
+ except Exception as exc: # pylint: disable=broad-except
+ msg = "can't parse JSON response // %s" % exc
+ logger.error(msg)
+ json_data = {'error': msg}
+
+ # handle error codes from Tineye
+
+ if resp.is_error:
+ if resp.status_code in (400, 422):
+
+ message = 'HTTP status: %s' % resp.status_code
+ error = json_data.get('error')
+ s_key = json_data.get('suggestions', {}).get('key', '')
+
+ if error and s_key:
+ message = "%s (%s)" % (error, s_key)
+ elif error:
+ message = error
+
+ if s_key == "Invalid image URL":
+ # test https://docs.searxng.org/_static/searxng-wordmark.svg
+ message = FORMAT_NOT_SUPPORTED
+ elif s_key == 'NO_SIGNATURE_ERROR':
+ # test https://pngimg.com/uploads/dot/dot_PNG4.png
+ message = NO_SIGNATURE_ERROR
+ elif s_key == 'Download Error':
+ # test https://notexists
+ message = DOWNLOAD_ERROR
+
+ # see https://github.com/searxng/searxng/pull/1456#issuecomment-1193105023
+ # results.append({'answer': message})
+ logger.error(message)
+
+ return results
+
+ resp.raise_for_status()
+
+ # append results from matches
+
+ for match_json in json_data['matches']:
+
+ tineye_match = parse_tineye_match(match_json)
+ if not tineye_match['backlinks']:
+ continue
+
+ backlink = tineye_match['backlinks'][0]
results.append(
{
'template': 'images.html',
- 'url': url,
- 'thumbnail_src': thumbnail_src,
- 'source': source,
- 'title': title,
- 'img_src': img_src,
- 'format': image_format,
- 'widht': width,
- 'height': height,
- 'publishedDate': publishedDate,
+ 'url': backlink['backlink'],
+ 'thumbnail_src': tineye_match['image_url'],
+ 'source': backlink['url'],
+ 'title': backlink['image_name'],
+ 'img_src': backlink['url'],
+ 'format': tineye_match['image_format'],
+ 'widht': tineye_match['width'],
+ 'height': tineye_match['height'],
+ 'publishedDate': backlink['crawl_date'],
}
)
- # Append number of results
- results.append({'number_of_results': number_of_results})
+ # append number of results
+
+ number_of_results = json_data.get('num_matches')
+ if number_of_results:
+ results.append({'number_of_results': number_of_results})
return results