summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2019-08-02 13:37:13 +0200
committerGitHub <noreply@github.com>2019-08-02 13:37:13 +0200
commit2179079a9173b33b81e1084fc1e8e181c19ef8e9 (patch)
tree081289e4baa95e731c5a849b22d663b0a7788f56 /searx
parent4dc792e1e2d3771b6d6620f5d564ea091597c4dc (diff)
downloadsearxng-2179079a9173b33b81e1084fc1e8e181c19ef8e9.tar.gz
searxng-2179079a9173b33b81e1084fc1e8e181c19ef8e9.zip
[fix] fix flickr_noapi decoding (#1655)
Characters that were not ASCII were incorrectly decoded. Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).
Diffstat (limited to 'searx')
-rw-r--r--searx/engines/flickr_noapi.py12
-rw-r--r--searx/utils.py19
2 files changed, 25 insertions, 6 deletions
diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py
index eeee413ec..198ac2cff 100644
--- a/searx/engines/flickr_noapi.py
+++ b/searx/engines/flickr_noapi.py
@@ -16,7 +16,8 @@ from json import loads
from time import time
import re
from searx.engines import logger
-from searx.url_utils import urlencode, unquote
+from searx.url_utils import urlencode
+from searx.utils import ecma_unescape, html_to_text
logger = logger.getChild('flickr-noapi')
@@ -75,11 +76,10 @@ def response(resp):
for index in legend:
photo = model_export['main'][index[0]][int(index[1])][index[2]][index[3]][int(index[4])]
- author = unquote(photo.get('realname', ''))
- source = unquote(photo.get('username', '')) + ' @ Flickr'
- title = unquote(photo.get('title', ''))
- content = unquote(photo.get('description', ''))
-
+ author = ecma_unescape(photo.get('realname', ''))
+ source = ecma_unescape(photo.get('username', '')) + ' @ Flickr'
+ title = ecma_unescape(photo.get('title', ''))
+ content = html_to_text(ecma_unescape(photo.get('description', '')))
img_src = None
# From the biggest to the lowest format
for image_size in image_sizes:
diff --git a/searx/utils.py b/searx/utils.py
index b7e914557..d88bc9897 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -1,3 +1,4 @@
+# -*- coding: utf-8 -*-
import csv
import hashlib
import hmac
@@ -44,6 +45,9 @@ logger = logger.getChild('utils')
blocked_tags = ('script',
'style')
+ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
+ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
+
useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__))
+ "/data/useragents.json", 'r', encoding='utf-8').read())
@@ -415,3 +419,18 @@ def to_string(obj):
return obj.__str__()
if hasattr(obj, '__repr__'):
return obj.__repr__()
+
+
+def ecma_unescape(s):
+ """
+ python implementation of the unescape javascript function
+
+ https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string
+ https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape
+ """
+ # s = unicode(s)
+ # "%u5409" becomes "吉"
+ s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
+ # "%20" becomes " ", "%F3" becomes "ó"
+ s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s)
+ return s