diff options
author | Alexandre Flament <alex@al-f.net> | 2019-08-02 13:37:13 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2019-08-02 13:37:13 +0200 |
commit | 2179079a9173b33b81e1084fc1e8e181c19ef8e9 (patch) | |
tree | 081289e4baa95e731c5a849b22d663b0a7788f56 /searx/utils.py | |
parent | 4dc792e1e2d3771b6d6620f5d564ea091597c4dc (diff) | |
download | searxng-2179079a9173b33b81e1084fc1e8e181c19ef8e9.tar.gz searxng-2179079a9173b33b81e1084fc1e8e181c19ef8e9.zip |
[fix] fix flickr_noapi decoding (#1655)
Characters that were not ASCII were incorrectly decoded.
Add an helper function: searx.utils.ecma_unescape (Python implementation of unescape Javascript function).
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 19 |
1 files changed, 19 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py index b7e914557..d88bc9897 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -1,3 +1,4 @@ +# -*- coding: utf-8 -*- import csv import hashlib import hmac @@ -44,6 +45,9 @@ logger = logger.getChild('utils') blocked_tags = ('script', 'style') +ecma_unescape4_re = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE) +ecma_unescape2_re = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE) + useragents = json.loads(open(os.path.dirname(os.path.realpath(__file__)) + "/data/useragents.json", 'r', encoding='utf-8').read()) @@ -415,3 +419,18 @@ def to_string(obj): return obj.__str__() if hasattr(obj, '__repr__'): return obj.__repr__() + + +def ecma_unescape(s): + """ + python implementation of the unescape javascript function + + https://www.ecma-international.org/ecma-262/6.0/#sec-unescape-string + https://developer.mozilla.org/fr/docs/Web/JavaScript/Reference/Objets_globaux/unescape + """ + # s = unicode(s) + # "%u5409" becomes "吉" + s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + # "%20" becomes " ", "%F3" becomes "ó" + s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + return s |