diff options
author | Dalf <alex@al-f.net> | 2020-08-06 17:42:46 +0200 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2020-09-10 10:39:04 +0200 |
commit | 1022228d950c2a809ed613df1a515d9a6cafda7c (patch) | |
tree | d792dddea1a5b278b018ed4e024cd13340d5c1b1 /searx/utils.py | |
parent | 272158944bf13503e2597018fc60a00baddec660 (diff) | |
download | searxng-1022228d950c2a809ed613df1a515d9a6cafda7c.tar.gz searxng-1022228d950c2a809ed613df1a515d9a6cafda7c.zip |
Drop Python 2 (1/n): remove unicode string and url_utils
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 77 |
1 files changed, 25 insertions, 52 deletions
diff --git a/searx/utils.py b/searx/utils.py index 5ea9dc89c..f87ea177a 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -1,21 +1,22 @@ # -*- coding: utf-8 -*- +import os +import sys import csv import hashlib import hmac -import os import re +import json -from babel.core import get_global -from babel.dates import format_date from codecs import getincrementalencoder from imp import load_source from numbers import Number from os.path import splitext, join -from io import open +from io import open, StringIO from random import choice +from html.parser import HTMLParser from lxml.etree import XPath -import sys -import json +from babel.core import get_global +from babel.dates import format_date from searx import settings from searx.version import VERSION_STRING @@ -23,23 +24,6 @@ from searx.languages import language_codes from searx import settings from searx import logger -try: - from cStringIO import StringIO -except: - from io import StringIO - -try: - from HTMLParser import HTMLParser -except: - from html.parser import HTMLParser - -if sys.version_info[0] == 3: - unichr = chr - unicode = str - IS_PY2 = False - basestring = str -else: - IS_PY2 = True logger = logger.getChild('utils') @@ -75,19 +59,19 @@ def highlight_content(content, query): if content.find('<') != -1: return content - query = query.decode('utf-8') + query = query.decode() if content.lower().find(query.lower()) > -1: - query_regex = u'({0})'.format(re.escape(query)) + query_regex = '({0})'.format(re.escape(query)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) else: regex_parts = [] for chunk in query.split(): if len(chunk) == 1: - regex_parts.append(u'\\W+{0}\\W+'.format(re.escape(chunk))) + regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) else: - regex_parts.append(u'{0}'.format(re.escape(chunk))) - query_regex = u'({0})'.format('|'.join(regex_parts)) + regex_parts.append('{0}'.format(re.escape(chunk))) + query_regex = '({0})'.format('|'.join(regex_parts)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) @@ -124,21 +108,21 @@ class HTMLTextExtractor(HTMLParser): def handle_charref(self, number): if not self.is_valid_tag(): return - if number[0] in (u'x', u'X'): + if number[0] in ('x', 'X'): codepoint = int(number[1:], 16) else: codepoint = int(number) - self.result.append(unichr(codepoint)) + self.result.append(chr(codepoint)) def handle_entityref(self, name): if not self.is_valid_tag(): return # codepoint = htmlentitydefs.name2codepoint[name] - # self.result.append(unichr(codepoint)) + # self.result.append(chr(codepoint)) self.result.append(name) def get_text(self): - return u''.join(self.result).strip() + return ''.join(self.result).strip() def html_to_text(html): @@ -163,22 +147,14 @@ class UnicodeWriter: self.encoder = getincrementalencoder(encoding)() def writerow(self, row): - if IS_PY2: - row = [s.encode("utf-8") if hasattr(s, 'encode') else s for s in row] self.writer.writerow(row) # Fetch UTF-8 output from the queue ... data = self.queue.getvalue() - if IS_PY2: - data = data.decode("utf-8") - else: - data = data.strip('\x00') + data = data.strip('\x00') # ... and reencode it into the target encoding data = self.encoder.encode(data) # write to the target stream - if IS_PY2: - self.stream.write(data) - else: - self.stream.write(data.decode("utf-8")) + self.stream.write(data.decode()) # empty queue self.queue.truncate(0) @@ -253,7 +229,7 @@ def dict_subset(d, properties): def prettify_url(url, max_length=74): if len(url) > max_length: chunk_len = int(max_length / 2 + 1) - return u'{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:]) + return '{0}[...]{1}'.format(url[:chunk_len], url[-chunk_len:]) else: return url @@ -310,7 +286,7 @@ def int_or_zero(num): def is_valid_lang(lang): is_abbr = (len(lang) == 2) - lang = lang.lower().decode('utf-8') + lang = lang.lower().decode() if is_abbr: for l in language_codes: if l[0][:2] == lang: @@ -407,17 +383,14 @@ def new_hmac(secret_key, url): secret_key_bytes = secret_key else: raise err - if sys.version_info[0] == 2: - return hmac.new(bytes(secret_key), url, hashlib.sha256).hexdigest() - else: - return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest() + return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest() def to_string(obj): - if isinstance(obj, basestring): + if isinstance(obj, str): return obj if isinstance(obj, Number): - return unicode(obj) + return str(obj) if hasattr(obj, '__str__'): return obj.__str__() if hasattr(obj, '__repr__'): @@ -433,9 +406,9 @@ def ecma_unescape(s): """ # s = unicode(s) # "%u5409" becomes "吉" - s = ecma_unescape4_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + s = ecma_unescape4_re.sub(lambda e: chr(int(e.group(1), 16)), s) # "%20" becomes " ", "%F3" becomes "ó" - s = ecma_unescape2_re.sub(lambda e: unichr(int(e.group(1), 16)), s) + s = ecma_unescape2_re.sub(lambda e: chr(int(e.group(1), 16)), s) return s |