searx/utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

# import htmlentitydefs
import locale
import dateutil.parser
import cStringIO
import csv
import os
import re

from codecs import getincrementalencoder
from HTMLParser import HTMLParser
from random import choice

from searx.version import VERSION_STRING
from searx import settings
from searx import logger


logger = logger.getChild('utils')

ua_versions = ('33.0',
               '34.0',
               '35.0',
               '36.0',
               '37.0')

ua_os = ('Windows NT 6.3; WOW64',
         'X11; Linux x86_64',
         'X11; Linux x86')
ua = "Mozilla/5.0 ({os}; rv:{version}) Gecko/20100101 Firefox/{version}"

blocked_tags = ('script',
                'style')


def gen_useragent():
    # TODO
    return ua.format(os=choice(ua_os), version=choice(ua_versions))


def searx_useragent():
    return 'searx/{searx_version} {suffix}'.format(
           searx_version=VERSION_STRING,
           suffix=settings['server'].get('useragent_suffix', ''))


def highlight_content(content, query):

    if not content:
        return None
    # ignoring html contents
    # TODO better html content detection
    if content.find('<') != -1:
        return content

    query = query.decode('utf-8')
    if content.lower().find(query.lower()) > -1:
        query_regex = u'({0})'.format(re.escape(query))
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)
    else:
        regex_parts = []
        for chunk in query.split():
            if len(chunk) == 1:
                regex_parts.append(u'\W+{0}\W+'.format(re.escape(chunk)))
            else:
                regex_parts.append(u'{0}'.format(re.escape(chunk)))
        query_regex = u'({0})'.format('|'.join(regex_parts))
        content = re.sub(query_regex, '<span class="highlight">\\1</span>',
                         content, flags=re.I | re.U)

    return content


class HTMLTextExtractor(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.result = []
        self.tags = []

    def handle_starttag(self, tag, attrs):
        self.tags.append(tag)

    def handle_endtag(self, tag):
        if not self.tags:
            return

        if tag != self.tags[-1]:
            raise Exception("invalid html")

        self.tags.pop()

    def is_valid_tag(self):
        return not self.tags or self.tags[-1] not in blocked_tags

    def handle_data(self, d):
        if not self.is_valid_tag():
            return
        self.result.append(d)

    def handle_charref(self, number):
        if not self.is_valid_tag():
            return
        if number[0] in (u'x', u'X'):
            codepoint = int(number[1:], 16)
        else:
            codepoint = int(number)
        self.result.append(unichr(codepoint))

    def handle_entityref(self, name):
        if not self.is_valid_tag():
            return
        # codepoint = htmlentitydefs.name2codepoint[name]
        # self.result.append(unichr(codepoint))
        self.result.append(name)

    def get_text(self):
        return u''.join(self.result).strip()


def html_to_text(html):
    html = html.replace('\n', ' ')
    html = ' '.join(html.split())
    s = HTMLTextExtractor()
    s.feed(html)
    return s.get_text()


class UnicodeWriter:
    """
    A CSV writer which will write rows to CSV file "f",
    which is encoded in the given encoding.
    """

    def __init__(self, f, dialect=csv.excel, encoding="utf-8", **kwds):
        # Redirect output to a queue
        self.queue = cStringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoder = getincrementalencoder(encoding)()

    def writerow(self, row):
        unicode_row = []
        for col in row:
            if type(col) == str or type(col) == unicode:
                unicode_row.append(col.encode('utf-8').strip())
            else:
                unicode_row.append(col)
        self.writer.writerow(unicode_row)
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = self.encoder.encode(data)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


def get_themes(root):
    """Returns available themes list."""

    static_path = os.path.join(root, 'static')
    templates_path = os.path.join(root, 'templates')

    themes = os.listdir(os.path.join(static_path, 'themes'))
    return static_path, templates_path, themes


def get_static_files(base_path):
    base_path = os.path.join(base_path, 'static')
    static_files = set()
    base_path_length = len(base_path) + 1
    for directory, _, files in os.walk(base_path):
        for filename in files:
            f = os.path.join(directory[base_path_length:], filename)
            static_files.add(f)
    return static_files


def get_result_templates(base_path):
    base_path = os.path.join(base_path, 'templates')
    result_templates = set()
    base_path_length = len(base_path) + 1
    for directory, _, files in os.walk(base_path):
        if directory.endswith('result_templates'):
            for filename in files:
                f = os.path.join(directory[base_path_length:], filename)
                result_templates.add(f)
    return result_templates


def format_date_by_locale(date_string, locale_string):
    # strftime works only on dates after 1900
    parsed_date = dateutil.parser.parse(date_string)
    if parsed_date.year <= 1900:
        return parsed_date.isoformat().split('T')[0]

    orig_locale = locale.getlocale()[0]
    try:
        locale.setlocale(locale.LC_ALL, locale_string)
    except:
        logger.warning('cannot set locale: {0}'.format(locale_string))
    formatted_date = parsed_date.strftime(locale.nl_langinfo(locale.D_FMT))
    try:
        locale.setlocale(locale.LC_ALL, orig_locale)
    except:
        logger.warning('cannot set original locale: {0}'.format(orig_locale))
    return formatted_date


def dict_subset(d, properties):
    result = {}
    for k in properties:
        if k in d:
            result[k] = d[k]
    return result


def prettify_url(url):
    if len(url) > 74:
        return u'{0}[...]{1}'.format(url[:35], url[-35:])
    else:
        return url


# get element in list or default value
def list_get(a_list, index, default=None):
    if len(a_list) > index:
        return a_list[index]
    else:
        return default


def get_blocked_engines(engines, cookies):
    if 'blocked_engines' not in cookies:
        return [(engine_name, category) for engine_name in engines
                for category in engines[engine_name].categories if engines[engine_name].disabled]

    blocked_engine_strings = cookies.get('blocked_engines', '').split(',')
    blocked_engines = []

    if not blocked_engine_strings:
        return blocked_engines

    for engine_string in blocked_engine_strings:
        if engine_string.find('__') > -1:
            engine, category = engine_string.split('__', 1)
            if engine in engines and category in engines[engine].categories:
                blocked_engines.append((engine, category))
        elif engine_string in engines:
            for category in engines[engine_string].categories:
                blocked_engines.append((engine_string, category))

    return blocked_engines