summaryrefslogtreecommitdiff
path: root/searx/results.py
blob: d3f2ecd0d7dab1544c67da6613e80392b85395be (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
import re
import sys
from collections import defaultdict
from operator import itemgetter
from threading import RLock
from searx.engines import engines
from searx.url_utils import urlparse, unquote

if sys.version_info[0] == 3:
    basestring = str

CONTENT_LEN_IGNORED_CHARS_REGEX = re.compile(r'[,;:!?\./\\\\ ()-_]', re.M | re.U)
WHITESPACE_REGEX = re.compile('( |\t|\n)+', re.M | re.U)


# return the meaningful length of the content for a result
def result_content_len(content):
    if isinstance(content, basestring):
        return len(CONTENT_LEN_IGNORED_CHARS_REGEX.sub('', content))
    else:
        return 0


def compare_urls(url_a, url_b):
    # ignore www. in comparison
    if url_a.netloc.startswith('www.'):
        host_a = url_a.netloc.replace('www.', '', 1)
    else:
        host_a = url_a.netloc
    if url_b.netloc.startswith('www.'):
        host_b = url_b.netloc.replace('www.', '', 1)
    else:
        host_b = url_b.netloc

    if host_a != host_b or url_a.query != url_b.query or url_a.fragment != url_b.fragment:
        return False

    # remove / from the end of the url if required
    path_a = url_a.path[:-1]\
        if url_a.path.endswith('/')\
        else url_a.path
    path_b = url_b.path[:-1]\
        if url_b.path.endswith('/')\
        else url_b.path

    return unquote(path_a) == unquote(path_b)


def merge_two_infoboxes(infobox1, infobox2):
    # get engines weights
    if hasattr(engines[infobox1['engine']], 'weight'):
        weight1 = engines[infobox1['engine']].weight
    else:
        weight1 = 1
    if hasattr(engines[infobox2['engine']], 'weight'):
        weight2 = engines[infobox2['engine']].weight
    else:
        weight2 = 1

    if weight2 > weight1:
        infobox1['engine'] = infobox2['engine']

    if 'urls' in infobox2:
        urls1 = infobox1.get('urls', None)
        if urls1 is None:
            urls1 = []

        for url2 in infobox2.get('urls', []):
            unique_url = True
            for url1 in infobox1.get('urls', []):
                if compare_urls(urlparse(url1.get('url', '')), urlparse(url2.get('url', ''))):
                    unique_url = False
                    break
            if unique_url:
                urls1.append(url2)

        infobox1['urls'] = urls1

    if 'img_src' in infobox2:
        img1 = infobox1.get('img_src', None)
        img2 = infobox2.get('img_src')
        if img1 is None:
            infobox1['img_src'] = img2
        elif weight2 > weight1:
            infobox1['img_src'] = img2

    if 'attributes' in infobox2:
        attributes1 = infobox1.get('attributes', None)
        if attributes1 is None:
            attributes1 = []
            infobox1['attributes'] = attributes1

        attributeSet = set()
        for attribute in infobox1.get('attributes', []):
            if attribute.get('label', None) not in attributeSet:
                attributeSet.add(attribute.get('label', None))

        for attribute in infobox2.get('attributes', []):
            if attribute.get('label', None) not in attributeSet:
                attributes1.append(attribute)

    if 'content' in infobox2:
        content1 = infobox1.get('content', None)
        content2 = infobox2.get('content', '')
        if content1 is not None:
            if result_content_len(content2) > result_content_len(content1):
                infobox1['content'] = content2
        else:
            infobox1['content'] = content2


def result_score(result):
    weight = 1.0

    for result_engine in result['engines']:
        if hasattr(engines[result_engine], 'weight'):
            weight *= float(engines[result_engine].weight)

    occurences = len(result['positions'])

    return sum((occurences * weight) / position for position in result['positions'])


class ResultContainer(object):
    """docstring for ResultContainer"""

    def __init__(self):
        super(ResultContainer, self).__init__()
        self.results = defaultdict(list)
        self._merged_results = []
        self.infoboxes = []
        self.suggestions = set()
        self.answers = set()
        self.corrections = set()
        self._number_of_results = []
        self._ordered = False
        self.paging = False
        self.unresponsive_engines = set()
        self.timings = []

    def extend(self, engine_name, results):
        for result in list(results):
            result['engine'] = engine_name
            if 'suggestion' in result:
                self.suggestions.add(result['suggestion'])
                results.remove(result)
            elif 'answer' in result:
                self.answers.add(result['answer'])
                results.remove(result)
            elif 'correction' in result:
                self.corrections.add(result['correction'])
                results.remove(result)
            elif 'infobox' in result:
                self._merge_infobox(result)
                results.remove(result)
            elif 'number_of_results' in result:
                self._number_of_results.append(result['number_of_results'])
                results.remove(result)

        if engine_name in engines:
            with RLock():
                engines[engine_name].stats['search_count'] += 1
                engines[engine_name].stats['result_count'] += len(results)

        if not results:
            return

        self.results[engine_name].extend(results)

        if not self.paging and engine_name in engines and engines[engine_name].paging:
            self.paging = True

        for i, result in enumerate(results):
            if 'url' in result and not isinstance(result['url'], basestring):
                continue
            try:
                result['url'] = result['url'].decode('utf-8')
            except:
                pass
            if 'title' in result and not isinstance(result['title'], basestring):
                continue
            if 'content' in result and not isinstance(result['content'], basestring):
                continue
            position = i + 1
            self._merge_result(result, position)

    def _merge_infobox(self, infobox):
        add_infobox = True
        infobox_id = infobox.get('id', None)
        if infobox_id is not None:
            for existingIndex in self.infoboxes:
                if compare_urls(urlparse(existingIndex.get('id', '')), urlparse(infobox_id)):
                    merge_two_infoboxes(existingIndex, infobox)
                    add_infobox = False

        if add_infobox:
            self.infoboxes.append(infobox)

    def _merge_result(self, result, position):
        result['parsed_url'] = urlparse(result['url'])

        # if the result has no scheme, use http as default
        if not result['parsed_url'].scheme:
            result['parsed_url'] = result['parsed_url']._replace(scheme="http")
            result['url'] = result['parsed_url'].geturl()

        result['engines'] = set([result['engine']])

        # strip multiple spaces and cariage returns from content
        if result.get('content'):
            result['content'] = WHITESPACE_REGEX.sub(' ', result['content'])

        # check for duplicates
        duplicated = False
        for merged_result in self._merged_results:
            if compare_urls(result['parsed_url'], merged_result['parsed_url'])\
               and result.get('template') == merged_result.get('template'):
                duplicated = merged_result
                break

        # merge duplicates together
        if duplicated:
            # using content with more text
            if result_content_len(result.get('content', '')) >\
                    result_content_len(duplicated.get('content', '')):
                duplicated['content'] = result['content']

            # merge all result's parameters not found in duplicate
            for key in result.keys():
                if not duplicated.get(key):
                    duplicated[key] = result.get(key)

            # add the new position
            duplicated['positions'].append(position)

            # add engine to list of result-engines
            duplicated['engines'].add(result['engine'])

            # using https if possible
            if duplicated['parsed_url'].scheme != 'https' and result['parsed_url'].scheme == 'https':
                duplicated['url'] = result['parsed_url'].geturl()
                duplicated['parsed_url'] = result['parsed_url']

        # if there is no duplicate found, append result
        else:
            result['positions'] = [position]
            with RLock():
                self._merged_results.append(result)

    def order_results(self):
        for result in self._merged_results:
            score = result_score(result)
            result['score'] = score
            with RLock():
                for result_engine in result['engines']:
                    engines[result_engine].stats['score_count'] += score

        results = sorted(self._merged_results, key=itemgetter('score'), reverse=True)

        # pass 2 : group results by category and template
        gresults = []
        categoryPositions = {}

        for i, res in enumerate(results):
            # FIXME : handle more than one category per engine
            res['category'] = engines[res['engine']].categories[0]

            # FIXME : handle more than one category per engine
            category = engines[res['engine']].categories[0]\
                + ':' + res.get('template', '')\
                + ':' + ('img_src' if 'img_src' in res or 'thumbnail' in res else '')

            current = None if category not in categoryPositions\
                else categoryPositions[category]

            # group with previous results using the same category
            # if the group can accept more result and is not too far
            # from the current position
            if current is not None and (current['count'] > 0)\
                    and (len(gresults) - current['index'] < 20):
                # group with the previous results using
                # the same category with this one
                index = current['index']
                gresults.insert(index, res)

                # update every index after the current one
                # (including the current one)
                for k in categoryPositions:
                    v = categoryPositions[k]['index']
                    if v >= index:
                        categoryPositions[k]['index'] = v + 1

                # update this category
                current['count'] -= 1

            else:
                # same category
                gresults.append(res)

                # update categoryIndex
                categoryPositions[category] = {'index': len(gresults), 'count': 8}

        # update _merged_results
        self._ordered = True
        self._merged_results = gresults

    def get_ordered_results(self):
        if not self._ordered:
            self.order_results()
        return self._merged_results

    def results_length(self):
        return len(self._merged_results)

    def results_number(self):
        resultnum_sum = sum(self._number_of_results)
        if not resultnum_sum or not self._number_of_results:
            return 0
        return resultnum_sum / len(self._number_of_results)

    def add_unresponsive_engine(self, engine_error):
        self.unresponsive_engines.add(engine_error)

    def add_timing(self, engine_name, engine_time, page_load_time):
        self.timings.append({
            'engine': engines[engine_name].shortcut,
            'total': engine_time,
            'load': page_load_time
        })

    def get_timings(self):
        return self.timings