1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
|
"""
Yahoo (Web)
@website https://search.yahoo.com/web
@provide-api yes (https://developer.yahoo.com/boss/search/),
$0.80/1000 queries
@using-api no (because pricing)
@results HTML (using search portal)
@stable no (HTML can change)
@parse url, title, content, suggestion
"""
from urllib import urlencode
from urlparse import unquote
from lxml import html
from searx.engines.xpath import extract_text, extract_url
# engine dependent config
categories = ['general']
paging = True
language_support = True
time_range_support = True
# search-url
base_url = 'https://search.yahoo.com/'
search_url = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}'
search_url_with_time = 'search?{query}&b={offset}&fl=1&vl=lang_{lang}&age={age}&btf={btf}&fr2=time'
supported_languages_url = 'https://search.yahoo.com/web/advanced'
# specific xpath variables
results_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' Sr ')]"
url_xpath = './/h3/a/@href'
title_xpath = './/h3/a'
content_xpath = './/div[@class="compText aAbs"]'
suggestion_xpath = "//div[contains(concat(' ', normalize-space(@class), ' '), ' AlsoTry ')]//a"
time_range_dict = {'day': ['1d', 'd'],
'week': ['1w', 'w'],
'month': ['1m', 'm']}
# remove yahoo-specific tracking-url
def parse_url(url_string):
endings = ['/RS', '/RK']
endpositions = []
start = url_string.find('http', url_string.find('/RU=') + 1)
for ending in endings:
endpos = url_string.rfind(ending)
if endpos > -1:
endpositions.append(endpos)
if start == 0 or len(endpositions) == 0:
return url_string
else:
end = min(endpositions)
return unquote(url_string[start:end])
def _get_url(query, offset, language, time_range):
if time_range in time_range_dict:
return base_url + search_url_with_time.format(offset=offset,
query=urlencode({'p': query}),
lang=language,
age=time_range_dict[time_range][0],
btf=time_range_dict[time_range][1])
return base_url + search_url.format(offset=offset,
query=urlencode({'p': query}),
lang=language)
def _get_language(params):
if params['language'] == 'all':
return 'en'
elif params['language'][:2] == 'zh':
if params['language'] == 'zh' or params['language'] == 'zh-CH':
return 'szh'
else:
return 'tzh'
else:
return params['language'].split('-')[0]
# do search-request
def request(query, params):
if params['time_range'] and params['time_range'] not in time_range_dict:
return params
offset = (params['pageno'] - 1) * 10 + 1
language = _get_language(params)
params['url'] = _get_url(query, offset, language, params['time_range'])
# TODO required?
params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\
.format(lang=language)
return params
# get response from search-request
def response(resp):
results = []
dom = html.fromstring(resp.text)
try:
results_num = int(dom.xpath('//div[@class="compPagination"]/span[last()]/text()')[0]
.split()[0].replace(',', ''))
results.append({'number_of_results': results_num})
except:
pass
# parse results
for result in dom.xpath(results_xpath):
try:
url = parse_url(extract_url(result.xpath(url_xpath), search_url))
title = extract_text(result.xpath(title_xpath)[0])
except:
continue
content = extract_text(result.xpath(content_xpath)[0])
# append result
results.append({'url': url,
'title': title,
'content': content})
# if no suggestion found, return results
suggestions = dom.xpath(suggestion_xpath)
if not suggestions:
return results
# parse suggestion
for suggestion in suggestions:
# append suggestion
results.append({'suggestion': extract_text(suggestion)})
# return results
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = []
dom = html.fromstring(resp.text)
options = dom.xpath('//div[@id="yschlang"]/span/label/input')
for option in options:
code = option.xpath('./@value')[0][5:].replace('_', '-')
supported_languages.append(code)
return supported_languages
|