1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
|
"""
Wikipedia (Web)
@website https://{language}.wikipedia.org
@provide-api yes
@using-api yes
@results JSON
@stable yes
@parse url, infobox
"""
from json import loads
from urllib import urlencode, quote
from lxml.html import fromstring
# search-url
base_url = 'https://{language}.wikipedia.org/'
search_postfix = 'w/api.php?'\
'action=query'\
'&format=json'\
'&{query}'\
'&prop=extracts|pageimages'\
'&exintro'\
'&explaintext'\
'&pithumbsize=300'\
'&redirects'
supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias'
# set language in base_url
def url_lang(lang):
lang = lang.split('-')[0]
if lang == 'all' or lang not in supported_languages:
language = 'en'
else:
language = lang
return base_url.format(language=language)
# do search-request
def request(query, params):
if query.islower():
query += '|' + query.title()
params['url'] = url_lang(params['language']) \
+ search_postfix.format(query=urlencode({'titles': query}))
return params
# get first meaningful paragraph
# this should filter out disambiguation pages and notes above first paragraph
# "magic numbers" were obtained by fine tuning
def extract_first_paragraph(content, title, image):
first_paragraph = None
failed_attempts = 0
for paragraph in content.split('\n'):
starts_with_title = paragraph.lower().find(title.lower(), 0, len(title) + 35)
length = len(paragraph)
if length >= 200 or (starts_with_title >= 0 and (image or length >= 150)):
first_paragraph = paragraph
break
failed_attempts += 1
if failed_attempts > 3:
return None
return first_paragraph
# get response from search-request
def response(resp):
results = []
search_result = loads(resp.content)
# wikipedia article's unique id
# first valid id is assumed to be the requested article
for article_id in search_result['query']['pages']:
page = search_result['query']['pages'][article_id]
if int(article_id) > 0:
break
if int(article_id) < 0:
return []
title = page.get('title')
image = page.get('thumbnail')
if image:
image = image.get('source')
extract = page.get('extract')
summary = extract_first_paragraph(extract, title, image)
if not summary:
return []
# link to wikipedia article
wikipedia_link = url_lang(resp.search_params['language']) \
+ 'wiki/' + quote(title.replace(' ', '_').encode('utf8'))
results.append({'url': wikipedia_link, 'title': title})
results.append({'infobox': title,
'id': wikipedia_link,
'content': summary,
'img_src': image,
'urls': [{'title': 'Wikipedia', 'url': wikipedia_link}]})
return results
# get supported languages from their site
def _fetch_supported_languages(resp):
supported_languages = {}
dom = fromstring(resp.text)
tables = dom.xpath('//table[contains(@class,"sortable")]')
for table in tables:
# exclude header row
trs = table.xpath('.//tr')[1:]
for tr in trs:
td = tr.xpath('./td')
code = td[3].xpath('./a')[0].text
name = td[2].xpath('./a')[0].text
english_name = td[1].xpath('./a')[0].text
articles = int(td[4].xpath('./a/b')[0].text.replace(',', ''))
# exclude languages with too few articles
if articles >= 100:
supported_languages[code] = {"name": name, "english_name": english_name, "articles": articles}
return supported_languages
|