1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
|
# -*- coding: utf-8 -*-
import json
import re
import unicodedata
import string
from urllib import urlencode
from requests import get
languages = {'de', 'en', 'es', 'fr', 'hu', 'it', 'nl', 'jp'}
url_template = 'https://www.wikidata.org/w/api.php?action=wbgetentities&format=json&{query}&props=labels%7Cdatatype%7Cclaims%7Caliases&languages=' + '|'.join(languages)
url_wmflabs_template = 'http://wdq.wmflabs.org/api?q='
url_wikidata_search_template='http://www.wikidata.org/w/api.php?action=query&list=search&format=json&srnamespace=0&srprop=sectiontitle&{query}'
wmflabs_queries = [
'CLAIM[31:8142]', # all devise
]
db = {
'iso4217' : {
},
'names' : {
}
}
def remove_accents(data):
return unicodedata.normalize('NFKD', data).lower()
def normalize_name(name):
return re.sub(' +',' ', remove_accents(name.lower()).replace('-', ' '))
def add_currency_name(name, iso4217):
global db
db_names = db['names']
if not isinstance(iso4217, basestring):
print "problem", name, iso4217
return
name = normalize_name(name)
if name == '':
print "name empty", iso4217
return
iso4217_set = db_names.get(name, None)
if iso4217_set is not None and iso4217 not in iso4217_set:
db_names[name].append(iso4217)
else:
db_names[name] = [ iso4217 ]
def add_currency_label(label, iso4217, language):
global db
db['iso4217'][iso4217] = db['iso4217'].get(iso4217, {})
db['iso4217'][iso4217][language] = label
def get_property_value(data, name):
prop = data.get('claims', {}).get(name, {})
if len(prop) == 0:
return None
value = prop[0].get('mainsnak', {}).get('datavalue', {}).get('value', '')
if value == '':
return None
return value
def parse_currency(data):
iso4217 = get_property_value(data, 'P498')
if iso4217 is not None:
unit = get_property_value(data, 'P558')
if unit is not None:
add_currency_name(unit, iso4217)
labels = data.get('labels', {})
for language in languages:
name = labels.get(language, {}).get('value', None)
if name != None:
add_currency_name(name, iso4217)
add_currency_label(name, iso4217, language)
aliases = data.get('aliases', {})
for language in aliases:
for i in range(0, len(aliases[language])):
alias = aliases[language][i].get('value', None)
add_currency_name(alias, iso4217)
def fetch_data(wikidata_ids):
url = url_template.format(query=urlencode({'ids' : '|'.join(wikidata_ids)}))
htmlresponse = get(url)
jsonresponse = json.loads(htmlresponse.content)
entities = jsonresponse.get('entities', {})
for pname in entities:
pvalue = entities.get(pname)
parse_currency(pvalue)
def add_q(i):
return "Q" + str(i)
def fetch_data_batch(wikidata_ids):
while len(wikidata_ids) > 0:
if len(wikidata_ids) > 50:
fetch_data(wikidata_ids[0:49])
wikidata_ids = wikidata_ids[50:]
else:
fetch_data(wikidata_ids)
wikidata_ids = []
def wdq_query(query):
url = url_wmflabs_template + query
htmlresponse = get(url)
jsonresponse = json.loads(htmlresponse.content)
qlist = map(add_q, jsonresponse.get('items', {}))
error = jsonresponse.get('status', {}).get('error', None)
if error != None and error != 'OK':
print "error for query '" + query + "' :" + error
fetch_data_batch(qlist)
def wd_query(query, offset=0):
qlist = []
url = url_wikidata_search_template.format(query=urlencode({'srsearch': query, 'srlimit': 50, 'sroffset': offset}))
htmlresponse = get(url)
jsonresponse = json.loads(htmlresponse.content)
for r in jsonresponse.get('query', {}).get('search', {}):
qlist.append(r.get('title', ''))
fetch_data_batch(qlist)
## fetch ##
for q in wmflabs_queries:
wdq_query(q)
# static
add_currency_name(u"euro", 'EUR')
add_currency_name(u"euros", 'EUR')
add_currency_name(u"dollar", 'USD')
add_currency_name(u"dollars", 'USD')
add_currency_name(u"peso", 'MXN')
add_currency_name(u"pesos", 'MXN')
# write
f = open("currencies.json", "wb")
json.dump(db, f, indent=4, encoding="utf-8")
f.close()
|