diff options
author | Alexandre Flament <alex@al-f.net> | 2020-10-26 19:26:44 +0100 |
---|---|---|
committer | Alexandre Flament <alex@al-f.net> | 2020-10-28 08:09:25 +0100 |
commit | 95bd6033fad53b584ae5be54f2229a6edfb5b6a2 (patch) | |
tree | 18acb415e3394a91e01ccbae1d757504792729b9 | |
parent | ca593728af10751fb0a313e2219e9091434d1035 (diff) | |
download | searxng-95bd6033fad53b584ae5be54f2229a6edfb5b6a2.tar.gz searxng-95bd6033fad53b584ae5be54f2229a6edfb5b6a2.zip |
[mod] wikidata engine: use one SPARQL request instead of 2 HTTP requests.
-rw-r--r-- | searx/engines/wikidata.py | 1089 |
1 files changed, 637 insertions, 452 deletions
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c557f4e59..01e873de9 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -3,501 +3,686 @@ Wikidata @website https://wikidata.org - @provide-api yes (https://wikidata.org/w/api.php) + @provide-api yes (https://query.wikidata.org/) - @using-api partially (most things require scraping) - @results JSON, HTML - @stable no (html can change) + @using-api yes + @results JSON + @stable yes @parse url, infobox """ -from searx import logger -from searx.poolrequests import get -from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url -from searx.utils import extract_text, match_language, eval_xpath from urllib.parse import urlencode from json import loads -from lxml.html import fromstring -from lxml import etree + +from dateutil.parser import isoparse +from babel.dates import format_datetime, format_date, format_time, get_datetime_format + +from searx import logger +from searx.data import WIKIDATA_UNITS +from searx.poolrequests import post, get +from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url +from searx.utils import match_language, searx_useragent, get_string_replaces_function +from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom logger = logger.getChild('wikidata') -result_count = 1 - -# urls -wikidata_host = 'https://www.wikidata.org' -url_search = wikidata_host \ - + '/w/index.php?{query}&ns0=1' - -wikidata_api = wikidata_host + '/w/api.php' -url_detail = wikidata_api\ - + '?action=parse&format=json&{query}'\ - + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\ - + '&disableeditsection=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' - -url_map = 'https://www.openstreetmap.org/'\ - + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' -url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' - -# xpaths -div_ids_xpath = '//div[@id]' -wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' -title_xpath = '//*[contains(@class,"wikibase-title-label")]' -description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' -label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' -url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' -wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ - + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' -property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' -preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' -value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ - + '/*/div[contains(@class,"wikibase-snakview-value")]' -language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' -calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' -media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' - - -def get_id_cache(result): - id_cache = {} - for e in eval_xpath(result, div_ids_xpath): - id = e.get('id') - if id.startswith('P'): - id_cache[id] = e - return id_cache +# SPARQL +SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql' +SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain' +WIKIDATA_PROPERTIES = { + 'P434': 'MusicBrainz', + 'P435': 'MusicBrainz', + 'P436': 'MusicBrainz', + 'P966': 'MusicBrainz', + 'P345': 'IMDb', + 'P2397': 'YouTube', + 'P1651': 'YouTube', + 'P2002': 'Twitter', + 'P2013': 'Facebook', + 'P2003': 'Instagram', +} + +# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI +# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE +# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates +# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model +# optmization: +# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization +# * https://github.com/blazegraph/database/wiki/QueryHints +QUERY_TEMPLATE = """ +SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT% +WHERE +{ + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:api "EntitySearch"; + wikibase:limit 1; + mwapi:search "%QUERY%"; + mwapi:language "%LANGUAGE%". + ?item wikibase:apiOutputItem mwapi:item. + } + + %WHERE% + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "%LANGUAGE%,en". + ?item rdfs:label ?itemLabel . + ?item schema:description ?itemDescription . + %WIKIBASE_LABELS% + } + +} +GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY% +""" -def request(query, params): - params['url'] = url_search.format( - query=urlencode({'search': query})) - return params +# Get the calendar names and the property names +QUERY_PROPERTY_NAMES = """ +SELECT ?item ?name +WHERE { + { + SELECT ?item + WHERE { ?item wdt:P279* wd:Q12132 } + } UNION { + VALUES ?item { %ATTRIBUTES% } + } + OPTIONAL { ?item rdfs:label ?name. } +} +""" -def response(resp): - results = [] - htmlparser = etree.HTMLParser() - html = fromstring(resp.content.decode(), parser=htmlparser) - search_results = eval_xpath(html, wikidata_ids_xpath) +# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1 +# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html +sparql_string_escape = get_string_replaces_function({'\t': '\\\t', + '\n': '\\\n', + '\r': '\\\r', + '\b': '\\\b', + '\f': '\\\f', + '\"': '\\\"', + '\'': '\\\'', + '\\': '\\\\'}) + +replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) + + +def get_headers(): + # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits + return { + 'Accept': 'application/sparql-results+json', + 'User-Agent': searx_useragent() + } + + +def get_label_for_entity(entity_id, language): + name = WIKIDATA_PROPERTIES.get(entity_id) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language)) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0])) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, 'en')) + if name is None: + name = entity_id + return name + + +def send_wikidata_query(query, method='GET'): + if method == 'GET': + # query will be cached by wikidata + http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers()) + else: + # query won't be cached by wikidata + http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers()) + if http_response.status_code != 200: + logger.debug('SPARQL endpoint error %s', http_response.content.decode()) + logger.debug('request time %s', str(http_response.elapsed)) + http_response.raise_for_status() + return loads(http_response.content.decode()) + - if resp.search_params['language'].split('-')[0] == 'all': +def request(query, params): + language = params['language'].split('-')[0] + if language == 'all': language = 'en' else: - language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + + query, attributes = get_query(query, language) - # TODO: make requests asynchronous to avoid timeout when result_count > 1 - for search_result in search_results[:result_count]: - wikidata_id = search_result.split('/')[-1] - url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) - htmlresponse = get(url) - jsonresponse = loads(htmlresponse.content.decode()) - results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) + params['method'] = 'POST' + params['url'] = SPARQL_ENDPOINT_URL + params['data'] = {'query': query} + params['headers'] = get_headers() + + params['language'] = language + params['attributes'] = attributes + return params + + +def response(resp): + results = [] + if resp.status_code != 200: + logger.debug('SPARQL endpoint error %s', resp.content.decode()) + resp.raise_for_status() + jsonresponse = loads(resp.content.decode()) + + language = resp.search_params['language'].lower() + attributes = resp.search_params['attributes'] + + seen_entities = set() + + for result in jsonresponse.get('results', {}).get('bindings', []): + attribute_result = {key: value['value'] for key, value in result.items()} + entity_url = attribute_result['item'] + if entity_url not in seen_entities: + seen_entities.add(entity_url) + results += get_results(attribute_result, attributes, language) + else: + logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result)) return results -def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): +def get_results(attribute_result, attributes, language): results = [] - urls = [] - attributes = [] + infobox_title = attribute_result.get('itemLabel') + infobox_id = attribute_result['item'] + infobox_id_lang = None + infobox_urls = [] + infobox_attributes = [] + infobox_content = attribute_result.get('itemDescription') + img_src = None + img_src_priority = 100 + + for attribute in attributes: + value = attribute.get_str(attribute_result, language) + if value is not None and value != '': + attribute_type = type(attribute) + + if attribute_type in (WDURLAttribute, WDArticle): + # get_select() method : there is group_concat(distinct ...;separator=", ") + # split the value here + for url in value.split(', '): + infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs}) + # "normal" results (not infobox) include official website and Wikipedia links. + if attribute.kwargs.get('official') or attribute_type == WDArticle: + results.append({'title': infobox_title, 'url': url}) + # update the infobox_id with the wikipedia URL + # first the local wikipedia URL, and as fallback the english wikipedia URL + if attribute_type == WDArticle\ + and ((attribute.language == 'en' and infobox_id_lang is None) + or attribute.language != 'en'): + infobox_id_lang = attribute.language + infobox_id = url + elif attribute_type == WDImageAttribute: + # this attribute is an image. + # replace the current image only the priority is lower + # (the infobox contain only one image). + if attribute.priority < img_src_priority: + img_src = value + img_src_priority = attribute.priority + elif attribute_type == WDGeoAttribute: + # geocoordinate link + # use the area to get the OSM zoom + # Note: ignre the unit (must be km² otherwise the calculation is wrong) + # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount + area = attribute_result.get('P2046') + osm_zoom = area_to_osm_zoom(area) if area else 19 + url = attribute.get_str(attribute_result, language, osm_zoom=osm_zoom) + if url: + infobox_urls.append({'title': attribute.get_label(language), + 'url': url, + 'entity': attribute.name}) + else: + infobox_attributes.append({'label': attribute.get_label(language), + 'value': value, + 'entity': attribute.name}) + + if infobox_id: + infobox_id = replace_http_by_https(infobox_id) - title = jsonresponse.get('parse', {}).get('displaytitle', {}) - result = jsonresponse.get('parse', {}).get('text', {}) - - if not title or not result: - return results - - title = fromstring(title, parser=htmlparser) - for elem in eval_xpath(title, language_fallback_xpath): - elem.getparent().remove(elem) - title = extract_text(eval_xpath(title, title_xpath)) - - result = fromstring(result, parser=htmlparser) - for elem in eval_xpath(result, language_fallback_xpath): - elem.getparent().remove(elem) - - description = extract_text(eval_xpath(result, description_xpath)) - - id_cache = get_id_cache(result) - - # URLS - - # official website - add_url(urls, result, id_cache, 'P856', results=results) - - # wikipedia - wikipedia_link_count = 0 - wikipedia_link = get_wikilink(result, language + 'wiki') - if wikipedia_link: - wikipedia_link_count += 1 - urls.append({'title': 'Wikipedia (' + language + ')', - 'url': wikipedia_link}) - - if language != 'en': - wikipedia_en_link = get_wikilink(result, 'enwiki') - if wikipedia_en_link: - wikipedia_link_count += 1 - urls.append({'title': 'Wikipedia (en)', - 'url': wikipedia_en_link}) - - # TODO: get_wiki_firstlanguage - # if wikipedia_link_count == 0: - - # more wikis - add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') - add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') - add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') - - add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') - - # musicbrainz - add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') - add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') - add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') - add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') - - # IMDb - add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') - # source code repository - add_url(urls, result, id_cache, 'P1324') - # blog - add_url(urls, result, id_cache, 'P1581') - # social media links - add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') - add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') - add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') - add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') - add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') - - urls.append({'title': 'Wikidata', - 'url': 'https://www.wikidata.org/wiki/' - + wikidata_id + '?uselang=' + language}) - - # INFOBOX ATTRIBUTES (ROWS) - - # DATES - # inception date - add_attribute(attributes, id_cache, 'P571', date=True) - # dissolution date - add_attribute(attributes, id_cache, 'P576', date=True) - # start date - add_attribute(attributes, id_cache, 'P580', date=True) - # end date - add_attribute(attributes, id_cache, 'P582', date=True) - # date of birth - add_attribute(attributes, id_cache, 'P569', date=True) - # date of death - add_attribute(attributes, id_cache, 'P570', date=True) - # date of spacecraft launch - add_attribute(attributes, id_cache, 'P619', date=True) - # date of spacecraft landing - add_attribute(attributes, id_cache, 'P620', date=True) - - # nationality - add_attribute(attributes, id_cache, 'P27') - # country of origin - add_attribute(attributes, id_cache, 'P495') - # country - add_attribute(attributes, id_cache, 'P17') - # headquarters - add_attribute(attributes, id_cache, 'Q180') - - # PLACES - # capital - add_attribute(attributes, id_cache, 'P36', trim=True) - # head of state - add_attribute(attributes, id_cache, 'P35', trim=True) - # head of government - add_attribute(attributes, id_cache, 'P6', trim=True) - # type of government - add_attribute(attributes, id_cache, 'P122') - # official language - add_attribute(attributes, id_cache, 'P37') - # population - add_attribute(attributes, id_cache, 'P1082', trim=True) - # area - add_attribute(attributes, id_cache, 'P2046') - # currency - add_attribute(attributes, id_cache, 'P38', trim=True) - # heigth (building) - add_attribute(attributes, id_cache, 'P2048') - - # MEDIA - # platform (videogames) - add_attribute(attributes, id_cache, 'P400') - # author - add_attribute(attributes, id_cache, 'P50') - # creator - add_attribute(attributes, id_cache, 'P170') - # director - add_attribute(attributes, id_cache, 'P57') - # performer - add_attribute(attributes, id_cache, 'P175') - # developer - add_attribute(attributes, id_cache, 'P178') - # producer - add_attribute(attributes, id_cache, 'P162') - # manufacturer - add_attribute(attributes, id_cache, 'P176') - # screenwriter - add_attribute(attributes, id_cache, 'P58') - # production company - add_attribute(attributes, id_cache, 'P272') - # record label - add_attribute(attributes, id_cache, 'P264') - # publisher - add_attribute(attributes, id_cache, 'P123') - # original network - add_attribute(attributes, id_cache, 'P449') - # distributor - add_attribute(attributes, id_cache, 'P750') - # composer - add_attribute(attributes, id_cache, 'P86') - # publication date - add_attribute(attributes, id_cache, 'P577', date=True) - # genre - add_attribute(attributes, id_cache, 'P136') - # original language - add_attribute(attributes, id_cache, 'P364') - # isbn - add_attribute(attributes, id_cache, 'Q33057') - # software license - add_attribute(attributes, id_cache, 'P275') - # programming language - add_attribute(attributes, id_cache, 'P277') - # version - add_attribute(attributes, id_cache, 'P348', trim=True) - # narrative location - add_attribute(attributes, id_cache, 'P840') - - # LANGUAGES - # number of speakers - add_attribute(attributes, id_cache, 'P1098') - # writing system - add_attribute(attributes, id_cache, 'P282') - # regulatory body - add_attribute(attributes, id_cache, 'P1018') - # language code - add_attribute(attributes, id_cache, 'P218') - - # OTHER - # ceo - add_attribute(attributes, id_cache, 'P169', trim=True) - # founder - add_attribute(attributes, id_cache, 'P112') - # legal form (company/organization) - add_attribute(attributes, id_cache, 'P1454') - # operator - add_attribute(attributes, id_cache, 'P137') - # crew members (tripulation) - add_attribute(attributes, id_cache, 'P1029') - # taxon - add_attribute(attributes, id_cache, 'P225') - # chemical formula - add_attribute(attributes, id_cache, 'P274') - # winner (sports/contests) - add_attribute(attributes, id_cache, 'P1346') - # number of deaths - add_attribute(attributes, id_cache, 'P1120') - # currency code - add_attribute(attributes, id_cache, 'P498') - - image = add_image(id_cache) - - if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: + # add the wikidata URL at the end + infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']}) + + if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and\ + len(infobox_content) == 0: results.append({ - 'url': urls[0]['url'], - 'title': title, - 'content': description - }) + 'url': infobox_urls[0]['url'], + 'title': infobox_title, + 'content': infobox_content + }) else: results.append({ - 'infobox': title, - 'id': wikipedia_link, - 'content': description, - 'img_src': image, - 'attributes': attributes, - 'urls': urls - }) - + 'infobox': infobox_title, + 'id': infobox_id, + 'content': infobox_content, + 'img_src': img_src, + 'urls': infobox_urls, + 'attributes': infobox_attributes + }) return results -# only returns first match -def add_image(id_cache): - # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon - property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] +def get_query(query, language): + attributes = get_attributes(language) + select = [a.get_select() for a in attributes] + where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes])) + wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes])) + group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes])) + query = QUERY_TEMPLATE\ + .replace('%QUERY%', sparql_string_escape(query))\ + .replace('%SELECT%', ' '.join(select))\ + .replace('%WHERE%', '\n '.join(where))\ + .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label))\ + .replace('%GROUP_BY%', ' '.join(group_by))\ + .replace('%LANGUAGE%', language) + return query, attributes - for property_id in property_ids: - image = id_cache.get(property_id, None) - if image is not None: - image_name = eval_xpath(image, media_xpath) - image_src = url_image.replace('{filename}', extract_text(image_name[0])) - return image_src +def get_attributes(language): + attributes = [] -# setting trim will only returned high ranked rows OR the first row -def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): - attribute = id_cache.get(property_id, None) - if attribute is not None: + def add_value(name): + attributes.append(WDAttribute(name)) + + def add_amount(name): + attributes.append(WDAmountAttribute(name)) + + def add_label(name): + attributes.append(WDLabelAttribute(name)) + + def add_url(name, url_id=None, **kwargs): + attributes.append(WDURLAttribute(name, url_id, kwargs)) + + def add_image(name, url_id=None, priority=1): + attributes.append(WDImageAttribute(name, url_id, priority)) + + def add_date(name): + attributes.append(WDDateAttribute(name)) + + # Dates + for p in ['P571', # inception date + 'P576', # dissolution date + 'P580', # start date + 'P582', # end date + 'P569', # date of birth + 'P570', # date of death + 'P619', # date of spacecraft launch + 'P620']: # date of spacecraft landing + add_date(p) + + for p in ['P27', # country of citizenship + 'P495', # country of origin + 'P17', # country + 'P159']: # headquarters location + add_label(p) + + # Places + for p in ['P36', # capital + 'P35', # head of state + 'P6', # head of government + 'P122', # basic form of government + 'P37']: # official language + add_label(p) + + add_value('P1082') # population + add_amount('P2046') # area + add_amount('P281') # postal code + add_label('P38') # currency + add_amount('P2048') # heigth (building) + + # Media + for p in ['P400', # platform (videogames, computing) + 'P50', # author + 'P170', # creator + 'P57', # director + 'P175', # performer + 'P178', # developer + 'P162', # producer + 'P176', # manufacturer + 'P58', # screenwriter + 'P272', # production company + 'P264', # record label + 'P123', # publisher + 'P449', # original network + 'P750', # distributed by + 'P86']: # composer + add_label(p) + + add_date('P577') # publication date + add_label('P136') # genre (music, film, artistic...) + add_label('P364') # original language + add_value('P212') # ISBN-13 + add_value('P957') # ISBN-10 + add_label('P275') # copyright license + add_label('P277') # programming language + add_value('P348') # version + add_label('P840') # narrative location + + # Languages + add_value('P1098') # number of speakers + add_label('P282') # writing system + add_label('P1018') # language regulatory body + add_value('P218') # language code (ISO 639-1) + + # Other + add_label('P169') # ceo + add_label('P112') # founded by + add_label('P1454') # legal form (company, organization) + add_label('P137') # operator (service, facility, ...) + add_label('P1029') # crew members (tripulation) + add_label('P225') # taxon name + add_value('P274') # chemical formula + add_label('P1346') # winner (sports, contests, ...) + add_value('P1120') # number of deaths + add_value('P498') # currency code (ISO 4217) + + # URL + add_url('P856', official=True) # official website + attributes.append(WDArticle(language)) # wikipedia (user language) + if not language.startswith('en'): + attributes.append(WDArticle('en')) # wikipedia (english) + + add_url('P1324') # source code repository + add_url('P1581') # blog + add_url('P434', url_id='musicbrainz_artist') + add_url('P435', url_id='musicbrainz_work') + add_url('P436', url_id='musicbrainz_release_group') + add_url('P966', url_id='musicbrainz_label') + add_url('P345', url_id='imdb_id') + add_url('P2397', url_id='youtube_channel') + add_url('P1651', url_id='youtube_video') + add_url('P2002', url_id='twitter_profile') + add_url('P2013', url_id='facebook_profile') + add_url('P2003', url_id='instagram_profile') + + # Map + attributes.append(WDGeoAttribute('P625')) + + # Image + add_image('P15', priority=1, url_id='wikimedia_image') # route map + add_image('P242', priority=2, url_id='wikimedia_image') # locator map + add_image('P154', priority=3, url_id='wikimedia_image') # logo + add_image('P18', priority=4, url_id='wikimedia_image') # image + add_image('P41', priority=5, url_id='wikimedia_image') # flag + add_image('P2716', priority=6, url_id='wikimedia_image') # collage + add_image('P2910', priority=7, url_id='wikimedia_image') # icon + + return attributes + + +class WDAttribute: + + __slots__ = 'name', + + def __init__(self, name): + self.name = name + + def get_select(self): + return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name) + + def get_label(self, language): + return get_label_for_entity(self.name, language) + + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) + + def get_wikibase_label(self): + return "" + + def get_group_by(self): + return "" + + def get_str(self, result, language): + return result.get(self.name + 's') - if default_label: - label = default_label - else: - label = extract_text(eval_xpath(attribute, label_xpath)) - label = label[0].upper() + label[1:] - - if date: - trim = True - # remove calendar name - calendar_name = eval_xpath(attribute, calendar_name_xpath) - for calendar in calendar_name: - calendar.getparent().remove(calendar) - - concat_values = "" - values = [] - first_value = None - for row in eval_xpath(attribute, property_row_xpath): - if not first_value or not trim or eval_xpath(row, preferred_rank_xpath): - value = eval_xpath(row, value_xpath) - if not value: - continue - value = extract_text(value) - - # save first value in case no ranked row is found - if trim and not first_value: - first_value = value - else: - # to avoid duplicate values - if value not in values: - concat_values += value + ", " - values.append(value) - - if trim and not values: - attributes.append({'label': label, - 'value': first_value}) - else: - attributes.append({'label': label, - 'value': concat_values[:-2]}) + def __repr__(self): + return '<' + str(type(self).__name__) + ':' + self.name + '>' -# requires property_id unless it's a wiki link (defined in link_type) -def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, - link_type=None, only_first=True): - links = [] +class WDAmountAttribute(WDAttribute): - # wiki links don't have property in wikidata page - if link_type and 'wiki' in link_type: - links.append(get_wikilink(result, link_type)) - else: - dom_element = id_cache.get(property_id, None) - if dom_element is not None: - if not default_label: - label = extract_text(eval_xpath(dom_element, label_xpath)) - label = label[0].upper() + label[1:] + def get_select(self): + return '?{name} ?{name}Unit'.replace('{name}', self.name) - if link_type == 'geo': - links.append(get_geolink(dom_element)) + def get_where(self): + return """ OPTIONAL { ?item p:{name} ?{name}Node . + ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} . + OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace('{name}', self.name) - elif link_type == 'imdb': - links.append(get_imdblink(dom_element, url_prefix)) + def get_group_by(self): + return self.get_select() - else: - url_results = eval_xpath(dom_element, url_xpath) - for link in url_results: - if link is not None: - if url_prefix: - link = url_prefix + extract_text(link) - else: - link = extract_text(link) - links.append(link) - - # append urls - for url in links: - if url is not None: - u = {'title': default_label or label, 'url': url} - if property_id == 'P856': - u['official'] = True - u['domain'] = url.split('/')[2] - urls.append(u) - if results is not None: - results.append(u) - if only_first: - break - - -def get_imdblink(result, url_prefix): - imdb_id = eval_xpath(result, value_xpath) - if imdb_id: - imdb_id = extract_text(imdb_id) - id_prefix = imdb_id[:2] - if id_prefix == 'tt': - url = url_prefix + 'title/' + imdb_id - elif id_prefix == 'nm': - url = url_prefix + 'name/' + imdb_id - elif id_prefix == 'ch': - url = url_prefix + 'character/' + imdb_id - elif id_prefix == 'co': - url = url_prefix + 'company/' + imdb_id - elif id_prefix == 'ev': - url = url_prefix + 'event/' + imdb_id - else: - url = None - return url + def get_str(self, result, language): + value = result.get(self.name) + unit = result.get(self.name + "Unit") + if unit is not None: + unit = unit.replace('http://www.wikidata.org/entity/', '') + return value + " " + get_label_for_entity(unit, language) + return value -def get_geolink(result): - coordinates = eval_xpath(result, value_xpath) - if not coordinates: - return None - coordinates = extract_text(coordinates[0]) - latitude, longitude = coordinates.split(',') - - # convert to decimal - lat = int(latitude[:latitude.find('°')]) - if latitude.find('\'') >= 0: - lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0 - if latitude.find('"') >= 0: - lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 - if latitude.find('S') >= 0: - lat *= -1 - lon = int(longitude[:longitude.find('°')]) - if longitude.find('\'') >= 0: - lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0 - if longitude.find('"') >= 0: - lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 - if longitude.find('W') >= 0: - lon *= -1 - - # TODO: get precision - precision = 0.0002 - # there is no zoom information, deduce from precision (error prone) - # samples : - # 13 --> 5 - # 1 --> 6 - # 0.016666666666667 --> 9 - # 0.00027777777777778 --> 19 - # wolframalpha : - # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}} - # 14.1186-8.8322 x+0.625447 x^2 - if precision < 0.0003: - zoom = 19 - else: - zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) +class WDArticle(WDAttribute): + + __slots__ = 'language', 'kwargs' + + def __init__(self, language, kwargs=None): + super().__init__('wikipedia') + self.language = language + self.kwargs = kwargs or {} + + def get_label(self, language): + # language parameter is ignored + return "Wikipedia ({language})".replace('{language}', self.language) + + def get_select(self): + return "?article{language} ?articleName{language}".replace('{language}', self.language) + + def get_where(self): + return """OPTIONAL { ?article{language} schema:about ?item ; + schema:inLanguage "{language}" ; + schema:isPartOf <https://{language}.wikipedia.org/> ; + schema:name ?articleName{language} . }""".replace('{language}', self.language) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language): + key = 'article{language}'.replace('{language}', self.language) + return result.get(key) + + +class WDLabelAttribute(WDAttribute): + + def get_select(self): + return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name) - url = url_map\ - .replace('{latitude}', str(lat))\ - .replace('{longitude}', str(lon))\ - .replace('{zoom}', str(zoom)) + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) - return url + def get_wikibase_label(self): + return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name) + def get_str(self, result, language): + return result.get(self.name + 'Labels') -def get_wikilink(result, wikiid): - url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid)) - if not url: + +class WDURLAttribute(WDAttribute): + + HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + + __slots__ = 'url_id', 'kwargs' + + def __init__(self, name, url_id=None, kwargs=None): + super().__init__(name) + self.url_id = url_id + self.kwargs = kwargs + + def get_str(self, result, language): + value = result.get(self.name + 's') + if self.url_id and value is not None and value != '': + value = value.split(',')[0] + url_id = self.url_id + if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE): + value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):] + url_id = 'wikimedia_image' + return get_external_url(url_id, value) + return value + + +class WDGeoAttribute(WDAttribute): + + def get_label(self, language): + return "OpenStreetMap" + + def get_select(self): + return "?{name}Lat ?{name}Long".replace('{name}', self.name) + + def get_where(self): + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:geoLatitude ?{name}Lat ; + wikibase:geoLongitude ?{name}Long ] }""".replace('{name}', self.name) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language, osm_zoom=19): + latitude = result.get(self.name + 'Lat') + longitude = result.get(self.name + 'Long') + if latitude and longitude: + return get_earth_coordinates_url(latitude, longitude, osm_zoom) return None - url = url[0] - if url.startswith('http://'): - url = url.replace('http://', 'https://') - elif url.startswith('//'): - url = 'https:' + url - return url + + +class WDImageAttribute(WDURLAttribute): + + __slots__ = 'priority', + + def __init__(self, name, url_id=None, priority=100): + super().__init__(name, url_id) + self.priority = priority + + +class WDDateAttribute(WDAttribute): + + def get_select(self): + return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) + + def get_where(self): + # To remove duplicate, add + # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) } + # this filter is too slow, so the response function ignore duplicate results + # (see the seen_entities variable) + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:timeValue ?{name} ; + wikibase:timePrecision ?{name}timePrecision ; + wikibase:timeTimezone ?{name}timeZone ; + wikibase:timeCalendarModel ?{name}timeCalendar ] . } + hint:Prior hint:rangeSafe true;""".replace('{name}', self.name) + + def get_group_by(self): + return self.get_select() + + def format_8(self, value, locale): + # precision: less than a year + return value + + def format_9(self, value, locale): + year = int(value) + # precision: year + if year < 1584: + if year < 0: + return str(year - 1) + return str(year) + timestamp = isoparse(value) + return format_date(timestamp, format='yyyy', locale=locale) + + def format_10(self, value, locale): + # precision: month + timestamp = isoparse(value) + return format_date(timestamp, format='MMMM y', locale=locale) + + def format_11(self, value, locale): + # precision: day + timestamp = isoparse(value) + return format_date(timestamp, format='full', locale=locale) + + def format_13(self, value, locale): + timestamp = isoparse(value) + # precision: minute + return get_datetime_format(format, locale=locale) \ + .replace("'", "") \ + .replace('{0}', format_time(timestamp, 'full', tzinfo=None, + locale=locale)) \ + .replace('{1}', format_date(timestamp, 'short', locale=locale)) + + def format_14(self, value, locale): + # precision: second. + return format_datetime(isoparse(value), format='full', locale=locale) + + DATE_FORMAT = { + '0': ('format_8', 1000000000), + '1': ('format_8', 100000000), + '2': ('format_8', 10000000), + '3': ('format_8', 1000000), + '4': ('format_8', 100000), + '5': ('format_8', 10000), + '6': ('format_8', 1000), + '7': ('format_8', 100), + '8': ('format_8', 10), + '9': ('format_9', 1), # year + '10': ('format_10', 1), # month + '11': ('format_11', 0), # day + '12': ('format_13', 0), # hour (not supported by babel, display minute) + '13': ('format_13', 0), # minute + '14': ('format_14', 0) # second + } + + def get_str(self, result, language): + value = result.get(self.name) + if value == '' or value is None: + return None + precision = result.get(self.name + 'timePrecision') + date_format = WDDateAttribute.DATE_FORMAT.get(precision) + if date_format is not None: + format_method = getattr(self, date_format[0]) + precision = date_format[1] + try: + if precision >= 1: + t = value.split('-') + if value.startswith('-'): + value = '-' + t[1] + else: + value = t[0] + return format_method(value, language) + except Exception: + return value + return value + + +def debug_explain_wikidata_query(query, method='GET'): + if method == 'GET': + http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers()) + else: + http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers()) + http_response.raise_for_status() + return http_response.content + + +def init(engine_settings=None): + # WIKIDATA_PROPERTIES : add unit symbols + WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) + + # WIKIDATA_PROPERTIES : add property labels + wikidata_property_names = [] + for attribute in get_attributes('en'): + if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute): + if attribute.name not in WIKIDATA_PROPERTIES: + wikidata_property_names.append("wd:" + attribute.name) + query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names)) + jsonresponse = send_wikidata_query(query) + for result in jsonresponse.get('results', {}).get('bindings', {}): + name = result['name']['value'] + lang = result['name']['xml:lang'] + entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') + WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() |