diff options
author | Noémi Ványi <kvch@users.noreply.github.com> | 2020-10-28 22:36:29 +0100 |
---|---|---|
committer | GitHub <noreply@github.com> | 2020-10-28 22:36:29 +0100 |
commit | 10ddd421f22c993a8cd3f4a02798dc3335c59709 (patch) | |
tree | 18acb415e3394a91e01ccbae1d757504792729b9 | |
parent | d3d50eff665f03c16adcb26a774b25b4fd5ade08 (diff) | |
parent | 95bd6033fad53b584ae5be54f2229a6edfb5b6a2 (diff) | |
download | searxng-10ddd421f22c993a8cd3f4a02798dc3335c59709.tar.gz searxng-10ddd421f22c993a8cd3f4a02798dc3335c59709.zip |
Merge pull request #2224 from dalf/update-infobox-engines
[enh] update infobox engines
-rw-r--r-- | searx/data/__init__.py | 5 | ||||
-rw-r--r-- | searx/data/external_urls.json | 156 | ||||
-rw-r--r-- | searx/data/wikidata_units.json | 1006 | ||||
-rw-r--r-- | searx/engines/duckduckgo_definitions.py | 227 | ||||
-rw-r--r-- | searx/engines/wikidata.py | 1089 | ||||
-rw-r--r-- | searx/external_urls.py | 77 | ||||
-rw-r--r-- | searx/results.py | 34 | ||||
-rw-r--r-- | searx/templates/oscar/infobox.html | 6 | ||||
-rw-r--r-- | searx/templates/simple/infobox.html | 1 | ||||
-rw-r--r-- | searx/utils.py | 10 | ||||
-rw-r--r-- | utils/fetch_wikidata_units.py | 47 |
11 files changed, 2120 insertions, 538 deletions
diff --git a/searx/data/__init__.py b/searx/data/__init__.py index 1116e5d47..55a254b13 100644 --- a/searx/data/__init__.py +++ b/searx/data/__init__.py @@ -2,7 +2,8 @@ import json from pathlib import Path -__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'bangs_loader', 'ahmia_blacklist_loader'] +__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'EXTERNAL_URLS', 'WIKIDATA_UNITS', + 'bangs_loader', 'ahmia_blacklist_loader'] data_dir = Path(__file__).parent @@ -24,3 +25,5 @@ def ahmia_blacklist_loader(): ENGINES_LANGUAGES = load('engines_languages.json') CURRENCIES = load('currencies.json') USER_AGENTS = load('useragents.json') +EXTERNAL_URLS = load('external_urls.json') +WIKIDATA_UNITS = load('wikidata_units.json') diff --git a/searx/data/external_urls.json b/searx/data/external_urls.json new file mode 100644 index 000000000..75b153aba --- /dev/null +++ b/searx/data/external_urls.json @@ -0,0 +1,156 @@ +{ + "facebook_profile": { + "category_name": "Facebook", + "url_name": "Facebook profile", + "urls": { + "default": "https://facebook.com/$1" + } + }, + "youtube_channel": { + "category_name": "YouTube", + "url_name": "YouTube channel", + "urls": { + "default": "https://www.youtube.com/channel/$1" + } + }, + "youtube_video": { + "category_name": "YouTube", + "url_name": "YouTube video", + "urls": { + "default": "https://www.youtube.com/watch?v=$1" + } + }, + "twitter_profile": { + "category_name": "Twitter", + "url_name": "Twitter profile", + "urls": { + "default": "https://twitter.com/$1" + } + }, + "instagram_profile": { + "category_name": "Instagram", + "url_name": "Instagram profile", + "urls": { + "default": "https://www.instagram.com/$1" + } + }, + "imdb_title": { + "category_name": "IMDB", + "url_name": "IMDB title", + "urls": { + "default": "https://www.imdb.com/title/$1" + } + }, + "imdb_name": { + "category_name": "IMDB", + "url_name": "IMDB name", + "urls": { + "default": "https://www.imdb.com/name/$1" + } + }, + "imdb_character": { + "category_name": "IMDB", + "url_name": "IMDB character", + "urls": { + "default": "https://www.imdb.com/character/$1" + } + }, + "imdb_company": { + "category_name": "IMDB", + "url_name": "IMDB company", + "urls": { + "default": "https://www.imdb.com/company/$1" + } + }, + "imdb_event": { + "category_name": "IMDB", + "url_name": "IMDB event", + "urls": { + "default": "https://www.imdb.com/event/$1" + } + }, + "rotten_tomatoes": { + "category_name": "Rotten tomatoes", + "url_name": "Rotten tomatoes title", + "urls": { + "default": "https://www.rottentomatoes.com/$1" + } + }, + "spotify_artist_id": { + "category_name": "Spotify", + "url_name": "Spotify artist", + "urls": { + "default": "https://open.spotify.com/artist/$1" + } + }, + "itunes_artist_id": { + "category_name": "iTunes", + "url_name": "iTunes artist", + "urls": { + "default": "https://music.apple.com/us/artist/$1" + } + }, + "soundcloud_id": { + "category_name": "Soundcloud", + "url_name": "Soundcloud artist", + "urls": { + "default": "https://soundcloud.com/$1" + } + }, + "netflix_id": { + "category_name": "Netflix", + "url_name": "Netflix movie", + "urls": { + "default": "https://www.netflix.com/watch/$1" + } + }, + "github_profile": { + "category_name": "Github", + "url_name": "Github profile", + "urls": { + "default": "https://wwww.github.com/$1" + } + }, + "musicbrainz_artist": { + "category_name": "Musicbrainz", + "url_name": "Musicbrainz artist", + "urls": { + "default": "http://musicbrainz.org/artist/$1" + } + }, + "musicbrainz_work": { + "category_name": "Musicbrainz", + "url_name": "Musicbrainz work", + "urls": { + "default": "http://musicbrainz.org/work/$1" + } + }, + "musicbrainz_release_group": { + "category_name": "Musicbrainz", + "url_name": "Musicbrainz release group", + "urls": { + "default": "http://musicbrainz.org/release-group/$1" + } + }, + "musicbrainz_label": { + "category_name": "Musicbrainz", + "url_name": "Musicbrainz label", + "urls": { + "default": "http://musicbrainz.org/label/$1" + } + }, + "wikimedia_image": { + "category_name": "Wikipedia", + "url_name": "Wikipedia image", + "urls": { + "default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400" + } + }, + "map": { + "category_name": "Map", + "url_name": "geo map", + "urls": { + "default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M" + } + } +}
\ No newline at end of file diff --git a/searx/data/wikidata_units.json b/searx/data/wikidata_units.json new file mode 100644 index 000000000..966e5e812 --- /dev/null +++ b/searx/data/wikidata_units.json @@ -0,0 +1,1006 @@ +{ + "Q199": "1", + "Q100149279": "°We", + "Q100995": "lb", + "Q1022113": "cm³", + "Q102573": "Bq", + "Q103246": "Sv", + "Q103510": "bar", + "Q10380431": "TJ", + "Q1040401": "das", + "Q1040427": "hs", + "Q1042866": "Zibit", + "Q1050958": "inHg", + "Q1051665": "m/s²", + "Q1052397": "rad", + "Q1054140": "Mm", + "Q10543042": "Ym", + "Q1057069": "hg", + "Q1063756": "rad/s", + "Q1063786": "in²", + "Q1065153": "mrad", + "Q1066138": "Ps", + "Q1067722": "Fg", + "Q1069725": "p.", + "Q1084321": "Tb/s", + "Q1086691": "fg", + "Q1091257": "tex", + "Q1092296": "a", + "Q1104069": "CAD$", + "Q11061003": "μm²", + "Q11061005": "nm²", + "Q1131660": "st", + "Q1137675": "cr", + "Q1140444": "Zbit", + "Q1140577": "Ybit", + "Q1152074": "Pbit", + "Q1152323": "Tbit", + "Q1165799": "mil", + "Q11776930": "Mg", + "Q11830636": "psf", + "Q11929860": "kpc", + "Q1194225": "lbf", + "Q1194580": "Mibit", + "Q1195111": "Ebit", + "Q1196837": "ω_P", + "Q1197459": "Ms", + "Q11982285": "Em³", + "Q11982288": "Zm³", + "Q11982289": "Tm³", + "Q12011178": "Zs", + "Q1204894": "Gibit", + "Q12257695": "Eb/s", + "Q12257696": "EB/s", + "Q12261466": "kB/s", + "Q12265780": "Pb/s", + "Q12265783": "PB/s", + "Q12269121": "Yb/s", + "Q12269122": "YB/s", + "Q12269308": "Zb/s", + "Q12269309": "ZB/s", + "Q1247300": "cm H₂O", + "Q12714022": "sh cwt", + "Q12789864": "GeV", + "Q12874593": "W h", + "Q128822": "kn", + "Q13035094": "J/mol", + "Q130964": "cal", + "Q131255": "F", + "Q13147228": "g/cm³", + "Q1322380": "Ts", + "Q1323615": "oz t", + "Q132643": "kr", + "Q13400897": "g", + "Q13479685": "mm wg", + "Q1351253": "Eibit", + "Q1351334": "Pibit", + "Q13542672": "Ry", + "Q13548586": "THz", + "Q13582667": "kgf/cm²", + "Q1361854": "dwt", + "Q1363007": "slug", + "Q1374438": "ks", + "Q13753469": "MB/s", + "Q1377051": "Gs", + "Q1394540": "bm", + "Q1396128": "F", + "Q1413142": "Gb", + "Q14158377": "A_P", + "Q14623803": "MDa", + "Q14623804": "kDa", + "Q1472674": "Sv", + "Q14754979": "Zg", + "Q14786969": "MJ", + "Q14913554": "Ys", + "Q14914907": "th", + "Q14916719": "Gpc", + "Q14923662": "Pm³", + "Q1511773": "LSd", + "Q15120301": "l atm", + "Q1542309": "xu", + "Q1545979": "ft³", + "Q1550511": "yd²", + "Q15551713": "Sh", + "Q1569733": "St", + "Q15784325": "apc", + "Q160680": "Br", + "Q160857": "hp", + "Q1628990": "hph", + "Q163343": "T", + "Q163354": "H", + "Q1640501": "hyl", + "Q1645498": "μg", + "Q16859309": "lb·ft", + "Q169893": "S", + "Q170804": "Wb", + "Q17093295": "m/h", + "Q17255465": "v_P", + "Q173117": "R$", + "Q1741429": "kpm", + "Q174467": "Lm", + "Q174728": "cm", + "Q174789": "mm", + "Q175821": "μm", + "Q1768929": "p", + "Q1770733": "Tg", + "Q1772386": "dg", + "Q177493": "Gs", + "Q177612": "sr", + "Q1777507": "fs", + "Q177974": "atm", + "Q178506": "bbl", + "Q178674": "nm", + "Q1793863": "sn", + "Q179836": "lx", + "Q180154": "km/h", + "Q180892": "M☉", + "Q1815100": "cl", + "Q182098": "kWh", + "Q1823150": "μW", + "Q182429": "m/s", + "Q1826195": "dl", + "Q18413919": "cm/s", + "Q184172": "FF", + "Q185078": "a", + "Q185153": "erg", + "Q185648": "Torr", + "Q185759": "span", + "Q1872619": "zs", + "Q189097": "₧", + "Q190095": "Gy", + "Q190951": "S$", + "Q191118": "t", + "Q1913097": "fg", + "Q1916026": "μV", + "Q192027": "Bd", + "Q192274": "pm", + "Q193098": "KD", + "Q1935515": "mA s", + "Q19392152": "TL", + "Q193933": "dpt", + "Q194339": "B$", + "Q1970718": "mam", + "Q1972579": "pdl", + "Q199462": "LE", + "Q199471": "Afs", + "Q200323": "dm", + "Q200337": "Kz", + "Q201880": "LL", + "Q201933": "dyn", + "Q2029156": "quad", + "Q2029519": "hl", + "Q203567": "₦", + "Q2042279": "m H₂O", + "Q204737": "៛", + "Q2051195": "GWh", + "Q2055118": "ppb", + "Q2064166": "fc", + "Q206600": "ރ", + "Q20706220": "cmm", + "Q20706221": "dmm", + "Q2080811": "vol%", + "Q208526": "NT$", + "Q208528": "gon", + "Q208634": "kat", + "Q208788": "fm", + "Q209351": "b", + "Q209426": "′", + "Q21006887": "ppm", + "Q2100949": "P", + "Q21014455": "m/min", + "Q210472": "B/.", + "Q21061369": "g/kg", + "Q21062777": "MPa", + "Q21064807": "kPa", + "Q21064845": "mol/L", + "Q21075844": "ml/l", + "Q21077820": "mg/m³", + "Q21091747": "mg/kg", + "Q211256": "mph", + "Q211580": "BTU (th)", + "Q212120": "A h", + "Q2140397": "in³", + "Q214377": "ell", + "Q2143992": "kHz", + "Q21489891": "nm³", + "Q21489892": "Gm³", + "Q21489893": "Mm³", + "Q21489894": "μm³", + "Q21500224": "mas", + "Q2151240": "mag", + "Q215571": "N m", + "Q21604951": "g/m³", + "Q2165290": "yd³", + "Q216880": "kp", + "Q217208": "a", + "Q2175964": "dm³", + "Q218593": "in", + "Q2199357": "dec", + "Q22137107": "mas/y", + "Q2215478": "ppt", + "Q2221356": "mW h", + "Q22350885": "da", + "Q2243141": "Gb/s", + "Q2254856": "ca", + "Q22673229": "ft/min", + "Q2269250": "kb/s", + "Q2282891": "μl", + "Q2282906": "ng", + "Q229354": "Ci", + "Q232291": "mi²", + "Q2332346": "ml", + "Q23823681": "TW", + "Q23925410": "gal (UK)", + "Q23925413": "gal (US)", + "Q23931040": "dam²", + "Q23931103": "nmi²", + "Q2414435": "$b.", + "Q242988": "Lib$", + "Q2438073": "ag", + "Q2448803": "mV", + "Q2451296": "μF", + "Q246868": "lot", + "Q2474258": "mSv", + "Q2483628": "as", + "Q2489298": "cm²", + "Q249439": "q_P", + "Q2518569": "nSv", + "Q253276": "mi", + "Q25472681": "GB/s", + "Q25472693": "TB/s", + "Q25499149": "oct", + "Q25511288": "mb", + "Q2553708": "MV", + "Q2554092": "kV", + "Q259502": "AU$", + "Q260126": "rem", + "Q2612219": "Pg", + "Q261247": "ct", + "Q2619500": "foe", + "Q2636421": "nH", + "Q2637946": "dal", + "Q2642547": "ha", + "Q2652700": "Osm", + "Q2655272": "Eg", + "Q2659078": "TW h", + "Q2670039": "₶", + "Q26708069": "kcal", + "Q267391": "K", + "Q2679083": "μH", + "Q2682463": "nF", + "Q2691798": "cg", + "Q271206": "sud£", + "Q2737347": "mm²", + "Q2739114": "μSv", + "Q275112": "Bz$", + "Q2756030": "pF", + "Q2757753": "PW h", + "Q2762458": "ys", + "Q27864215": "μW h", + "Q2793566": "GV", + "Q27949241": "R", + "Q2799294": "Gg", + "Q281096": "cd/m²", + "Q28719934": "keV", + "Q28924752": "g/mol", + "Q28924753": "kg/mol", + "Q2924137": "mH", + "Q296936": "toe", + "Q29924639": "kVA", + "Q30001811": "aBq", + "Q30001813": "aC", + "Q30001814": "aHz", + "Q30001815": "aJ", + "Q30001816": "akat", + "Q30001818": "aL", + "Q30001819": "alm", + "Q30001820": "alx", + "Q30001822": "aN", + "Q30001823": "aΩ", + "Q30001825": "aPa", + "Q30001826": "arad", + "Q30001827": "aS", + "Q30001828": "aSv", + "Q30001829": "asr", + "Q30001830": "aT", + "Q30001831": "aV", + "Q30001832": "aW", + "Q30001833": "aWb", + "Q3013059": "kyr", + "Q3194304": "kbit", + "Q3207456": "mW", + "Q321017": "R", + "Q3221356": "ym", + "Q3239557": "pg", + "Q3241121": "mg", + "Q324923": "Hart", + "Q3249364": "cs", + "Q3251645": "ds", + "Q3267417": "Tm", + "Q3270676": "zm", + "Q32750621": "liq pt (US)", + "Q32750759": "fl oz (US)", + "Q32750816": "bu (US)", + "Q32751272": "dry pt (US)", + "Q32751296": "bbl (US)", + "Q3276763": "GHz", + "Q3277907": "Em", + "Q3277915": "Zm", + "Q3277919": "Pm", + "Q3312063": "fL", + "Q3320608": "kW", + "Q3331719": "dm²", + "Q3332689": "ToR", + "Q3332814": "Mbit", + "Q3396758": "daa", + "Q3414243": "rps", + "Q3421309": "R_J", + "Q3495543": "mbar", + "Q355198": "px", + "Q3674704": "km/s", + "Q3675550": "mm³", + "Q3712659": "$", + "Q376660": "nat", + "Q37732658": "°R", + "Q3773454": "Mpc", + "Q3815076": "Kibit", + "Q3833309": "£", + "Q3858002": "mA h", + "Q3867152": "ft/s²", + "Q389062": "Tibit", + "Q3902688": "pl", + "Q3902709": "ps", + "Q39360235": "US lea", + "Q39360471": "nl", + "Q39362962": "µin", + "Q39363132": "UK lg", + "Q39363209": "UK nl", + "Q39380159": "US nmi", + "Q39462789": "µin²", + "Q39467934": "kgf/m²", + "Q39469927": "N/m²", + "Q39617688": "cwt long", + "Q39617818": "t lb", + "Q39628023": "y", + "Q39699418": "cm/s²", + "Q39708248": "S", + "Q39709980": "bd", + "Q39710113": "bhp EDR", + "Q3972226": "kL", + "Q4041686": "iwg", + "Q4068266": "Ʒ", + "Q4176683": "aC", + "Q420266": "oz. fl.", + "Q42319606": "people/m²", + "Q4243638": "km³", + "Q4456994": "mF", + "Q469356": "tn. sh.", + "Q476572": "Ha", + "Q482798": "yd", + "Q483261": "Da", + "Q483725": "A.M.", + "Q484092": "lm", + "Q4861171": "H", + "Q494083": "fur", + "Q4989854": "kJ", + "Q500515": "Gal", + "Q5042194": "£", + "Q50808017": "kg m²", + "Q5139563": "hPa", + "Q514845": "pz", + "Q5195628": "hm³", + "Q5198770": "dam³", + "Q524410": "byr", + "Q53393488": "PHz", + "Q53393490": "EHz", + "Q53393494": "ZHz", + "Q53393498": "YHz", + "Q53393659": "ML", + "Q53393664": "GL", + "Q53393674": "ZL", + "Q53393678": "YL", + "Q53393771": "yL", + "Q53393868": "GJ", + "Q53393886": "PJ", + "Q53393890": "EJ", + "Q53448786": "yHz", + "Q53448790": "zHz", + "Q53448794": "fHz", + "Q53448797": "pHz", + "Q53448801": "nHz", + "Q53448806": "μHz", + "Q53448808": "mHz", + "Q53448813": "cHz", + "Q53448817": "dHz", + "Q53448820": "daHz", + "Q53448826": "hHz", + "Q53448828": "yJ", + "Q53448832": "zJ", + "Q53448842": "pJ", + "Q53448844": "nJ", + "Q53448847": "μJ", + "Q53448851": "mJ", + "Q53448856": "cJ", + "Q53448860": "dJ", + "Q53448864": "daJ", + "Q53448875": "hJ", + "Q53448879": "yPa", + "Q53448883": "zPa", + "Q53448886": "fPa", + "Q53448892": "pPa", + "Q53448897": "nPa", + "Q53448900": "μPa", + "Q53448906": "mPa", + "Q53448909": "cPa", + "Q53448914": "dPa", + "Q53448918": "daPa", + "Q53448922": "GPa", + "Q53448927": "TPa", + "Q53448931": "PPa", + "Q53448936": "EPa", + "Q53448939": "ZPa", + "Q53448943": "YPa", + "Q53448949": "yV", + "Q53448952": "zV", + "Q53448957": "fV", + "Q53448960": "pV", + "Q53448965": "nV", + "Q53448969": "cV", + "Q53448973": "dV", + "Q53448977": "daV", + "Q53448981": "hV", + "Q53448985": "TV", + "Q53448990": "PV", + "Q53448994": "EV", + "Q53448996": "ZV", + "Q53449001": "YV", + "Q53449006": "yW", + "Q53449008": "zW", + "Q53449013": "fW", + "Q53449018": "pW", + "Q53449021": "nW", + "Q53449025": "cW", + "Q53449029": "dW", + "Q53449033": "daW", + "Q53449036": "hW", + "Q53449040": "PW", + "Q53449045": "EW", + "Q53449049": "ZW", + "Q53449054": "YW", + "Q53561461": "wf", + "Q53561822": "wf", + "Q53651160": "zm³", + "Q53651201": "Ym³", + "Q53651356": "ym³", + "Q53651512": "pm³", + "Q53651713": "fm³", + "Q536785": "ρ_P", + "Q53951982": "Mt", + "Q53952048": "kt", + "Q54006645": "ZWb", + "Q54081925": "ZSv", + "Q54082468": "ZS", + "Q54083144": "ZΩ", + "Q54083318": "ZN", + "Q54083566": "Zlm", + "Q54083579": "Zlx", + "Q54083712": "ZBq", + "Q54083746": "ZC", + "Q54083766": "ZF", + "Q54083779": "ZGy", + "Q54083795": "ZH", + "Q54083813": "Zkat", + "Q5409016": "MVA", + "Q5465723": "ft-pdl", + "Q549389": "bit/s", + "Q550341": "V A", + "Q552299": "ch", + "Q55442349": "U/L", + "Q55726194": "mg/L", + "Q56156859": "mmol", + "Q56156949": "μmol", + "Q56157046": "nmol", + "Q56157048": "pmol", + "Q56160603": "fmol", + "Q56302633": "UM", + "Q56317116": "mgal", + "Q56317622": "Q_P", + "Q56318907": "kbar", + "Q56349362": "Bs.S", + "Q56402798": "kN", + "Q5711261": "am³", + "Q581432": "‴", + "Q5879479": "GW", + "Q6003257": "am", + "Q6009164": "MW h", + "Q6014364": "in/s", + "Q603071": "E°", + "Q605704": "doz", + "Q60742631": "AU/yr", + "Q608697": "Mx", + "Q610135": "G", + "Q613726": "Yg", + "Q6170164": "yg", + "Q6171168": "zg", + "Q61756607": "yd", + "Q61793198": "rd", + "Q61794766": "ch (US survey)", + "Q61994988": "Wth", + "Q61995006": "KWth", + "Q626299": "psi", + "Q630369": "var", + "Q636200": "U", + "Q640907": "sb", + "Q6414556": "kip", + "Q648908": "bya", + "Q64996135": "gal (US)/min", + "Q65028392": "mm/yr", + "Q651336": "M_J", + "Q6517513": "dag", + "Q667419": "UK t", + "Q681996": "M⊕", + "Q685662": "p_P", + "Q6859652": "mm Hg", + "Q686163": "$", + "Q68725821": "°Rø", + "Q68726230": "°De", + "Q68726625": "°N", + "Q69362731": "°C", + "Q69363953": "K", + "Q693944": "gr", + "Q6982035": "MW", + "Q69878540": "fl oz (UK)", + "Q70378044": "dmol", + "Q70378549": "dK", + "Q70393458": "kmol", + "Q70395375": "Tmol", + "Q70395643": "Mmol", + "Q70395830": "kK", + "Q70396179": "mK", + "Q70397275": "μK", + "Q70397725": "cmol", + "Q70397932": "cK", + "Q70398457": "nK", + "Q70398619": "MK", + "Q70398813": "Gmol", + "Q70398991": "GK", + "Q70440025": "daK", + "Q70440438": "hK", + "Q70440620": "damol", + "Q70440823": "hmol", + "Q70443020": "EK", + "Q70443154": "yK", + "Q70443282": "zK", + "Q70443367": "fK", + "Q70443453": "TK", + "Q70443757": "pK", + "Q70443901": "YK", + "Q70444029": "PK", + "Q70444141": "Emol", + "Q70444284": "ymol", + "Q70444386": "zmol", + "Q70444514": "Ymol", + "Q70444609": "Pmol", + "Q712226": "km²", + "Q72081071": "MeV", + "Q723733": "ms", + "Q730251": "ft·lbf", + "Q732707": "MHz", + "Q73408": "K", + "Q7350781": "Mb/s", + "Q743895": "bpm", + "Q748716": "ft/s", + "Q750178": "‱", + "Q752197": "kJ/mol", + "Q7672057": "TU", + "Q777017": "dBm", + "Q78754556": "rot", + "Q78756901": "rev", + "Q78757683": "windings", + "Q79726": "kB", + "Q79735": "MB", + "Q79738": "GB", + "Q79741": "TB", + "Q79744": "PB", + "Q79745": "EB", + "Q79747": "ZB", + "Q7974920": "W s", + "Q79752": "YB", + "Q79756": "KiB", + "Q79758": "MiB", + "Q79765": "GiB", + "Q79769": "TiB", + "Q79774": "PiB", + "Q79777": "EiB", + "Q79779": "ZiB", + "Q79781": "YiB", + "Q80237579": "J/nm", + "Q809678": "Ba", + "Q81062869": "W/nm", + "Q81073100": "W/(sr nm)", + "Q81292": "acre", + "Q81454": "Å", + "Q8229770": "B/s", + "Q828224": "km", + "Q829073": "\"", + "Q83216": "cd", + "Q83327": "eV", + "Q834105": "g/L", + "Q835916": "IU", + "Q838801": "ns", + "Q842015": "μs", + "Q842981": "thm (US)", + "Q844211": "kg/m³", + "Q844338": "hm", + "Q844976": "Oe", + "Q845958": "¥", + "Q848856": "dam", + "Q851872": "o", + "Q854546": "Gm", + "Q855161": "Yibit", + "Q856240": "ft³/min", + "Q857027": "ft²", + "Q85854198": "MN", + "Q864818": "abA", + "Q87262709": "kΩ", + "Q87416053": "MΩ", + "Q88296091": "tsp", + "Q9026416": "MWth", + "Q9048643": "nl", + "Q905912": "L", + "Q906223": "Es", + "Q909066": "at", + "Q911730": "nx", + "Q914151": "P_P", + "Q915169": "F_P", + "Q93318": "nmi", + "Q940052": "q", + "Q94076025": "dalm", + "Q94076717": "dakat", + "Q942092": "BWI$", + "Q94414053": "Prad", + "Q94414499": "PC", + "Q94415026": "Grad", + "Q94415255": "GC", + "Q94415438": "Yrad", + "Q94415526": "YC", + "Q94415782": "Mrad", + "Q94416260": "GN", + "Q94416535": "cN", + "Q94416879": "YN", + "Q94417138": "PN", + "Q94417481": "μGy", + "Q94417583": "μS", + "Q94417598": "μT", + "Q94417933": "μlm", + "Q94418102": "μN", + "Q94418220": "μsr", + "Q94418481": "μBq", + "Q94479580": "GΩ", + "Q94480021": "PΩ", + "Q94480081": "YΩ", + "Q94480128": "cΩ", + "Q94480131": "TΩ", + "Q94480136": "pΩ", + "Q94480254": "nΩ", + "Q94480476": "dΩ", + "Q94480633": "EΩ", + "Q94480967": "daΩ", + "Q94481176": "hΩ", + "Q94481339": "fΩ", + "Q94481646": "yΩ", + "Q94487174": "zΩ", + "Q94487366": "mΩ", + "Q94487561": "μΩ", + "Q94487750": "kGy", + "Q94488007": "klx", + "Q94488361": "MF", + "Q94488759": "GBq", + "Q94489041": "PBq", + "Q94489223": "YBq", + "Q94489429": "MBq", + "Q94489465": "kBq", + "Q94489476": "TBq", + "Q94489494": "kWb", + "Q94489520": "kS", + "Q94490951": "klm", + "Q94491129": "kkat", + "Q94634634": "cC", + "Q94634655": "MC", + "Q94634666": "kC", + "Q94634677": "TC", + "Q94634684": "μC", + "Q94634699": "mC", + "Q94693759": "csr", + "Q94693773": "msr", + "Q94693786": "mWb", + "Q94693805": "μWb", + "Q94693819": "GS", + "Q94693849": "cS", + "Q94693918": "MS", + "Q94694019": "TS", + "Q94694096": "pS", + "Q94694154": "nS", + "Q94694206": "mS", + "Q94731530": "mlm", + "Q94731808": "mkat", + "Q94731887": "μkat", + "Q94732218": "nkat", + "Q94732627": "pkat", + "Q94733432": "fkat", + "Q94733760": "cGy", + "Q94734107": "dGy", + "Q94734232": "mGy", + "Q94734359": "daGy", + "Q94734468": "aGy", + "Q94734527": "pGy", + "Q94734593": "nGy", + "Q94734689": "kT", + "Q94734788": "mT", + "Q94939947": "Gkat", + "Q94940018": "Pkat", + "Q94940081": "ykat", + "Q94940160": "dkat", + "Q94940232": "Ekat", + "Q94940295": "Ykat", + "Q94940582": "Tkat", + "Q94940892": "hkat", + "Q94941461": "zkat", + "Q94942602": "MGy", + "Q94942863": "GGy", + "Q94986863": "YWb", + "Q94986889": "PWb", + "Q94986906": "cWb", + "Q94986920": "GWb", + "Q94986942": "MWb", + "Q94986962": "TWb", + "Q95178536": "Mlm", + "Q95178777": "Tlm", + "Q95178881": "clm", + "Q95179024": "plm", + "Q95179137": "nlm", + "Q95179382": "hlm", + "Q95179467": "flm", + "Q95179608": "zlm", + "Q95179695": "Mkat", + "Q95179788": "ckat", + "Q95179882": "PGy", + "Q95377836": "PF", + "Q95377853": "YF", + "Q95378017": "kF", + "Q95378296": "TF", + "Q95379145": "cF", + "Q95379382": "GF", + "Q95379491": "daC", + "Q95379580": "hC", + "Q95379588": "dC", + "Q95379596": "EC", + "Q95445986": "nC", + "Q95446327": "pC", + "Q95446670": "fC", + "Q95447079": "zC", + "Q95447237": "yC", + "Q95447253": "fF", + "Q95447263": "zF", + "Q95447276": "aF", + "Q95447555": "dF", + "Q95447863": "EF", + "Q95448262": "yF", + "Q95448479": "hF", + "Q95448689": "daF", + "Q95448950": "kSv", + "Q95559229": "GSv", + "Q95559368": "YSv", + "Q95559441": "MSv", + "Q95559576": "TSv", + "Q95559603": "PSv", + "Q95609154": "nWb", + "Q95609210": "fWb", + "Q95609261": "zWb", + "Q95609291": "dWb", + "Q95609317": "EWb", + "Q95676212": "pWb", + "Q95676232": "yWb", + "Q95676243": "hWb", + "Q95676250": "daWb", + "Q95676257": "PS", + "Q95676260": "YS", + "Q95676273": "zS", + "Q95676275": "fS", + "Q95676279": "yS", + "Q95676287": "hS", + "Q95676291": "daS", + "Q95676297": "dS", + "Q95676298": "ES", + "Q95720731": "YGy", + "Q95720734": "TGy", + "Q95720736": "fGy", + "Q95720739": "yGy", + "Q95720741": "zGy", + "Q95720742": "EGy", + "Q95720746": "hGy", + "Q95720749": "mlx", + "Q95720758": "μlx", + "Q95720773": "dalx", + "Q95720777": "hlx", + "Q95720781": "dlx", + "Q95720786": "clx", + "Q95857671": "zSv", + "Q95859071": "fSv", + "Q95860960": "daSv", + "Q95861107": "hSv", + "Q95861296": "dSv", + "Q95862182": "ESv", + "Q95863358": "cSv", + "Q95863591": "ySv", + "Q95863894": "pSv", + "Q95864194": "zBq", + "Q95864378": "fBq", + "Q95864695": "daBq", + "Q95864940": "hBq", + "Q95865286": "dBq", + "Q95865530": "EBq", + "Q95865716": "cBq", + "Q95865877": "yBq", + "Q95866173": "pBq", + "Q95866344": "nBq", + "Q95866767": "mBq", + "Q95867993": "mN", + "Q95948345": "crad", + "Q95948364": "drad", + "Q95948734": "daN", + "Q95948739": "hN", + "Q95948747": "dN", + "Q95976839": "Plm", + "Q95976853": "Glm", + "Q95976869": "Ylm", + "Q95976889": "ylm", + "Q95976917": "dlm", + "Q95976919": "Elm", + "Q95976921": "nT", + "Q95993516": "TN", + "Q95993522": "nN", + "Q95993524": "fN", + "Q95993526": "yN", + "Q95993528": "zN", + "Q95993530": "EN", + "Q95993532": "pN", + "Q95993537": "μrad", + "Q95993542": "nrad", + "Q95993547": "frad", + "Q95993553": "prad", + "Q95993554": "darad", + "Q95993557": "hrad", + "Q95993619": "pT", + "Q96025401": "daT", + "Q96025405": "Trad", + "Q96025407": "Zrad", + "Q96025409": "zrad", + "Q96025413": "yrad", + "Q96025414": "Erad", + "Q96025419": "Ylx", + "Q96025422": "Glx", + "Q96025427": "Plx", + "Q96025431": "Mlx", + "Q96025433": "Tlx", + "Q96025435": "nlx", + "Q96025441": "flx", + "Q96050953": "GH", + "Q96051010": "PH", + "Q96051029": "YH", + "Q96051052": "cH", + "Q96051074": "TH", + "Q96051106": "MH", + "Q96051123": "kH", + "Q96051126": "fH", + "Q96051133": "yH", + "Q96051139": "hH", + "Q96051142": "dH", + "Q96051144": "EH", + "Q96051150": "pH", + "Q96051160": "daH", + "Q96051186": "zH", + "Q96051199": "aH", + "Q96051245": "ylx", + "Q96051267": "Elx", + "Q96051282": "plx", + "Q96051312": "zlx", + "Q96070067": "PT", + "Q96070074": "YT", + "Q96070076": "GT", + "Q96070087": "cT", + "Q96070103": "MT", + "Q96070125": "hT", + "Q96070145": "fT", + "Q96070174": "TT", + "Q96070195": "zT", + "Q96070247": "yT", + "Q96070254": "dT", + "Q96070264": "ET", + "Q96070276": "m°C", + "Q96070318": "dsr", + "Q96070329": "nsr", + "Q96070341": "psr", + "Q96095866": "fsr", + "Q96095897": "zsr", + "Q96095917": "ysr", + "Q96095927": "dasr", + "Q96095928": "hsr", + "Q96095931": "ksr", + "Q96095933": "Msr", + "Q96095939": "Gsr", + "Q96095941": "μ°C", + "Q96095955": "n°C", + "Q96095960": "k°C", + "Q96106290": "Tsr", + "Q96106298": "Psr", + "Q96106311": "Esr", + "Q96106319": "Zsr", + "Q96106332": "Ysr", + "Q96106346": "c°C", + "Q96106360": "d°C", + "Q96106368": "da°C", + "Q96106385": "h°C", + "Q96106393": "M°C", + "Q96236286": "G°C", + "Q97059641": "p°C", + "Q97059652": "T°C", + "Q97143826": "P°C", + "Q97143831": "y°C", + "Q97143835": "f°C", + "Q97143838": "Z°C", + "Q97143842": "E°C", + "Q97143843": "z°C", + "Q97143849": "Y°C", + "Q97143851": "a°C", + "Q98538634": "eV/m²", + "Q98635536": "eV/m", + "Q98642859": "eV m²/kg", + "Q11229": "%", + "Q11570": "kg", + "Q11573": "m", + "Q11574": "s", + "Q11579": "K", + "Q11582": "L", + "Q12129": "pc", + "Q12438": "N", + "Q16068": "DM", + "Q1811": "ua", + "Q20764": "Myr", + "Q2101": "e", + "Q25235": "h", + "Q25236": "W", + "Q25250": "V", + "Q25267": "°C", + "Q25269": "J", + "Q25272": "A", + "Q25343": "m²", + "Q25406": "C", + "Q25517": "m³", + "Q33680": "rad", + "Q35852": "ha", + "Q36384": "equiv", + "Q3710": "ft", + "Q39274": "Sv", + "Q39369": "Hz", + "Q41509": "mol", + "Q41803": "g", + "Q42289": "°F", + "Q4406": "TV$", + "Q44395": "Pa", + "Q4587": "Le", + "Q4588": "WS$", + "Q4592": "F$", + "Q4596": "Rs", + "Q4597": "$", + "Q47083": "Ω", + "Q48013": "oz", + "Q50094": "Np", + "Q50098": "B", + "Q531": "ly", + "Q5329": "dB", + "Q573": "d", + "Q577": "a", + "Q7727": "min", + "Q8799": "B" +}
\ No newline at end of file diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index 7ce54f056..f8bc44e46 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -12,28 +12,53 @@ DuckDuckGo (definitions) import json from urllib.parse import urlencode from lxml import html -from re import compile + +from searx import logger +from searx.data import WIKIDATA_UNITS from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases -from searx.utils import extract_text, html_to_text, match_language +from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function +from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom + + +logger = logger.getChild('duckduckgo_definitions') -url = 'https://api.duckduckgo.com/'\ +URL = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' -http_regex = compile(r'^http:') +WIKIDATA_PREFIX = [ + 'http://www.wikidata.org/entity/', + 'https://www.wikidata.org/entity/' +] + +replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) + + +def is_broken_text(text): + """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>" + The href URL is broken, the "Related website" may contains some HTML. -def result_to_text(url, text, htmlResult): + The best solution seems to ignore these results. + """ + return text.startswith('http') and ' ' in text + + +def result_to_text(text, htmlResult): # TODO : remove result ending with "Meaning" or "Category" + result = None dom = html.fromstring(htmlResult) a = dom.xpath('//a') if len(a) >= 1: - return extract_text(a[0]) + result = extract_text(a[0]) else: - return text + result = text + if not is_broken_text(result): + return result + return None def request(query, params): - params['url'] = url.format(query=urlencode({'q': query})) + params['url'] = URL.format(query=urlencode({'q': query})) language = match_language(params['language'], supported_languages, language_aliases) language = language.split('-')[0] params['headers']['Accept-Language'] = language @@ -45,6 +70,14 @@ def response(resp): search_res = json.loads(resp.text) + # search_res.get('Entity') possible values (not exhaustive) : + # * continent / country / department / location / waterfall + # * actor / musician / artist + # * book / performing art / film / television / media franchise / concert tour / playwright + # * prepared food + # * website / software / os / programming language / file format / software engineer + # * compagny + content = '' heading = search_res.get('Heading', '') attributes = [] @@ -55,7 +88,8 @@ def response(resp): # add answer if there is one answer = search_res.get('Answer', '') if answer: - if search_res.get('AnswerType', '') not in ['calc']: + logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer) + if search_res.get('AnswerType') not in ['calc', 'ip']: results.append({'answer': html_to_text(answer)}) # add infobox @@ -66,42 +100,36 @@ def response(resp): content = content + search_res.get('Abstract', '') # image - image = search_res.get('Image', '') + image = search_res.get('Image') image = None if image == '' else image - # attributes - if 'Infobox' in search_res: - infobox = search_res.get('Infobox', None) - if 'content' in infobox: - for info in infobox.get('content'): - attributes.append({'label': info.get('label'), - 'value': info.get('value')}) - # urls + # Official website, Wikipedia page for ddg_result in search_res.get('Results', []): - if 'FirstURL' in ddg_result: - firstURL = ddg_result.get('FirstURL', '') - text = ddg_result.get('Text', '') + firstURL = ddg_result.get('FirstURL') + text = ddg_result.get('Text') + if firstURL is not None and text is not None: urls.append({'title': text, 'url': firstURL}) results.append({'title': heading, 'url': firstURL}) # related topics for ddg_result in search_res.get('RelatedTopics', []): if 'FirstURL' in ddg_result: - suggestion = result_to_text(ddg_result.get('FirstURL', None), - ddg_result.get('Text', None), - ddg_result.get('Result', None)) - if suggestion != heading: - results.append({'suggestion': suggestion}) + firstURL = ddg_result.get('FirstURL') + text = ddg_result.get('Text') + if not is_broken_text(text): + suggestion = result_to_text(text, + ddg_result.get('Result')) + if suggestion != heading and suggestion is not None: + results.append({'suggestion': suggestion}) elif 'Topics' in ddg_result: suggestions = [] relatedTopics.append({'name': ddg_result.get('Name', ''), - 'suggestions': suggestions}) + 'suggestions': suggestions}) for topic_result in ddg_result.get('Topics', []): - suggestion = result_to_text(topic_result.get('FirstURL', None), - topic_result.get('Text', None), - topic_result.get('Result', None)) - if suggestion != heading: + suggestion = result_to_text(topic_result.get('Text'), + topic_result.get('Result')) + if suggestion != heading and suggestion is not None: suggestions.append(suggestion) # abstract @@ -110,7 +138,10 @@ def response(resp): # add as result ? problem always in english infobox_id = abstractURL urls.append({'title': search_res.get('AbstractSource'), - 'url': abstractURL}) + 'url': abstractURL, + 'official': True}) + results.append({'url': abstractURL, + 'title': heading}) # definition definitionURL = search_res.get('DefinitionURL', '') @@ -118,53 +149,107 @@ def response(resp): # add as result ? as answer ? problem always in english infobox_id = definitionURL urls.append({'title': search_res.get('DefinitionSource'), - 'url': definitionURL}) + 'url': definitionURL}) # to merge with wikidata's infobox if infobox_id: - infobox_id = http_regex.sub('https:', infobox_id) - - # entity - entity = search_res.get('Entity', None) - # TODO continent / country / department / location / waterfall / - # mountain range : - # link to map search, get weather, near by locations - # TODO musician : link to music search - # TODO concert tour : ?? - # TODO film / actor / television / media franchise : - # links to IMDB / rottentomatoes (or scrap result) - # TODO music : link tu musicbrainz / last.fm - # TODO book : ?? - # TODO artist / playwright : ?? - # TODO compagny : ?? - # TODO software / os : ?? - # TODO software engineer : ?? - # TODO prepared food : ?? - # TODO website : ?? - # TODO performing art : ?? - # TODO prepared food : ?? - # TODO programming language : ?? - # TODO file format : ?? + infobox_id = replace_http_by_https(infobox_id) + + # attributes + # some will be converted to urls + if 'Infobox' in search_res: + infobox = search_res.get('Infobox') + if 'content' in infobox: + osm_zoom = 17 + coordinates = None + for info in infobox.get('content'): + data_type = info.get('data_type') + data_label = info.get('label') + data_value = info.get('value') + + # Workaround: ddg may return a double quote + if data_value == '""': + continue + + # Is it an external URL ? + # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile + # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id + # * netflix_id + external_url = get_external_url(data_type, data_value) + if external_url is not None: + urls.append({'title': data_label, + 'url': external_url}) + elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']: + # ignore instance: Wikidata value from "Instance Of" (Qxxxx) + # ignore wiki_maps_trigger: reference to a javascript + # ignore google_play_artist_id: service shutdown + pass + elif data_type == 'string' and data_label == 'Website': + # There is already an URL for the website + pass + elif data_type == 'area': + attributes.append({'label': data_label, + 'value': area_to_str(data_value), + 'entity': 'P2046'}) + osm_zoom = area_to_osm_zoom(data_value.get('amount')) + elif data_type == 'coordinates': + if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2': + # coordinate on Earth + # get the zoom information from the area + coordinates = info + else: + # coordinate NOT on Earth + attributes.append({'label': data_label, + 'value': data_value, + 'entity': 'P625'}) + elif data_type == 'string': + attributes.append({'label': data_label, + 'value': data_value}) + + if coordinates: + data_label = coordinates.get('label') + data_value = coordinates.get('value') + latitude = data_value.get('latitude') + longitude = data_value.get('longitude') + url = get_earth_coordinates_url(latitude, longitude, osm_zoom) + urls.append({'title': 'OpenStreetMap', + 'url': url, + 'entity': 'P625'}) if len(heading) > 0: # TODO get infobox.meta.value where .label='article_title' if image is None and len(attributes) == 0 and len(urls) == 1 and\ len(relatedTopics) == 0 and len(content) == 0: - results.append({ - 'url': urls[0]['url'], - 'title': heading, - 'content': content - }) + results.append({'url': urls[0]['url'], + 'title': heading, + 'content': content}) else: - results.append({ - 'infobox': heading, - 'id': infobox_id, - 'entity': entity, - 'content': content, - 'img_src': image, - 'attributes': attributes, - 'urls': urls, - 'relatedTopics': relatedTopics - }) + results.append({'infobox': heading, + 'id': infobox_id, + 'content': content, + 'img_src': image, + 'attributes': attributes, + 'urls': urls, + 'relatedTopics': relatedTopics}) return results + + +def unit_to_str(unit): + for prefix in WIKIDATA_PREFIX: + if unit.startswith(prefix): + wikidata_entity = unit[len(prefix):] + return WIKIDATA_UNITS.get(wikidata_entity, unit) + return unit + + +def area_to_str(area): + """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}""" + unit = unit_to_str(area.get('unit')) + if unit is not None: + try: + amount = float(area.get('amount')) + return '{} {}'.format(amount, unit) + except ValueError: + pass + return '{} {}'.format(area.get('amount', ''), area.get('unit', '')) diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index c557f4e59..01e873de9 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -3,501 +3,686 @@ Wikidata @website https://wikidata.org - @provide-api yes (https://wikidata.org/w/api.php) + @provide-api yes (https://query.wikidata.org/) - @using-api partially (most things require scraping) - @results JSON, HTML - @stable no (html can change) + @using-api yes + @results JSON + @stable yes @parse url, infobox """ -from searx import logger -from searx.poolrequests import get -from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url -from searx.utils import extract_text, match_language, eval_xpath from urllib.parse import urlencode from json import loads -from lxml.html import fromstring -from lxml import etree + +from dateutil.parser import isoparse +from babel.dates import format_datetime, format_date, format_time, get_datetime_format + +from searx import logger +from searx.data import WIKIDATA_UNITS +from searx.poolrequests import post, get +from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url +from searx.utils import match_language, searx_useragent, get_string_replaces_function +from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom logger = logger.getChild('wikidata') -result_count = 1 - -# urls -wikidata_host = 'https://www.wikidata.org' -url_search = wikidata_host \ - + '/w/index.php?{query}&ns0=1' - -wikidata_api = wikidata_host + '/w/api.php' -url_detail = wikidata_api\ - + '?action=parse&format=json&{query}'\ - + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\ - + '&disableeditsection=1&preview=1§ionpreview=1&disabletoc=1&utf8=1&formatversion=2' - -url_map = 'https://www.openstreetmap.org/'\ - + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M' -url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400' - -# xpaths -div_ids_xpath = '//div[@id]' -wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href' -title_xpath = '//*[contains(@class,"wikibase-title-label")]' -description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]' -label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a' -url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]' -wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\ - + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href' -property_row_xpath = './/div[contains(@class,"wikibase-statementview")]' -preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]' -value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\ - + '/*/div[contains(@class,"wikibase-snakview-value")]' -language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]' -calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]' -media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a' - - -def get_id_cache(result): - id_cache = {} - for e in eval_xpath(result, div_ids_xpath): - id = e.get('id') - if id.startswith('P'): - id_cache[id] = e - return id_cache +# SPARQL +SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql' +SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain' +WIKIDATA_PROPERTIES = { + 'P434': 'MusicBrainz', + 'P435': 'MusicBrainz', + 'P436': 'MusicBrainz', + 'P966': 'MusicBrainz', + 'P345': 'IMDb', + 'P2397': 'YouTube', + 'P1651': 'YouTube', + 'P2002': 'Twitter', + 'P2013': 'Facebook', + 'P2003': 'Instagram', +} + +# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI +# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE +# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates +# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model +# optmization: +# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization +# * https://github.com/blazegraph/database/wiki/QueryHints +QUERY_TEMPLATE = """ +SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT% +WHERE +{ + SERVICE wikibase:mwapi { + bd:serviceParam wikibase:endpoint "www.wikidata.org"; + wikibase:api "EntitySearch"; + wikibase:limit 1; + mwapi:search "%QUERY%"; + mwapi:language "%LANGUAGE%". + ?item wikibase:apiOutputItem mwapi:item. + } + + %WHERE% + + SERVICE wikibase:label { + bd:serviceParam wikibase:language "%LANGUAGE%,en". + ?item rdfs:label ?itemLabel . + ?item schema:description ?itemDescription . + %WIKIBASE_LABELS% + } + +} +GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY% +""" -def request(query, params): - params['url'] = url_search.format( - query=urlencode({'search': query})) - return params +# Get the calendar names and the property names +QUERY_PROPERTY_NAMES = """ +SELECT ?item ?name +WHERE { + { + SELECT ?item + WHERE { ?item wdt:P279* wd:Q12132 } + } UNION { + VALUES ?item { %ATTRIBUTES% } + } + OPTIONAL { ?item rdfs:label ?name. } +} +""" -def response(resp): - results = [] - htmlparser = etree.HTMLParser() - html = fromstring(resp.content.decode(), parser=htmlparser) - search_results = eval_xpath(html, wikidata_ids_xpath) +# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1 +# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html +sparql_string_escape = get_string_replaces_function({'\t': '\\\t', + '\n': '\\\n', + '\r': '\\\r', + '\b': '\\\b', + '\f': '\\\f', + '\"': '\\\"', + '\'': '\\\'', + '\\': '\\\\'}) + +replace_http_by_https = get_string_replaces_function({'http:': 'https:'}) + + +def get_headers(): + # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits + return { + 'Accept': 'application/sparql-results+json', + 'User-Agent': searx_useragent() + } + + +def get_label_for_entity(entity_id, language): + name = WIKIDATA_PROPERTIES.get(entity_id) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language)) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0])) + if name is None: + name = WIKIDATA_PROPERTIES.get((entity_id, 'en')) + if name is None: + name = entity_id + return name + + +def send_wikidata_query(query, method='GET'): + if method == 'GET': + # query will be cached by wikidata + http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers()) + else: + # query won't be cached by wikidata + http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers()) + if http_response.status_code != 200: + logger.debug('SPARQL endpoint error %s', http_response.content.decode()) + logger.debug('request time %s', str(http_response.elapsed)) + http_response.raise_for_status() + return loads(http_response.content.decode()) + - if resp.search_params['language'].split('-')[0] == 'all': +def request(query, params): + language = params['language'].split('-')[0] + if language == 'all': language = 'en' else: - language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0] + language = match_language(params['language'], supported_languages, language_aliases).split('-')[0] + + query, attributes = get_query(query, language) - # TODO: make requests asynchronous to avoid timeout when result_count > 1 - for search_result in search_results[:result_count]: - wikidata_id = search_result.split('/')[-1] - url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) - htmlresponse = get(url) - jsonresponse = loads(htmlresponse.content.decode()) - results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser) + params['method'] = 'POST' + params['url'] = SPARQL_ENDPOINT_URL + params['data'] = {'query': query} + params['headers'] = get_headers() + + params['language'] = language + params['attributes'] = attributes + return params + + +def response(resp): + results = [] + if resp.status_code != 200: + logger.debug('SPARQL endpoint error %s', resp.content.decode()) + resp.raise_for_status() + jsonresponse = loads(resp.content.decode()) + + language = resp.search_params['language'].lower() + attributes = resp.search_params['attributes'] + + seen_entities = set() + + for result in jsonresponse.get('results', {}).get('bindings', []): + attribute_result = {key: value['value'] for key, value in result.items()} + entity_url = attribute_result['item'] + if entity_url not in seen_entities: + seen_entities.add(entity_url) + results += get_results(attribute_result, attributes, language) + else: + logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result)) return results -def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser): +def get_results(attribute_result, attributes, language): results = [] - urls = [] - attributes = [] + infobox_title = attribute_result.get('itemLabel') + infobox_id = attribute_result['item'] + infobox_id_lang = None + infobox_urls = [] + infobox_attributes = [] + infobox_content = attribute_result.get('itemDescription') + img_src = None + img_src_priority = 100 + + for attribute in attributes: + value = attribute.get_str(attribute_result, language) + if value is not None and value != '': + attribute_type = type(attribute) + + if attribute_type in (WDURLAttribute, WDArticle): + # get_select() method : there is group_concat(distinct ...;separator=", ") + # split the value here + for url in value.split(', '): + infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs}) + # "normal" results (not infobox) include official website and Wikipedia links. + if attribute.kwargs.get('official') or attribute_type == WDArticle: + results.append({'title': infobox_title, 'url': url}) + # update the infobox_id with the wikipedia URL + # first the local wikipedia URL, and as fallback the english wikipedia URL + if attribute_type == WDArticle\ + and ((attribute.language == 'en' and infobox_id_lang is None) + or attribute.language != 'en'): + infobox_id_lang = attribute.language + infobox_id = url + elif attribute_type == WDImageAttribute: + # this attribute is an image. + # replace the current image only the priority is lower + # (the infobox contain only one image). + if attribute.priority < img_src_priority: + img_src = value + img_src_priority = attribute.priority + elif attribute_type == WDGeoAttribute: + # geocoordinate link + # use the area to get the OSM zoom + # Note: ignre the unit (must be km² otherwise the calculation is wrong) + # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount + area = attribute_result.get('P2046') + osm_zoom = area_to_osm_zoom(area) if area else 19 + url = attribute.get_str(attribute_result, language, osm_zoom=osm_zoom) + if url: + infobox_urls.append({'title': attribute.get_label(language), + 'url': url, + 'entity': attribute.name}) + else: + infobox_attributes.append({'label': attribute.get_label(language), + 'value': value, + 'entity': attribute.name}) + + if infobox_id: + infobox_id = replace_http_by_https(infobox_id) - title = jsonresponse.get('parse', {}).get('displaytitle', {}) - result = jsonresponse.get('parse', {}).get('text', {}) - - if not title or not result: - return results - - title = fromstring(title, parser=htmlparser) - for elem in eval_xpath(title, language_fallback_xpath): - elem.getparent().remove(elem) - title = extract_text(eval_xpath(title, title_xpath)) - - result = fromstring(result, parser=htmlparser) - for elem in eval_xpath(result, language_fallback_xpath): - elem.getparent().remove(elem) - - description = extract_text(eval_xpath(result, description_xpath)) - - id_cache = get_id_cache(result) - - # URLS - - # official website - add_url(urls, result, id_cache, 'P856', results=results) - - # wikipedia - wikipedia_link_count = 0 - wikipedia_link = get_wikilink(result, language + 'wiki') - if wikipedia_link: - wikipedia_link_count += 1 - urls.append({'title': 'Wikipedia (' + language + ')', - 'url': wikipedia_link}) - - if language != 'en': - wikipedia_en_link = get_wikilink(result, 'enwiki') - if wikipedia_en_link: - wikipedia_link_count += 1 - urls.append({'title': 'Wikipedia (en)', - 'url': wikipedia_en_link}) - - # TODO: get_wiki_firstlanguage - # if wikipedia_link_count == 0: - - # more wikis - add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage') - add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote') - add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki') - - add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo') - - # musicbrainz - add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/') - add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/') - add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/') - add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/') - - # IMDb - add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb') - # source code repository - add_url(urls, result, id_cache, 'P1324') - # blog - add_url(urls, result, id_cache, 'P1581') - # social media links - add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/') - add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=') - add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/') - add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/') - add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/') - - urls.append({'title': 'Wikidata', - 'url': 'https://www.wikidata.org/wiki/' - + wikidata_id + '?uselang=' + language}) - - # INFOBOX ATTRIBUTES (ROWS) - - # DATES - # inception date - add_attribute(attributes, id_cache, 'P571', date=True) - # dissolution date - add_attribute(attributes, id_cache, 'P576', date=True) - # start date - add_attribute(attributes, id_cache, 'P580', date=True) - # end date - add_attribute(attributes, id_cache, 'P582', date=True) - # date of birth - add_attribute(attributes, id_cache, 'P569', date=True) - # date of death - add_attribute(attributes, id_cache, 'P570', date=True) - # date of spacecraft launch - add_attribute(attributes, id_cache, 'P619', date=True) - # date of spacecraft landing - add_attribute(attributes, id_cache, 'P620', date=True) - - # nationality - add_attribute(attributes, id_cache, 'P27') - # country of origin - add_attribute(attributes, id_cache, 'P495') - # country - add_attribute(attributes, id_cache, 'P17') - # headquarters - add_attribute(attributes, id_cache, 'Q180') - - # PLACES - # capital - add_attribute(attributes, id_cache, 'P36', trim=True) - # head of state - add_attribute(attributes, id_cache, 'P35', trim=True) - # head of government - add_attribute(attributes, id_cache, 'P6', trim=True) - # type of government - add_attribute(attributes, id_cache, 'P122') - # official language - add_attribute(attributes, id_cache, 'P37') - # population - add_attribute(attributes, id_cache, 'P1082', trim=True) - # area - add_attribute(attributes, id_cache, 'P2046') - # currency - add_attribute(attributes, id_cache, 'P38', trim=True) - # heigth (building) - add_attribute(attributes, id_cache, 'P2048') - - # MEDIA - # platform (videogames) - add_attribute(attributes, id_cache, 'P400') - # author - add_attribute(attributes, id_cache, 'P50') - # creator - add_attribute(attributes, id_cache, 'P170') - # director - add_attribute(attributes, id_cache, 'P57') - # performer - add_attribute(attributes, id_cache, 'P175') - # developer - add_attribute(attributes, id_cache, 'P178') - # producer - add_attribute(attributes, id_cache, 'P162') - # manufacturer - add_attribute(attributes, id_cache, 'P176') - # screenwriter - add_attribute(attributes, id_cache, 'P58') - # production company - add_attribute(attributes, id_cache, 'P272') - # record label - add_attribute(attributes, id_cache, 'P264') - # publisher - add_attribute(attributes, id_cache, 'P123') - # original network - add_attribute(attributes, id_cache, 'P449') - # distributor - add_attribute(attributes, id_cache, 'P750') - # composer - add_attribute(attributes, id_cache, 'P86') - # publication date - add_attribute(attributes, id_cache, 'P577', date=True) - # genre - add_attribute(attributes, id_cache, 'P136') - # original language - add_attribute(attributes, id_cache, 'P364') - # isbn - add_attribute(attributes, id_cache, 'Q33057') - # software license - add_attribute(attributes, id_cache, 'P275') - # programming language - add_attribute(attributes, id_cache, 'P277') - # version - add_attribute(attributes, id_cache, 'P348', trim=True) - # narrative location - add_attribute(attributes, id_cache, 'P840') - - # LANGUAGES - # number of speakers - add_attribute(attributes, id_cache, 'P1098') - # writing system - add_attribute(attributes, id_cache, 'P282') - # regulatory body - add_attribute(attributes, id_cache, 'P1018') - # language code - add_attribute(attributes, id_cache, 'P218') - - # OTHER - # ceo - add_attribute(attributes, id_cache, 'P169', trim=True) - # founder - add_attribute(attributes, id_cache, 'P112') - # legal form (company/organization) - add_attribute(attributes, id_cache, 'P1454') - # operator - add_attribute(attributes, id_cache, 'P137') - # crew members (tripulation) - add_attribute(attributes, id_cache, 'P1029') - # taxon - add_attribute(attributes, id_cache, 'P225') - # chemical formula - add_attribute(attributes, id_cache, 'P274') - # winner (sports/contests) - add_attribute(attributes, id_cache, 'P1346') - # number of deaths - add_attribute(attributes, id_cache, 'P1120') - # currency code - add_attribute(attributes, id_cache, 'P498') - - image = add_image(id_cache) - - if len(attributes) == 0 and len(urls) == 2 and len(description) == 0: + # add the wikidata URL at the end + infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']}) + + if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and\ + len(infobox_content) == 0: results.append({ - 'url': urls[0]['url'], - 'title': title, - 'content': description - }) + 'url': infobox_urls[0]['url'], + 'title': infobox_title, + 'content': infobox_content + }) else: results.append({ - 'infobox': title, - 'id': wikipedia_link, - 'content': description, - 'img_src': image, - 'attributes': attributes, - 'urls': urls - }) - + 'infobox': infobox_title, + 'id': infobox_id, + 'content': infobox_content, + 'img_src': img_src, + 'urls': infobox_urls, + 'attributes': infobox_attributes + }) return results -# only returns first match -def add_image(id_cache): - # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon - property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910'] +def get_query(query, language): + attributes = get_attributes(language) + select = [a.get_select() for a in attributes] + where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes])) + wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes])) + group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes])) + query = QUERY_TEMPLATE\ + .replace('%QUERY%', sparql_string_escape(query))\ + .replace('%SELECT%', ' '.join(select))\ + .replace('%WHERE%', '\n '.join(where))\ + .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label))\ + .replace('%GROUP_BY%', ' '.join(group_by))\ + .replace('%LANGUAGE%', language) + return query, attributes - for property_id in property_ids: - image = id_cache.get(property_id, None) - if image is not None: - image_name = eval_xpath(image, media_xpath) - image_src = url_image.replace('{filename}', extract_text(image_name[0])) - return image_src +def get_attributes(language): + attributes = [] -# setting trim will only returned high ranked rows OR the first row -def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False): - attribute = id_cache.get(property_id, None) - if attribute is not None: + def add_value(name): + attributes.append(WDAttribute(name)) + + def add_amount(name): + attributes.append(WDAmountAttribute(name)) + + def add_label(name): + attributes.append(WDLabelAttribute(name)) + + def add_url(name, url_id=None, **kwargs): + attributes.append(WDURLAttribute(name, url_id, kwargs)) + + def add_image(name, url_id=None, priority=1): + attributes.append(WDImageAttribute(name, url_id, priority)) + + def add_date(name): + attributes.append(WDDateAttribute(name)) + + # Dates + for p in ['P571', # inception date + 'P576', # dissolution date + 'P580', # start date + 'P582', # end date + 'P569', # date of birth + 'P570', # date of death + 'P619', # date of spacecraft launch + 'P620']: # date of spacecraft landing + add_date(p) + + for p in ['P27', # country of citizenship + 'P495', # country of origin + 'P17', # country + 'P159']: # headquarters location + add_label(p) + + # Places + for p in ['P36', # capital + 'P35', # head of state + 'P6', # head of government + 'P122', # basic form of government + 'P37']: # official language + add_label(p) + + add_value('P1082') # population + add_amount('P2046') # area + add_amount('P281') # postal code + add_label('P38') # currency + add_amount('P2048') # heigth (building) + + # Media + for p in ['P400', # platform (videogames, computing) + 'P50', # author + 'P170', # creator + 'P57', # director + 'P175', # performer + 'P178', # developer + 'P162', # producer + 'P176', # manufacturer + 'P58', # screenwriter + 'P272', # production company + 'P264', # record label + 'P123', # publisher + 'P449', # original network + 'P750', # distributed by + 'P86']: # composer + add_label(p) + + add_date('P577') # publication date + add_label('P136') # genre (music, film, artistic...) + add_label('P364') # original language + add_value('P212') # ISBN-13 + add_value('P957') # ISBN-10 + add_label('P275') # copyright license + add_label('P277') # programming language + add_value('P348') # version + add_label('P840') # narrative location + + # Languages + add_value('P1098') # number of speakers + add_label('P282') # writing system + add_label('P1018') # language regulatory body + add_value('P218') # language code (ISO 639-1) + + # Other + add_label('P169') # ceo + add_label('P112') # founded by + add_label('P1454') # legal form (company, organization) + add_label('P137') # operator (service, facility, ...) + add_label('P1029') # crew members (tripulation) + add_label('P225') # taxon name + add_value('P274') # chemical formula + add_label('P1346') # winner (sports, contests, ...) + add_value('P1120') # number of deaths + add_value('P498') # currency code (ISO 4217) + + # URL + add_url('P856', official=True) # official website + attributes.append(WDArticle(language)) # wikipedia (user language) + if not language.startswith('en'): + attributes.append(WDArticle('en')) # wikipedia (english) + + add_url('P1324') # source code repository + add_url('P1581') # blog + add_url('P434', url_id='musicbrainz_artist') + add_url('P435', url_id='musicbrainz_work') + add_url('P436', url_id='musicbrainz_release_group') + add_url('P966', url_id='musicbrainz_label') + add_url('P345', url_id='imdb_id') + add_url('P2397', url_id='youtube_channel') + add_url('P1651', url_id='youtube_video') + add_url('P2002', url_id='twitter_profile') + add_url('P2013', url_id='facebook_profile') + add_url('P2003', url_id='instagram_profile') + + # Map + attributes.append(WDGeoAttribute('P625')) + + # Image + add_image('P15', priority=1, url_id='wikimedia_image') # route map + add_image('P242', priority=2, url_id='wikimedia_image') # locator map + add_image('P154', priority=3, url_id='wikimedia_image') # logo + add_image('P18', priority=4, url_id='wikimedia_image') # image + add_image('P41', priority=5, url_id='wikimedia_image') # flag + add_image('P2716', priority=6, url_id='wikimedia_image') # collage + add_image('P2910', priority=7, url_id='wikimedia_image') # icon + + return attributes + + +class WDAttribute: + + __slots__ = 'name', + + def __init__(self, name): + self.name = name + + def get_select(self): + return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name) + + def get_label(self, language): + return get_label_for_entity(self.name, language) + + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) + + def get_wikibase_label(self): + return "" + + def get_group_by(self): + return "" + + def get_str(self, result, language): + return result.get(self.name + 's') - if default_label: - label = default_label - else: - label = extract_text(eval_xpath(attribute, label_xpath)) - label = label[0].upper() + label[1:] - - if date: - trim = True - # remove calendar name - calendar_name = eval_xpath(attribute, calendar_name_xpath) - for calendar in calendar_name: - calendar.getparent().remove(calendar) - - concat_values = "" - values = [] - first_value = None - for row in eval_xpath(attribute, property_row_xpath): - if not first_value or not trim or eval_xpath(row, preferred_rank_xpath): - value = eval_xpath(row, value_xpath) - if not value: - continue - value = extract_text(value) - - # save first value in case no ranked row is found - if trim and not first_value: - first_value = value - else: - # to avoid duplicate values - if value not in values: - concat_values += value + ", " - values.append(value) - - if trim and not values: - attributes.append({'label': label, - 'value': first_value}) - else: - attributes.append({'label': label, - 'value': concat_values[:-2]}) + def __repr__(self): + return '<' + str(type(self).__name__) + ':' + self.name + '>' -# requires property_id unless it's a wiki link (defined in link_type) -def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None, - link_type=None, only_first=True): - links = [] +class WDAmountAttribute(WDAttribute): - # wiki links don't have property in wikidata page - if link_type and 'wiki' in link_type: - links.append(get_wikilink(result, link_type)) - else: - dom_element = id_cache.get(property_id, None) - if dom_element is not None: - if not default_label: - label = extract_text(eval_xpath(dom_element, label_xpath)) - label = label[0].upper() + label[1:] + def get_select(self): + return '?{name} ?{name}Unit'.replace('{name}', self.name) - if link_type == 'geo': - links.append(get_geolink(dom_element)) + def get_where(self): + return """ OPTIONAL { ?item p:{name} ?{name}Node . + ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} . + OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace('{name}', self.name) - elif link_type == 'imdb': - links.append(get_imdblink(dom_element, url_prefix)) + def get_group_by(self): + return self.get_select() - else: - url_results = eval_xpath(dom_element, url_xpath) - for link in url_results: - if link is not None: - if url_prefix: - link = url_prefix + extract_text(link) - else: - link = extract_text(link) - links.append(link) - - # append urls - for url in links: - if url is not None: - u = {'title': default_label or label, 'url': url} - if property_id == 'P856': - u['official'] = True - u['domain'] = url.split('/')[2] - urls.append(u) - if results is not None: - results.append(u) - if only_first: - break - - -def get_imdblink(result, url_prefix): - imdb_id = eval_xpath(result, value_xpath) - if imdb_id: - imdb_id = extract_text(imdb_id) - id_prefix = imdb_id[:2] - if id_prefix == 'tt': - url = url_prefix + 'title/' + imdb_id - elif id_prefix == 'nm': - url = url_prefix + 'name/' + imdb_id - elif id_prefix == 'ch': - url = url_prefix + 'character/' + imdb_id - elif id_prefix == 'co': - url = url_prefix + 'company/' + imdb_id - elif id_prefix == 'ev': - url = url_prefix + 'event/' + imdb_id - else: - url = None - return url + def get_str(self, result, language): + value = result.get(self.name) + unit = result.get(self.name + "Unit") + if unit is not None: + unit = unit.replace('http://www.wikidata.org/entity/', '') + return value + " " + get_label_for_entity(unit, language) + return value -def get_geolink(result): - coordinates = eval_xpath(result, value_xpath) - if not coordinates: - return None - coordinates = extract_text(coordinates[0]) - latitude, longitude = coordinates.split(',') - - # convert to decimal - lat = int(latitude[:latitude.find('°')]) - if latitude.find('\'') >= 0: - lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0 - if latitude.find('"') >= 0: - lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0 - if latitude.find('S') >= 0: - lat *= -1 - lon = int(longitude[:longitude.find('°')]) - if longitude.find('\'') >= 0: - lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0 - if longitude.find('"') >= 0: - lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0 - if longitude.find('W') >= 0: - lon *= -1 - - # TODO: get precision - precision = 0.0002 - # there is no zoom information, deduce from precision (error prone) - # samples : - # 13 --> 5 - # 1 --> 6 - # 0.016666666666667 --> 9 - # 0.00027777777777778 --> 19 - # wolframalpha : - # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}} - # 14.1186-8.8322 x+0.625447 x^2 - if precision < 0.0003: - zoom = 19 - else: - zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447) +class WDArticle(WDAttribute): + + __slots__ = 'language', 'kwargs' + + def __init__(self, language, kwargs=None): + super().__init__('wikipedia') + self.language = language + self.kwargs = kwargs or {} + + def get_label(self, language): + # language parameter is ignored + return "Wikipedia ({language})".replace('{language}', self.language) + + def get_select(self): + return "?article{language} ?articleName{language}".replace('{language}', self.language) + + def get_where(self): + return """OPTIONAL { ?article{language} schema:about ?item ; + schema:inLanguage "{language}" ; + schema:isPartOf <https://{language}.wikipedia.org/> ; + schema:name ?articleName{language} . }""".replace('{language}', self.language) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language): + key = 'article{language}'.replace('{language}', self.language) + return result.get(key) + + +class WDLabelAttribute(WDAttribute): + + def get_select(self): + return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name) - url = url_map\ - .replace('{latitude}', str(lat))\ - .replace('{longitude}', str(lon))\ - .replace('{zoom}', str(zoom)) + def get_where(self): + return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name) - return url + def get_wikibase_label(self): + return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name) + def get_str(self, result, language): + return result.get(self.name + 'Labels') -def get_wikilink(result, wikiid): - url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid)) - if not url: + +class WDURLAttribute(WDAttribute): + + HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/' + + __slots__ = 'url_id', 'kwargs' + + def __init__(self, name, url_id=None, kwargs=None): + super().__init__(name) + self.url_id = url_id + self.kwargs = kwargs + + def get_str(self, result, language): + value = result.get(self.name + 's') + if self.url_id and value is not None and value != '': + value = value.split(',')[0] + url_id = self.url_id + if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE): + value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):] + url_id = 'wikimedia_image' + return get_external_url(url_id, value) + return value + + +class WDGeoAttribute(WDAttribute): + + def get_label(self, language): + return "OpenStreetMap" + + def get_select(self): + return "?{name}Lat ?{name}Long".replace('{name}', self.name) + + def get_where(self): + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:geoLatitude ?{name}Lat ; + wikibase:geoLongitude ?{name}Long ] }""".replace('{name}', self.name) + + def get_group_by(self): + return self.get_select() + + def get_str(self, result, language, osm_zoom=19): + latitude = result.get(self.name + 'Lat') + longitude = result.get(self.name + 'Long') + if latitude and longitude: + return get_earth_coordinates_url(latitude, longitude, osm_zoom) return None - url = url[0] - if url.startswith('http://'): - url = url.replace('http://', 'https://') - elif url.startswith('//'): - url = 'https:' + url - return url + + +class WDImageAttribute(WDURLAttribute): + + __slots__ = 'priority', + + def __init__(self, name, url_id=None, priority=100): + super().__init__(name, url_id) + self.priority = priority + + +class WDDateAttribute(WDAttribute): + + def get_select(self): + return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name) + + def get_where(self): + # To remove duplicate, add + # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) } + # this filter is too slow, so the response function ignore duplicate results + # (see the seen_entities variable) + return """OPTIONAL { ?item p:{name}/psv:{name} [ + wikibase:timeValue ?{name} ; + wikibase:timePrecision ?{name}timePrecision ; + wikibase:timeTimezone ?{name}timeZone ; + wikibase:timeCalendarModel ?{name}timeCalendar ] . } + hint:Prior hint:rangeSafe true;""".replace('{name}', self.name) + + def get_group_by(self): + return self.get_select() + + def format_8(self, value, locale): + # precision: less than a year + return value + + def format_9(self, value, locale): + year = int(value) + # precision: year + if year < 1584: + if year < 0: + return str(year - 1) + return str(year) + timestamp = isoparse(value) + return format_date(timestamp, format='yyyy', locale=locale) + + def format_10(self, value, locale): + # precision: month + timestamp = isoparse(value) + return format_date(timestamp, format='MMMM y', locale=locale) + + def format_11(self, value, locale): + # precision: day + timestamp = isoparse(value) + return format_date(timestamp, format='full', locale=locale) + + def format_13(self, value, locale): + timestamp = isoparse(value) + # precision: minute + return get_datetime_format(format, locale=locale) \ + .replace("'", "") \ + .replace('{0}', format_time(timestamp, 'full', tzinfo=None, + locale=locale)) \ + .replace('{1}', format_date(timestamp, 'short', locale=locale)) + + def format_14(self, value, locale): + # precision: second. + return format_datetime(isoparse(value), format='full', locale=locale) + + DATE_FORMAT = { + '0': ('format_8', 1000000000), + '1': ('format_8', 100000000), + '2': ('format_8', 10000000), + '3': ('format_8', 1000000), + '4': ('format_8', 100000), + '5': ('format_8', 10000), + '6': ('format_8', 1000), + '7': ('format_8', 100), + '8': ('format_8', 10), + '9': ('format_9', 1), # year + '10': ('format_10', 1), # month + '11': ('format_11', 0), # day + '12': ('format_13', 0), # hour (not supported by babel, display minute) + '13': ('format_13', 0), # minute + '14': ('format_14', 0) # second + } + + def get_str(self, result, language): + value = result.get(self.name) + if value == '' or value is None: + return None + precision = result.get(self.name + 'timePrecision') + date_format = WDDateAttribute.DATE_FORMAT.get(precision) + if date_format is not None: + format_method = getattr(self, date_format[0]) + precision = date_format[1] + try: + if precision >= 1: + t = value.split('-') + if value.startswith('-'): + value = '-' + t[1] + else: + value = t[0] + return format_method(value, language) + except Exception: + return value + return value + + +def debug_explain_wikidata_query(query, method='GET'): + if method == 'GET': + http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers()) + else: + http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers()) + http_response.raise_for_status() + return http_response.content + + +def init(engine_settings=None): + # WIKIDATA_PROPERTIES : add unit symbols + WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS) + + # WIKIDATA_PROPERTIES : add property labels + wikidata_property_names = [] + for attribute in get_attributes('en'): + if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute): + if attribute.name not in WIKIDATA_PROPERTIES: + wikidata_property_names.append("wd:" + attribute.name) + query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names)) + jsonresponse = send_wikidata_query(query) + for result in jsonresponse.get('results', {}).get('bindings', {}): + name = result['name']['value'] + lang = result['name']['xml:lang'] + entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '') + WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize() diff --git a/searx/external_urls.py b/searx/external_urls.py new file mode 100644 index 000000000..da58b8f54 --- /dev/null +++ b/searx/external_urls.py @@ -0,0 +1,77 @@ +import math + +from searx.data import EXTERNAL_URLS + + +IMDB_PREFIX_TO_URL_ID = { + 'tt': 'imdb_title', + 'mn': 'imdb_name', + 'ch': 'imdb_character', + 'co': 'imdb_company', + 'ev': 'imdb_event' +} + + +def get_imdb_url_id(imdb_item_id): + id_prefix = imdb_item_id[:2] + return IMDB_PREFIX_TO_URL_ID.get(id_prefix) + + +def get_external_url(url_id, item_id, alternative="default"): + """Return an external URL or None if url_id is not found. + + url_id can take value from data/external_urls.json + The "imdb_id" value is automaticaly converted according to the item_id value. + + If item_id is None, the raw URL with the $1 is returned. + """ + if url_id == 'imdb_id' and item_id is not None: + url_id = get_imdb_url_id(item_id) + + url_description = EXTERNAL_URLS.get(url_id) + if url_description: + url_template = url_description["urls"].get(alternative) + if url_template is not None: + if item_id is not None: + return url_template.replace('$1', item_id) + else: + return url_template + return None + + +def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'): + url = get_external_url('map', None, alternative)\ + .replace('${latitude}', str(latitude))\ + .replace('${longitude}', str(longitude))\ + .replace('${zoom}', str(osm_zoom)) + return url + + +def area_to_osm_zoom(area): + """Convert an area in km² into an OSM zoom. Less reliable if the shape is not round. + + logarithm regression using these data: + * 9596961 -> 4 (China) + * 3287263 -> 5 (India) + * 643801 -> 6 (France) + * 6028 -> 9 + * 1214 -> 10 + * 891 -> 12 + * 12 -> 13 + + In WolframAlpha: + >>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6} + + with 15 = 19-4 (China); 14 = 19-5 (India) and so on + + Args: + area (int,float,str): area in km² + + Returns: + int: OSM zoom or 19 in area is not a number + """ + try: + amount = float(area) + return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount)))) + except ValueError: + return 19 diff --git a/searx/results.py b/searx/results.py index e4cad2e24..34a94511a 100644 --- a/searx/results.py +++ b/searx/results.py @@ -20,6 +20,18 @@ def result_content_len(content): def compare_urls(url_a, url_b): + """Lazy compare between two URL. + "www.example.com" and "example.com" are equals. + "www.example.com/path/" and "www.example.com/path" are equals. + "https://www.example.com/" and "http://www.example.com/" are equals. + + Args: + url_a (ParseResult): first URL + url_b (ParseResult): second URL + + Returns: + bool: True if url_a and url_b are equals + """ # ignore www. in comparison if url_a.netloc.startswith('www.'): host_a = url_a.netloc.replace('www.', '', 1) @@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2): for url2 in infobox2.get('urls', []): unique_url = True parsed_url2 = urlparse(url2.get('url', '')) + entity_url2 = url2.get('entity') for url1 in urls1: - if compare_urls(urlparse(url1.get('url', '')), parsed_url2): + if (entity_url2 is not None and url1.get('entity') == entity_url2)\ + or compare_urls(urlparse(url1.get('url', '')), parsed_url2): unique_url = False break if unique_url: @@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2): infobox1['img_src'] = img2 if 'attributes' in infobox2: - attributes1 = infobox1.get('attributes', None) + attributes1 = infobox1.get('attributes') if attributes1 is None: - attributes1 = [] - infobox1['attributes'] = attributes1 + infobox1['attributes'] = attributes1 = [] attributeSet = set() - for attribute in infobox1.get('attributes', []): - if attribute.get('label', None) not in attributeSet: - attributeSet.add(attribute.get('label', None)) + for attribute in attributes1: + label = attribute.get('label') + if label not in attributeSet: + attributeSet.add(label) + entity = attribute.get('entity') + if entity not in attributeSet: + attributeSet.add(entity) for attribute in infobox2.get('attributes', []): - if attribute.get('label', None) not in attributeSet: + if attribute.get('label') not in attributeSet\ + and attribute.get('entity') not in attributeSet: attributes1.append(attribute) if 'content' in infobox2: diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html index 5ba4aa5f0..8a12b8074 100644 --- a/searx/templates/oscar/infobox.html +++ b/searx/templates/oscar/infobox.html @@ -25,11 +25,7 @@ {%- if attribute.image -%} <td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td> {%- else -%} - {% if attribute.label == 'Instance of' %} - <td><bdi><a href="https://wikidata.org/wiki/{{ attribute.value.id }}">{{ attribute.value.id }}</a></bdi></td> - {% else %} - <td><bdi>{{ attribute.value }}</bdi></td> - {%- endif -%} + <td><bdi>{{ attribute.value }}</bdi></td> {%- endif -%} </tr> {% endfor -%} diff --git a/searx/templates/simple/infobox.html b/searx/templates/simple/infobox.html index 50b568919..08daa5038 100644 --- a/searx/templates/simple/infobox.html +++ b/searx/templates/simple/infobox.html @@ -1,7 +1,6 @@ <aside class="infobox"> <h2><bdi>{{ infobox.infobox }}</bdi></h2> {% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %} - <p><bdi>{{ infobox.entity }}</bdi></p> <p><bdi>{{ infobox.content | safe }}</bdi></p> {% if infobox.attributes %} <div class="attributes"> diff --git a/searx/utils.py b/searx/utils.py index 1c10585cf..9e43c67f0 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -481,6 +481,16 @@ def ecma_unescape(s): return s +def get_string_replaces_function(replaces): + rep = {re.escape(k): v for k, v in replaces.items()} + pattern = re.compile("|".join(rep.keys())) + + def f(text): + return pattern.sub(lambda m: rep[re.escape(m.group(0))], text) + + return f + + def get_engine_from_settings(name): """Return engine configuration from settings.yml of a given engine name""" diff --git a/utils/fetch_wikidata_units.py b/utils/fetch_wikidata_units.py new file mode 100644 index 000000000..69505968e --- /dev/null +++ b/utils/fetch_wikidata_units.py @@ -0,0 +1,47 @@ +#!/usr/bin/env python + +import json +import collections + +# set path +from sys import path +from os.path import realpath, dirname, join +path.append(realpath(dirname(realpath(__file__)) + '/../')) + +from searx import searx_dir +from searx.engines.wikidata import send_wikidata_query + + +SARQL_REQUEST = """ +SELECT DISTINCT ?item ?symbol ?P2370 ?P2370Unit ?P2442 ?P2442Unit +WHERE +{ +?item wdt:P31/wdt:P279 wd:Q47574. +?item wdt:P5061 ?symbol. +FILTER(LANG(?symbol) = "en"). +} +ORDER BY ?item +""" + + +def get_data(): + def get_key(unit): + return unit['item']['value'].replace('http://www.wikidata.org/entity/', '') + + def get_value(unit): + return unit['symbol']['value'] + + result = send_wikidata_query(SARQL_REQUEST) + if result is not None: + # sort the unit by entity name + # so different fetchs keep the file unchanged. + list(result['results']['bindings']).sort(key=get_key) + return collections.OrderedDict([(get_key(unit), get_value(unit)) for unit in result['results']['bindings']]) + + +def get_wikidata_units_filename(): + return join(join(searx_dir, "data"), "wikidata_units.json") + + +with open(get_wikidata_units_filename(), 'w') as f: + json.dump(get_data(), f, indent=4, ensure_ascii=False) |