summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoémi Ványi <kvch@users.noreply.github.com>2020-10-28 22:36:29 +0100
committerGitHub <noreply@github.com>2020-10-28 22:36:29 +0100
commit10ddd421f22c993a8cd3f4a02798dc3335c59709 (patch)
tree18acb415e3394a91e01ccbae1d757504792729b9
parentd3d50eff665f03c16adcb26a774b25b4fd5ade08 (diff)
parent95bd6033fad53b584ae5be54f2229a6edfb5b6a2 (diff)
downloadsearxng-10ddd421f22c993a8cd3f4a02798dc3335c59709.tar.gz
searxng-10ddd421f22c993a8cd3f4a02798dc3335c59709.zip
Merge pull request #2224 from dalf/update-infobox-engines
[enh] update infobox engines
-rw-r--r--searx/data/__init__.py5
-rw-r--r--searx/data/external_urls.json156
-rw-r--r--searx/data/wikidata_units.json1006
-rw-r--r--searx/engines/duckduckgo_definitions.py227
-rw-r--r--searx/engines/wikidata.py1089
-rw-r--r--searx/external_urls.py77
-rw-r--r--searx/results.py34
-rw-r--r--searx/templates/oscar/infobox.html6
-rw-r--r--searx/templates/simple/infobox.html1
-rw-r--r--searx/utils.py10
-rw-r--r--utils/fetch_wikidata_units.py47
11 files changed, 2120 insertions, 538 deletions
diff --git a/searx/data/__init__.py b/searx/data/__init__.py
index 1116e5d47..55a254b13 100644
--- a/searx/data/__init__.py
+++ b/searx/data/__init__.py
@@ -2,7 +2,8 @@ import json
from pathlib import Path
-__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'bangs_loader', 'ahmia_blacklist_loader']
+__init__ = ['ENGINES_LANGUGAGES', 'CURRENCIES', 'USER_AGENTS', 'EXTERNAL_URLS', 'WIKIDATA_UNITS',
+ 'bangs_loader', 'ahmia_blacklist_loader']
data_dir = Path(__file__).parent
@@ -24,3 +25,5 @@ def ahmia_blacklist_loader():
ENGINES_LANGUAGES = load('engines_languages.json')
CURRENCIES = load('currencies.json')
USER_AGENTS = load('useragents.json')
+EXTERNAL_URLS = load('external_urls.json')
+WIKIDATA_UNITS = load('wikidata_units.json')
diff --git a/searx/data/external_urls.json b/searx/data/external_urls.json
new file mode 100644
index 000000000..75b153aba
--- /dev/null
+++ b/searx/data/external_urls.json
@@ -0,0 +1,156 @@
+{
+ "facebook_profile": {
+ "category_name": "Facebook",
+ "url_name": "Facebook profile",
+ "urls": {
+ "default": "https://facebook.com/$1"
+ }
+ },
+ "youtube_channel": {
+ "category_name": "YouTube",
+ "url_name": "YouTube channel",
+ "urls": {
+ "default": "https://www.youtube.com/channel/$1"
+ }
+ },
+ "youtube_video": {
+ "category_name": "YouTube",
+ "url_name": "YouTube video",
+ "urls": {
+ "default": "https://www.youtube.com/watch?v=$1"
+ }
+ },
+ "twitter_profile": {
+ "category_name": "Twitter",
+ "url_name": "Twitter profile",
+ "urls": {
+ "default": "https://twitter.com/$1"
+ }
+ },
+ "instagram_profile": {
+ "category_name": "Instagram",
+ "url_name": "Instagram profile",
+ "urls": {
+ "default": "https://www.instagram.com/$1"
+ }
+ },
+ "imdb_title": {
+ "category_name": "IMDB",
+ "url_name": "IMDB title",
+ "urls": {
+ "default": "https://www.imdb.com/title/$1"
+ }
+ },
+ "imdb_name": {
+ "category_name": "IMDB",
+ "url_name": "IMDB name",
+ "urls": {
+ "default": "https://www.imdb.com/name/$1"
+ }
+ },
+ "imdb_character": {
+ "category_name": "IMDB",
+ "url_name": "IMDB character",
+ "urls": {
+ "default": "https://www.imdb.com/character/$1"
+ }
+ },
+ "imdb_company": {
+ "category_name": "IMDB",
+ "url_name": "IMDB company",
+ "urls": {
+ "default": "https://www.imdb.com/company/$1"
+ }
+ },
+ "imdb_event": {
+ "category_name": "IMDB",
+ "url_name": "IMDB event",
+ "urls": {
+ "default": "https://www.imdb.com/event/$1"
+ }
+ },
+ "rotten_tomatoes": {
+ "category_name": "Rotten tomatoes",
+ "url_name": "Rotten tomatoes title",
+ "urls": {
+ "default": "https://www.rottentomatoes.com/$1"
+ }
+ },
+ "spotify_artist_id": {
+ "category_name": "Spotify",
+ "url_name": "Spotify artist",
+ "urls": {
+ "default": "https://open.spotify.com/artist/$1"
+ }
+ },
+ "itunes_artist_id": {
+ "category_name": "iTunes",
+ "url_name": "iTunes artist",
+ "urls": {
+ "default": "https://music.apple.com/us/artist/$1"
+ }
+ },
+ "soundcloud_id": {
+ "category_name": "Soundcloud",
+ "url_name": "Soundcloud artist",
+ "urls": {
+ "default": "https://soundcloud.com/$1"
+ }
+ },
+ "netflix_id": {
+ "category_name": "Netflix",
+ "url_name": "Netflix movie",
+ "urls": {
+ "default": "https://www.netflix.com/watch/$1"
+ }
+ },
+ "github_profile": {
+ "category_name": "Github",
+ "url_name": "Github profile",
+ "urls": {
+ "default": "https://wwww.github.com/$1"
+ }
+ },
+ "musicbrainz_artist": {
+ "category_name": "Musicbrainz",
+ "url_name": "Musicbrainz artist",
+ "urls": {
+ "default": "http://musicbrainz.org/artist/$1"
+ }
+ },
+ "musicbrainz_work": {
+ "category_name": "Musicbrainz",
+ "url_name": "Musicbrainz work",
+ "urls": {
+ "default": "http://musicbrainz.org/work/$1"
+ }
+ },
+ "musicbrainz_release_group": {
+ "category_name": "Musicbrainz",
+ "url_name": "Musicbrainz release group",
+ "urls": {
+ "default": "http://musicbrainz.org/release-group/$1"
+ }
+ },
+ "musicbrainz_label": {
+ "category_name": "Musicbrainz",
+ "url_name": "Musicbrainz label",
+ "urls": {
+ "default": "http://musicbrainz.org/label/$1"
+ }
+ },
+ "wikimedia_image": {
+ "category_name": "Wikipedia",
+ "url_name": "Wikipedia image",
+ "urls": {
+ "default": "https://commons.wikimedia.org/wiki/Special:FilePath/$1?width=500&height=400"
+ }
+ },
+ "map": {
+ "category_name": "Map",
+ "url_name": "geo map",
+ "urls": {
+ "default": "https://www.openstreetmap.org/?lat=${latitude}&lon=${longitude}&zoom=${zoom}&layers=M"
+ }
+ }
+} \ No newline at end of file
diff --git a/searx/data/wikidata_units.json b/searx/data/wikidata_units.json
new file mode 100644
index 000000000..966e5e812
--- /dev/null
+++ b/searx/data/wikidata_units.json
@@ -0,0 +1,1006 @@
+{
+ "Q199": "1",
+ "Q100149279": "°We",
+ "Q100995": "lb",
+ "Q1022113": "cm³",
+ "Q102573": "Bq",
+ "Q103246": "Sv",
+ "Q103510": "bar",
+ "Q10380431": "TJ",
+ "Q1040401": "das",
+ "Q1040427": "hs",
+ "Q1042866": "Zibit",
+ "Q1050958": "inHg",
+ "Q1051665": "m/s²",
+ "Q1052397": "rad",
+ "Q1054140": "Mm",
+ "Q10543042": "Ym",
+ "Q1057069": "hg",
+ "Q1063756": "rad/s",
+ "Q1063786": "in²",
+ "Q1065153": "mrad",
+ "Q1066138": "Ps",
+ "Q1067722": "Fg",
+ "Q1069725": "p.",
+ "Q1084321": "Tb/s",
+ "Q1086691": "fg",
+ "Q1091257": "tex",
+ "Q1092296": "a",
+ "Q1104069": "CAD$",
+ "Q11061003": "μm²",
+ "Q11061005": "nm²",
+ "Q1131660": "st",
+ "Q1137675": "cr",
+ "Q1140444": "Zbit",
+ "Q1140577": "Ybit",
+ "Q1152074": "Pbit",
+ "Q1152323": "Tbit",
+ "Q1165799": "mil",
+ "Q11776930": "Mg",
+ "Q11830636": "psf",
+ "Q11929860": "kpc",
+ "Q1194225": "lbf",
+ "Q1194580": "Mibit",
+ "Q1195111": "Ebit",
+ "Q1196837": "ω_P",
+ "Q1197459": "Ms",
+ "Q11982285": "Em³",
+ "Q11982288": "Zm³",
+ "Q11982289": "Tm³",
+ "Q12011178": "Zs",
+ "Q1204894": "Gibit",
+ "Q12257695": "Eb/s",
+ "Q12257696": "EB/s",
+ "Q12261466": "kB/s",
+ "Q12265780": "Pb/s",
+ "Q12265783": "PB/s",
+ "Q12269121": "Yb/s",
+ "Q12269122": "YB/s",
+ "Q12269308": "Zb/s",
+ "Q12269309": "ZB/s",
+ "Q1247300": "cm H₂O",
+ "Q12714022": "sh cwt",
+ "Q12789864": "GeV",
+ "Q12874593": "W h",
+ "Q128822": "kn",
+ "Q13035094": "J/mol",
+ "Q130964": "cal",
+ "Q131255": "F",
+ "Q13147228": "g/cm³",
+ "Q1322380": "Ts",
+ "Q1323615": "oz t",
+ "Q132643": "kr",
+ "Q13400897": "g",
+ "Q13479685": "mm wg",
+ "Q1351253": "Eibit",
+ "Q1351334": "Pibit",
+ "Q13542672": "Ry",
+ "Q13548586": "THz",
+ "Q13582667": "kgf/cm²",
+ "Q1361854": "dwt",
+ "Q1363007": "slug",
+ "Q1374438": "ks",
+ "Q13753469": "MB/s",
+ "Q1377051": "Gs",
+ "Q1394540": "bm",
+ "Q1396128": "F",
+ "Q1413142": "Gb",
+ "Q14158377": "A_P",
+ "Q14623803": "MDa",
+ "Q14623804": "kDa",
+ "Q1472674": "Sv",
+ "Q14754979": "Zg",
+ "Q14786969": "MJ",
+ "Q14913554": "Ys",
+ "Q14914907": "th",
+ "Q14916719": "Gpc",
+ "Q14923662": "Pm³",
+ "Q1511773": "LSd",
+ "Q15120301": "l atm",
+ "Q1542309": "xu",
+ "Q1545979": "ft³",
+ "Q1550511": "yd²",
+ "Q15551713": "Sh",
+ "Q1569733": "St",
+ "Q15784325": "apc",
+ "Q160680": "Br",
+ "Q160857": "hp",
+ "Q1628990": "hph",
+ "Q163343": "T",
+ "Q163354": "H",
+ "Q1640501": "hyl",
+ "Q1645498": "μg",
+ "Q16859309": "lb·ft",
+ "Q169893": "S",
+ "Q170804": "Wb",
+ "Q17093295": "m/h",
+ "Q17255465": "v_P",
+ "Q173117": "R$",
+ "Q1741429": "kpm",
+ "Q174467": "Lm",
+ "Q174728": "cm",
+ "Q174789": "mm",
+ "Q175821": "μm",
+ "Q1768929": "p",
+ "Q1770733": "Tg",
+ "Q1772386": "dg",
+ "Q177493": "Gs",
+ "Q177612": "sr",
+ "Q1777507": "fs",
+ "Q177974": "atm",
+ "Q178506": "bbl",
+ "Q178674": "nm",
+ "Q1793863": "sn",
+ "Q179836": "lx",
+ "Q180154": "km/h",
+ "Q180892": "M☉",
+ "Q1815100": "cl",
+ "Q182098": "kWh",
+ "Q1823150": "μW",
+ "Q182429": "m/s",
+ "Q1826195": "dl",
+ "Q18413919": "cm/s",
+ "Q184172": "FF",
+ "Q185078": "a",
+ "Q185153": "erg",
+ "Q185648": "Torr",
+ "Q185759": "span",
+ "Q1872619": "zs",
+ "Q189097": "₧",
+ "Q190095": "Gy",
+ "Q190951": "S$",
+ "Q191118": "t",
+ "Q1913097": "fg",
+ "Q1916026": "μV",
+ "Q192027": "Bd",
+ "Q192274": "pm",
+ "Q193098": "KD",
+ "Q1935515": "mA s",
+ "Q19392152": "TL",
+ "Q193933": "dpt",
+ "Q194339": "B$",
+ "Q1970718": "mam",
+ "Q1972579": "pdl",
+ "Q199462": "LE",
+ "Q199471": "Afs",
+ "Q200323": "dm",
+ "Q200337": "Kz",
+ "Q201880": "LL",
+ "Q201933": "dyn",
+ "Q2029156": "quad",
+ "Q2029519": "hl",
+ "Q203567": "₦",
+ "Q2042279": "m H₂O",
+ "Q204737": "៛",
+ "Q2051195": "GWh",
+ "Q2055118": "ppb",
+ "Q2064166": "fc",
+ "Q206600": "ރ",
+ "Q20706220": "cmm",
+ "Q20706221": "dmm",
+ "Q2080811": "vol%",
+ "Q208526": "NT$",
+ "Q208528": "gon",
+ "Q208634": "kat",
+ "Q208788": "fm",
+ "Q209351": "b",
+ "Q209426": "′",
+ "Q21006887": "ppm",
+ "Q2100949": "P",
+ "Q21014455": "m/min",
+ "Q210472": "B/.",
+ "Q21061369": "g/kg",
+ "Q21062777": "MPa",
+ "Q21064807": "kPa",
+ "Q21064845": "mol/L",
+ "Q21075844": "ml/l",
+ "Q21077820": "mg/m³",
+ "Q21091747": "mg/kg",
+ "Q211256": "mph",
+ "Q211580": "BTU (th)",
+ "Q212120": "A h",
+ "Q2140397": "in³",
+ "Q214377": "ell",
+ "Q2143992": "kHz",
+ "Q21489891": "nm³",
+ "Q21489892": "Gm³",
+ "Q21489893": "Mm³",
+ "Q21489894": "μm³",
+ "Q21500224": "mas",
+ "Q2151240": "mag",
+ "Q215571": "N m",
+ "Q21604951": "g/m³",
+ "Q2165290": "yd³",
+ "Q216880": "kp",
+ "Q217208": "a",
+ "Q2175964": "dm³",
+ "Q218593": "in",
+ "Q2199357": "dec",
+ "Q22137107": "mas/y",
+ "Q2215478": "ppt",
+ "Q2221356": "mW h",
+ "Q22350885": "da",
+ "Q2243141": "Gb/s",
+ "Q2254856": "ca",
+ "Q22673229": "ft/min",
+ "Q2269250": "kb/s",
+ "Q2282891": "μl",
+ "Q2282906": "ng",
+ "Q229354": "Ci",
+ "Q232291": "mi²",
+ "Q2332346": "ml",
+ "Q23823681": "TW",
+ "Q23925410": "gal (UK)",
+ "Q23925413": "gal (US)",
+ "Q23931040": "dam²",
+ "Q23931103": "nmi²",
+ "Q2414435": "$b.",
+ "Q242988": "Lib$",
+ "Q2438073": "ag",
+ "Q2448803": "mV",
+ "Q2451296": "μF",
+ "Q246868": "lot",
+ "Q2474258": "mSv",
+ "Q2483628": "as",
+ "Q2489298": "cm²",
+ "Q249439": "q_P",
+ "Q2518569": "nSv",
+ "Q253276": "mi",
+ "Q25472681": "GB/s",
+ "Q25472693": "TB/s",
+ "Q25499149": "oct",
+ "Q25511288": "mb",
+ "Q2553708": "MV",
+ "Q2554092": "kV",
+ "Q259502": "AU$",
+ "Q260126": "rem",
+ "Q2612219": "Pg",
+ "Q261247": "ct",
+ "Q2619500": "foe",
+ "Q2636421": "nH",
+ "Q2637946": "dal",
+ "Q2642547": "ha",
+ "Q2652700": "Osm",
+ "Q2655272": "Eg",
+ "Q2659078": "TW h",
+ "Q2670039": "₶",
+ "Q26708069": "kcal",
+ "Q267391": "K",
+ "Q2679083": "μH",
+ "Q2682463": "nF",
+ "Q2691798": "cg",
+ "Q271206": "sud£",
+ "Q2737347": "mm²",
+ "Q2739114": "μSv",
+ "Q275112": "Bz$",
+ "Q2756030": "pF",
+ "Q2757753": "PW h",
+ "Q2762458": "ys",
+ "Q27864215": "μW h",
+ "Q2793566": "GV",
+ "Q27949241": "R",
+ "Q2799294": "Gg",
+ "Q281096": "cd/m²",
+ "Q28719934": "keV",
+ "Q28924752": "g/mol",
+ "Q28924753": "kg/mol",
+ "Q2924137": "mH",
+ "Q296936": "toe",
+ "Q29924639": "kVA",
+ "Q30001811": "aBq",
+ "Q30001813": "aC",
+ "Q30001814": "aHz",
+ "Q30001815": "aJ",
+ "Q30001816": "akat",
+ "Q30001818": "aL",
+ "Q30001819": "alm",
+ "Q30001820": "alx",
+ "Q30001822": "aN",
+ "Q30001823": "aΩ",
+ "Q30001825": "aPa",
+ "Q30001826": "arad",
+ "Q30001827": "aS",
+ "Q30001828": "aSv",
+ "Q30001829": "asr",
+ "Q30001830": "aT",
+ "Q30001831": "aV",
+ "Q30001832": "aW",
+ "Q30001833": "aWb",
+ "Q3013059": "kyr",
+ "Q3194304": "kbit",
+ "Q3207456": "mW",
+ "Q321017": "R",
+ "Q3221356": "ym",
+ "Q3239557": "pg",
+ "Q3241121": "mg",
+ "Q324923": "Hart",
+ "Q3249364": "cs",
+ "Q3251645": "ds",
+ "Q3267417": "Tm",
+ "Q3270676": "zm",
+ "Q32750621": "liq pt (US)",
+ "Q32750759": "fl oz (US)",
+ "Q32750816": "bu (US)",
+ "Q32751272": "dry pt (US)",
+ "Q32751296": "bbl (US)",
+ "Q3276763": "GHz",
+ "Q3277907": "Em",
+ "Q3277915": "Zm",
+ "Q3277919": "Pm",
+ "Q3312063": "fL",
+ "Q3320608": "kW",
+ "Q3331719": "dm²",
+ "Q3332689": "ToR",
+ "Q3332814": "Mbit",
+ "Q3396758": "daa",
+ "Q3414243": "rps",
+ "Q3421309": "R_J",
+ "Q3495543": "mbar",
+ "Q355198": "px",
+ "Q3674704": "km/s",
+ "Q3675550": "mm³",
+ "Q3712659": "$",
+ "Q376660": "nat",
+ "Q37732658": "°R",
+ "Q3773454": "Mpc",
+ "Q3815076": "Kibit",
+ "Q3833309": "£",
+ "Q3858002": "mA h",
+ "Q3867152": "ft/s²",
+ "Q389062": "Tibit",
+ "Q3902688": "pl",
+ "Q3902709": "ps",
+ "Q39360235": "US lea",
+ "Q39360471": "nl",
+ "Q39362962": "µin",
+ "Q39363132": "UK lg",
+ "Q39363209": "UK nl",
+ "Q39380159": "US nmi",
+ "Q39462789": "µin²",
+ "Q39467934": "kgf/m²",
+ "Q39469927": "N/m²",
+ "Q39617688": "cwt long",
+ "Q39617818": "t lb",
+ "Q39628023": "y",
+ "Q39699418": "cm/s²",
+ "Q39708248": "S",
+ "Q39709980": "bd",
+ "Q39710113": "bhp EDR",
+ "Q3972226": "kL",
+ "Q4041686": "iwg",
+ "Q4068266": "Ʒ",
+ "Q4176683": "aC",
+ "Q420266": "oz. fl.",
+ "Q42319606": "people/m²",
+ "Q4243638": "km³",
+ "Q4456994": "mF",
+ "Q469356": "tn. sh.",
+ "Q476572": "Ha",
+ "Q482798": "yd",
+ "Q483261": "Da",
+ "Q483725": "A.M.",
+ "Q484092": "lm",
+ "Q4861171": "H",
+ "Q494083": "fur",
+ "Q4989854": "kJ",
+ "Q500515": "Gal",
+ "Q5042194": "£",
+ "Q50808017": "kg m²",
+ "Q5139563": "hPa",
+ "Q514845": "pz",
+ "Q5195628": "hm³",
+ "Q5198770": "dam³",
+ "Q524410": "byr",
+ "Q53393488": "PHz",
+ "Q53393490": "EHz",
+ "Q53393494": "ZHz",
+ "Q53393498": "YHz",
+ "Q53393659": "ML",
+ "Q53393664": "GL",
+ "Q53393674": "ZL",
+ "Q53393678": "YL",
+ "Q53393771": "yL",
+ "Q53393868": "GJ",
+ "Q53393886": "PJ",
+ "Q53393890": "EJ",
+ "Q53448786": "yHz",
+ "Q53448790": "zHz",
+ "Q53448794": "fHz",
+ "Q53448797": "pHz",
+ "Q53448801": "nHz",
+ "Q53448806": "μHz",
+ "Q53448808": "mHz",
+ "Q53448813": "cHz",
+ "Q53448817": "dHz",
+ "Q53448820": "daHz",
+ "Q53448826": "hHz",
+ "Q53448828": "yJ",
+ "Q53448832": "zJ",
+ "Q53448842": "pJ",
+ "Q53448844": "nJ",
+ "Q53448847": "μJ",
+ "Q53448851": "mJ",
+ "Q53448856": "cJ",
+ "Q53448860": "dJ",
+ "Q53448864": "daJ",
+ "Q53448875": "hJ",
+ "Q53448879": "yPa",
+ "Q53448883": "zPa",
+ "Q53448886": "fPa",
+ "Q53448892": "pPa",
+ "Q53448897": "nPa",
+ "Q53448900": "μPa",
+ "Q53448906": "mPa",
+ "Q53448909": "cPa",
+ "Q53448914": "dPa",
+ "Q53448918": "daPa",
+ "Q53448922": "GPa",
+ "Q53448927": "TPa",
+ "Q53448931": "PPa",
+ "Q53448936": "EPa",
+ "Q53448939": "ZPa",
+ "Q53448943": "YPa",
+ "Q53448949": "yV",
+ "Q53448952": "zV",
+ "Q53448957": "fV",
+ "Q53448960": "pV",
+ "Q53448965": "nV",
+ "Q53448969": "cV",
+ "Q53448973": "dV",
+ "Q53448977": "daV",
+ "Q53448981": "hV",
+ "Q53448985": "TV",
+ "Q53448990": "PV",
+ "Q53448994": "EV",
+ "Q53448996": "ZV",
+ "Q53449001": "YV",
+ "Q53449006": "yW",
+ "Q53449008": "zW",
+ "Q53449013": "fW",
+ "Q53449018": "pW",
+ "Q53449021": "nW",
+ "Q53449025": "cW",
+ "Q53449029": "dW",
+ "Q53449033": "daW",
+ "Q53449036": "hW",
+ "Q53449040": "PW",
+ "Q53449045": "EW",
+ "Q53449049": "ZW",
+ "Q53449054": "YW",
+ "Q53561461": "wf",
+ "Q53561822": "wf",
+ "Q53651160": "zm³",
+ "Q53651201": "Ym³",
+ "Q53651356": "ym³",
+ "Q53651512": "pm³",
+ "Q53651713": "fm³",
+ "Q536785": "ρ_P",
+ "Q53951982": "Mt",
+ "Q53952048": "kt",
+ "Q54006645": "ZWb",
+ "Q54081925": "ZSv",
+ "Q54082468": "ZS",
+ "Q54083144": "ZΩ",
+ "Q54083318": "ZN",
+ "Q54083566": "Zlm",
+ "Q54083579": "Zlx",
+ "Q54083712": "ZBq",
+ "Q54083746": "ZC",
+ "Q54083766": "ZF",
+ "Q54083779": "ZGy",
+ "Q54083795": "ZH",
+ "Q54083813": "Zkat",
+ "Q5409016": "MVA",
+ "Q5465723": "ft-pdl",
+ "Q549389": "bit/s",
+ "Q550341": "V A",
+ "Q552299": "ch",
+ "Q55442349": "U/L",
+ "Q55726194": "mg/L",
+ "Q56156859": "mmol",
+ "Q56156949": "μmol",
+ "Q56157046": "nmol",
+ "Q56157048": "pmol",
+ "Q56160603": "fmol",
+ "Q56302633": "UM",
+ "Q56317116": "mgal",
+ "Q56317622": "Q_P",
+ "Q56318907": "kbar",
+ "Q56349362": "Bs.S",
+ "Q56402798": "kN",
+ "Q5711261": "am³",
+ "Q581432": "‴",
+ "Q5879479": "GW",
+ "Q6003257": "am",
+ "Q6009164": "MW h",
+ "Q6014364": "in/s",
+ "Q603071": "E°",
+ "Q605704": "doz",
+ "Q60742631": "AU/yr",
+ "Q608697": "Mx",
+ "Q610135": "G",
+ "Q613726": "Yg",
+ "Q6170164": "yg",
+ "Q6171168": "zg",
+ "Q61756607": "yd",
+ "Q61793198": "rd",
+ "Q61794766": "ch (US survey)",
+ "Q61994988": "Wth",
+ "Q61995006": "KWth",
+ "Q626299": "psi",
+ "Q630369": "var",
+ "Q636200": "U",
+ "Q640907": "sb",
+ "Q6414556": "kip",
+ "Q648908": "bya",
+ "Q64996135": "gal (US)/min",
+ "Q65028392": "mm/yr",
+ "Q651336": "M_J",
+ "Q6517513": "dag",
+ "Q667419": "UK t",
+ "Q681996": "M⊕",
+ "Q685662": "p_P",
+ "Q6859652": "mm Hg",
+ "Q686163": "$",
+ "Q68725821": "°Rø",
+ "Q68726230": "°De",
+ "Q68726625": "°N",
+ "Q69362731": "°C",
+ "Q69363953": "K",
+ "Q693944": "gr",
+ "Q6982035": "MW",
+ "Q69878540": "fl oz (UK)",
+ "Q70378044": "dmol",
+ "Q70378549": "dK",
+ "Q70393458": "kmol",
+ "Q70395375": "Tmol",
+ "Q70395643": "Mmol",
+ "Q70395830": "kK",
+ "Q70396179": "mK",
+ "Q70397275": "μK",
+ "Q70397725": "cmol",
+ "Q70397932": "cK",
+ "Q70398457": "nK",
+ "Q70398619": "MK",
+ "Q70398813": "Gmol",
+ "Q70398991": "GK",
+ "Q70440025": "daK",
+ "Q70440438": "hK",
+ "Q70440620": "damol",
+ "Q70440823": "hmol",
+ "Q70443020": "EK",
+ "Q70443154": "yK",
+ "Q70443282": "zK",
+ "Q70443367": "fK",
+ "Q70443453": "TK",
+ "Q70443757": "pK",
+ "Q70443901": "YK",
+ "Q70444029": "PK",
+ "Q70444141": "Emol",
+ "Q70444284": "ymol",
+ "Q70444386": "zmol",
+ "Q70444514": "Ymol",
+ "Q70444609": "Pmol",
+ "Q712226": "km²",
+ "Q72081071": "MeV",
+ "Q723733": "ms",
+ "Q730251": "ft·lbf",
+ "Q732707": "MHz",
+ "Q73408": "K",
+ "Q7350781": "Mb/s",
+ "Q743895": "bpm",
+ "Q748716": "ft/s",
+ "Q750178": "‱",
+ "Q752197": "kJ/mol",
+ "Q7672057": "TU",
+ "Q777017": "dBm",
+ "Q78754556": "rot",
+ "Q78756901": "rev",
+ "Q78757683": "windings",
+ "Q79726": "kB",
+ "Q79735": "MB",
+ "Q79738": "GB",
+ "Q79741": "TB",
+ "Q79744": "PB",
+ "Q79745": "EB",
+ "Q79747": "ZB",
+ "Q7974920": "W s",
+ "Q79752": "YB",
+ "Q79756": "KiB",
+ "Q79758": "MiB",
+ "Q79765": "GiB",
+ "Q79769": "TiB",
+ "Q79774": "PiB",
+ "Q79777": "EiB",
+ "Q79779": "ZiB",
+ "Q79781": "YiB",
+ "Q80237579": "J/nm",
+ "Q809678": "Ba",
+ "Q81062869": "W/nm",
+ "Q81073100": "W/(sr nm)",
+ "Q81292": "acre",
+ "Q81454": "Å",
+ "Q8229770": "B/s",
+ "Q828224": "km",
+ "Q829073": "\"",
+ "Q83216": "cd",
+ "Q83327": "eV",
+ "Q834105": "g/L",
+ "Q835916": "IU",
+ "Q838801": "ns",
+ "Q842015": "μs",
+ "Q842981": "thm (US)",
+ "Q844211": "kg/m³",
+ "Q844338": "hm",
+ "Q844976": "Oe",
+ "Q845958": "¥",
+ "Q848856": "dam",
+ "Q851872": "o",
+ "Q854546": "Gm",
+ "Q855161": "Yibit",
+ "Q856240": "ft³/min",
+ "Q857027": "ft²",
+ "Q85854198": "MN",
+ "Q864818": "abA",
+ "Q87262709": "kΩ",
+ "Q87416053": "MΩ",
+ "Q88296091": "tsp",
+ "Q9026416": "MWth",
+ "Q9048643": "nl",
+ "Q905912": "L",
+ "Q906223": "Es",
+ "Q909066": "at",
+ "Q911730": "nx",
+ "Q914151": "P_P",
+ "Q915169": "F_P",
+ "Q93318": "nmi",
+ "Q940052": "q",
+ "Q94076025": "dalm",
+ "Q94076717": "dakat",
+ "Q942092": "BWI$",
+ "Q94414053": "Prad",
+ "Q94414499": "PC",
+ "Q94415026": "Grad",
+ "Q94415255": "GC",
+ "Q94415438": "Yrad",
+ "Q94415526": "YC",
+ "Q94415782": "Mrad",
+ "Q94416260": "GN",
+ "Q94416535": "cN",
+ "Q94416879": "YN",
+ "Q94417138": "PN",
+ "Q94417481": "μGy",
+ "Q94417583": "μS",
+ "Q94417598": "μT",
+ "Q94417933": "μlm",
+ "Q94418102": "μN",
+ "Q94418220": "μsr",
+ "Q94418481": "μBq",
+ "Q94479580": "GΩ",
+ "Q94480021": "PΩ",
+ "Q94480081": "YΩ",
+ "Q94480128": "cΩ",
+ "Q94480131": "TΩ",
+ "Q94480136": "pΩ",
+ "Q94480254": "nΩ",
+ "Q94480476": "dΩ",
+ "Q94480633": "EΩ",
+ "Q94480967": "daΩ",
+ "Q94481176": "hΩ",
+ "Q94481339": "fΩ",
+ "Q94481646": "yΩ",
+ "Q94487174": "zΩ",
+ "Q94487366": "mΩ",
+ "Q94487561": "μΩ",
+ "Q94487750": "kGy",
+ "Q94488007": "klx",
+ "Q94488361": "MF",
+ "Q94488759": "GBq",
+ "Q94489041": "PBq",
+ "Q94489223": "YBq",
+ "Q94489429": "MBq",
+ "Q94489465": "kBq",
+ "Q94489476": "TBq",
+ "Q94489494": "kWb",
+ "Q94489520": "kS",
+ "Q94490951": "klm",
+ "Q94491129": "kkat",
+ "Q94634634": "cC",
+ "Q94634655": "MC",
+ "Q94634666": "kC",
+ "Q94634677": "TC",
+ "Q94634684": "μC",
+ "Q94634699": "mC",
+ "Q94693759": "csr",
+ "Q94693773": "msr",
+ "Q94693786": "mWb",
+ "Q94693805": "μWb",
+ "Q94693819": "GS",
+ "Q94693849": "cS",
+ "Q94693918": "MS",
+ "Q94694019": "TS",
+ "Q94694096": "pS",
+ "Q94694154": "nS",
+ "Q94694206": "mS",
+ "Q94731530": "mlm",
+ "Q94731808": "mkat",
+ "Q94731887": "μkat",
+ "Q94732218": "nkat",
+ "Q94732627": "pkat",
+ "Q94733432": "fkat",
+ "Q94733760": "cGy",
+ "Q94734107": "dGy",
+ "Q94734232": "mGy",
+ "Q94734359": "daGy",
+ "Q94734468": "aGy",
+ "Q94734527": "pGy",
+ "Q94734593": "nGy",
+ "Q94734689": "kT",
+ "Q94734788": "mT",
+ "Q94939947": "Gkat",
+ "Q94940018": "Pkat",
+ "Q94940081": "ykat",
+ "Q94940160": "dkat",
+ "Q94940232": "Ekat",
+ "Q94940295": "Ykat",
+ "Q94940582": "Tkat",
+ "Q94940892": "hkat",
+ "Q94941461": "zkat",
+ "Q94942602": "MGy",
+ "Q94942863": "GGy",
+ "Q94986863": "YWb",
+ "Q94986889": "PWb",
+ "Q94986906": "cWb",
+ "Q94986920": "GWb",
+ "Q94986942": "MWb",
+ "Q94986962": "TWb",
+ "Q95178536": "Mlm",
+ "Q95178777": "Tlm",
+ "Q95178881": "clm",
+ "Q95179024": "plm",
+ "Q95179137": "nlm",
+ "Q95179382": "hlm",
+ "Q95179467": "flm",
+ "Q95179608": "zlm",
+ "Q95179695": "Mkat",
+ "Q95179788": "ckat",
+ "Q95179882": "PGy",
+ "Q95377836": "PF",
+ "Q95377853": "YF",
+ "Q95378017": "kF",
+ "Q95378296": "TF",
+ "Q95379145": "cF",
+ "Q95379382": "GF",
+ "Q95379491": "daC",
+ "Q95379580": "hC",
+ "Q95379588": "dC",
+ "Q95379596": "EC",
+ "Q95445986": "nC",
+ "Q95446327": "pC",
+ "Q95446670": "fC",
+ "Q95447079": "zC",
+ "Q95447237": "yC",
+ "Q95447253": "fF",
+ "Q95447263": "zF",
+ "Q95447276": "aF",
+ "Q95447555": "dF",
+ "Q95447863": "EF",
+ "Q95448262": "yF",
+ "Q95448479": "hF",
+ "Q95448689": "daF",
+ "Q95448950": "kSv",
+ "Q95559229": "GSv",
+ "Q95559368": "YSv",
+ "Q95559441": "MSv",
+ "Q95559576": "TSv",
+ "Q95559603": "PSv",
+ "Q95609154": "nWb",
+ "Q95609210": "fWb",
+ "Q95609261": "zWb",
+ "Q95609291": "dWb",
+ "Q95609317": "EWb",
+ "Q95676212": "pWb",
+ "Q95676232": "yWb",
+ "Q95676243": "hWb",
+ "Q95676250": "daWb",
+ "Q95676257": "PS",
+ "Q95676260": "YS",
+ "Q95676273": "zS",
+ "Q95676275": "fS",
+ "Q95676279": "yS",
+ "Q95676287": "hS",
+ "Q95676291": "daS",
+ "Q95676297": "dS",
+ "Q95676298": "ES",
+ "Q95720731": "YGy",
+ "Q95720734": "TGy",
+ "Q95720736": "fGy",
+ "Q95720739": "yGy",
+ "Q95720741": "zGy",
+ "Q95720742": "EGy",
+ "Q95720746": "hGy",
+ "Q95720749": "mlx",
+ "Q95720758": "μlx",
+ "Q95720773": "dalx",
+ "Q95720777": "hlx",
+ "Q95720781": "dlx",
+ "Q95720786": "clx",
+ "Q95857671": "zSv",
+ "Q95859071": "fSv",
+ "Q95860960": "daSv",
+ "Q95861107": "hSv",
+ "Q95861296": "dSv",
+ "Q95862182": "ESv",
+ "Q95863358": "cSv",
+ "Q95863591": "ySv",
+ "Q95863894": "pSv",
+ "Q95864194": "zBq",
+ "Q95864378": "fBq",
+ "Q95864695": "daBq",
+ "Q95864940": "hBq",
+ "Q95865286": "dBq",
+ "Q95865530": "EBq",
+ "Q95865716": "cBq",
+ "Q95865877": "yBq",
+ "Q95866173": "pBq",
+ "Q95866344": "nBq",
+ "Q95866767": "mBq",
+ "Q95867993": "mN",
+ "Q95948345": "crad",
+ "Q95948364": "drad",
+ "Q95948734": "daN",
+ "Q95948739": "hN",
+ "Q95948747": "dN",
+ "Q95976839": "Plm",
+ "Q95976853": "Glm",
+ "Q95976869": "Ylm",
+ "Q95976889": "ylm",
+ "Q95976917": "dlm",
+ "Q95976919": "Elm",
+ "Q95976921": "nT",
+ "Q95993516": "TN",
+ "Q95993522": "nN",
+ "Q95993524": "fN",
+ "Q95993526": "yN",
+ "Q95993528": "zN",
+ "Q95993530": "EN",
+ "Q95993532": "pN",
+ "Q95993537": "μrad",
+ "Q95993542": "nrad",
+ "Q95993547": "frad",
+ "Q95993553": "prad",
+ "Q95993554": "darad",
+ "Q95993557": "hrad",
+ "Q95993619": "pT",
+ "Q96025401": "daT",
+ "Q96025405": "Trad",
+ "Q96025407": "Zrad",
+ "Q96025409": "zrad",
+ "Q96025413": "yrad",
+ "Q96025414": "Erad",
+ "Q96025419": "Ylx",
+ "Q96025422": "Glx",
+ "Q96025427": "Plx",
+ "Q96025431": "Mlx",
+ "Q96025433": "Tlx",
+ "Q96025435": "nlx",
+ "Q96025441": "flx",
+ "Q96050953": "GH",
+ "Q96051010": "PH",
+ "Q96051029": "YH",
+ "Q96051052": "cH",
+ "Q96051074": "TH",
+ "Q96051106": "MH",
+ "Q96051123": "kH",
+ "Q96051126": "fH",
+ "Q96051133": "yH",
+ "Q96051139": "hH",
+ "Q96051142": "dH",
+ "Q96051144": "EH",
+ "Q96051150": "pH",
+ "Q96051160": "daH",
+ "Q96051186": "zH",
+ "Q96051199": "aH",
+ "Q96051245": "ylx",
+ "Q96051267": "Elx",
+ "Q96051282": "plx",
+ "Q96051312": "zlx",
+ "Q96070067": "PT",
+ "Q96070074": "YT",
+ "Q96070076": "GT",
+ "Q96070087": "cT",
+ "Q96070103": "MT",
+ "Q96070125": "hT",
+ "Q96070145": "fT",
+ "Q96070174": "TT",
+ "Q96070195": "zT",
+ "Q96070247": "yT",
+ "Q96070254": "dT",
+ "Q96070264": "ET",
+ "Q96070276": "m°C",
+ "Q96070318": "dsr",
+ "Q96070329": "nsr",
+ "Q96070341": "psr",
+ "Q96095866": "fsr",
+ "Q96095897": "zsr",
+ "Q96095917": "ysr",
+ "Q96095927": "dasr",
+ "Q96095928": "hsr",
+ "Q96095931": "ksr",
+ "Q96095933": "Msr",
+ "Q96095939": "Gsr",
+ "Q96095941": "μ°C",
+ "Q96095955": "n°C",
+ "Q96095960": "k°C",
+ "Q96106290": "Tsr",
+ "Q96106298": "Psr",
+ "Q96106311": "Esr",
+ "Q96106319": "Zsr",
+ "Q96106332": "Ysr",
+ "Q96106346": "c°C",
+ "Q96106360": "d°C",
+ "Q96106368": "da°C",
+ "Q96106385": "h°C",
+ "Q96106393": "M°C",
+ "Q96236286": "G°C",
+ "Q97059641": "p°C",
+ "Q97059652": "T°C",
+ "Q97143826": "P°C",
+ "Q97143831": "y°C",
+ "Q97143835": "f°C",
+ "Q97143838": "Z°C",
+ "Q97143842": "E°C",
+ "Q97143843": "z°C",
+ "Q97143849": "Y°C",
+ "Q97143851": "a°C",
+ "Q98538634": "eV/m²",
+ "Q98635536": "eV/m",
+ "Q98642859": "eV m²/kg",
+ "Q11229": "%",
+ "Q11570": "kg",
+ "Q11573": "m",
+ "Q11574": "s",
+ "Q11579": "K",
+ "Q11582": "L",
+ "Q12129": "pc",
+ "Q12438": "N",
+ "Q16068": "DM",
+ "Q1811": "ua",
+ "Q20764": "Myr",
+ "Q2101": "e",
+ "Q25235": "h",
+ "Q25236": "W",
+ "Q25250": "V",
+ "Q25267": "°C",
+ "Q25269": "J",
+ "Q25272": "A",
+ "Q25343": "m²",
+ "Q25406": "C",
+ "Q25517": "m³",
+ "Q33680": "rad",
+ "Q35852": "ha",
+ "Q36384": "equiv",
+ "Q3710": "ft",
+ "Q39274": "Sv",
+ "Q39369": "Hz",
+ "Q41509": "mol",
+ "Q41803": "g",
+ "Q42289": "°F",
+ "Q4406": "TV$",
+ "Q44395": "Pa",
+ "Q4587": "Le",
+ "Q4588": "WS$",
+ "Q4592": "F$",
+ "Q4596": "Rs",
+ "Q4597": "$",
+ "Q47083": "Ω",
+ "Q48013": "oz",
+ "Q50094": "Np",
+ "Q50098": "B",
+ "Q531": "ly",
+ "Q5329": "dB",
+ "Q573": "d",
+ "Q577": "a",
+ "Q7727": "min",
+ "Q8799": "B"
+} \ No newline at end of file
diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py
index 7ce54f056..f8bc44e46 100644
--- a/searx/engines/duckduckgo_definitions.py
+++ b/searx/engines/duckduckgo_definitions.py
@@ -12,28 +12,53 @@ DuckDuckGo (definitions)
import json
from urllib.parse import urlencode
from lxml import html
-from re import compile
+
+from searx import logger
+from searx.data import WIKIDATA_UNITS
from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url, language_aliases
-from searx.utils import extract_text, html_to_text, match_language
+from searx.utils import extract_text, html_to_text, match_language, get_string_replaces_function
+from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
+
+
+logger = logger.getChild('duckduckgo_definitions')
-url = 'https://api.duckduckgo.com/'\
+URL = 'https://api.duckduckgo.com/'\
+ '?{query}&format=json&pretty=0&no_redirect=1&d=1'
-http_regex = compile(r'^http:')
+WIKIDATA_PREFIX = [
+ 'http://www.wikidata.org/entity/',
+ 'https://www.wikidata.org/entity/'
+]
+
+replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
+
+
+def is_broken_text(text):
+ """ duckduckgo may return something like "<a href="xxxx">http://somewhere Related website<a/>"
+ The href URL is broken, the "Related website" may contains some HTML.
-def result_to_text(url, text, htmlResult):
+ The best solution seems to ignore these results.
+ """
+ return text.startswith('http') and ' ' in text
+
+
+def result_to_text(text, htmlResult):
# TODO : remove result ending with "Meaning" or "Category"
+ result = None
dom = html.fromstring(htmlResult)
a = dom.xpath('//a')
if len(a) >= 1:
- return extract_text(a[0])
+ result = extract_text(a[0])
else:
- return text
+ result = text
+ if not is_broken_text(result):
+ return result
+ return None
def request(query, params):
- params['url'] = url.format(query=urlencode({'q': query}))
+ params['url'] = URL.format(query=urlencode({'q': query}))
language = match_language(params['language'], supported_languages, language_aliases)
language = language.split('-')[0]
params['headers']['Accept-Language'] = language
@@ -45,6 +70,14 @@ def response(resp):
search_res = json.loads(resp.text)
+ # search_res.get('Entity') possible values (not exhaustive) :
+ # * continent / country / department / location / waterfall
+ # * actor / musician / artist
+ # * book / performing art / film / television / media franchise / concert tour / playwright
+ # * prepared food
+ # * website / software / os / programming language / file format / software engineer
+ # * compagny
+
content = ''
heading = search_res.get('Heading', '')
attributes = []
@@ -55,7 +88,8 @@ def response(resp):
# add answer if there is one
answer = search_res.get('Answer', '')
if answer:
- if search_res.get('AnswerType', '') not in ['calc']:
+ logger.debug('AnswerType="%s" Answer="%s"', search_res.get('AnswerType'), answer)
+ if search_res.get('AnswerType') not in ['calc', 'ip']:
results.append({'answer': html_to_text(answer)})
# add infobox
@@ -66,42 +100,36 @@ def response(resp):
content = content + search_res.get('Abstract', '')
# image
- image = search_res.get('Image', '')
+ image = search_res.get('Image')
image = None if image == '' else image
- # attributes
- if 'Infobox' in search_res:
- infobox = search_res.get('Infobox', None)
- if 'content' in infobox:
- for info in infobox.get('content'):
- attributes.append({'label': info.get('label'),
- 'value': info.get('value')})
-
# urls
+ # Official website, Wikipedia page
for ddg_result in search_res.get('Results', []):
- if 'FirstURL' in ddg_result:
- firstURL = ddg_result.get('FirstURL', '')
- text = ddg_result.get('Text', '')
+ firstURL = ddg_result.get('FirstURL')
+ text = ddg_result.get('Text')
+ if firstURL is not None and text is not None:
urls.append({'title': text, 'url': firstURL})
results.append({'title': heading, 'url': firstURL})
# related topics
for ddg_result in search_res.get('RelatedTopics', []):
if 'FirstURL' in ddg_result:
- suggestion = result_to_text(ddg_result.get('FirstURL', None),
- ddg_result.get('Text', None),
- ddg_result.get('Result', None))
- if suggestion != heading:
- results.append({'suggestion': suggestion})
+ firstURL = ddg_result.get('FirstURL')
+ text = ddg_result.get('Text')
+ if not is_broken_text(text):
+ suggestion = result_to_text(text,
+ ddg_result.get('Result'))
+ if suggestion != heading and suggestion is not None:
+ results.append({'suggestion': suggestion})
elif 'Topics' in ddg_result:
suggestions = []
relatedTopics.append({'name': ddg_result.get('Name', ''),
- 'suggestions': suggestions})
+ 'suggestions': suggestions})
for topic_result in ddg_result.get('Topics', []):
- suggestion = result_to_text(topic_result.get('FirstURL', None),
- topic_result.get('Text', None),
- topic_result.get('Result', None))
- if suggestion != heading:
+ suggestion = result_to_text(topic_result.get('Text'),
+ topic_result.get('Result'))
+ if suggestion != heading and suggestion is not None:
suggestions.append(suggestion)
# abstract
@@ -110,7 +138,10 @@ def response(resp):
# add as result ? problem always in english
infobox_id = abstractURL
urls.append({'title': search_res.get('AbstractSource'),
- 'url': abstractURL})
+ 'url': abstractURL,
+ 'official': True})
+ results.append({'url': abstractURL,
+ 'title': heading})
# definition
definitionURL = search_res.get('DefinitionURL', '')
@@ -118,53 +149,107 @@ def response(resp):
# add as result ? as answer ? problem always in english
infobox_id = definitionURL
urls.append({'title': search_res.get('DefinitionSource'),
- 'url': definitionURL})
+ 'url': definitionURL})
# to merge with wikidata's infobox
if infobox_id:
- infobox_id = http_regex.sub('https:', infobox_id)
-
- # entity
- entity = search_res.get('Entity', None)
- # TODO continent / country / department / location / waterfall /
- # mountain range :
- # link to map search, get weather, near by locations
- # TODO musician : link to music search
- # TODO concert tour : ??
- # TODO film / actor / television / media franchise :
- # links to IMDB / rottentomatoes (or scrap result)
- # TODO music : link tu musicbrainz / last.fm
- # TODO book : ??
- # TODO artist / playwright : ??
- # TODO compagny : ??
- # TODO software / os : ??
- # TODO software engineer : ??
- # TODO prepared food : ??
- # TODO website : ??
- # TODO performing art : ??
- # TODO prepared food : ??
- # TODO programming language : ??
- # TODO file format : ??
+ infobox_id = replace_http_by_https(infobox_id)
+
+ # attributes
+ # some will be converted to urls
+ if 'Infobox' in search_res:
+ infobox = search_res.get('Infobox')
+ if 'content' in infobox:
+ osm_zoom = 17
+ coordinates = None
+ for info in infobox.get('content'):
+ data_type = info.get('data_type')
+ data_label = info.get('label')
+ data_value = info.get('value')
+
+ # Workaround: ddg may return a double quote
+ if data_value == '""':
+ continue
+
+ # Is it an external URL ?
+ # * imdb_id / facebook_profile / youtube_channel / youtube_video / twitter_profile
+ # * instagram_profile / rotten_tomatoes / spotify_artist_id / itunes_artist_id / soundcloud_id
+ # * netflix_id
+ external_url = get_external_url(data_type, data_value)
+ if external_url is not None:
+ urls.append({'title': data_label,
+ 'url': external_url})
+ elif data_type in ['instance', 'wiki_maps_trigger', 'google_play_artist_id']:
+ # ignore instance: Wikidata value from "Instance Of" (Qxxxx)
+ # ignore wiki_maps_trigger: reference to a javascript
+ # ignore google_play_artist_id: service shutdown
+ pass
+ elif data_type == 'string' and data_label == 'Website':
+ # There is already an URL for the website
+ pass
+ elif data_type == 'area':
+ attributes.append({'label': data_label,
+ 'value': area_to_str(data_value),
+ 'entity': 'P2046'})
+ osm_zoom = area_to_osm_zoom(data_value.get('amount'))
+ elif data_type == 'coordinates':
+ if data_value.get('globe') == 'http://www.wikidata.org/entity/Q2':
+ # coordinate on Earth
+ # get the zoom information from the area
+ coordinates = info
+ else:
+ # coordinate NOT on Earth
+ attributes.append({'label': data_label,
+ 'value': data_value,
+ 'entity': 'P625'})
+ elif data_type == 'string':
+ attributes.append({'label': data_label,
+ 'value': data_value})
+
+ if coordinates:
+ data_label = coordinates.get('label')
+ data_value = coordinates.get('value')
+ latitude = data_value.get('latitude')
+ longitude = data_value.get('longitude')
+ url = get_earth_coordinates_url(latitude, longitude, osm_zoom)
+ urls.append({'title': 'OpenStreetMap',
+ 'url': url,
+ 'entity': 'P625'})
if len(heading) > 0:
# TODO get infobox.meta.value where .label='article_title'
if image is None and len(attributes) == 0 and len(urls) == 1 and\
len(relatedTopics) == 0 and len(content) == 0:
- results.append({
- 'url': urls[0]['url'],
- 'title': heading,
- 'content': content
- })
+ results.append({'url': urls[0]['url'],
+ 'title': heading,
+ 'content': content})
else:
- results.append({
- 'infobox': heading,
- 'id': infobox_id,
- 'entity': entity,
- 'content': content,
- 'img_src': image,
- 'attributes': attributes,
- 'urls': urls,
- 'relatedTopics': relatedTopics
- })
+ results.append({'infobox': heading,
+ 'id': infobox_id,
+ 'content': content,
+ 'img_src': image,
+ 'attributes': attributes,
+ 'urls': urls,
+ 'relatedTopics': relatedTopics})
return results
+
+
+def unit_to_str(unit):
+ for prefix in WIKIDATA_PREFIX:
+ if unit.startswith(prefix):
+ wikidata_entity = unit[len(prefix):]
+ return WIKIDATA_UNITS.get(wikidata_entity, unit)
+ return unit
+
+
+def area_to_str(area):
+ """parse {'unit': 'http://www.wikidata.org/entity/Q712226', 'amount': '+20.99'}"""
+ unit = unit_to_str(area.get('unit'))
+ if unit is not None:
+ try:
+ amount = float(area.get('amount'))
+ return '{} {}'.format(amount, unit)
+ except ValueError:
+ pass
+ return '{} {}'.format(area.get('amount', ''), area.get('unit', ''))
diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py
index c557f4e59..01e873de9 100644
--- a/searx/engines/wikidata.py
+++ b/searx/engines/wikidata.py
@@ -3,501 +3,686 @@
Wikidata
@website https://wikidata.org
- @provide-api yes (https://wikidata.org/w/api.php)
+ @provide-api yes (https://query.wikidata.org/)
- @using-api partially (most things require scraping)
- @results JSON, HTML
- @stable no (html can change)
+ @using-api yes
+ @results JSON
+ @stable yes
@parse url, infobox
"""
-from searx import logger
-from searx.poolrequests import get
-from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
-from searx.utils import extract_text, match_language, eval_xpath
from urllib.parse import urlencode
from json import loads
-from lxml.html import fromstring
-from lxml import etree
+
+from dateutil.parser import isoparse
+from babel.dates import format_datetime, format_date, format_time, get_datetime_format
+
+from searx import logger
+from searx.data import WIKIDATA_UNITS
+from searx.poolrequests import post, get
+from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
+from searx.utils import match_language, searx_useragent, get_string_replaces_function
+from searx.external_urls import get_external_url, get_earth_coordinates_url, area_to_osm_zoom
logger = logger.getChild('wikidata')
-result_count = 1
-
-# urls
-wikidata_host = 'https://www.wikidata.org'
-url_search = wikidata_host \
- + '/w/index.php?{query}&ns0=1'
-
-wikidata_api = wikidata_host + '/w/api.php'
-url_detail = wikidata_api\
- + '?action=parse&format=json&{query}'\
- + '&redirects=1&prop=text%7Cdisplaytitle%7Cparsewarnings'\
- + '&disableeditsection=1&preview=1&sectionpreview=1&disabletoc=1&utf8=1&formatversion=2'
-
-url_map = 'https://www.openstreetmap.org/'\
- + '?lat={latitude}&lon={longitude}&zoom={zoom}&layers=M'
-url_image = 'https://commons.wikimedia.org/wiki/Special:FilePath/{filename}?width=500&height=400'
-
-# xpaths
-div_ids_xpath = '//div[@id]'
-wikidata_ids_xpath = '//ul[@class="mw-search-results"]/li//a/@href'
-title_xpath = '//*[contains(@class,"wikibase-title-label")]'
-description_xpath = '//div[contains(@class,"wikibase-entitytermsview-heading-description")]'
-label_xpath = './/div[contains(@class,"wikibase-statementgroupview-property-label")]/a'
-url_xpath = './/a[contains(@class,"external free") or contains(@class, "wb-external-id")]'
-wikilink_xpath = './/ul[contains(@class,"wikibase-sitelinklistview-listview")]'\
- + '/li[contains(@data-wb-siteid,"{wikiid}")]//a/@href'
-property_row_xpath = './/div[contains(@class,"wikibase-statementview")]'
-preferred_rank_xpath = './/span[contains(@class,"wikibase-rankselector-preferred")]'
-value_xpath = './/div[contains(@class,"wikibase-statementview-mainsnak")]'\
- + '/*/div[contains(@class,"wikibase-snakview-value")]'
-language_fallback_xpath = '//sup[contains(@class,"wb-language-fallback-indicator")]'
-calendar_name_xpath = './/sup[contains(@class,"wb-calendar-name")]'
-media_xpath = value_xpath + '//div[contains(@class,"commons-media-caption")]//a'
-
-
-def get_id_cache(result):
- id_cache = {}
- for e in eval_xpath(result, div_ids_xpath):
- id = e.get('id')
- if id.startswith('P'):
- id_cache[id] = e
- return id_cache
+# SPARQL
+SPARQL_ENDPOINT_URL = 'https://query.wikidata.org/sparql'
+SPARQL_EXPLAIN_URL = 'https://query.wikidata.org/bigdata/namespace/wdq/sparql?explain'
+WIKIDATA_PROPERTIES = {
+ 'P434': 'MusicBrainz',
+ 'P435': 'MusicBrainz',
+ 'P436': 'MusicBrainz',
+ 'P966': 'MusicBrainz',
+ 'P345': 'IMDb',
+ 'P2397': 'YouTube',
+ 'P1651': 'YouTube',
+ 'P2002': 'Twitter',
+ 'P2013': 'Facebook',
+ 'P2003': 'Instagram',
+}
+
+# SERVICE wikibase:mwapi : https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual/MWAPI
+# SERVICE wikibase:label: https://en.wikibooks.org/wiki/SPARQL/SERVICE_-_Label#Manual_Label_SERVICE
+# https://en.wikibooks.org/wiki/SPARQL/WIKIDATA_Precision,_Units_and_Coordinates
+# https://www.mediawiki.org/wiki/Wikibase/Indexing/RDF_Dump_Format#Data_model
+# optmization:
+# * https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/query_optimization
+# * https://github.com/blazegraph/database/wiki/QueryHints
+QUERY_TEMPLATE = """
+SELECT ?item ?itemLabel ?itemDescription ?lat ?long %SELECT%
+WHERE
+{
+ SERVICE wikibase:mwapi {
+ bd:serviceParam wikibase:endpoint "www.wikidata.org";
+ wikibase:api "EntitySearch";
+ wikibase:limit 1;
+ mwapi:search "%QUERY%";
+ mwapi:language "%LANGUAGE%".
+ ?item wikibase:apiOutputItem mwapi:item.
+ }
+
+ %WHERE%
+
+ SERVICE wikibase:label {
+ bd:serviceParam wikibase:language "%LANGUAGE%,en".
+ ?item rdfs:label ?itemLabel .
+ ?item schema:description ?itemDescription .
+ %WIKIBASE_LABELS%
+ }
+
+}
+GROUP BY ?item ?itemLabel ?itemDescription ?lat ?long %GROUP_BY%
+"""
-def request(query, params):
- params['url'] = url_search.format(
- query=urlencode({'search': query}))
- return params
+# Get the calendar names and the property names
+QUERY_PROPERTY_NAMES = """
+SELECT ?item ?name
+WHERE {
+ {
+ SELECT ?item
+ WHERE { ?item wdt:P279* wd:Q12132 }
+ } UNION {
+ VALUES ?item { %ATTRIBUTES% }
+ }
+ OPTIONAL { ?item rdfs:label ?name. }
+}
+"""
-def response(resp):
- results = []
- htmlparser = etree.HTMLParser()
- html = fromstring(resp.content.decode(), parser=htmlparser)
- search_results = eval_xpath(html, wikidata_ids_xpath)
+# https://www.w3.org/TR/sparql11-query/#rSTRING_LITERAL1
+# https://lists.w3.org/Archives/Public/public-rdf-dawg/2011OctDec/0175.html
+sparql_string_escape = get_string_replaces_function({'\t': '\\\t',
+ '\n': '\\\n',
+ '\r': '\\\r',
+ '\b': '\\\b',
+ '\f': '\\\f',
+ '\"': '\\\"',
+ '\'': '\\\'',
+ '\\': '\\\\'})
+
+replace_http_by_https = get_string_replaces_function({'http:': 'https:'})
+
+
+def get_headers():
+ # user agent: https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits
+ return {
+ 'Accept': 'application/sparql-results+json',
+ 'User-Agent': searx_useragent()
+ }
+
+
+def get_label_for_entity(entity_id, language):
+ name = WIKIDATA_PROPERTIES.get(entity_id)
+ if name is None:
+ name = WIKIDATA_PROPERTIES.get((entity_id, language))
+ if name is None:
+ name = WIKIDATA_PROPERTIES.get((entity_id, language.split('-')[0]))
+ if name is None:
+ name = WIKIDATA_PROPERTIES.get((entity_id, 'en'))
+ if name is None:
+ name = entity_id
+ return name
+
+
+def send_wikidata_query(query, method='GET'):
+ if method == 'GET':
+ # query will be cached by wikidata
+ http_response = get(SPARQL_ENDPOINT_URL + '?' + urlencode({'query': query}), headers=get_headers())
+ else:
+ # query won't be cached by wikidata
+ http_response = post(SPARQL_ENDPOINT_URL, data={'query': query}, headers=get_headers())
+ if http_response.status_code != 200:
+ logger.debug('SPARQL endpoint error %s', http_response.content.decode())
+ logger.debug('request time %s', str(http_response.elapsed))
+ http_response.raise_for_status()
+ return loads(http_response.content.decode())
+
- if resp.search_params['language'].split('-')[0] == 'all':
+def request(query, params):
+ language = params['language'].split('-')[0]
+ if language == 'all':
language = 'en'
else:
- language = match_language(resp.search_params['language'], supported_languages, language_aliases).split('-')[0]
+ language = match_language(params['language'], supported_languages, language_aliases).split('-')[0]
+
+ query, attributes = get_query(query, language)
- # TODO: make requests asynchronous to avoid timeout when result_count > 1
- for search_result in search_results[:result_count]:
- wikidata_id = search_result.split('/')[-1]
- url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
- htmlresponse = get(url)
- jsonresponse = loads(htmlresponse.content.decode())
- results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'], htmlparser)
+ params['method'] = 'POST'
+ params['url'] = SPARQL_ENDPOINT_URL
+ params['data'] = {'query': query}
+ params['headers'] = get_headers()
+
+ params['language'] = language
+ params['attributes'] = attributes
+ return params
+
+
+def response(resp):
+ results = []
+ if resp.status_code != 200:
+ logger.debug('SPARQL endpoint error %s', resp.content.decode())
+ resp.raise_for_status()
+ jsonresponse = loads(resp.content.decode())
+
+ language = resp.search_params['language'].lower()
+ attributes = resp.search_params['attributes']
+
+ seen_entities = set()
+
+ for result in jsonresponse.get('results', {}).get('bindings', []):
+ attribute_result = {key: value['value'] for key, value in result.items()}
+ entity_url = attribute_result['item']
+ if entity_url not in seen_entities:
+ seen_entities.add(entity_url)
+ results += get_results(attribute_result, attributes, language)
+ else:
+ logger.debug('The SPARQL request returns duplicate entities: %s', str(attribute_result))
return results
-def getDetail(jsonresponse, wikidata_id, language, locale, htmlparser):
+def get_results(attribute_result, attributes, language):
results = []
- urls = []
- attributes = []
+ infobox_title = attribute_result.get('itemLabel')
+ infobox_id = attribute_result['item']
+ infobox_id_lang = None
+ infobox_urls = []
+ infobox_attributes = []
+ infobox_content = attribute_result.get('itemDescription')
+ img_src = None
+ img_src_priority = 100
+
+ for attribute in attributes:
+ value = attribute.get_str(attribute_result, language)
+ if value is not None and value != '':
+ attribute_type = type(attribute)
+
+ if attribute_type in (WDURLAttribute, WDArticle):
+ # get_select() method : there is group_concat(distinct ...;separator=", ")
+ # split the value here
+ for url in value.split(', '):
+ infobox_urls.append({'title': attribute.get_label(language), 'url': url, **attribute.kwargs})
+ # "normal" results (not infobox) include official website and Wikipedia links.
+ if attribute.kwargs.get('official') or attribute_type == WDArticle:
+ results.append({'title': infobox_title, 'url': url})
+ # update the infobox_id with the wikipedia URL
+ # first the local wikipedia URL, and as fallback the english wikipedia URL
+ if attribute_type == WDArticle\
+ and ((attribute.language == 'en' and infobox_id_lang is None)
+ or attribute.language != 'en'):
+ infobox_id_lang = attribute.language
+ infobox_id = url
+ elif attribute_type == WDImageAttribute:
+ # this attribute is an image.
+ # replace the current image only the priority is lower
+ # (the infobox contain only one image).
+ if attribute.priority < img_src_priority:
+ img_src = value
+ img_src_priority = attribute.priority
+ elif attribute_type == WDGeoAttribute:
+ # geocoordinate link
+ # use the area to get the OSM zoom
+ # Note: ignre the unit (must be km² otherwise the calculation is wrong)
+ # Should use normalized value p:P2046/psn:P2046/wikibase:quantityAmount
+ area = attribute_result.get('P2046')
+ osm_zoom = area_to_osm_zoom(area) if area else 19
+ url = attribute.get_str(attribute_result, language, osm_zoom=osm_zoom)
+ if url:
+ infobox_urls.append({'title': attribute.get_label(language),
+ 'url': url,
+ 'entity': attribute.name})
+ else:
+ infobox_attributes.append({'label': attribute.get_label(language),
+ 'value': value,
+ 'entity': attribute.name})
+
+ if infobox_id:
+ infobox_id = replace_http_by_https(infobox_id)
- title = jsonresponse.get('parse', {}).get('displaytitle', {})
- result = jsonresponse.get('parse', {}).get('text', {})
-
- if not title or not result:
- return results
-
- title = fromstring(title, parser=htmlparser)
- for elem in eval_xpath(title, language_fallback_xpath):
- elem.getparent().remove(elem)
- title = extract_text(eval_xpath(title, title_xpath))
-
- result = fromstring(result, parser=htmlparser)
- for elem in eval_xpath(result, language_fallback_xpath):
- elem.getparent().remove(elem)
-
- description = extract_text(eval_xpath(result, description_xpath))
-
- id_cache = get_id_cache(result)
-
- # URLS
-
- # official website
- add_url(urls, result, id_cache, 'P856', results=results)
-
- # wikipedia
- wikipedia_link_count = 0
- wikipedia_link = get_wikilink(result, language + 'wiki')
- if wikipedia_link:
- wikipedia_link_count += 1
- urls.append({'title': 'Wikipedia (' + language + ')',
- 'url': wikipedia_link})
-
- if language != 'en':
- wikipedia_en_link = get_wikilink(result, 'enwiki')
- if wikipedia_en_link:
- wikipedia_link_count += 1
- urls.append({'title': 'Wikipedia (en)',
- 'url': wikipedia_en_link})
-
- # TODO: get_wiki_firstlanguage
- # if wikipedia_link_count == 0:
-
- # more wikis
- add_url(urls, result, id_cache, default_label='Wikivoyage (' + language + ')', link_type=language + 'wikivoyage')
- add_url(urls, result, id_cache, default_label='Wikiquote (' + language + ')', link_type=language + 'wikiquote')
- add_url(urls, result, id_cache, default_label='Wikimedia Commons', link_type='commonswiki')
-
- add_url(urls, result, id_cache, 'P625', 'OpenStreetMap', link_type='geo')
-
- # musicbrainz
- add_url(urls, result, id_cache, 'P434', 'MusicBrainz', 'http://musicbrainz.org/artist/')
- add_url(urls, result, id_cache, 'P435', 'MusicBrainz', 'http://musicbrainz.org/work/')
- add_url(urls, result, id_cache, 'P436', 'MusicBrainz', 'http://musicbrainz.org/release-group/')
- add_url(urls, result, id_cache, 'P966', 'MusicBrainz', 'http://musicbrainz.org/label/')
-
- # IMDb
- add_url(urls, result, id_cache, 'P345', 'IMDb', 'https://www.imdb.com/', link_type='imdb')
- # source code repository
- add_url(urls, result, id_cache, 'P1324')
- # blog
- add_url(urls, result, id_cache, 'P1581')
- # social media links
- add_url(urls, result, id_cache, 'P2397', 'YouTube', 'https://www.youtube.com/channel/')
- add_url(urls, result, id_cache, 'P1651', 'YouTube', 'https://www.youtube.com/watch?v=')
- add_url(urls, result, id_cache, 'P2002', 'Twitter', 'https://twitter.com/')
- add_url(urls, result, id_cache, 'P2013', 'Facebook', 'https://facebook.com/')
- add_url(urls, result, id_cache, 'P2003', 'Instagram', 'https://instagram.com/')
-
- urls.append({'title': 'Wikidata',
- 'url': 'https://www.wikidata.org/wiki/'
- + wikidata_id + '?uselang=' + language})
-
- # INFOBOX ATTRIBUTES (ROWS)
-
- # DATES
- # inception date
- add_attribute(attributes, id_cache, 'P571', date=True)
- # dissolution date
- add_attribute(attributes, id_cache, 'P576', date=True)
- # start date
- add_attribute(attributes, id_cache, 'P580', date=True)
- # end date
- add_attribute(attributes, id_cache, 'P582', date=True)
- # date of birth
- add_attribute(attributes, id_cache, 'P569', date=True)
- # date of death
- add_attribute(attributes, id_cache, 'P570', date=True)
- # date of spacecraft launch
- add_attribute(attributes, id_cache, 'P619', date=True)
- # date of spacecraft landing
- add_attribute(attributes, id_cache, 'P620', date=True)
-
- # nationality
- add_attribute(attributes, id_cache, 'P27')
- # country of origin
- add_attribute(attributes, id_cache, 'P495')
- # country
- add_attribute(attributes, id_cache, 'P17')
- # headquarters
- add_attribute(attributes, id_cache, 'Q180')
-
- # PLACES
- # capital
- add_attribute(attributes, id_cache, 'P36', trim=True)
- # head of state
- add_attribute(attributes, id_cache, 'P35', trim=True)
- # head of government
- add_attribute(attributes, id_cache, 'P6', trim=True)
- # type of government
- add_attribute(attributes, id_cache, 'P122')
- # official language
- add_attribute(attributes, id_cache, 'P37')
- # population
- add_attribute(attributes, id_cache, 'P1082', trim=True)
- # area
- add_attribute(attributes, id_cache, 'P2046')
- # currency
- add_attribute(attributes, id_cache, 'P38', trim=True)
- # heigth (building)
- add_attribute(attributes, id_cache, 'P2048')
-
- # MEDIA
- # platform (videogames)
- add_attribute(attributes, id_cache, 'P400')
- # author
- add_attribute(attributes, id_cache, 'P50')
- # creator
- add_attribute(attributes, id_cache, 'P170')
- # director
- add_attribute(attributes, id_cache, 'P57')
- # performer
- add_attribute(attributes, id_cache, 'P175')
- # developer
- add_attribute(attributes, id_cache, 'P178')
- # producer
- add_attribute(attributes, id_cache, 'P162')
- # manufacturer
- add_attribute(attributes, id_cache, 'P176')
- # screenwriter
- add_attribute(attributes, id_cache, 'P58')
- # production company
- add_attribute(attributes, id_cache, 'P272')
- # record label
- add_attribute(attributes, id_cache, 'P264')
- # publisher
- add_attribute(attributes, id_cache, 'P123')
- # original network
- add_attribute(attributes, id_cache, 'P449')
- # distributor
- add_attribute(attributes, id_cache, 'P750')
- # composer
- add_attribute(attributes, id_cache, 'P86')
- # publication date
- add_attribute(attributes, id_cache, 'P577', date=True)
- # genre
- add_attribute(attributes, id_cache, 'P136')
- # original language
- add_attribute(attributes, id_cache, 'P364')
- # isbn
- add_attribute(attributes, id_cache, 'Q33057')
- # software license
- add_attribute(attributes, id_cache, 'P275')
- # programming language
- add_attribute(attributes, id_cache, 'P277')
- # version
- add_attribute(attributes, id_cache, 'P348', trim=True)
- # narrative location
- add_attribute(attributes, id_cache, 'P840')
-
- # LANGUAGES
- # number of speakers
- add_attribute(attributes, id_cache, 'P1098')
- # writing system
- add_attribute(attributes, id_cache, 'P282')
- # regulatory body
- add_attribute(attributes, id_cache, 'P1018')
- # language code
- add_attribute(attributes, id_cache, 'P218')
-
- # OTHER
- # ceo
- add_attribute(attributes, id_cache, 'P169', trim=True)
- # founder
- add_attribute(attributes, id_cache, 'P112')
- # legal form (company/organization)
- add_attribute(attributes, id_cache, 'P1454')
- # operator
- add_attribute(attributes, id_cache, 'P137')
- # crew members (tripulation)
- add_attribute(attributes, id_cache, 'P1029')
- # taxon
- add_attribute(attributes, id_cache, 'P225')
- # chemical formula
- add_attribute(attributes, id_cache, 'P274')
- # winner (sports/contests)
- add_attribute(attributes, id_cache, 'P1346')
- # number of deaths
- add_attribute(attributes, id_cache, 'P1120')
- # currency code
- add_attribute(attributes, id_cache, 'P498')
-
- image = add_image(id_cache)
-
- if len(attributes) == 0 and len(urls) == 2 and len(description) == 0:
+ # add the wikidata URL at the end
+ infobox_urls.append({'title': 'Wikidata', 'url': attribute_result['item']})
+
+ if img_src is None and len(infobox_attributes) == 0 and len(infobox_urls) == 1 and\
+ len(infobox_content) == 0:
results.append({
- 'url': urls[0]['url'],
- 'title': title,
- 'content': description
- })
+ 'url': infobox_urls[0]['url'],
+ 'title': infobox_title,
+ 'content': infobox_content
+ })
else:
results.append({
- 'infobox': title,
- 'id': wikipedia_link,
- 'content': description,
- 'img_src': image,
- 'attributes': attributes,
- 'urls': urls
- })
-
+ 'infobox': infobox_title,
+ 'id': infobox_id,
+ 'content': infobox_content,
+ 'img_src': img_src,
+ 'urls': infobox_urls,
+ 'attributes': infobox_attributes
+ })
return results
-# only returns first match
-def add_image(id_cache):
- # P15: route map, P242: locator map, P154: logo, P18: image, P242: map, P41: flag, P2716: collage, P2910: icon
- property_ids = ['P15', 'P242', 'P154', 'P18', 'P242', 'P41', 'P2716', 'P2910']
+def get_query(query, language):
+ attributes = get_attributes(language)
+ select = [a.get_select() for a in attributes]
+ where = list(filter(lambda s: len(s) > 0, [a.get_where() for a in attributes]))
+ wikibase_label = list(filter(lambda s: len(s) > 0, [a.get_wikibase_label() for a in attributes]))
+ group_by = list(filter(lambda s: len(s) > 0, [a.get_group_by() for a in attributes]))
+ query = QUERY_TEMPLATE\
+ .replace('%QUERY%', sparql_string_escape(query))\
+ .replace('%SELECT%', ' '.join(select))\
+ .replace('%WHERE%', '\n '.join(where))\
+ .replace('%WIKIBASE_LABELS%', '\n '.join(wikibase_label))\
+ .replace('%GROUP_BY%', ' '.join(group_by))\
+ .replace('%LANGUAGE%', language)
+ return query, attributes
- for property_id in property_ids:
- image = id_cache.get(property_id, None)
- if image is not None:
- image_name = eval_xpath(image, media_xpath)
- image_src = url_image.replace('{filename}', extract_text(image_name[0]))
- return image_src
+def get_attributes(language):
+ attributes = []
-# setting trim will only returned high ranked rows OR the first row
-def add_attribute(attributes, id_cache, property_id, default_label=None, date=False, trim=False):
- attribute = id_cache.get(property_id, None)
- if attribute is not None:
+ def add_value(name):
+ attributes.append(WDAttribute(name))
+
+ def add_amount(name):
+ attributes.append(WDAmountAttribute(name))
+
+ def add_label(name):
+ attributes.append(WDLabelAttribute(name))
+
+ def add_url(name, url_id=None, **kwargs):
+ attributes.append(WDURLAttribute(name, url_id, kwargs))
+
+ def add_image(name, url_id=None, priority=1):
+ attributes.append(WDImageAttribute(name, url_id, priority))
+
+ def add_date(name):
+ attributes.append(WDDateAttribute(name))
+
+ # Dates
+ for p in ['P571', # inception date
+ 'P576', # dissolution date
+ 'P580', # start date
+ 'P582', # end date
+ 'P569', # date of birth
+ 'P570', # date of death
+ 'P619', # date of spacecraft launch
+ 'P620']: # date of spacecraft landing
+ add_date(p)
+
+ for p in ['P27', # country of citizenship
+ 'P495', # country of origin
+ 'P17', # country
+ 'P159']: # headquarters location
+ add_label(p)
+
+ # Places
+ for p in ['P36', # capital
+ 'P35', # head of state
+ 'P6', # head of government
+ 'P122', # basic form of government
+ 'P37']: # official language
+ add_label(p)
+
+ add_value('P1082') # population
+ add_amount('P2046') # area
+ add_amount('P281') # postal code
+ add_label('P38') # currency
+ add_amount('P2048') # heigth (building)
+
+ # Media
+ for p in ['P400', # platform (videogames, computing)
+ 'P50', # author
+ 'P170', # creator
+ 'P57', # director
+ 'P175', # performer
+ 'P178', # developer
+ 'P162', # producer
+ 'P176', # manufacturer
+ 'P58', # screenwriter
+ 'P272', # production company
+ 'P264', # record label
+ 'P123', # publisher
+ 'P449', # original network
+ 'P750', # distributed by
+ 'P86']: # composer
+ add_label(p)
+
+ add_date('P577') # publication date
+ add_label('P136') # genre (music, film, artistic...)
+ add_label('P364') # original language
+ add_value('P212') # ISBN-13
+ add_value('P957') # ISBN-10
+ add_label('P275') # copyright license
+ add_label('P277') # programming language
+ add_value('P348') # version
+ add_label('P840') # narrative location
+
+ # Languages
+ add_value('P1098') # number of speakers
+ add_label('P282') # writing system
+ add_label('P1018') # language regulatory body
+ add_value('P218') # language code (ISO 639-1)
+
+ # Other
+ add_label('P169') # ceo
+ add_label('P112') # founded by
+ add_label('P1454') # legal form (company, organization)
+ add_label('P137') # operator (service, facility, ...)
+ add_label('P1029') # crew members (tripulation)
+ add_label('P225') # taxon name
+ add_value('P274') # chemical formula
+ add_label('P1346') # winner (sports, contests, ...)
+ add_value('P1120') # number of deaths
+ add_value('P498') # currency code (ISO 4217)
+
+ # URL
+ add_url('P856', official=True) # official website
+ attributes.append(WDArticle(language)) # wikipedia (user language)
+ if not language.startswith('en'):
+ attributes.append(WDArticle('en')) # wikipedia (english)
+
+ add_url('P1324') # source code repository
+ add_url('P1581') # blog
+ add_url('P434', url_id='musicbrainz_artist')
+ add_url('P435', url_id='musicbrainz_work')
+ add_url('P436', url_id='musicbrainz_release_group')
+ add_url('P966', url_id='musicbrainz_label')
+ add_url('P345', url_id='imdb_id')
+ add_url('P2397', url_id='youtube_channel')
+ add_url('P1651', url_id='youtube_video')
+ add_url('P2002', url_id='twitter_profile')
+ add_url('P2013', url_id='facebook_profile')
+ add_url('P2003', url_id='instagram_profile')
+
+ # Map
+ attributes.append(WDGeoAttribute('P625'))
+
+ # Image
+ add_image('P15', priority=1, url_id='wikimedia_image') # route map
+ add_image('P242', priority=2, url_id='wikimedia_image') # locator map
+ add_image('P154', priority=3, url_id='wikimedia_image') # logo
+ add_image('P18', priority=4, url_id='wikimedia_image') # image
+ add_image('P41', priority=5, url_id='wikimedia_image') # flag
+ add_image('P2716', priority=6, url_id='wikimedia_image') # collage
+ add_image('P2910', priority=7, url_id='wikimedia_image') # icon
+
+ return attributes
+
+
+class WDAttribute:
+
+ __slots__ = 'name',
+
+ def __init__(self, name):
+ self.name = name
+
+ def get_select(self):
+ return '(group_concat(distinct ?{name};separator=", ") as ?{name}s)'.replace('{name}', self.name)
+
+ def get_label(self, language):
+ return get_label_for_entity(self.name, language)
+
+ def get_where(self):
+ return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
+
+ def get_wikibase_label(self):
+ return ""
+
+ def get_group_by(self):
+ return ""
+
+ def get_str(self, result, language):
+ return result.get(self.name + 's')
- if default_label:
- label = default_label
- else:
- label = extract_text(eval_xpath(attribute, label_xpath))
- label = label[0].upper() + label[1:]
-
- if date:
- trim = True
- # remove calendar name
- calendar_name = eval_xpath(attribute, calendar_name_xpath)
- for calendar in calendar_name:
- calendar.getparent().remove(calendar)
-
- concat_values = ""
- values = []
- first_value = None
- for row in eval_xpath(attribute, property_row_xpath):
- if not first_value or not trim or eval_xpath(row, preferred_rank_xpath):
- value = eval_xpath(row, value_xpath)
- if not value:
- continue
- value = extract_text(value)
-
- # save first value in case no ranked row is found
- if trim and not first_value:
- first_value = value
- else:
- # to avoid duplicate values
- if value not in values:
- concat_values += value + ", "
- values.append(value)
-
- if trim and not values:
- attributes.append({'label': label,
- 'value': first_value})
- else:
- attributes.append({'label': label,
- 'value': concat_values[:-2]})
+ def __repr__(self):
+ return '<' + str(type(self).__name__) + ':' + self.name + '>'
-# requires property_id unless it's a wiki link (defined in link_type)
-def add_url(urls, result, id_cache, property_id=None, default_label=None, url_prefix=None, results=None,
- link_type=None, only_first=True):
- links = []
+class WDAmountAttribute(WDAttribute):
- # wiki links don't have property in wikidata page
- if link_type and 'wiki' in link_type:
- links.append(get_wikilink(result, link_type))
- else:
- dom_element = id_cache.get(property_id, None)
- if dom_element is not None:
- if not default_label:
- label = extract_text(eval_xpath(dom_element, label_xpath))
- label = label[0].upper() + label[1:]
+ def get_select(self):
+ return '?{name} ?{name}Unit'.replace('{name}', self.name)
- if link_type == 'geo':
- links.append(get_geolink(dom_element))
+ def get_where(self):
+ return """ OPTIONAL { ?item p:{name} ?{name}Node .
+ ?{name}Node rdf:type wikibase:BestRank ; ps:{name} ?{name} .
+ OPTIONAL { ?{name}Node psv:{name}/wikibase:quantityUnit ?{name}Unit. } }""".replace('{name}', self.name)
- elif link_type == 'imdb':
- links.append(get_imdblink(dom_element, url_prefix))
+ def get_group_by(self):
+ return self.get_select()
- else:
- url_results = eval_xpath(dom_element, url_xpath)
- for link in url_results:
- if link is not None:
- if url_prefix:
- link = url_prefix + extract_text(link)
- else:
- link = extract_text(link)
- links.append(link)
-
- # append urls
- for url in links:
- if url is not None:
- u = {'title': default_label or label, 'url': url}
- if property_id == 'P856':
- u['official'] = True
- u['domain'] = url.split('/')[2]
- urls.append(u)
- if results is not None:
- results.append(u)
- if only_first:
- break
-
-
-def get_imdblink(result, url_prefix):
- imdb_id = eval_xpath(result, value_xpath)
- if imdb_id:
- imdb_id = extract_text(imdb_id)
- id_prefix = imdb_id[:2]
- if id_prefix == 'tt':
- url = url_prefix + 'title/' + imdb_id
- elif id_prefix == 'nm':
- url = url_prefix + 'name/' + imdb_id
- elif id_prefix == 'ch':
- url = url_prefix + 'character/' + imdb_id
- elif id_prefix == 'co':
- url = url_prefix + 'company/' + imdb_id
- elif id_prefix == 'ev':
- url = url_prefix + 'event/' + imdb_id
- else:
- url = None
- return url
+ def get_str(self, result, language):
+ value = result.get(self.name)
+ unit = result.get(self.name + "Unit")
+ if unit is not None:
+ unit = unit.replace('http://www.wikidata.org/entity/', '')
+ return value + " " + get_label_for_entity(unit, language)
+ return value
-def get_geolink(result):
- coordinates = eval_xpath(result, value_xpath)
- if not coordinates:
- return None
- coordinates = extract_text(coordinates[0])
- latitude, longitude = coordinates.split(',')
-
- # convert to decimal
- lat = int(latitude[:latitude.find('°')])
- if latitude.find('\'') >= 0:
- lat += int(latitude[latitude.find('°') + 1:latitude.find('\'')] or 0) / 60.0
- if latitude.find('"') >= 0:
- lat += float(latitude[latitude.find('\'') + 1:latitude.find('"')] or 0) / 3600.0
- if latitude.find('S') >= 0:
- lat *= -1
- lon = int(longitude[:longitude.find('°')])
- if longitude.find('\'') >= 0:
- lon += int(longitude[longitude.find('°') + 1:longitude.find('\'')] or 0) / 60.0
- if longitude.find('"') >= 0:
- lon += float(longitude[longitude.find('\'') + 1:longitude.find('"')] or 0) / 3600.0
- if longitude.find('W') >= 0:
- lon *= -1
-
- # TODO: get precision
- precision = 0.0002
- # there is no zoom information, deduce from precision (error prone)
- # samples :
- # 13 --> 5
- # 1 --> 6
- # 0.016666666666667 --> 9
- # 0.00027777777777778 --> 19
- # wolframalpha :
- # quadratic fit { {13, 5}, {1, 6}, {0.0166666, 9}, {0.0002777777,19}}
- # 14.1186-8.8322 x+0.625447 x^2
- if precision < 0.0003:
- zoom = 19
- else:
- zoom = int(15 - precision * 8.8322 + precision * precision * 0.625447)
+class WDArticle(WDAttribute):
+
+ __slots__ = 'language', 'kwargs'
+
+ def __init__(self, language, kwargs=None):
+ super().__init__('wikipedia')
+ self.language = language
+ self.kwargs = kwargs or {}
+
+ def get_label(self, language):
+ # language parameter is ignored
+ return "Wikipedia ({language})".replace('{language}', self.language)
+
+ def get_select(self):
+ return "?article{language} ?articleName{language}".replace('{language}', self.language)
+
+ def get_where(self):
+ return """OPTIONAL { ?article{language} schema:about ?item ;
+ schema:inLanguage "{language}" ;
+ schema:isPartOf <https://{language}.wikipedia.org/> ;
+ schema:name ?articleName{language} . }""".replace('{language}', self.language)
+
+ def get_group_by(self):
+ return self.get_select()
+
+ def get_str(self, result, language):
+ key = 'article{language}'.replace('{language}', self.language)
+ return result.get(key)
+
+
+class WDLabelAttribute(WDAttribute):
+
+ def get_select(self):
+ return '(group_concat(distinct ?{name}Label;separator=", ") as ?{name}Labels)'.replace('{name}', self.name)
- url = url_map\
- .replace('{latitude}', str(lat))\
- .replace('{longitude}', str(lon))\
- .replace('{zoom}', str(zoom))
+ def get_where(self):
+ return "OPTIONAL { ?item wdt:{name} ?{name} . }".replace('{name}', self.name)
- return url
+ def get_wikibase_label(self):
+ return "?{name} rdfs:label ?{name}Label .".replace('{name}', self.name)
+ def get_str(self, result, language):
+ return result.get(self.name + 'Labels')
-def get_wikilink(result, wikiid):
- url = eval_xpath(result, wikilink_xpath.replace('{wikiid}', wikiid))
- if not url:
+
+class WDURLAttribute(WDAttribute):
+
+ HTTP_WIKIMEDIA_IMAGE = 'http://commons.wikimedia.org/wiki/Special:FilePath/'
+
+ __slots__ = 'url_id', 'kwargs'
+
+ def __init__(self, name, url_id=None, kwargs=None):
+ super().__init__(name)
+ self.url_id = url_id
+ self.kwargs = kwargs
+
+ def get_str(self, result, language):
+ value = result.get(self.name + 's')
+ if self.url_id and value is not None and value != '':
+ value = value.split(',')[0]
+ url_id = self.url_id
+ if value.startswith(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):
+ value = value[len(WDURLAttribute.HTTP_WIKIMEDIA_IMAGE):]
+ url_id = 'wikimedia_image'
+ return get_external_url(url_id, value)
+ return value
+
+
+class WDGeoAttribute(WDAttribute):
+
+ def get_label(self, language):
+ return "OpenStreetMap"
+
+ def get_select(self):
+ return "?{name}Lat ?{name}Long".replace('{name}', self.name)
+
+ def get_where(self):
+ return """OPTIONAL { ?item p:{name}/psv:{name} [
+ wikibase:geoLatitude ?{name}Lat ;
+ wikibase:geoLongitude ?{name}Long ] }""".replace('{name}', self.name)
+
+ def get_group_by(self):
+ return self.get_select()
+
+ def get_str(self, result, language, osm_zoom=19):
+ latitude = result.get(self.name + 'Lat')
+ longitude = result.get(self.name + 'Long')
+ if latitude and longitude:
+ return get_earth_coordinates_url(latitude, longitude, osm_zoom)
return None
- url = url[0]
- if url.startswith('http://'):
- url = url.replace('http://', 'https://')
- elif url.startswith('//'):
- url = 'https:' + url
- return url
+
+
+class WDImageAttribute(WDURLAttribute):
+
+ __slots__ = 'priority',
+
+ def __init__(self, name, url_id=None, priority=100):
+ super().__init__(name, url_id)
+ self.priority = priority
+
+
+class WDDateAttribute(WDAttribute):
+
+ def get_select(self):
+ return '?{name} ?{name}timePrecision ?{name}timeZone ?{name}timeCalendar'.replace('{name}', self.name)
+
+ def get_where(self):
+ # To remove duplicate, add
+ # FILTER NOT EXISTS { ?item p:{name}/psv:{name}/wikibase:timeValue ?{name}bis FILTER (?{name}bis < ?{name}) }
+ # this filter is too slow, so the response function ignore duplicate results
+ # (see the seen_entities variable)
+ return """OPTIONAL { ?item p:{name}/psv:{name} [
+ wikibase:timeValue ?{name} ;
+ wikibase:timePrecision ?{name}timePrecision ;
+ wikibase:timeTimezone ?{name}timeZone ;
+ wikibase:timeCalendarModel ?{name}timeCalendar ] . }
+ hint:Prior hint:rangeSafe true;""".replace('{name}', self.name)
+
+ def get_group_by(self):
+ return self.get_select()
+
+ def format_8(self, value, locale):
+ # precision: less than a year
+ return value
+
+ def format_9(self, value, locale):
+ year = int(value)
+ # precision: year
+ if year < 1584:
+ if year < 0:
+ return str(year - 1)
+ return str(year)
+ timestamp = isoparse(value)
+ return format_date(timestamp, format='yyyy', locale=locale)
+
+ def format_10(self, value, locale):
+ # precision: month
+ timestamp = isoparse(value)
+ return format_date(timestamp, format='MMMM y', locale=locale)
+
+ def format_11(self, value, locale):
+ # precision: day
+ timestamp = isoparse(value)
+ return format_date(timestamp, format='full', locale=locale)
+
+ def format_13(self, value, locale):
+ timestamp = isoparse(value)
+ # precision: minute
+ return get_datetime_format(format, locale=locale) \
+ .replace("'", "") \
+ .replace('{0}', format_time(timestamp, 'full', tzinfo=None,
+ locale=locale)) \
+ .replace('{1}', format_date(timestamp, 'short', locale=locale))
+
+ def format_14(self, value, locale):
+ # precision: second.
+ return format_datetime(isoparse(value), format='full', locale=locale)
+
+ DATE_FORMAT = {
+ '0': ('format_8', 1000000000),
+ '1': ('format_8', 100000000),
+ '2': ('format_8', 10000000),
+ '3': ('format_8', 1000000),
+ '4': ('format_8', 100000),
+ '5': ('format_8', 10000),
+ '6': ('format_8', 1000),
+ '7': ('format_8', 100),
+ '8': ('format_8', 10),
+ '9': ('format_9', 1), # year
+ '10': ('format_10', 1), # month
+ '11': ('format_11', 0), # day
+ '12': ('format_13', 0), # hour (not supported by babel, display minute)
+ '13': ('format_13', 0), # minute
+ '14': ('format_14', 0) # second
+ }
+
+ def get_str(self, result, language):
+ value = result.get(self.name)
+ if value == '' or value is None:
+ return None
+ precision = result.get(self.name + 'timePrecision')
+ date_format = WDDateAttribute.DATE_FORMAT.get(precision)
+ if date_format is not None:
+ format_method = getattr(self, date_format[0])
+ precision = date_format[1]
+ try:
+ if precision >= 1:
+ t = value.split('-')
+ if value.startswith('-'):
+ value = '-' + t[1]
+ else:
+ value = t[0]
+ return format_method(value, language)
+ except Exception:
+ return value
+ return value
+
+
+def debug_explain_wikidata_query(query, method='GET'):
+ if method == 'GET':
+ http_response = get(SPARQL_EXPLAIN_URL + '&' + urlencode({'query': query}), headers=get_headers())
+ else:
+ http_response = post(SPARQL_EXPLAIN_URL, data={'query': query}, headers=get_headers())
+ http_response.raise_for_status()
+ return http_response.content
+
+
+def init(engine_settings=None):
+ # WIKIDATA_PROPERTIES : add unit symbols
+ WIKIDATA_PROPERTIES.update(WIKIDATA_UNITS)
+
+ # WIKIDATA_PROPERTIES : add property labels
+ wikidata_property_names = []
+ for attribute in get_attributes('en'):
+ if type(attribute) in (WDAttribute, WDAmountAttribute, WDURLAttribute, WDDateAttribute, WDLabelAttribute):
+ if attribute.name not in WIKIDATA_PROPERTIES:
+ wikidata_property_names.append("wd:" + attribute.name)
+ query = QUERY_PROPERTY_NAMES.replace('%ATTRIBUTES%', " ".join(wikidata_property_names))
+ jsonresponse = send_wikidata_query(query)
+ for result in jsonresponse.get('results', {}).get('bindings', {}):
+ name = result['name']['value']
+ lang = result['name']['xml:lang']
+ entity_id = result['item']['value'].replace('http://www.wikidata.org/entity/', '')
+ WIKIDATA_PROPERTIES[(entity_id, lang)] = name.capitalize()
diff --git a/searx/external_urls.py b/searx/external_urls.py
new file mode 100644
index 000000000..da58b8f54
--- /dev/null
+++ b/searx/external_urls.py
@@ -0,0 +1,77 @@
+import math
+
+from searx.data import EXTERNAL_URLS
+
+
+IMDB_PREFIX_TO_URL_ID = {
+ 'tt': 'imdb_title',
+ 'mn': 'imdb_name',
+ 'ch': 'imdb_character',
+ 'co': 'imdb_company',
+ 'ev': 'imdb_event'
+}
+
+
+def get_imdb_url_id(imdb_item_id):
+ id_prefix = imdb_item_id[:2]
+ return IMDB_PREFIX_TO_URL_ID.get(id_prefix)
+
+
+def get_external_url(url_id, item_id, alternative="default"):
+ """Return an external URL or None if url_id is not found.
+
+ url_id can take value from data/external_urls.json
+ The "imdb_id" value is automaticaly converted according to the item_id value.
+
+ If item_id is None, the raw URL with the $1 is returned.
+ """
+ if url_id == 'imdb_id' and item_id is not None:
+ url_id = get_imdb_url_id(item_id)
+
+ url_description = EXTERNAL_URLS.get(url_id)
+ if url_description:
+ url_template = url_description["urls"].get(alternative)
+ if url_template is not None:
+ if item_id is not None:
+ return url_template.replace('$1', item_id)
+ else:
+ return url_template
+ return None
+
+
+def get_earth_coordinates_url(latitude, longitude, osm_zoom, alternative='default'):
+ url = get_external_url('map', None, alternative)\
+ .replace('${latitude}', str(latitude))\
+ .replace('${longitude}', str(longitude))\
+ .replace('${zoom}', str(osm_zoom))
+ return url
+
+
+def area_to_osm_zoom(area):
+ """Convert an area in km² into an OSM zoom. Less reliable if the shape is not round.
+
+ logarithm regression using these data:
+ * 9596961 -> 4 (China)
+ * 3287263 -> 5 (India)
+ * 643801 -> 6 (France)
+ * 6028 -> 9
+ * 1214 -> 10
+ * 891 -> 12
+ * 12 -> 13
+
+ In WolframAlpha:
+ >>> log fit {9596961,15},{3287263, 14},{643801,13},{6028,10},{1214,9},{891,7},{12,6}
+
+ with 15 = 19-4 (China); 14 = 19-5 (India) and so on
+
+ Args:
+ area (int,float,str): area in km²
+
+ Returns:
+ int: OSM zoom or 19 in area is not a number
+ """
+ try:
+ amount = float(area)
+ return max(0, min(19, round(19 - 0.688297 * math.log(226.878 * amount))))
+ except ValueError:
+ return 19
diff --git a/searx/results.py b/searx/results.py
index e4cad2e24..34a94511a 100644
--- a/searx/results.py
+++ b/searx/results.py
@@ -20,6 +20,18 @@ def result_content_len(content):
def compare_urls(url_a, url_b):
+ """Lazy compare between two URL.
+ "www.example.com" and "example.com" are equals.
+ "www.example.com/path/" and "www.example.com/path" are equals.
+ "https://www.example.com/" and "http://www.example.com/" are equals.
+
+ Args:
+ url_a (ParseResult): first URL
+ url_b (ParseResult): second URL
+
+ Returns:
+ bool: True if url_a and url_b are equals
+ """
# ignore www. in comparison
if url_a.netloc.startswith('www.'):
host_a = url_a.netloc.replace('www.', '', 1)
@@ -68,8 +80,10 @@ def merge_two_infoboxes(infobox1, infobox2):
for url2 in infobox2.get('urls', []):
unique_url = True
parsed_url2 = urlparse(url2.get('url', ''))
+ entity_url2 = url2.get('entity')
for url1 in urls1:
- if compare_urls(urlparse(url1.get('url', '')), parsed_url2):
+ if (entity_url2 is not None and url1.get('entity') == entity_url2)\
+ or compare_urls(urlparse(url1.get('url', '')), parsed_url2):
unique_url = False
break
if unique_url:
@@ -86,18 +100,22 @@ def merge_two_infoboxes(infobox1, infobox2):
infobox1['img_src'] = img2
if 'attributes' in infobox2:
- attributes1 = infobox1.get('attributes', None)
+ attributes1 = infobox1.get('attributes')
if attributes1 is None:
- attributes1 = []
- infobox1['attributes'] = attributes1
+ infobox1['attributes'] = attributes1 = []
attributeSet = set()
- for attribute in infobox1.get('attributes', []):
- if attribute.get('label', None) not in attributeSet:
- attributeSet.add(attribute.get('label', None))
+ for attribute in attributes1:
+ label = attribute.get('label')
+ if label not in attributeSet:
+ attributeSet.add(label)
+ entity = attribute.get('entity')
+ if entity not in attributeSet:
+ attributeSet.add(entity)
for attribute in infobox2.get('attributes', []):
- if attribute.get('label', None) not in attributeSet:
+ if attribute.get('label') not in attributeSet\
+ and attribute.get('entity') not in attributeSet:
attributes1.append(attribute)
if 'content' in infobox2:
diff --git a/searx/templates/oscar/infobox.html b/searx/templates/oscar/infobox.html
index 5ba4aa5f0..8a12b8074 100644
--- a/searx/templates/oscar/infobox.html
+++ b/searx/templates/oscar/infobox.html
@@ -25,11 +25,7 @@
{%- if attribute.image -%}
<td><img class="img-responsive" src="{{ image_proxify(attribute.image.src) }}" alt="{{ attribute.image.alt }}" /></td>
{%- else -%}
- {% if attribute.label == 'Instance of' %}
- <td><bdi><a href="https://wikidata.org/wiki/{{ attribute.value.id }}">{{ attribute.value.id }}</a></bdi></td>
- {% else %}
- <td><bdi>{{ attribute.value }}</bdi></td>
- {%- endif -%}
+ <td><bdi>{{ attribute.value }}</bdi></td>
{%- endif -%}
</tr>
{% endfor -%}
diff --git a/searx/templates/simple/infobox.html b/searx/templates/simple/infobox.html
index 50b568919..08daa5038 100644
--- a/searx/templates/simple/infobox.html
+++ b/searx/templates/simple/infobox.html
@@ -1,7 +1,6 @@
<aside class="infobox">
<h2><bdi>{{ infobox.infobox }}</bdi></h2>
{% if infobox.img_src %}<img src="{{ image_proxify(infobox.img_src) }}" title="{{ infobox.infobox|striptags }}" alt="{{ infobox.infobox|striptags }}" />{% endif %}
- <p><bdi>{{ infobox.entity }}</bdi></p>
<p><bdi>{{ infobox.content | safe }}</bdi></p>
{% if infobox.attributes %}
<div class="attributes">
diff --git a/searx/utils.py b/searx/utils.py
index 1c10585cf..9e43c67f0 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -481,6 +481,16 @@ def ecma_unescape(s):
return s
+def get_string_replaces_function(replaces):
+ rep = {re.escape(k): v for k, v in replaces.items()}
+ pattern = re.compile("|".join(rep.keys()))
+
+ def f(text):
+ return pattern.sub(lambda m: rep[re.escape(m.group(0))], text)
+
+ return f
+
+
def get_engine_from_settings(name):
"""Return engine configuration from settings.yml of a given engine name"""
diff --git a/utils/fetch_wikidata_units.py b/utils/fetch_wikidata_units.py
new file mode 100644
index 000000000..69505968e
--- /dev/null
+++ b/utils/fetch_wikidata_units.py
@@ -0,0 +1,47 @@
+#!/usr/bin/env python
+
+import json
+import collections
+
+# set path
+from sys import path
+from os.path import realpath, dirname, join
+path.append(realpath(dirname(realpath(__file__)) + '/../'))
+
+from searx import searx_dir
+from searx.engines.wikidata import send_wikidata_query
+
+
+SARQL_REQUEST = """
+SELECT DISTINCT ?item ?symbol ?P2370 ?P2370Unit ?P2442 ?P2442Unit
+WHERE
+{
+?item wdt:P31/wdt:P279 wd:Q47574.
+?item wdt:P5061 ?symbol.
+FILTER(LANG(?symbol) = "en").
+}
+ORDER BY ?item
+"""
+
+
+def get_data():
+ def get_key(unit):
+ return unit['item']['value'].replace('http://www.wikidata.org/entity/', '')
+
+ def get_value(unit):
+ return unit['symbol']['value']
+
+ result = send_wikidata_query(SARQL_REQUEST)
+ if result is not None:
+ # sort the unit by entity name
+ # so different fetchs keep the file unchanged.
+ list(result['results']['bindings']).sort(key=get_key)
+ return collections.OrderedDict([(get_key(unit), get_value(unit)) for unit in result['results']['bindings']])
+
+
+def get_wikidata_units_filename():
+ return join(join(searx_dir, "data"), "wikidata_units.json")
+
+
+with open(get_wikidata_units_filename(), 'w') as f:
+ json.dump(get_data(), f, indent=4, ensure_ascii=False)