diff options
Diffstat (limited to 'searx/engines')
70 files changed, 178 insertions, 195 deletions
diff --git a/searx/engines/1337x.py b/searx/engines/1337x.py index c6bc3cb6d..0de04bd95 100644 --- a/searx/engines/1337x.py +++ b/searx/engines/1337x.py @@ -1,8 +1,7 @@ -from urllib import quote from lxml import html from searx.engines.xpath import extract_text from searx.utils import get_torrent_size -from urlparse import urljoin +from searx.url_utils import quote, urljoin url = 'https://1337x.to/' search_url = url + 'search/{search_term}/{pageno}/' diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index 77184a282..023ec409a 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -72,12 +72,11 @@ def load_engine(engine_data): if engine_data['categories'] == 'none': engine.categories = [] else: - engine.categories = map( - str.strip, engine_data['categories'].split(',')) + engine.categories = list(map(str.strip, engine_data['categories'].split(','))) continue setattr(engine, param_name, engine_data[param_name]) - for arg_name, arg_value in engine_default_args.iteritems(): + for arg_name, arg_value in engine_default_args.items(): if not hasattr(engine, arg_name): setattr(engine, arg_name, arg_value) diff --git a/searx/engines/archlinux.py b/searx/engines/archlinux.py index dca825790..cad06f8c6 100644 --- a/searx/engines/archlinux.py +++ b/searx/engines/archlinux.py @@ -11,10 +11,9 @@ @parse url, title """ -from urlparse import urljoin -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['it'] diff --git a/searx/engines/base.py b/searx/engines/base.py index a552453ce..ff006a3bc 100755 --- a/searx/engines/base.py +++ b/searx/engines/base.py @@ -14,10 +14,10 @@ """ from lxml import etree -from urllib import urlencode -from searx.utils import searx_useragent from datetime import datetime import re +from searx.url_utils import urlencode +from searx.utils import searx_useragent categories = ['science'] @@ -73,7 +73,7 @@ def request(query, params): def response(resp): results = [] - search_results = etree.XML(resp.content) + search_results = etree.XML(resp.text) for entry in search_results.xpath('./result/doc'): content = "No description available" diff --git a/searx/engines/bing.py b/searx/engines/bing.py index 4e7ead82d..052d567ea 100644 --- a/searx/engines/bing.py +++ b/searx/engines/bing.py @@ -13,9 +13,9 @@ @todo publishedDate """ -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['general'] diff --git a/searx/engines/bing_images.py b/searx/engines/bing_images.py index 97f6dca37..e79740e50 100644 --- a/searx/engines/bing_images.py +++ b/searx/engines/bing_images.py @@ -15,11 +15,11 @@ limited response to 10 images """ -from urllib import urlencode from lxml import html from json import loads import re from searx.engines.bing import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode # engine dependent config categories = ['images'] diff --git a/searx/engines/bing_news.py b/searx/engines/bing_news.py index 765bcd38e..8e3cc517e 100644 --- a/searx/engines/bing_news.py +++ b/searx/engines/bing_news.py @@ -11,13 +11,12 @@ @parse url, title, content, publishedDate, thumbnail """ -from urllib import urlencode -from urlparse import urlparse, parse_qsl from datetime import datetime from dateutil import parser from lxml import etree from searx.utils import list_get from searx.engines.bing import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode, urlparse, parse_qsl # engine dependent config categories = ['news'] @@ -86,7 +85,7 @@ def request(query, params): def response(resp): results = [] - rss = etree.fromstring(resp.content) + rss = etree.fromstring(resp.text) ns = rss.nsmap diff --git a/searx/engines/blekko_images.py b/searx/engines/blekko_images.py index c0664f390..f71645634 100644 --- a/searx/engines/blekko_images.py +++ b/searx/engines/blekko_images.py @@ -11,7 +11,7 @@ """ from json import loads -from urllib import urlencode +from searx.url_utils import urlencode # engine dependent config categories = ['images'] diff --git a/searx/engines/btdigg.py b/searx/engines/btdigg.py index 33c8355de..40438673f 100644 --- a/searx/engines/btdigg.py +++ b/searx/engines/btdigg.py @@ -10,11 +10,10 @@ @parse url, title, content, seed, leech, magnetlink """ -from urlparse import urljoin -from urllib import quote from lxml import html from operator import itemgetter from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin from searx.utils import get_torrent_size # engine dependent config @@ -38,7 +37,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) search_res = dom.xpath('//div[@id="search_res"]/table/tr') diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index bc839cfb5..1218d4849 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -1,21 +1,25 @@ -from datetime import datetime +import json import re import os -import json +import sys import unicodedata +from datetime import datetime + +if sys.version_info[0] == 3: + unicode = str categories = [] url = 'https://download.finance.yahoo.com/d/quotes.csv?e=.csv&f=sl1d1t1&s={query}=X' weight = 100 -parser_re = re.compile(u'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) # noqa +parser_re = re.compile(b'.*?(\\d+(?:\\.\\d+)?) ([^.0-9]+) (?:in|to) ([^.0-9]+)', re.I) db = 1 def normalize_name(name): - name = name.lower().replace('-', ' ').rstrip('s') + name = name.decode('utf-8').lower().replace('-', ' ').rstrip('s') name = re.sub(' +', ' ', name) return unicodedata.normalize('NFKD', name).lower() @@ -35,7 +39,7 @@ def iso4217_to_name(iso4217, language): def request(query, params): - m = parser_re.match(unicode(query, 'utf8')) + m = parser_re.match(query) if not m: # wrong query return params diff --git a/searx/engines/dailymotion.py b/searx/engines/dailymotion.py index 8c69aafe0..fad7e596c 100644 --- a/searx/engines/dailymotion.py +++ b/searx/engines/dailymotion.py @@ -12,10 +12,9 @@ @todo set content-parameter with correct data """ -from urllib import urlencode from json import loads from datetime import datetime -from requests import get +from searx.url_utils import urlencode # engine dependent config categories = ['videos'] diff --git a/searx/engines/deezer.py b/searx/engines/deezer.py index 3db1af3d2..af63478fb 100644 --- a/searx/engines/deezer.py +++ b/searx/engines/deezer.py @@ -11,7 +11,7 @@ """ from json import loads -from urllib import urlencode +from searx.url_utils import urlencode # engine dependent config categories = ['music'] @@ -30,8 +30,7 @@ embedded_url = '<iframe scrolling="no" frameborder="0" allowTransparency="true" def request(query, params): offset = (params['pageno'] - 1) * 25 - params['url'] = search_url.format(query=urlencode({'q': query}), - offset=offset) + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) return params diff --git a/searx/engines/deviantart.py b/searx/engines/deviantart.py index a24b75b8a..bb85c6dc5 100644 --- a/searx/engines/deviantart.py +++ b/searx/engines/deviantart.py @@ -12,10 +12,10 @@ @todo rewrite to api """ -from urllib import urlencode from lxml import html import re from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['images'] diff --git a/searx/engines/dictzone.py b/searx/engines/dictzone.py index 20a9a8980..7c3478629 100644 --- a/searx/engines/dictzone.py +++ b/searx/engines/dictzone.py @@ -10,20 +10,20 @@ """ import re -from urlparse import urljoin from lxml import html from searx.utils import is_valid_lang +from searx.url_utils import urljoin categories = ['general'] url = u'http://dictzone.com/{from_lang}-{to_lang}-dictionary/{query}' weight = 100 -parser_re = re.compile(u'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) +parser_re = re.compile(b'.*?([a-z]+)-([a-z]+) ([^ ]+)$', re.I) results_xpath = './/table[@id="r"]/tr' def request(query, params): - m = parser_re.match(unicode(query, 'utf8')) + m = parser_re.match(query) if not m: return params diff --git a/searx/engines/digbt.py b/searx/engines/digbt.py index b55d7747a..ff2f94593 100644 --- a/searx/engines/digbt.py +++ b/searx/engines/digbt.py @@ -10,10 +10,14 @@ @parse url, title, content, magnetlink """ -from urlparse import urljoin +from sys import version_info from lxml import html from searx.engines.xpath import extract_text from searx.utils import get_torrent_size +from searx.url_utils import urljoin + +if version_info[0] == 3: + unicode = str categories = ['videos', 'music', 'files'] paging = True @@ -31,7 +35,7 @@ def request(query, params): def response(resp): - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) search_res = dom.xpath('.//td[@class="x-item"]') if not search_res: diff --git a/searx/engines/digg.py b/searx/engines/digg.py index 238b466a0..606747a4d 100644 --- a/searx/engines/digg.py +++ b/searx/engines/digg.py @@ -10,10 +10,10 @@ @parse url, title, content, publishedDate, thumbnail """ -from urllib import quote_plus +from dateutil import parser from json import loads from lxml import html -from dateutil import parser +from searx.url_utils import quote_plus # engine dependent config categories = ['news', 'social media'] diff --git a/searx/engines/doku.py b/searx/engines/doku.py index 93867fd0d..a391be444 100644 --- a/searx/engines/doku.py +++ b/searx/engines/doku.py @@ -9,9 +9,9 @@ # @stable yes # @parse (general) url, title, content -from urllib import urlencode from lxml.html import fromstring from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['general'] # TODO , 'images', 'music', 'videos', 'files' diff --git a/searx/engines/duckduckgo.py b/searx/engines/duckduckgo.py index 1ae484123..1872ab7d4 100644 --- a/searx/engines/duckduckgo.py +++ b/searx/engines/duckduckgo.py @@ -13,11 +13,11 @@ @todo rewrite to api """ -from urllib import urlencode from lxml.html import fromstring from requests import get from json import loads from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['general'] diff --git a/searx/engines/duckduckgo_definitions.py b/searx/engines/duckduckgo_definitions.py index dd3f12e1e..21c6a6578 100644 --- a/searx/engines/duckduckgo_definitions.py +++ b/searx/engines/duckduckgo_definitions.py @@ -1,10 +1,10 @@ import json -from urllib import urlencode -from re import compile, sub from lxml import html -from searx.utils import html_to_text +from re import compile from searx.engines.xpath import extract_text from searx.engines.duckduckgo import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode +from searx.utils import html_to_text url = 'https://api.duckduckgo.com/'\ + '?{query}&format=json&pretty=0&no_redirect=1&d=1' diff --git a/searx/engines/faroo.py b/searx/engines/faroo.py index 9fa244e77..e24d1b7dc 100644 --- a/searx/engines/faroo.py +++ b/searx/engines/faroo.py @@ -10,10 +10,10 @@ @parse url, title, content, publishedDate, img_src """ -from urllib import urlencode from json import loads import datetime from searx.utils import searx_useragent +from searx.url_utils import urlencode # engine dependent config categories = ['general', 'news'] diff --git a/searx/engines/fdroid.py b/searx/engines/fdroid.py index 6d470a4eb..a6b01a8ee 100644 --- a/searx/engines/fdroid.py +++ b/searx/engines/fdroid.py @@ -9,9 +9,9 @@ @parse url, title, content """ -from urllib import urlencode -from searx.engines.xpath import extract_text from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['files'] @@ -24,8 +24,7 @@ search_url = base_url + 'repository/browse/?{query}' # do search-request def request(query, params): - query = urlencode({'fdfilter': query, - 'fdpage': params['pageno']}) + query = urlencode({'fdfilter': query, 'fdpage': params['pageno']}) params['url'] = search_url.format(query=query) return params diff --git a/searx/engines/filecrop.py b/searx/engines/filecrop.py index 71665bd4e..ed57a6bf3 100644 --- a/searx/engines/filecrop.py +++ b/searx/engines/filecrop.py @@ -1,5 +1,9 @@ -from urllib import urlencode -from HTMLParser import HTMLParser +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser url = 'http://www.filecrop.com/' search_url = url + '/search.php?{query}&size_i=0&size_f=100000000&engine_r=1&engine_d=1&engine_e=1&engine_4=1&engine_m=1&pos={index}' # noqa @@ -73,8 +77,7 @@ class FilecropResultParser(HTMLParser): def request(query, params): index = 1 + (params['pageno'] - 1) * 30 - params['url'] = search_url.format(query=urlencode({'w': query}), - index=index) + params['url'] = search_url.format(query=urlencode({'w': query}), index=index) return params diff --git a/searx/engines/flickr.py b/searx/engines/flickr.py index 5ce1160e9..de1769370 100644 --- a/searx/engines/flickr.py +++ b/searx/engines/flickr.py @@ -13,8 +13,8 @@ More info on api-key : https://www.flickr.com/services/apps/create/ """ -from urllib import urlencode from json import loads +from searx.url_utils import urlencode categories = ['images'] diff --git a/searx/engines/flickr_noapi.py b/searx/engines/flickr_noapi.py index 3c0ec7b70..08f07f7ce 100644 --- a/searx/engines/flickr_noapi.py +++ b/searx/engines/flickr_noapi.py @@ -12,11 +12,11 @@ @parse url, title, thumbnail, img_src """ -from urllib import urlencode from json import loads from time import time import re from searx.engines import logger +from searx.url_utils import urlencode logger = logger.getChild('flickr-noapi') diff --git a/searx/engines/framalibre.py b/searx/engines/framalibre.py index e8d1d8aa7..f2eecdc73 100644 --- a/searx/engines/framalibre.py +++ b/searx/engines/framalibre.py @@ -10,12 +10,10 @@ @parse url, title, content, thumbnail, img_src """ -from urlparse import urljoin from cgi import escape -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text -from dateutil import parser +from searx.url_utils import urljoin, urlencode # engine dependent config categories = ['it'] diff --git a/searx/engines/frinkiac.py b/searx/engines/frinkiac.py index a9383f862..a67b42dbe 100644 --- a/searx/engines/frinkiac.py +++ b/searx/engines/frinkiac.py @@ -10,7 +10,7 @@ Frinkiac (Images) """ from json import loads -from urllib import urlencode +from searx.url_utils import urlencode categories = ['images'] diff --git a/searx/engines/gigablast.py b/searx/engines/gigablast.py index 0c1d7f613..37933c69b 100644 --- a/searx/engines/gigablast.py +++ b/searx/engines/gigablast.py @@ -11,10 +11,9 @@ """ from json import loads -from random import randint from time import time -from urllib import urlencode from lxml.html import fromstring +from searx.url_utils import urlencode # engine dependent config categories = ['general'] diff --git a/searx/engines/github.py b/searx/engines/github.py index 7adef3be9..eaa00da4f 100644 --- a/searx/engines/github.py +++ b/searx/engines/github.py @@ -10,8 +10,8 @@ @parse url, title, content """ -from urllib import urlencode from json import loads +from searx.url_utils import urlencode # engine dependent config categories = ['it'] diff --git a/searx/engines/google.py b/searx/engines/google.py index e14e9e702..934f5c29a 100644 --- a/searx/engines/google.py +++ b/searx/engines/google.py @@ -9,11 +9,10 @@ # @parse url, title, content, suggestion import re -from urllib import urlencode -from urlparse import urlparse, parse_qsl from lxml import html, etree from searx.engines.xpath import extract_text, extract_url -from searx.search import logger +from searx import logger +from searx.url_utils import urlencode, urlparse, parse_qsl logger = logger.getChild('google engine') diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 9a3c71c7e..9692f4b82 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -11,9 +11,9 @@ """ from datetime import date, timedelta -from urllib import urlencode from json import loads from lxml import html +from searx.url_utils import urlencode # engine dependent config diff --git a/searx/engines/google_news.py b/searx/engines/google_news.py index 6b79ff5c8..7344b5289 100644 --- a/searx/engines/google_news.py +++ b/searx/engines/google_news.py @@ -11,9 +11,8 @@ """ from lxml import html -from urllib import urlencode -from json import loads from searx.engines.google import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode # search-url categories = ['news'] diff --git a/searx/engines/ina.py b/searx/engines/ina.py index 86a39782b..37a05f099 100644 --- a/searx/engines/ina.py +++ b/searx/engines/ina.py @@ -12,11 +12,15 @@ # @todo embedded (needs some md5 from video page) from json import loads -from urllib import urlencode from lxml import html -from HTMLParser import HTMLParser -from searx.engines.xpath import extract_text from dateutil import parser +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +try: + from HTMLParser import HTMLParser +except: + from html.parser import HTMLParser # engine dependent config categories = ['videos'] diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 4604c3cac..67d6a5a65 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -1,11 +1,16 @@ -from urllib import urlencode -from json import loads from collections import Iterable +from json import loads +from sys import version_info +from searx.url_utils import urlencode + +if version_info[0] == 3: + unicode = str search_url = None url_query = None content_query = None title_query = None +paging = False suggestion_query = '' results_query = '' @@ -20,7 +25,7 @@ first_page_num = 1 def iterate(iterable): if type(iterable) == dict: - it = iterable.iteritems() + it = iterable.items() else: it = enumerate(iterable) diff --git a/searx/engines/kickass.py b/searx/engines/kickass.py index 059fa2a66..5e897c96f 100644 --- a/searx/engines/kickass.py +++ b/searx/engines/kickass.py @@ -10,12 +10,11 @@ @parse url, title, content, seed, leech, magnetlink """ -from urlparse import urljoin -from urllib import quote from lxml import html from operator import itemgetter from searx.engines.xpath import extract_text from searx.utils import get_torrent_size, convert_str_to_int +from searx.url_utils import quote, urljoin # engine dependent config categories = ['videos', 'music', 'files'] diff --git a/searx/engines/mediawiki.py b/searx/engines/mediawiki.py index 93d98d3aa..5a70204b1 100644 --- a/searx/engines/mediawiki.py +++ b/searx/engines/mediawiki.py @@ -14,7 +14,7 @@ from json import loads from string import Formatter -from urllib import urlencode, quote +from searx.url_utils import urlencode, quote # engine dependent config categories = ['general'] diff --git a/searx/engines/mixcloud.py b/searx/engines/mixcloud.py index 312d297eb..470c007ea 100644 --- a/searx/engines/mixcloud.py +++ b/searx/engines/mixcloud.py @@ -11,8 +11,8 @@ """ from json import loads -from urllib import urlencode from dateutil import parser +from searx.url_utils import urlencode # engine dependent config categories = ['music'] diff --git a/searx/engines/nyaa.py b/searx/engines/nyaa.py index 4ca5b3171..272c712c4 100644 --- a/searx/engines/nyaa.py +++ b/searx/engines/nyaa.py @@ -9,9 +9,9 @@ @parse url, title, content, seed, leech, torrentfile """ -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['files', 'images', 'videos', 'music'] diff --git a/searx/engines/openstreetmap.py b/searx/engines/openstreetmap.py index 01ca7d42d..733ba6203 100644 --- a/searx/engines/openstreetmap.py +++ b/searx/engines/openstreetmap.py @@ -11,7 +11,6 @@ """ from json import loads -from searx.utils import searx_useragent # engine dependent config categories = ['map'] @@ -27,9 +26,6 @@ result_base_url = 'https://openstreetmap.org/{osm_type}/{osm_id}' def request(query, params): params['url'] = base_url + search_string.format(query=query) - # using searx User-Agent - params['headers']['User-Agent'] = searx_useragent() - return params diff --git a/searx/engines/photon.py b/searx/engines/photon.py index a029bbfef..15236f680 100644 --- a/searx/engines/photon.py +++ b/searx/engines/photon.py @@ -10,9 +10,9 @@ @parse url, title """ -from urllib import urlencode from json import loads from searx.utils import searx_useragent +from searx.url_utils import urlencode # engine dependent config categories = ['map'] diff --git a/searx/engines/piratebay.py b/searx/engines/piratebay.py index ca21a3bb2..a5af8d824 100644 --- a/searx/engines/piratebay.py +++ b/searx/engines/piratebay.py @@ -8,11 +8,10 @@ # @stable yes (HTML can change) # @parse url, title, content, seed, leech, magnetlink -from urlparse import urljoin -from urllib import quote from lxml import html from operator import itemgetter from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin # engine dependent config categories = ['videos', 'music', 'files'] diff --git a/searx/engines/qwant.py b/searx/engines/qwant.py index 1fc4630fa..cb097eb38 100644 --- a/searx/engines/qwant.py +++ b/searx/engines/qwant.py @@ -12,9 +12,8 @@ from datetime import datetime from json import loads -from urllib import urlencode - from searx.utils import html_to_text +from searx.url_utils import urlencode # engine dependent config categories = None diff --git a/searx/engines/reddit.py b/searx/engines/reddit.py index b29792a3a..d19724906 100644 --- a/searx/engines/reddit.py +++ b/searx/engines/reddit.py @@ -11,9 +11,8 @@ """ import json -from urllib import urlencode -from urlparse import urlparse, urljoin from datetime import datetime +from searx.url_utils import urlencode, urljoin, urlparse # engine dependent config categories = ['general', 'images', 'news', 'social media'] @@ -26,8 +25,7 @@ search_url = base_url + 'search.json?{query}' # do search-request def request(query, params): - query = urlencode({'q': query, - 'limit': page_size}) + query = urlencode({'q': query, 'limit': page_size}) params['url'] = search_url.format(query=query) return params diff --git a/searx/engines/scanr_structures.py b/searx/engines/scanr_structures.py index ad78155ac..72fd2b3c9 100644 --- a/searx/engines/scanr_structures.py +++ b/searx/engines/scanr_structures.py @@ -10,9 +10,7 @@ @parse url, title, content, img_src """ -from urllib import urlencode from json import loads, dumps -from dateutil import parser from searx.utils import html_to_text # engine dependent config @@ -48,7 +46,7 @@ def response(resp): search_res = loads(resp.text) # return empty array if there are no results - if search_res.get('total') < 1: + if search_res.get('total', 0) < 1: return [] # parse results diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index be7a6d385..789e8e7a9 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -10,8 +10,8 @@ @parse url, title, content """ -from urllib import urlencode from json import loads +from searx.url_utils import urlencode # engine dependent config @@ -31,8 +31,7 @@ code_endings = {'cs': 'c#', # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), - pageno=params['pageno'] - 1) + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) return params diff --git a/searx/engines/searchcode_doc.py b/searx/engines/searchcode_doc.py index 99e10be62..4b8e9a84a 100644 --- a/searx/engines/searchcode_doc.py +++ b/searx/engines/searchcode_doc.py @@ -10,8 +10,8 @@ @parse url, title, content """ -from urllib import urlencode from json import loads +from searx.url_utils import urlencode # engine dependent config categories = ['it'] @@ -24,8 +24,7 @@ search_url = url + 'api/search_IV/?{query}&p={pageno}' # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), - pageno=params['pageno'] - 1) + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno'] - 1) return params diff --git a/searx/engines/seedpeer.py b/searx/engines/seedpeer.py index e1309a9b5..3770dacac 100644 --- a/searx/engines/seedpeer.py +++ b/searx/engines/seedpeer.py @@ -8,11 +8,9 @@ # @stable yes (HTML can change) # @parse url, title, content, seed, leech, magnetlink -from urlparse import urljoin -from urllib import quote from lxml import html from operator import itemgetter -from searx.engines.xpath import extract_text +from searx.url_utils import quote, urljoin url = 'http://www.seedpeer.eu/' diff --git a/searx/engines/soundcloud.py b/searx/engines/soundcloud.py index 62b03ac03..41b40da61 100644 --- a/searx/engines/soundcloud.py +++ b/searx/engines/soundcloud.py @@ -11,13 +11,17 @@ """ import re -from StringIO import StringIO from json import loads -from lxml import etree -from urllib import urlencode, quote_plus +from lxml import html from dateutil import parser from searx import logger from searx.poolrequests import get as http_get +from searx.url_utils import quote_plus, urlencode + +try: + from cStringIO import StringIO +except: + from io import StringIO # engine dependent config categories = ['music'] @@ -36,14 +40,15 @@ embedded_url = '<iframe width="100%" height="166" ' +\ 'scrolling="no" frameborder="no" ' +\ 'data-src="https://w.soundcloud.com/player/?url={uri}"></iframe>' +cid_re = re.compile(r'client_id:"([^"]*)"', re.I | re.U) + def get_client_id(): response = http_get("https://soundcloud.com") - rx_namespace = {"re": "http://exslt.org/regular-expressions"} if response.ok: - tree = etree.parse(StringIO(response.content), etree.HTMLParser()) - script_tags = tree.xpath("//script[re:match(@src, '(.*app.*js)')]", namespaces=rx_namespace) + tree = html.fromstring(response.content) + script_tags = tree.xpath("//script[contains(@src, '/assets/app')]") app_js_urls = [script_tag.get('src') for script_tag in script_tags if script_tag is not None] # extracts valid app_js urls from soundcloud.com content @@ -51,7 +56,7 @@ def get_client_id(): # gets app_js and searches for the clientid response = http_get(app_js_url) if response.ok: - cids = re.search(r'client_id:"([^"]*)"', response.content, re.M | re.I) + cids = cid_re.search(response.text) if cids is not None and len(cids.groups()): return cids.groups()[0] logger.warning("Unable to fetch guest client_id from SoundCloud, check parser!") diff --git a/searx/engines/spotify.py b/searx/engines/spotify.py index 249ba91ef..aed756be3 100644 --- a/searx/engines/spotify.py +++ b/searx/engines/spotify.py @@ -11,7 +11,7 @@ """ from json import loads -from urllib import urlencode +from searx.url_utils import urlencode # engine dependent config categories = ['music'] @@ -29,8 +29,7 @@ embedded_url = '<iframe data-src="https://embed.spotify.com/?uri=spotify:track:{ def request(query, params): offset = (params['pageno'] - 1) * 20 - params['url'] = search_url.format(query=urlencode({'q': query}), - offset=offset) + params['url'] = search_url.format(query=urlencode({'q': query}), offset=offset) return params diff --git a/searx/engines/stackoverflow.py b/searx/engines/stackoverflow.py index 5e7ab2901..25875aa15 100644 --- a/searx/engines/stackoverflow.py +++ b/searx/engines/stackoverflow.py @@ -10,10 +10,9 @@ @parse url, title, content """ -from urlparse import urljoin -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['it'] @@ -31,8 +30,7 @@ content_xpath = './/div[@class="excerpt"]' # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'q': query}), - pageno=params['pageno']) + params['url'] = search_url.format(query=urlencode({'q': query}), pageno=params['pageno']) return params diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index 54aafdee5..314b7b9a8 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -56,7 +56,7 @@ def request(query, params): def response(resp): results = [] - dom = html.fromstring(resp.content) + dom = html.fromstring(resp.text) # parse results for result in dom.xpath(results_xpath): diff --git a/searx/engines/subtitleseeker.py b/searx/engines/subtitleseeker.py index 77b010c3f..2cbc991b3 100644 --- a/searx/engines/subtitleseeker.py +++ b/searx/engines/subtitleseeker.py @@ -10,10 +10,10 @@ @parse url, title, content """ -from urllib import quote_plus from lxml import html from searx.languages import language_codes from searx.engines.xpath import extract_text +from searx.url_utils import quote_plus # engine dependent config categories = ['videos'] diff --git a/searx/engines/swisscows.py b/searx/engines/swisscows.py index dd398857f..e9c13ca24 100644 --- a/searx/engines/swisscows.py +++ b/searx/engines/swisscows.py @@ -11,9 +11,9 @@ """ from json import loads -from urllib import urlencode, unquote import re from lxml.html import fromstring +from searx.url_utils import unquote, urlencode # engine dependent config categories = ['general', 'images'] @@ -27,10 +27,10 @@ search_string = '?{query}&page={page}' supported_languages_url = base_url # regex -regex_json = re.compile(r'initialData: {"Request":(.|\n)*},\s*environment') -regex_json_remove_start = re.compile(r'^initialData:\s*') -regex_json_remove_end = re.compile(r',\s*environment$') -regex_img_url_remove_start = re.compile(r'^https?://i\.swisscows\.ch/\?link=') +regex_json = re.compile(b'initialData: {"Request":(.|\n)*},\s*environment') +regex_json_remove_start = re.compile(b'^initialData:\s*') +regex_json_remove_end = re.compile(b',\s*environment$') +regex_img_url_remove_start = re.compile(b'^https?://i\.swisscows\.ch/\?link=') # do search-request @@ -45,10 +45,9 @@ def request(query, params): ui_language = params['language'].split('-')[0] search_path = search_string.format( - query=urlencode({'query': query, - 'uiLanguage': ui_language, - 'region': region}), - page=params['pageno']) + query=urlencode({'query': query, 'uiLanguage': ui_language, 'region': region}), + page=params['pageno'] + ) # image search query is something like 'image?{query}&page={page}' if params['category'] == 'images': @@ -63,14 +62,14 @@ def request(query, params): def response(resp): results = [] - json_regex = regex_json.search(resp.content) + json_regex = regex_json.search(resp.text) # check if results are returned if not json_regex: return [] - json_raw = regex_json_remove_end.sub('', regex_json_remove_start.sub('', json_regex.group())) - json = loads(json_raw) + json_raw = regex_json_remove_end.sub(b'', regex_json_remove_start.sub(b'', json_regex.group())) + json = loads(json_raw.decode('utf-8')) # parse results for result in json['Results'].get('items', []): @@ -78,7 +77,7 @@ def response(resp): # parse image results if result.get('ContentType', '').startswith('image'): - img_url = unquote(regex_img_url_remove_start.sub('', result['Url'])) + img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) # append result results.append({'url': result['SourceUrl'], @@ -100,7 +99,7 @@ def response(resp): # parse images for result in json.get('Images', []): # decode image url - img_url = unquote(regex_img_url_remove_start.sub('', result['Url'])) + img_url = unquote(regex_img_url_remove_start.sub(b'', result['Url'].encode('utf-8')).decode('utf-8')) # append result results.append({'url': result['SourceUrl'], diff --git a/searx/engines/tokyotoshokan.py b/searx/engines/tokyotoshokan.py index 52b2cbe07..9a6b5e57d 100644 --- a/searx/engines/tokyotoshokan.py +++ b/searx/engines/tokyotoshokan.py @@ -11,11 +11,11 @@ """ import re -from urllib import urlencode from lxml import html from searx.engines.xpath import extract_text from datetime import datetime from searx.engines.nyaa import int_or_zero, get_filesize_mul +from searx.url_utils import urlencode # engine dependent config categories = ['files', 'videos', 'music'] @@ -28,8 +28,7 @@ search_url = base_url + 'search.php?{query}' # do search-request def request(query, params): - query = urlencode({'page': params['pageno'], - 'terms': query}) + query = urlencode({'page': params['pageno'], 'terms': query}) params['url'] = search_url.format(query=query) return params @@ -50,7 +49,7 @@ def response(resp): size_re = re.compile(r'Size:\s*([\d.]+)(TB|GB|MB|B)', re.IGNORECASE) # processing the results, two rows at a time - for i in xrange(0, len(rows), 2): + for i in range(0, len(rows), 2): # parse the first row name_row = rows[i] @@ -79,14 +78,14 @@ def response(resp): groups = size_re.match(item).groups() multiplier = get_filesize_mul(groups[1]) params['filesize'] = int(multiplier * float(groups[0])) - except Exception as e: + except: pass elif item.startswith('Date:'): try: # Date: 2016-02-21 21:44 UTC date = datetime.strptime(item, 'Date: %Y-%m-%d %H:%M UTC') params['publishedDate'] = date - except Exception as e: + except: pass elif item.startswith('Comment:'): params['content'] = item diff --git a/searx/engines/torrentz.py b/searx/engines/torrentz.py index f9c832651..dda56fc22 100644 --- a/searx/engines/torrentz.py +++ b/searx/engines/torrentz.py @@ -12,11 +12,11 @@ """ import re -from urllib import urlencode from lxml import html -from searx.engines.xpath import extract_text from datetime import datetime from searx.engines.nyaa import int_or_zero, get_filesize_mul +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode # engine dependent config categories = ['files', 'videos', 'music'] @@ -70,7 +70,7 @@ def response(resp): size_str = result.xpath('./dd/span[@class="s"]/text()')[0] size, suffix = size_str.split() params['filesize'] = int(size) * get_filesize_mul(suffix) - except Exception as e: + except: pass # does our link contain a valid SHA1 sum? @@ -84,7 +84,7 @@ def response(resp): # Fri, 25 Mar 2016 16:29:01 date = datetime.strptime(date_str, '%a, %d %b %Y %H:%M:%S') params['publishedDate'] = date - except Exception as e: + except: pass results.append(params) diff --git a/searx/engines/translated.py b/searx/engines/translated.py index e78db0d8e..5c7b17033 100644 --- a/searx/engines/translated.py +++ b/searx/engines/translated.py @@ -9,8 +9,12 @@ @parse url, title, content """ import re +from sys import version_info from searx.utils import is_valid_lang +if version_info[0] == 3: + unicode = str + categories = ['general'] url = u'http://api.mymemory.translated.net/get?q={query}&langpair={from_lang}|{to_lang}{key}' web_url = u'http://mymemory.translated.net/en/{from_lang}/{to_lang}/{query}' diff --git a/searx/engines/twitter.py b/searx/engines/twitter.py index 6cca05f70..038cef47f 100644 --- a/searx/engines/twitter.py +++ b/searx/engines/twitter.py @@ -12,11 +12,10 @@ @todo publishedDate """ -from urlparse import urljoin -from urllib import urlencode from lxml import html from datetime import datetime from searx.engines.xpath import extract_text +from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['social media'] diff --git a/searx/engines/vimeo.py b/searx/engines/vimeo.py index 5d5310544..1408be8df 100644 --- a/searx/engines/vimeo.py +++ b/searx/engines/vimeo.py @@ -13,8 +13,8 @@ # @todo set content-parameter with correct data from json import loads -from urllib import urlencode from dateutil import parser +from searx.url_utils import urlencode # engine dependent config categories = ['videos'] diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 3f849bc7d..be217463c 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -14,12 +14,11 @@ from searx import logger from searx.poolrequests import get from searx.engines.xpath import extract_text -from searx.utils import format_date_by_locale from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url +from searx.url_utils import urlencode from json import loads from lxml.html import fromstring -from urllib import urlencode logger = logger.getChild('wikidata') result_count = 1 @@ -62,14 +61,13 @@ def request(query, params): language = 'en' params['url'] = url_search.format( - query=urlencode({'label': query, - 'language': language})) + query=urlencode({'label': query, 'language': language})) return params def response(resp): results = [] - html = fromstring(resp.content) + html = fromstring(resp.text) wikidata_ids = html.xpath(wikidata_ids_xpath) language = resp.search_params['language'].split('-')[0] @@ -78,10 +76,9 @@ def response(resp): # TODO: make requests asynchronous to avoid timeout when result_count > 1 for wikidata_id in wikidata_ids[:result_count]: - url = url_detail.format(query=urlencode({'page': wikidata_id, - 'uselang': language})) + url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language})) htmlresponse = get(url) - jsonresponse = loads(htmlresponse.content) + jsonresponse = loads(htmlresponse.text) results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language']) return results diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index 3af8f1c71..db2fdc000 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -11,13 +11,12 @@ """ from json import loads -from urllib import urlencode, quote from lxml.html import fromstring - +from searx.url_utils import quote, urlencode # search-url -base_url = 'https://{language}.wikipedia.org/' -search_postfix = 'w/api.php?'\ +base_url = u'https://{language}.wikipedia.org/' +search_url = base_url + u'w/api.php?'\ 'action=query'\ '&format=json'\ '&{query}'\ @@ -37,16 +36,16 @@ def url_lang(lang): else: language = lang - return base_url.format(language=language) + return language # do search-request def request(query, params): if query.islower(): - query += '|' + query.title() + query = u'{0}|{1}'.format(query.decode('utf-8'), query.decode('utf-8').title()).encode('utf-8') - params['url'] = url_lang(params['language']) \ - + search_postfix.format(query=urlencode({'titles': query})) + params['url'] = search_url.format(query=urlencode({'titles': query}), + language=url_lang(params['language'])) return params @@ -78,7 +77,7 @@ def extract_first_paragraph(content, title, image): def response(resp): results = [] - search_result = loads(resp.content) + search_result = loads(resp.text) # wikipedia article's unique id # first valid id is assumed to be the requested article @@ -99,11 +98,9 @@ def response(resp): extract = page.get('extract') summary = extract_first_paragraph(extract, title, image) - if not summary: - return [] # link to wikipedia article - wikipedia_link = url_lang(resp.search_params['language']) \ + wikipedia_link = base_url.format(language=url_lang(resp.search_params['language'])) \ + 'wiki/' + quote(title.replace(' ', '_').encode('utf8')) results.append({'url': wikipedia_link, 'title': title}) diff --git a/searx/engines/wolframalpha_api.py b/searx/engines/wolframalpha_api.py index e743c8f56..595c6b7de 100644 --- a/searx/engines/wolframalpha_api.py +++ b/searx/engines/wolframalpha_api.py @@ -8,8 +8,8 @@ # @stable yes # @parse url, infobox -from urllib import urlencode from lxml import etree +from searx.url_utils import urlencode # search-url search_url = 'https://api.wolframalpha.com/v2/query?appid={api_key}&{query}' @@ -37,8 +37,7 @@ image_pods = {'VisualRepresentation', # do search-request def request(query, params): - params['url'] = search_url.format(query=urlencode({'input': query}), - api_key=api_key) + params['url'] = search_url.format(query=urlencode({'input': query}), api_key=api_key) params['headers']['Referer'] = site_url.format(query=urlencode({'i': query})) return params @@ -56,7 +55,7 @@ def replace_pua_chars(text): u'\uf74e': 'i', # imaginary number u'\uf7d9': '='} # equals sign - for k, v in pua_chars.iteritems(): + for k, v in pua_chars.items(): text = text.replace(k, v) return text @@ -66,7 +65,7 @@ def replace_pua_chars(text): def response(resp): results = [] - search_results = etree.XML(resp.content) + search_results = etree.XML(resp.text) # return empty array if there are no results if search_results.xpath(failure_xpath): @@ -120,10 +119,10 @@ def response(resp): # append infobox results.append({'infobox': infobox_title, 'attributes': result_chunks, - 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) # append link to site - results.append({'url': resp.request.headers['Referer'].decode('utf8'), + results.append({'url': resp.request.headers['Referer'], 'title': title, 'content': result_content}) diff --git a/searx/engines/wolframalpha_noapi.py b/searx/engines/wolframalpha_noapi.py index 1534501b3..2a8642f92 100644 --- a/searx/engines/wolframalpha_noapi.py +++ b/searx/engines/wolframalpha_noapi.py @@ -10,10 +10,9 @@ from json import loads from time import time -from urllib import urlencode -from lxml.etree import XML from searx.poolrequests import get as http_get +from searx.url_utils import urlencode # search-url url = 'https://www.wolframalpha.com/' @@ -62,7 +61,7 @@ obtain_token() # do search-request def request(query, params): # obtain token if last update was more than an hour - if time() - token['last_updated'] > 3600: + if time() - (token['last_updated'] or 0) > 3600: obtain_token() params['url'] = search_url.format(query=urlencode({'input': query}), token=token['value']) params['headers']['Referer'] = referer_url.format(query=urlencode({'i': query})) @@ -112,9 +111,9 @@ def response(resp): results.append({'infobox': infobox_title, 'attributes': result_chunks, - 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer'].decode('utf8')}]}) + 'urls': [{'title': 'Wolfram|Alpha', 'url': resp.request.headers['Referer']}]}) - results.append({'url': resp.request.headers['Referer'].decode('utf8'), + results.append({'url': resp.request.headers['Referer'], 'title': 'Wolfram|Alpha (' + infobox_title + ')', 'content': result_content}) diff --git a/searx/engines/www1x.py b/searx/engines/www1x.py index 1269a5422..508803240 100644 --- a/searx/engines/www1x.py +++ b/searx/engines/www1x.py @@ -10,11 +10,9 @@ @parse url, title, thumbnail, img_src, content """ -from urllib import urlencode -from urlparse import urljoin from lxml import html -import string import re +from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['images'] @@ -55,7 +53,7 @@ def response(resp): cur_element += result_part # fix xml-error - cur_element = string.replace(cur_element, '"></a>', '"/></a>') + cur_element = cur_element.replace('"></a>', '"/></a>') dom = html.fromstring(cur_element) link = dom.xpath('//a')[0] diff --git a/searx/engines/www500px.py b/searx/engines/www500px.py index 546521ba3..7a2015ae9 100644 --- a/searx/engines/www500px.py +++ b/searx/engines/www500px.py @@ -13,8 +13,7 @@ """ from json import loads -from urllib import urlencode -from urlparse import urljoin +from searx.url_utils import urlencode, urljoin # engine dependent config categories = ['images'] diff --git a/searx/engines/xpath.py b/searx/engines/xpath.py index 0d39b28a8..f466697bd 100644 --- a/searx/engines/xpath.py +++ b/searx/engines/xpath.py @@ -1,13 +1,13 @@ from lxml import html -from urllib import urlencode, unquote -from urlparse import urlparse, urljoin from lxml.etree import _ElementStringResult, _ElementUnicodeResult from searx.utils import html_to_text +from searx.url_utils import unquote, urlencode, urljoin, urlparse search_url = None url_xpath = None content_xpath = None title_xpath = None +paging = False suggestion_xpath = '' results_xpath = '' diff --git a/searx/engines/yacy.py b/searx/engines/yacy.py index 7b1b6b35d..a62a1296e 100644 --- a/searx/engines/yacy.py +++ b/searx/engines/yacy.py @@ -13,8 +13,8 @@ # @todo parse video, audio and file results from json import loads -from urllib import urlencode from dateutil import parser +from searx.url_utils import urlencode from searx.utils import html_to_text diff --git a/searx/engines/yahoo.py b/searx/engines/yahoo.py index 5c62c2ed8..5387aaf54 100644 --- a/searx/engines/yahoo.py +++ b/searx/engines/yahoo.py @@ -11,10 +11,9 @@ @parse url, title, content, suggestion """ -from urllib import urlencode -from urlparse import unquote from lxml import html from searx.engines.xpath import extract_text, extract_url +from searx.url_utils import unquote, urlencode # engine dependent config categories = ['general'] diff --git a/searx/engines/yahoo_news.py b/searx/engines/yahoo_news.py index 1a0fd28f5..ae54a4acd 100644 --- a/searx/engines/yahoo_news.py +++ b/searx/engines/yahoo_news.py @@ -9,13 +9,13 @@ # @stable no (HTML can change) # @parse url, title, content, publishedDate -from urllib import urlencode +import re +from datetime import datetime, timedelta from lxml import html from searx.engines.xpath import extract_text, extract_url from searx.engines.yahoo import parse_url, _fetch_supported_languages, supported_languages_url -from datetime import datetime, timedelta -import re from dateutil import parser +from searx.url_utils import urlencode # engine dependent config categories = ['news'] diff --git a/searx/engines/yandex.py b/searx/engines/yandex.py index 65aee28b8..1c789f6cb 100644 --- a/searx/engines/yandex.py +++ b/searx/engines/yandex.py @@ -9,9 +9,9 @@ @parse url, title, content """ -from urllib import urlencode from lxml import html -from searx.search import logger +from searx import logger +from searx.url_utils import urlencode logger = logger.getChild('yandex engine') diff --git a/searx/engines/youtube_api.py b/searx/engines/youtube_api.py index 1dfca5166..6de18aa2c 100644 --- a/searx/engines/youtube_api.py +++ b/searx/engines/youtube_api.py @@ -9,8 +9,8 @@ # @parse url, title, content, publishedDate, thumbnail, embedded from json import loads -from urllib import urlencode from dateutil import parser +from searx.url_utils import urlencode # engine dependent config categories = ['videos', 'music'] diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 9b7ca64c8..9f01841f6 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -8,10 +8,10 @@ # @stable no # @parse url, title, content, publishedDate, thumbnail, embedded -from urllib import quote_plus from lxml import html from searx.engines.xpath import extract_text from searx.utils import list_get +from searx.url_utils import quote_plus # engine dependent config categories = ['videos', 'music'] |