summaryrefslogtreecommitdiff
path: root/searx
diff options
context:
space:
mode:
authorAdam Tauber <asciimoo@gmail.com>2014-12-19 22:40:37 +0100
committerAdam Tauber <asciimoo@gmail.com>2014-12-19 22:40:37 +0100
commitf14177381433618a5b4f5bcff83e4e1a19487f02 (patch)
treef349bb5f1ab15a8d6d3ce33e713e24c24028ade4 /searx
parent813247b37ab00a1496468df4cff33199ae04d6b4 (diff)
downloadsearxng-f14177381433618a5b4f5bcff83e4e1a19487f02.tar.gz
searxng-f14177381433618a5b4f5bcff83e4e1a19487f02.zip
[mod][fix] https rewrite refactor ++ fixes
Diffstat (limited to 'searx')
-rw-r--r--searx/https_rewrite.py68
-rw-r--r--searx/https_rules/Soundcloud.xml2
-rw-r--r--searx/webapp.py59
3 files changed, 68 insertions, 61 deletions
diff --git a/searx/https_rewrite.py b/searx/https_rewrite.py
index 9faf3599d..408474a44 100644
--- a/searx/https_rewrite.py
+++ b/searx/https_rewrite.py
@@ -16,6 +16,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
'''
import re
+from urlparse import urlparse
from lxml import etree
from os import listdir
from os.path import isfile, isdir, join
@@ -86,15 +87,23 @@ def load_single_https_ruleset(filepath):
# TODO hack, which convert a javascript regex group
# into a valid python regex group
- rule_from = ruleset.attrib.get('from').replace('$', '\\')
- rule_to = ruleset.attrib.get('to').replace('$', '\\')
+ rule_from = ruleset.attrib['from'].replace('$', '\\')
+ if rule_from.endswith('\\'):
+ rule_from = rule_from[:-1]+'$'
+ rule_to = ruleset.attrib['to'].replace('$', '\\')
+ if rule_to.endswith('\\'):
+ rule_to = rule_to[:-1]+'$'
# TODO, not working yet because of the hack above,
# currently doing that in webapp.py
# rule_from_rgx = re.compile(rule_from, re.I)
# append rule
- rules.append((rule_from, rule_to))
+ try:
+ rules.append((re.compile(rule_from, re.I | re.U), rule_to))
+ except:
+ # TODO log regex error
+ continue
# this child define an exclusion
elif ruleset.tag == 'exclusion':
@@ -143,3 +152,56 @@ def load_https_rules(rules_path):
https_rules.append(ruleset)
print(' * {n} https-rules loaded'.format(n=len(https_rules)))
+
+
+
+def https_url_rewrite(result):
+ skip_https_rewrite = False
+ # check if HTTPS rewrite is possible
+ for target, rules, exclusions in https_rules:
+
+ # check if target regex match with url
+ if target.match(result['parsed_url'].netloc):
+ # process exclusions
+ for exclusion in exclusions:
+ # check if exclusion match with url
+ if exclusion.match(result['url']):
+ skip_https_rewrite = True
+ break
+
+ # skip https rewrite if required
+ if skip_https_rewrite:
+ break
+
+ # process rules
+ for rule in rules:
+ try:
+ new_result_url = rule[0].sub(rule[1], result['url'])
+ except:
+ break
+
+ # parse new url
+ new_parsed_url = urlparse(new_result_url)
+
+ # continiue if nothing was rewritten
+ if result['url'] == new_result_url:
+ continue
+
+ # get domainname from result
+ # TODO, does only work correct with TLD's like
+ # asdf.com, not for asdf.com.de
+ # TODO, using publicsuffix instead of this rewrite rule
+ old_result_domainname = '.'.join(
+ result['parsed_url'].hostname.split('.')[-2:])
+ new_result_domainname = '.'.join(
+ new_parsed_url.hostname.split('.')[-2:])
+
+ # check if rewritten hostname is the same,
+ # to protect against wrong or malicious rewrite rules
+ if old_result_domainname == new_result_domainname:
+ # set new url
+ result['url'] = new_result_url
+
+ # target has matched, do not search over the other rules
+ break
+ return result
diff --git a/searx/https_rules/Soundcloud.xml b/searx/https_rules/Soundcloud.xml
index 0baa5832b..6958e8cbc 100644
--- a/searx/https_rules/Soundcloud.xml
+++ b/searx/https_rules/Soundcloud.xml
@@ -89,7 +89,7 @@
<rule from="^http://([aiw]\d|api|wis)\.sndcdn\.com/"
to="https://$1.sndcdn.com/" />
- <rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.)?soundcloud\.com/"
+ <rule from="^http://((?:api|backstage|blog|connect|developers|ec-media|eventlogger|help-assets|media|visuals|w|www)\.|)soundcloud\.com/"
to="https://$1soundcloud.com/" />
<rule from="^https?://scbackstage\.wpengine\.netdna-cdn\.com/"
diff --git a/searx/webapp.py b/searx/webapp.py
index a2a135e9a..915fb3564 100644
--- a/searx/webapp.py
+++ b/searx/webapp.py
@@ -41,15 +41,12 @@ from searx.utils import (
UnicodeWriter, highlight_content, html_to_text, get_themes
)
from searx.version import VERSION_STRING
-from searx.https_rewrite import https_rules
from searx.languages import language_codes
+from searx.https_rewrite import https_url_rewrite
from searx.search import Search
from searx.query import Query
from searx.autocomplete import backends as autocomplete_backends
-from urlparse import urlparse
-import re
-
static_path, templates_path, themes =\
get_themes(settings['themes_path']
@@ -215,59 +212,7 @@ def index():
if settings['server']['https_rewrite']\
and result['parsed_url'].scheme == 'http':
- skip_https_rewrite = False
-
- # check if HTTPS rewrite is possible
- for target, rules, exclusions in https_rules:
-
- # check if target regex match with url
- if target.match(result['url']):
- # process exclusions
- for exclusion in exclusions:
- # check if exclusion match with url
- if exclusion.match(result['url']):
- skip_https_rewrite = True
- break
-
- # skip https rewrite if required
- if skip_https_rewrite:
- break
-
- # process rules
- for rule in rules:
- try:
- # TODO, precompile rule
- p = re.compile(rule[0])
-
- # rewrite url if possible
- new_result_url = p.sub(rule[1], result['url'])
- except:
- break
-
- # parse new url
- new_parsed_url = urlparse(new_result_url)
-
- # continiue if nothing was rewritten
- if result['url'] == new_result_url:
- continue
-
- # get domainname from result
- # TODO, does only work correct with TLD's like
- # asdf.com, not for asdf.com.de
- # TODO, using publicsuffix instead of this rewrite rule
- old_result_domainname = '.'.join(
- result['parsed_url'].hostname.split('.')[-2:])
- new_result_domainname = '.'.join(
- new_parsed_url.hostname.split('.')[-2:])
-
- # check if rewritten hostname is the same,
- # to protect against wrong or malicious rewrite rules
- if old_result_domainname == new_result_domainname:
- # set new url
- result['url'] = new_result_url
-
- # target has matched, do not search over the other rules
- break
+ result = https_url_rewrite(result)
if search.request_data.get('format', 'html') == 'html':
if 'content' in result: