summaryrefslogtreecommitdiff
path: root/searx/engines/startpage.py
diff options
context:
space:
mode:
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r--searx/engines/startpage.py53
1 files changed, 46 insertions, 7 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 9d5b4befe..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -66,20 +68,57 @@ def response(resp):
url = link.attrib.get('href')
# block google-ad url's
- if re.match("^http(s|)://www.google.[a-z]+/aclk.*$", url):
+ if re.match("^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
+ continue
+
+ # block startpage search url's
+ if re.match("^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
+ continue
+
+ # block ixquick search url's
+ if re.match("^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
continue
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results