summaryrefslogtreecommitdiff
path: root/searx/engines/startpage.py
diff options
context:
space:
mode:
authorThomas Pointhuber <thomas.pointhuber@gmx.at>2015-10-24 16:15:30 +0200
committerThomas Pointhuber <thomas.pointhuber@gmx.at>2015-10-24 16:19:47 +0200
commit4508c966677708a2926afb1d05f134f252d8f93a (patch)
treeb40dd5604d03706bc333f86c6409891f1a17f389 /searx/engines/startpage.py
parenta959977ab445bca91059d98bf8ca822fffc51fdf (diff)
downloadsearxng-4508c966677708a2926afb1d05f134f252d8f93a.tar.gz
searxng-4508c966677708a2926afb1d05f134f252d8f93a.zip
[enh] fix content fetching, parse published date from description
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r--searx/engines/startpage.py43
1 files changed, 37 insertions, 6 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py
index 7d58f7f01..a91cafa00 100644
--- a/searx/engines/startpage.py
+++ b/searx/engines/startpage.py
@@ -12,6 +12,8 @@
from lxml import html
from cgi import escape
+from dateutil import parser
+from datetime import datetime, timedelta
import re
from searx.engines.xpath import extract_text
@@ -79,15 +81,44 @@ def response(resp):
title = escape(extract_text(link))
- if result.xpath('./p[@class="desc"]'):
- content = escape(extract_text(result.xpath('./p[@class="desc"]')))
+ if result.xpath('./p[@class="desc clk"]'):
+ content = escape(extract_text(result.xpath('./p[@class="desc clk"]')))
else:
content = ''
- # append result
- results.append({'url': url,
- 'title': title,
- 'content': content})
+ published_date = None
+
+ # check if search result starts with something like: "2 Sep 2014 ... "
+ if re.match("^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+ published_date = parser.parse(date_string, dayfirst=True)
+
+ # fix content string
+ content = content[date_pos:]
+
+ # check if search result starts with something like: "5 days ago ... "
+ elif re.match("^[0-9]+ days? ago \.\.\. ", content):
+ date_pos = content.find('...')+4
+ date_string = content[0:date_pos-5]
+
+ # calculate datetime
+ published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))
+
+ # fix content string
+ content = content[date_pos:]
+
+ if published_date:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content,
+ 'publishedDate': published_date})
+ else:
+ # append result
+ results.append({'url': url,
+ 'title': title,
+ 'content': content})
# return results
return results