diff options
author | Markus Heiser <markus.heiser@darmarit.de> | 2022-01-06 18:29:04 +0100 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarit.de> | 2022-01-10 11:22:38 +0100 |
commit | 2f4e567e904278f19c4c392fb9a222fcf0afec1c (patch) | |
tree | e720627732b243287c9ca374aff7defac623393f /searx/engines/startpage.py | |
parent | 1cbcddb3f703b0ee076ac5ce0b514246a21472ec (diff) | |
download | searxng-2f4e567e904278f19c4c392fb9a222fcf0afec1c.tar.gz searxng-2f4e567e904278f19c4c392fb9a222fcf0afec1c.zip |
[fix] Get an actual `sc` argument from startpage's home page.
Signed-off-by: Markus Heiser <markus.heiser@darmarit.de>
Diffstat (limited to 'searx/engines/startpage.py')
-rw-r--r-- | searx/engines/startpage.py | 39 |
1 files changed, 38 insertions, 1 deletions
diff --git a/searx/engines/startpage.py b/searx/engines/startpage.py index eaa157705..f5448dd47 100644 --- a/searx/engines/startpage.py +++ b/searx/engines/startpage.py @@ -5,6 +5,7 @@ """ import re +from time import time from urllib.parse import urlencode from unicodedata import normalize, combining @@ -15,6 +16,7 @@ from lxml import html from babel import Locale from babel.localedata import locale_identifiers +from searx import network from searx.utils import extract_text, eval_xpath, match_language # about @@ -47,6 +49,41 @@ results_xpath = '//div[@class="w-gl__result__main"]' link_xpath = './/a[@class="w-gl__result-title result-link"]' content_xpath = './/p[@class="w-gl__description"]' +# timestamp of the last fetch of 'sc' code +sc_code_ts = 0 +sc_code = '' + + +def get_sc_code(headers): + """Get an actual `sc` argument from startpage's home page. + + Startpage puts a `sc` argument on every link. Without this argument + startpage considers the request is from a bot. We do not know what is + encoded in the value of the `sc` argument, but it seems to be a kind of a + *time-stamp*. This *time-stamp* is valid for a few hours. + + This function scrap a new *time-stamp* from startpage's home page every hour + (3000 sec). + + """ + + global sc_code_ts, sc_code # pylint: disable=global-statement + + if time() > (sc_code_ts + 3000): + logger.debug("query new sc time-stamp ...") + + resp = network.get(base_url, headers=headers) + dom = html.fromstring(resp.text) + + # href --> '/?sc=adrKJMgF8xwp20' + href = eval_xpath(dom, '//a[@class="footer-home__logo"]')[0].get('href') + + sc_code = href[5:] + sc_code_ts = time() + logger.debug("new value is: %s", sc_code) + + return sc_code + # do search-request def request(query, params): @@ -56,7 +93,7 @@ def request(query, params): 'page': params['pageno'], 'cat': 'web', # 'abp': "-1", - 'sc': 'Mj4jZy61QETj20', + 'sc': get_sc_code(params['headers']), } # set language if specified |