diff options
-rw-r--r-- | .github/workflows/data-update.yml | 2 | ||||
-rw-r--r-- | AUTHORS.rst | 2 | ||||
-rw-r--r-- | searx/engines/apkmirror.py | 2 | ||||
-rw-r--r-- | searx/engines/loc.py | 68 | ||||
-rw-r--r-- | searx/engines/wikipedia.py | 9 | ||||
-rw-r--r-- | searx/settings.yml | 32 | ||||
-rw-r--r-- | searx/webutils.py | 5 | ||||
-rw-r--r-- | tests/unit/test_webutils.py | 22 |
8 files changed, 112 insertions, 30 deletions
diff --git a/.github/workflows/data-update.yml b/.github/workflows/data-update.yml index a97169767..70e491153 100644 --- a/.github/workflows/data-update.yml +++ b/.github/workflows/data-update.yml @@ -7,7 +7,7 @@ jobs: updateData: name: Update data runs-on: ubuntu-20.04 - if: env.DATA_PR_TOKEN != null + if: secrets.DATA_PR_TOKEN != null steps: - name: Checkout uses: actions/checkout@v2 diff --git a/AUTHORS.rst b/AUTHORS.rst index 036ae0fe1..b44a10b55 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -154,6 +154,6 @@ generally made searx better: - @mrwormo - Xiaoyu WEI @xywei - @joshu9h - +- Daniel Hones diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py index a4c66e891..a9ddd711a 100644 --- a/searx/engines/apkmirror.py +++ b/searx/engines/apkmirror.py @@ -45,7 +45,7 @@ def response(resp): dom = html.fromstring(resp.text) # parse results - for result in eval_xpath_list(dom, './/div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'): + for result in eval_xpath_list(dom, './/div[@id="content"]/div[@class="listWidget"]//div[@class="appRow"]'): link = eval_xpath_getindex(result, './/h5/a', 0) url = base_url + link.attrib.get('href') + '#downloads' diff --git a/searx/engines/loc.py b/searx/engines/loc.py new file mode 100644 index 000000000..5c09ceff2 --- /dev/null +++ b/searx/engines/loc.py @@ -0,0 +1,68 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +""" + + Library of Congress : images from Prints and Photographs Online Catalog + +""" + +from json import loads +from urllib.parse import urlencode + + +about = { + "website": 'https://www.loc.gov/pictures/', + "wikidata_id": 'Q131454', + "official_api_documentation": 'https://www.loc.gov/pictures/api', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +categories = ['images'] + +paging = True + +base_url = 'https://loc.gov/pictures/search/?' +search_string = "&sp={page}&{query}&fo=json" + +IMG_SRC_FIXES = { + 'https://tile.loc.gov/storage-services/': 'https://tile.loc.gov/storage-services/', + 'https://loc.gov/pictures/static/images/': 'https://tile.loc.gov/storage-services/', + 'https://www.loc.gov/pictures/cdn/': 'https://tile.loc.gov/storage-services/', +} + + +def request(query, params): + + search_path = search_string.format( + query=urlencode({'q': query}), + page=params['pageno']) + + params['url'] = base_url + search_path + + return params + + +def response(resp): + results = [] + + json_data = loads(resp.text) + + for result in json_data['results']: + img_src = result['image']['full'] + for url_prefix, url_replace in IMG_SRC_FIXES.items(): + if img_src.startswith(url_prefix): + img_src = img_src.replace(url_prefix, url_replace) + break + else: + img_src = result['image']['thumb'] + results.append({ + 'url': result['links']['item'], + 'title': result['title'], + 'img_src': img_src, + 'thumbnail_src': result['image']['thumb'], + 'author': result['creator'], + 'template': 'images.html' + }) + + return results diff --git a/searx/engines/wikipedia.py b/searx/engines/wikipedia.py index eff301145..c8e589e64 100644 --- a/searx/engines/wikipedia.py +++ b/searx/engines/wikipedia.py @@ -22,6 +22,7 @@ about = { # search-url search_url = 'https://{language}.wikipedia.org/api/rest_v1/page/summary/{title}' supported_languages_url = 'https://meta.wikimedia.org/wiki/List_of_Wikipedias' +language_variants = {"zh": ("zh-cn", "zh-hk", "zh-mo", "zh-my", "zh-sg", "zh-tw")} # set language in base_url @@ -37,8 +38,12 @@ def request(query, params): if query.islower(): query = query.title() + language = url_lang(params['language']) params['url'] = search_url.format(title=quote(query), - language=url_lang(params['language'])) + language=language) + + if params['language'].lower() in language_variants.get(language, []): + params['headers']['Accept-Language'] = params['language'].lower() params['headers']['User-Agent'] = searx_useragent() params['raise_for_httperror'] = False @@ -60,7 +65,7 @@ def response(resp): if api_result.get('type') != 'standard': return [] - title = api_result['title'] + title = api_result['displaytitle'] wikipedia_link = api_result['content_urls']['desktop']['page'] results.append({'url': wikipedia_link, 'title': title}) diff --git a/searx/settings.yml b/searx/settings.yml index f03d8aff3..87008eb20 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -583,25 +583,6 @@ engines: require_api_key: false results: HTML - - name : google play music - engine : xpath - search_url : https://play.google.com/store/search?q={query}&c=music - results_xpath : '//div[@class="WHE7ib mpg5gc"]' - title_xpath : './/div[@class="RZEgze"]//div[@title and not(@title="")]/a' - url_xpath : './/div[@class="RZEgze"]//div[@title and not(@title="")]/a/@href' - content_xpath : './/div[@class="RZEgze"]//a[@class="mnKHRc"]' - thumbnail_xpath : './/div[@class="uzcko"]/div/span[1]//img/@data-src' - categories : music - shortcut : gps - disabled : True - about: - website: https://play.google.com/ - wikidata_id: Q79576 - official_api_documentation: - use_official_api: false - require_api_key: false - results: HTML - - name : geektimes engine : xpath paging : True @@ -698,6 +679,11 @@ engines: require_api_key: false results: HTML + - name : library of congress + engine : loc + shortcut : loc + categories : images + - name : lobste.rs engine : xpath search_url : https://lobste.rs/search?utf8=%E2%9C%93&q={query}&what=stories&order=relevance @@ -1229,11 +1215,9 @@ engines: engine: xpath paging : True search_url : https://search.naver.com/search.naver?where=webkr&sm=osp_hty&ie=UTF-8&query={query}&start={pageno} - results_xpath: /html/body//ul[@id="elThumbnailResultArea"]/li - url_xpath : ./dl/dt/a[@class="title_link"]/@href - title_xpath : ./dl/dt/a[@class="title_link"] - content_xpath : ./dl/dd[@class="sh_web_passage"] - suggestion_xpath : /html/body//div[@class="sp_keyword section"]//a + url_xpath : //a[@class="link_tit"]/@href + title_xpath : //a[@class="link_tit"] + content_xpath : //a[@class="total_dsc"]/div first_page_num : 1 page_size : 10 disabled : True diff --git a/searx/webutils.py b/searx/webutils.py index 8be8fcecd..2464a097f 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -119,7 +119,10 @@ def highlight_content(content, query): else: regex_parts = [] for chunk in query.split(): - if len(chunk) == 1: + chunk = chunk.replace('"', '') + if len(chunk) == 0: + continue + elif len(chunk) == 1: regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) else: regex_parts.append('{0}'.format(re.escape(chunk))) diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py index aa464688b..023374b04 100644 --- a/tests/unit/test_webutils.py +++ b/tests/unit/test_webutils.py @@ -34,6 +34,28 @@ class TestWebUtils(SearxTestCase): query = 'a test' self.assertEqual(webutils.highlight_content(content, query), content) + data = ( + ('" test "', + 'a test string', + 'a <span class="highlight">test</span> string'), + ('"a"', + 'this is a test string', + 'this is<span class="highlight"> a </span>test string'), + ('a test', + 'this is a test string that matches entire query', + 'this is <span class="highlight">a test</span> string that matches entire query'), + ('this a test', + 'this is a string to test.', + ('<span class="highlight">this</span> is<span class="highlight"> a </span>' + 'string to <span class="highlight">test</span>.')), + ('match this "exact phrase"', + 'this string contains the exact phrase we want to match', + ('<span class="highlight">this</span> string contains the <span class="highlight">exact</span>' + ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>')) + ) + for query, content, expected in data: + self.assertEqual(webutils.highlight_content(content, query), expected) + class TestUnicodeWriter(SearxTestCase): |