[mod] json_engine: add content_html_to_text and title_html_to_text

Some JSON API returns HTML in either in the HTML or the content. This commit adds two new parameters to the json_engine: content_html_to_text and title_html_to_text, False by default. If True, then the searx.utils.html_to_text removes the HTML tags. Update crossref, openairedatasets and openairepublications engines
author: Alexandre Flament <alex@al-f.net> 2021-02-10 16:40:03 +0100
committer: Alexandre Flament <alex@al-f.net> 2021-02-10 16:42:11 +0100
commit: ff84a1af35c04855ae6e5c2463b978111d8c9fb1 (patch)
tree: 16a3f60522b02539de1f7be0f03d9ff7380e990b /searx/engines/json_engine.py
parent: 436d366448131088a9cddcedc9b789a44b38ee97 (diff)
download: searxng-ff84a1af35c04855ae6e5c2463b978111d8c9fb1.tar.gz
searxng-ff84a1af35c04855ae6e5c2463b978111d8c9fb1.zip
1 files changed, 15 insertions, 5 deletions
diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py
index f4a5ff6d2..8a04d34b2 100644
--- a/searx/engines/json_engine.py
+++ b/searx/engines/json_engine.py
@@ -3,13 +3,15 @@
 from collections.abc import Iterable
 from json import loads
 from urllib.parse import urlencode
-from searx.utils import to_string
+from searx.utils import to_string, html_to_text
 
 
 search_url = None
 url_query = None
 content_query = None
 title_query = None
+content_html_to_text = False
+title_html_to_text = False
 paging = False
 suggestion_query = ''
 results_query = ''
@@ -92,9 +94,17 @@ def request(query, params):
     return params
 
 
+def identity(arg):
+    return arg
+
+
 def response(resp):
     results = []
     json = loads(resp.text)
+
+    title_filter = html_to_text if title_html_to_text else identity
+    content_filter = html_to_text if content_html_to_text else identity
+
     if results_query:
         rs = query(json, results_query)
         if not len(rs):
@@ -111,8 +121,8 @@ def response(resp):
                 content = ""
             results.append({
                 'url': to_string(url),
-                'title': to_string(title),
-                'content': to_string(content),
+                'title': title_filter(to_string(title)),
+                'content': content_filter(to_string(content)),
             })
     else:
         for url, title, content in zip(
@@ -122,8 +132,8 @@ def response(resp):
         ):
             results.append({
                 'url': to_string(url),
-                'title': to_string(title),
-                'content': to_string(content),
+                'title': title_filter(to_string(title)),
+                'content': content_filter(to_string(content)),
             })
 
     if not suggestion_query:
author	Alexandre Flament <alex@al-f.net>	2021-02-10 16:40:03 +0100
committer	Alexandre Flament <alex@al-f.net>	2021-02-10 16:42:11 +0100
commit	ff84a1af35c04855ae6e5c2463b978111d8c9fb1 (patch)
tree	16a3f60522b02539de1f7be0f03d9ff7380e990b /searx/engines/json_engine.py
parent	436d366448131088a9cddcedc9b789a44b38ee97 (diff)
download	searxng-ff84a1af35c04855ae6e5c2463b978111d8c9fb1.tar.gz searxng-ff84a1af35c04855ae6e5c2463b978111d8c9fb1.zip