[mod] utils.py: add markdown_to_text helper function

author: Bnyro <bnyro@tutanota.com> 2023-09-08 08:40:22 +0200
committer: Markus Heiser <markus.heiser@darmarIT.de> 2023-09-08 11:47:13 +0200
commit: a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90 (patch)
tree: a4cf8c00e667f9cf558f0cf44b1d72248b2e1e1d /searx
parent: 668b1d55abb7fe0b8afea8e96de7e844e679586b (diff)
download: searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.tar.gz
searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.zip
2 files changed, 30 insertions, 12 deletions
diff --git a/searx/engines/lemmy.py b/searx/engines/lemmy.py
index 8c1b22151..bc3cc9cf6 100644
--- a/searx/engines/lemmy.py
+++ b/searx/engines/lemmy.py
@@ -42,10 +42,9 @@ Implementations
 from datetime import datetime
 from urllib.parse import urlencode
 
-from markdown_it import MarkdownIt
 from flask_babel import gettext
 
-from searx.utils import html_to_text
+from searx.utils import markdown_to_text
 
 about = {
     "website": 'https://lemmy.ml/',
@@ -78,11 +77,6 @@ def request(query, params):
     return params
 
 
-def _format_content(content):
-    html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
-    return html_to_text(html)
-
-
 def _get_communities(json):
     results = []
 
@@ -97,7 +91,7 @@ def _get_communities(json):
             {
                 'url': result['community']['actor_id'],
                 'title': result['community']['title'],
-                'content': _format_content(result['community'].get('description', '')),
+                'content': markdown_to_text(result['community'].get('description', '')),
                 'img_src': result['community'].get('icon', result['community'].get('banner')),
                 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'metadata': metadata,
@@ -114,7 +108,7 @@ def _get_users(json):
             {
                 'url': result['person']['actor_id'],
                 'title': result['person']['name'],
-                'content': _format_content(result['person'].get('bio', '')),
+                'content': markdown_to_text(result['person'].get('bio', '')),
             }
         )
 
@@ -140,7 +134,7 @@ def _get_posts(json):
 
         content = result['post'].get('body', '').strip()
         if content:
-            content = _format_content(content)
+            content = markdown_to_text(content)
 
         results.append(
             {
@@ -164,7 +158,7 @@ def _get_comments(json):
 
         content = result['comment'].get('content', '').strip()
         if content:
-            content = _format_content(content)
+            content = markdown_to_text(content)
 
         metadata = (
             f"&#x25B2; {result['counts']['upvotes']} &#x25BC; {result['counts']['downvotes']}"
@@ -176,7 +170,7 @@ def _get_comments(json):
             {
                 'url': result['comment']['ap_id'],
                 'title': result['post']['name'],
-                'content': _format_content(result['comment']['content']),
+                'content': markdown_to_text(result['comment']['content']),
                 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                 'metadata': metadata,
             }
diff --git a/searx/utils.py b/searx/utils.py
index 161983011..7ddd2305a 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
 from random import choice
 from html.parser import HTMLParser
 from urllib.parse import urljoin, urlparse
+from markdown_it import MarkdownIt
 
 from lxml import html
 from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
     return s.get_text()
 
 
+def markdown_to_text(markdown_str: str) -> str:
+    """Extract text from a Markdown string
+
+    Args:
+        * markdown_str (str): string Markdown
+
+    Returns:
+        * str: extracted text
+
+    Examples:
+        >>> markdown_to_text('[example](https://example.com)')
+        'example'
+
+        >>> markdown_to_text('## Headline')
+        'Headline'
+    """
+
+    html_str = (
+        MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
+    )
+    return html_to_text(html_str)
+
+
 def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
     """Extract text from a lxml result
author	Bnyro <bnyro@tutanota.com>	2023-09-08 08:40:22 +0200
committer	Markus Heiser <markus.heiser@darmarIT.de>	2023-09-08 11:47:13 +0200
commit	a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90 (patch)
tree	a4cf8c00e667f9cf558f0cf44b1d72248b2e1e1d /searx
parent	668b1d55abb7fe0b8afea8e96de7e844e679586b (diff)
download	searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.tar.gz searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.zip