summaryrefslogtreecommitdiff
path: root/searx/utils.py
diff options
context:
space:
mode:
authorBnyro <bnyro@tutanota.com>2023-09-08 08:40:22 +0200
committerMarkus Heiser <markus.heiser@darmarIT.de>2023-09-08 11:47:13 +0200
commita3d7e9c285d8c5947c9cb67b1587e7732ac4bc90 (patch)
treea4cf8c00e667f9cf558f0cf44b1d72248b2e1e1d /searx/utils.py
parent668b1d55abb7fe0b8afea8e96de7e844e679586b (diff)
downloadsearxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.tar.gz
searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.zip
[mod] utils.py: add markdown_to_text helper function
Diffstat (limited to 'searx/utils.py')
-rw-r--r--searx/utils.py24
1 files changed, 24 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py
index 161983011..7ddd2305a 100644
--- a/searx/utils.py
+++ b/searx/utils.py
@@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
+from markdown_it import MarkdownIt
from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
return s.get_text()
+def markdown_to_text(markdown_str: str) -> str:
+ """Extract text from a Markdown string
+
+ Args:
+ * markdown_str (str): string Markdown
+
+ Returns:
+ * str: extracted text
+
+ Examples:
+ >>> markdown_to_text('[example](https://example.com)')
+ 'example'
+
+ >>> markdown_to_text('## Headline')
+ 'Headline'
+ """
+
+ html_str = (
+ MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
+ )
+ return html_to_text(html_str)
+
+
def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
"""Extract text from a lxml result