diff options
author | Bnyro <bnyro@tutanota.com> | 2023-09-08 08:40:22 +0200 |
---|---|---|
committer | Markus Heiser <markus.heiser@darmarIT.de> | 2023-09-08 11:47:13 +0200 |
commit | a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90 (patch) | |
tree | a4cf8c00e667f9cf558f0cf44b1d72248b2e1e1d /searx/utils.py | |
parent | 668b1d55abb7fe0b8afea8e96de7e844e679586b (diff) | |
download | searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.tar.gz searxng-a3d7e9c285d8c5947c9cb67b1587e7732ac4bc90.zip |
[mod] utils.py: add markdown_to_text helper function
Diffstat (limited to 'searx/utils.py')
-rw-r--r-- | searx/utils.py | 24 |
1 files changed, 24 insertions, 0 deletions
diff --git a/searx/utils.py b/searx/utils.py index 161983011..7ddd2305a 100644 --- a/searx/utils.py +++ b/searx/utils.py @@ -15,6 +15,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from urllib.parse import urljoin, urlparse +from markdown_it import MarkdownIt from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult @@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str: return s.get_text() +def markdown_to_text(markdown_str: str) -> str: + """Extract text from a Markdown string + + Args: + * markdown_str (str): string Markdown + + Returns: + * str: extracted text + + Examples: + >>> markdown_to_text('[example](https://example.com)') + 'example' + + >>> markdown_to_text('## Headline') + 'Headline' + """ + + html_str = ( + MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str) + ) + return html_to_text(html_str) + + def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]: """Extract text from a lxml result |