summaryrefslogtreecommitdiff
path: root/tests
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2020-10-02 18:13:56 +0200
committerAlexandre Flament <alex@al-f.net>2020-10-02 18:13:56 +0200
commit2006eb468087c045c46c7d9e1d771e8ab2dfed7b (patch)
tree837ec4f49ee9fbc3637f3e24c015bc2542d8983e /tests
parentecb9f28869f081dc75182053c727523ed7e20d12 (diff)
downloadsearxng-2006eb468087c045c46c7d9e1d771e8ab2dfed7b.tar.gz
searxng-2006eb468087c045c46c7d9e1d771e8ab2dfed7b.zip
[mod] move extract_text, extract_url to searx.utils
Diffstat (limited to 'tests')
-rw-r--r--tests/unit/test_utils.py49
1 files changed, 45 insertions, 4 deletions
diff --git a/tests/unit/test_utils.py b/tests/unit/test_utils.py
index 69f5ef92a..f3a98ad71 100644
--- a/tests/unit/test_utils.py
+++ b/tests/unit/test_utils.py
@@ -1,4 +1,7 @@
# -*- coding: utf-8 -*-
+import lxml.etree
+from lxml import html
+
from searx.testing import SearxTestCase
from searx import utils
@@ -16,7 +19,30 @@ class TestUtils(SearxTestCase):
self.assertTrue(utils.searx_useragent().startswith('searx'))
def test_html_to_text(self):
- html = """
+ html_str = """
+ <a href="/testlink" class="link_access_account">
+ <style>
+ .toto {
+ color: red;
+ }
+ </style>
+ <span class="toto">
+ <span>
+ <img src="test.jpg" />
+ </span>
+ </span>
+ <span class="titi">
+ Test text
+ </span>
+ <script>value='dummy';</script>
+ </a>
+ """
+ self.assertIsInstance(utils.html_to_text(html_str), str)
+ self.assertIsNotNone(utils.html_to_text(html_str))
+ self.assertEqual(utils.html_to_text(html_str), "Test text")
+
+ def test_extract_text(self):
+ html_str = """
<a href="/testlink" class="link_access_account">
<span class="toto">
<span>
@@ -28,9 +54,24 @@ class TestUtils(SearxTestCase):
</span>
</a>
"""
- self.assertIsInstance(utils.html_to_text(html), str)
- self.assertIsNotNone(utils.html_to_text(html))
- self.assertEqual(utils.html_to_text(html), "Test text")
+ dom = html.fromstring(html_str)
+ self.assertEqual(utils.extract_text(dom), 'Test text')
+ self.assertEqual(utils.extract_text(dom.xpath('//span')), 'Test text')
+ self.assertEqual(utils.extract_text(dom.xpath('//img/@src')), 'test.jpg')
+ self.assertEqual(utils.extract_text(dom.xpath('//unexistingtag')), '')
+
+ def test_extract_url(self):
+ def f(html_str, search_url):
+ return utils.extract_url(html.fromstring(html_str), search_url)
+ self.assertEqual(f('<span id="42">https://example.com</span>', 'http://example.com/'), 'https://example.com/')
+ self.assertEqual(f('https://example.com', 'http://example.com/'), 'https://example.com/')
+ self.assertEqual(f('//example.com', 'http://example.com/'), 'http://example.com/')
+ self.assertEqual(f('//example.com', 'https://example.com/'), 'https://example.com/')
+ self.assertEqual(f('/path?a=1', 'https://example.com'), 'https://example.com/path?a=1')
+ with self.assertRaises(lxml.etree.ParserError):
+ f('', 'https://example.com')
+ with self.assertRaises(Exception):
+ utils.extract_url([], 'https://example.com')
def test_html_to_text_invalid(self):
html = '<p><b>Lorem ipsum</i>dolor sit amet</p>'