summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlexandre Flament <alex@al-f.net>2023-01-17 23:24:04 +0100
committerGitHub <noreply@github.com>2023-01-17 23:24:04 +0100
commit6d72ef3cbec5246fd52498932760ea2f975a3112 (patch)
tree9414146f13f65fa476f09bf0337e3ccce175062f
parent13b0c251c45c3d14700723b25b601be56178e8df (diff)
parent99b5272d9a17ffd813fc8c0b2f3cae3201d2398e (diff)
downloadsearxng-6d72ef3cbec5246fd52498932760ea2f975a3112.tar.gz
searxng-6d72ef3cbec5246fd52498932760ea2f975a3112.zip
Merge pull request #2109 from ahmad-alkadri/fix/highlight-full-word
Standalone words highlighting for query result in non-CJK characters
-rw-r--r--searx/webutils.py69
-rw-r--r--tests/unit/test_webutils.py19
2 files changed, 63 insertions, 25 deletions
diff --git a/searx/webutils.py b/searx/webutils.py
index 35f4401d2..7b9a8045c 100644
--- a/searx/webutils.py
+++ b/searx/webutils.py
@@ -113,31 +113,68 @@ def prettify_url(url, max_length=74):
return url
+def contains_cjko(s: str) -> bool:
+ """This function check whether or not a string contains Chinese, Japanese,
+ or Korean characters. It employs regex and uses the u escape sequence to
+ match any character in a set of Unicode ranges.
+
+ Args:
+ s (str): string to be checked.
+
+ Returns:
+ bool: True if the input s contains the characters and False otherwise.
+ """
+ unicode_ranges = (
+ '\u4e00-\u9fff' # Chinese characters
+ '\u3040-\u309f' # Japanese hiragana
+ '\u30a0-\u30ff' # Japanese katakana
+ '\u4e00-\u9faf' # Japanese kanji
+ '\uac00-\ud7af' # Korean hangul syllables
+ '\u1100-\u11ff' # Korean hangul jamo
+ )
+ return bool(re.search(fr'[{unicode_ranges}]', s))
+
+
+def regex_highlight_cjk(word: str) -> str:
+ """Generate the regex pattern to match for a given word according
+ to whether or not the word contains CJK characters or not.
+ If the word is and/or contains CJK character, the regex pattern
+ will match standalone word by taking into account the presence
+ of whitespace before and after it; if not, it will match any presence
+ of the word throughout the text, ignoring the whitespace.
+
+ Args:
+ word (str): the word to be matched with regex pattern.
+
+ Returns:
+ str: the regex pattern for the word.
+ """
+ rword = re.escape(word)
+ if contains_cjko(rword):
+ return fr'({rword})'
+ else:
+ return fr'\b({rword})(?!\w)'
+
+
def highlight_content(content, query):
if not content:
return None
+
# ignoring html contents
# TODO better html content detection
if content.find('<') != -1:
return content
- if content.lower().find(query.lower()) > -1:
- query_regex = '({0})'.format(re.escape(query))
- content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
- else:
- regex_parts = []
- for chunk in query.split():
- chunk = chunk.replace('"', '')
- if len(chunk) == 0:
- continue
- elif len(chunk) == 1:
- regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
- else:
- regex_parts.append('{0}'.format(re.escape(chunk)))
- query_regex = '({0})'.format('|'.join(regex_parts))
- content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
-
+ querysplit = query.split()
+ queries = []
+ for qs in querysplit:
+ qs = qs.replace("'", "").replace('"', '').replace(" ", "")
+ if len(qs) > 0:
+ queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
+ if len(queries) > 0:
+ for q in set(queries):
+ content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
return content
diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py
index 31a0f86ce..acf1aeeb7 100644
--- a/tests/unit/test_webutils.py
+++ b/tests/unit/test_webutils.py
@@ -28,32 +28,33 @@ class TestWebUtils(SearxTestCase):
content = 'a'
query = 'test'
- self.assertEqual(webutils.highlight_content(content, query), content)
+ self.assertEqual(webutils.highlight_content(content, query), 'a')
query = 'a test'
- self.assertEqual(webutils.highlight_content(content, query), content)
+ self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>')
data = (
('" test "', 'a test string', 'a <span class="highlight">test</span> string'),
- ('"a"', 'this is a test string', 'this is<span class="highlight"> a </span>test string'),
+ ('"a"', 'this is a test string', 'this is <span class="highlight">a</span> test string'),
(
'a test',
'this is a test string that matches entire query',
- 'this is <span class="highlight">a test</span> string that matches entire query',
+ 'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query',
),
(
'this a test',
'this is a string to test.',
(
- '<span class="highlight">this</span> is<span class="highlight"> a </span>'
- 'string to <span class="highlight">test</span>.'
+ '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.'
),
),
(
'match this "exact phrase"',
'this string contains the exact phrase we want to match',
- (
- '<span class="highlight">this</span> string contains the <span class="highlight">exact</span>'
- ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>'
+ ''.join(
+ [
+ '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ',
+ '<span class="highlight">phrase</span> we want to <span class="highlight">match</span>',
+ ]
),
),
)