Merge pull request #2109 from ahmad-alkadri/fix/highlight-full-word

Standalone words highlighting for query result in non-CJK characters
This commit is contained in:
Alexandre Flament 2023-01-17 23:24:04 +01:00 committed by GitHub
commit 6d72ef3cbe
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 63 additions and 25 deletions

View file

@ -113,31 +113,68 @@ def prettify_url(url, max_length=74):
return url return url
def contains_cjko(s: str) -> bool:
"""This function check whether or not a string contains Chinese, Japanese,
or Korean characters. It employs regex and uses the u escape sequence to
match any character in a set of Unicode ranges.
Args:
s (str): string to be checked.
Returns:
bool: True if the input s contains the characters and False otherwise.
"""
unicode_ranges = (
'\u4e00-\u9fff' # Chinese characters
'\u3040-\u309f' # Japanese hiragana
'\u30a0-\u30ff' # Japanese katakana
'\u4e00-\u9faf' # Japanese kanji
'\uac00-\ud7af' # Korean hangul syllables
'\u1100-\u11ff' # Korean hangul jamo
)
return bool(re.search(fr'[{unicode_ranges}]', s))
def regex_highlight_cjk(word: str) -> str:
"""Generate the regex pattern to match for a given word according
to whether or not the word contains CJK characters or not.
If the word is and/or contains CJK character, the regex pattern
will match standalone word by taking into account the presence
of whitespace before and after it; if not, it will match any presence
of the word throughout the text, ignoring the whitespace.
Args:
word (str): the word to be matched with regex pattern.
Returns:
str: the regex pattern for the word.
"""
rword = re.escape(word)
if contains_cjko(rword):
return fr'({rword})'
else:
return fr'\b({rword})(?!\w)'
def highlight_content(content, query): def highlight_content(content, query):
if not content: if not content:
return None return None
# ignoring html contents # ignoring html contents
# TODO better html content detection # TODO better html content detection
if content.find('<') != -1: if content.find('<') != -1:
return content return content
if content.lower().find(query.lower()) > -1: querysplit = query.split()
query_regex = '({0})'.format(re.escape(query)) queries = []
content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) for qs in querysplit:
else: qs = qs.replace("'", "").replace('"', '').replace(" ", "")
regex_parts = [] if len(qs) > 0:
for chunk in query.split(): queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U))
chunk = chunk.replace('"', '') if len(queries) > 0:
if len(chunk) == 0: for q in set(queries):
continue content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
elif len(chunk) == 1:
regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
else:
regex_parts.append('{0}'.format(re.escape(chunk)))
query_regex = '({0})'.format('|'.join(regex_parts))
content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U)
return content return content

View file

@ -28,32 +28,33 @@ class TestWebUtils(SearxTestCase):
content = 'a' content = 'a'
query = 'test' query = 'test'
self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), 'a')
query = 'a test' query = 'a test'
self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>')
data = ( data = (
('" test "', 'a test string', 'a <span class="highlight">test</span> string'), ('" test "', 'a test string', 'a <span class="highlight">test</span> string'),
('"a"', 'this is a test string', 'this is<span class="highlight"> a </span>test string'), ('"a"', 'this is a test string', 'this is <span class="highlight">a</span> test string'),
( (
'a test', 'a test',
'this is a test string that matches entire query', 'this is a test string that matches entire query',
'this is <span class="highlight">a test</span> string that matches entire query', 'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query',
), ),
( (
'this a test', 'this a test',
'this is a string to test.', 'this is a string to test.',
( (
'<span class="highlight">this</span> is<span class="highlight"> a </span>' '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.'
'string to <span class="highlight">test</span>.'
), ),
), ),
( (
'match this "exact phrase"', 'match this "exact phrase"',
'this string contains the exact phrase we want to match', 'this string contains the exact phrase we want to match',
( ''.join(
'<span class="highlight">this</span> string contains the <span class="highlight">exact</span>' [
' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>' '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ',
'<span class="highlight">phrase</span> we want to <span class="highlight">match</span>',
]
), ),
), ),
) )