From 6c421110b57c695e9c0a0d9212bc271d701d17da Mon Sep 17 00:00:00 2001 From: ahmad-alkadri Date: Sat, 14 Jan 2023 23:00:08 +0000 Subject: [PATCH 1/2] Perso branch - added cjk check and enclosement --- searx/webutils.py | 66 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 50 insertions(+), 16 deletions(-) diff --git a/searx/webutils.py b/searx/webutils.py index 35f4401d2..150b376fa 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -113,31 +113,65 @@ def prettify_url(url, max_length=74): return url +def contains_cjko(s: str) -> bool: + """This function check whether or not a string contains Chinese, Japanese, + or Korean characters. It employs regex and uses the u escape sequence to + match any character in a set of Unicode ranges. + + Args: + s (str): string to be checked. + + Returns: + bool: True if the input s contains the characters and False otherwise. + """ + unicode_ranges = ('\u4e00-\u9fff' # Chinese characters + '\u3040-\u309f' # Japanese hiragana + '\u30a0-\u30ff' # Japanese katakana + '\u4e00-\u9faf' # Japanese kanji + '\uac00-\ud7af' # Korean hangul syllables + '\u1100-\u11ff' # Korean hangul jamo + ) + return bool(re.search(fr'[{unicode_ranges}]', s)) + + +def regex_highlight_cjk(word: str) -> str: + """Generate the regex pattern to match for a given word according + to whether or not the word contains CJK characters or not. + If the word is and/or contains CJK character, the regex pattern + will match standalone word by taking into account the presence + of whitespace before and after it; if not, it will match any presence + of the word throughout the text, ignoring the whitespace. + + Args: + word (str): the word to be matched with regex pattern. + + Returns: + str: the regex pattern for the word. + """ + rword = re.escape(word) + if contains_cjko(rword): + return fr'({rword})' + else: + return fr'\b({rword})(?!\w)' + + def highlight_content(content, query): if not content: return None + # ignoring html contents # TODO better html content detection if content.find('<') != -1: return content - if content.lower().find(query.lower()) > -1: - query_regex = '({0})'.format(re.escape(query)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) - else: - regex_parts = [] - for chunk in query.split(): - chunk = chunk.replace('"', '') - if len(chunk) == 0: - continue - elif len(chunk) == 1: - regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) - else: - regex_parts.append('{0}'.format(re.escape(chunk))) - query_regex = '({0})'.format('|'.join(regex_parts)) - content = re.sub(query_regex, '\\1', content, flags=re.I | re.U) - + querysplit = query.split() + queries = [] + for qs in querysplit: + queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) + if len(queries) > 0: + for q in set(queries): + content = re.sub(regex_highlight_cjk(q), f'{q}', content) return content From 99b5272d9a17ffd813fc8c0b2f3cae3201d2398e Mon Sep 17 00:00:00 2001 From: ahmad-alkadri Date: Sun, 15 Jan 2023 15:08:11 +0000 Subject: [PATCH 2/2] A little fix and modified the testing for content highlight --- searx/webutils.py | 19 +++++++++++-------- tests/unit/test_webutils.py | 19 ++++++++++--------- 2 files changed, 21 insertions(+), 17 deletions(-) diff --git a/searx/webutils.py b/searx/webutils.py index 150b376fa..7b9a8045c 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -124,13 +124,14 @@ def contains_cjko(s: str) -> bool: Returns: bool: True if the input s contains the characters and False otherwise. """ - unicode_ranges = ('\u4e00-\u9fff' # Chinese characters - '\u3040-\u309f' # Japanese hiragana - '\u30a0-\u30ff' # Japanese katakana - '\u4e00-\u9faf' # Japanese kanji - '\uac00-\ud7af' # Korean hangul syllables - '\u1100-\u11ff' # Korean hangul jamo - ) + unicode_ranges = ( + '\u4e00-\u9fff' # Chinese characters + '\u3040-\u309f' # Japanese hiragana + '\u30a0-\u30ff' # Japanese katakana + '\u4e00-\u9faf' # Japanese kanji + '\uac00-\ud7af' # Korean hangul syllables + '\u1100-\u11ff' # Korean hangul jamo + ) return bool(re.search(fr'[{unicode_ranges}]', s)) @@ -168,7 +169,9 @@ def highlight_content(content, query): querysplit = query.split() queries = [] for qs in querysplit: - queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) + qs = qs.replace("'", "").replace('"', '').replace(" ", "") + if len(qs) > 0: + queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) if len(queries) > 0: for q in set(queries): content = re.sub(regex_highlight_cjk(q), f'{q}', content) diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py index 31a0f86ce..acf1aeeb7 100644 --- a/tests/unit/test_webutils.py +++ b/tests/unit/test_webutils.py @@ -28,32 +28,33 @@ class TestWebUtils(SearxTestCase): content = 'a' query = 'test' - self.assertEqual(webutils.highlight_content(content, query), content) + self.assertEqual(webutils.highlight_content(content, query), 'a') query = 'a test' - self.assertEqual(webutils.highlight_content(content, query), content) + self.assertEqual(webutils.highlight_content(content, query), 'a') data = ( ('" test "', 'a test string', 'a test string'), - ('"a"', 'this is a test string', 'this is a test string'), + ('"a"', 'this is a test string', 'this is a test string'), ( 'a test', 'this is a test string that matches entire query', - 'this is a test string that matches entire query', + 'this is a test string that matches entire query', ), ( 'this a test', 'this is a string to test.', ( - 'this is a ' - 'string to test.' + 'this is a string to test.' ), ), ( 'match this "exact phrase"', 'this string contains the exact phrase we want to match', - ( - 'this string contains the exact' - ' phrase we want to match' + ''.join( + [ + 'this string contains the exact ', + 'phrase we want to match', + ] ), ), )