diff --git a/AUTHORS.rst b/AUTHORS.rst index ef0800bb0..23d8d6db6 100644 --- a/AUTHORS.rst +++ b/AUTHORS.rst @@ -178,3 +178,4 @@ features or generally made SearXNG better: - `Bearz314 `_ - Tommaso Colella `` - @AgentScrubbles +- Filip Mikina `` diff --git a/docs/dev/engines/online/github_code.rst b/docs/dev/engines/online/github_code.rst new file mode 100644 index 000000000..12082f29f --- /dev/null +++ b/docs/dev/engines/online/github_code.rst @@ -0,0 +1,8 @@ +.. _github code engine: + +=========== +Github Code +=========== + +.. automodule:: searx.engines.github_code + :members: diff --git a/searx/engines/github_code.py b/searx/engines/github_code.py new file mode 100644 index 000000000..4bafe9c0d --- /dev/null +++ b/searx/engines/github_code.py @@ -0,0 +1,272 @@ +# SPDX-License-Identifier: AGPL-3.0-or-lat_er +"""GitHub code search with `search syntax`_ as described in `Constructing a +search query`_ in the documentation of GitHub's REST API. + +.. _search syntax: + https://docs.github.com/en/search-github/getting-started-with-searching-on-github/understanding-the-search-syntax +.. _Constructing a search query: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#constructing-a-search-query +.. _Github REST API for code search: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code +.. _Github REST API auth for code search: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens + +Configuration +============= + +The engine has the following mandatory setting: + +- :py:obj:`ghc_auth` + Change the authentication method used when using the API, defaults to none. + +Optional settings are: + +- :py:obj:`ghc_highlight_matching_lines` + Control the highlighting of the matched text (turns off/on). +- :py:obj:`ghc_strip_new_lines` + Strip new lines at the start or end of each code fragment. +- :py:obj:`ghc_strip_whitespace` + Strip any whitespace at the start or end of each code fragment. +- :py:obj:`ghc_insert_block_separator` + Add a `...` between each code fragment before merging them. + +.. code:: yaml + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "none" + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "personal_access_token" + token: "" + ghc_highlight_matching_lines: true + ghc_strip_whitespace: true + ghc_strip_new_lines: true + + + - name: github code + engine: github_code + shortcut: ghc + ghc_auth: + type: "bearer" + token: "" + +Implementation +=============== + +GitHub does not return the code line indices alongside the code fragment in the +search API. Since these are not super important for the user experience all the +code lines are just relabeled (starting from 1) and appended (a disjoint set of +code blocks in a single file might be returned from the API). +""" + +from __future__ import annotations + +import typing as t +from urllib.parse import urlencode, urlparse + +from pygments.lexers import guess_lexer_for_filename +from pygments.util import ClassNotFound +from searx.result_types import EngineResults +from searx.extended_types import SXNG_Response +from searx.network import raise_for_httperror + +# about +about = { + "website": 'https://github.com/', + "wikidata_id": 'Q364', + "official_api_documentation": 'https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code', + "use_official_api": True, + "require_api_key": False, + "results": 'JSON', +} + +# engine dependent config +categories = ['code'] + + +search_url = 'https://api.github.com/search/code?sort=indexed&{query}&{page}' +# https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#text-match-metadata +accept_header = 'application/vnd.github.text-match+json' +paging = True + +ghc_auth = { + "type": "none", + "token": "", +} +"""Change the method of authenticating to the github API. + +``type`` needs to be one of ``none``, ``personal_access_token``, or ``bearer``. +When type is not `none` a token is expected to be passed as well in +``auth.token``. + +If there is any privacy concerns about generating a token, one can use the API +without authentication. The calls will be heavily rate limited, this is what the +API returns on such calls:: + + API rate limit exceeded for . + (But here's the good news: Authenticated requests get a higher rate limit) + +The personal access token or a bearer for an org or a group can be generated [in +the `GitHub settings`_. + +.. _GitHub settings: + https://docs.github.com/en/rest/search/search?apiVersion=2022-11-28#search-code--fine-grained-access-tokens +""" + +ghc_highlight_matching_lines = True +"""Highlight the matching code lines.""" + +ghc_strip_new_lines = True +"""Strip leading and trailing newlines for each returned fragment. +Single file might return multiple code fragments. +""" + +ghc_strip_whitespace = False +"""Strip all leading and trailing whitespace for each returned fragment. +Single file might return multiple code fragments. Enabling this might break +code indentation. +""" + +ghc_api_version = "2022-11-28" +"""The version of the GitHub REST API. +""" + +ghc_insert_block_separator = False +"""Each file possibly consists of more than one code block that matches the +search, if this is set to true, the blocks will be separated with ``...`` line. +This might break the lexer and thus result in the lack of code highlighting. +""" + + +def request(query: str, params: dict[str, t.Any]) -> None: + + params['url'] = search_url.format(query=urlencode({'q': query}), page=urlencode({'page': params['pageno']})) + params['headers']['Accept'] = accept_header + params['headers']['X-GitHub-Api-Version'] = ghc_api_version + + if ghc_auth['type'] == "none": + # Without the auth header the query fails, so add a dummy instead. + # Queries without auth are heavily rate limited. + params['headers']['Authorization'] = "placeholder" + if ghc_auth['type'] == "personal_access_token": + params['headers']['Authorization'] = f"token {ghc_auth['token']}" + if ghc_auth['type'] == "bearer": + params['headers']['Authorization'] = f"Bearer {ghc_auth['token']}" + + params['raise_for_httperror'] = False + + +def get_code_language_name(filename: str, code_snippet: str) -> str | None: + """Returns a code language name by pulling information from the filename if + possible otherwise by scanning the passed code snippet. In case there is any + parsing error just default to no syntax highlighting.""" + try: + lexer = guess_lexer_for_filename(filename, _text=code_snippet) + if lexer is None: + return None + code_name_aliases = lexer.aliases + if len(code_name_aliases) == 0: + return None + return code_name_aliases[0] + except ClassNotFound: + return None + + +def extract_code(code_matches: list[dict[str, t.Any]]) -> tuple[list[str], set[int]]: + """ + Iterate over multiple possible matches, for each extract a code fragment. + GitHub additionally sends context for _word_ highlights; pygments supports + highlighting lines, as such we calculate which lines to highlight while + traversing the text. + """ + lines: list[str] = [] + highlighted_lines_index: set[int] = set() + + for i, match in enumerate(code_matches): + if i > 0 and ghc_insert_block_separator: + lines.append("...") + buffer: list[str] = [] + highlight_groups = [highlight_group['indices'] for highlight_group in match['matches']] + + code: str = match['fragment'] + original_code_lenght = len(code) + + if ghc_strip_whitespace: + code = code.lstrip() + if ghc_strip_new_lines: + code = code.lstrip("\n") + + offset = original_code_lenght - len(code) + + if ghc_strip_whitespace: + code = code.rstrip() + if ghc_strip_new_lines: + code = code.rstrip("\n") + + for i, letter in enumerate(code): + if len(highlight_groups) > 0: + # the API ensures these are sorted already, and we have a + # guaranteed match in the code (all indices are in the range 0 + # and len(fragment)), so only check the first highlight group + [after, before] = highlight_groups[0] + if after <= (i + offset) < before: + # pygments enumerates lines from 1, highlight the next line + highlighted_lines_index.add(len(lines) + 1) + highlight_groups.pop(0) + + if letter == "\n": + lines.append("".join(buffer)) + buffer = [] + continue + + buffer.append(letter) + lines.append("".join(buffer)) + return lines, highlighted_lines_index + + +def response(resp: SXNG_Response) -> EngineResults: + results = EngineResults() + + if resp.status_code == 422: + # on a invalid search term the status code 422 "Unprocessable Content" + # is returned / e.g. search term is "user: foo" instead "user:foo" + return results + # raise for other errors + raise_for_httperror(resp) + + for item in resp.json().get('items', []): + repo = item['repository'] + text_matches = item['text_matches'] + # ensure picking only the code contents in the blob + code_matches = [ + match for match in text_matches if match["object_type"] == "FileContent" and match["property"] == "content" + ] + lines, highlighted_lines_index = extract_code(code_matches) + if not ghc_highlight_matching_lines: + highlighted_lines_index: set[int] = set() + + code_snippet = "\n".join(lines) + + kwargs: dict[str, t.Any] = { + 'template': 'code.html', + 'url': item['html_url'], + 'title': f"{repo['full_name']} · {item['path']}", + 'content': repo['description'], + 'repository': repo['html_url'], + 'codelines': [(i + 1, line) for (i, line) in enumerate(lines)], + 'hl_lines': highlighted_lines_index, + 'code_language': get_code_language_name(filename=item['name'], code_snippet=code_snippet), + # important to set for highlighing + 'strip_whitespace': ghc_strip_whitespace, + 'strip_new_lines': ghc_strip_new_lines, + 'parsed_url': urlparse(item['html_url']), + } + results.add(results.types.LegacyResult(**kwargs)) + + return results diff --git a/searx/engines/searchcode_code.py b/searx/engines/searchcode_code.py index 7cfe2ce71..2196b0ad2 100644 --- a/searx/engines/searchcode_code.py +++ b/searx/engines/searchcode_code.py @@ -70,6 +70,8 @@ def response(resp): 'codelines': sorted(lines.items()), 'code_language': code_language, 'template': 'code.html', + 'strip_whitespace': True, + 'strip_new_lines': True, } ) diff --git a/searx/settings.yml b/searx/settings.yml index d21192651..0cd293d7e 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -983,6 +983,24 @@ engines: engine: github shortcut: gh + - name: github code + engine: github_code + shortcut: ghc + disabled: true + ghc_auth: + # type is one of: + # * none + # * personal_access_token + # * bearer + # When none is passed, the token is not requried. + type: "none" + token: "token" + # specify whether to highlight the matching lines to the query + ghc_highlight_matching_lines: true + ghc_strip_new_lines: true + ghc_strip_whitespace: false + timeout: 10.0 + - name: codeberg # https://docs.searxng.org/dev/engines/online/gitea.html engine: gitea diff --git a/searx/templates/simple/result_templates/code.html b/searx/templates/simple/result_templates/code.html index 49326aed5..bcde94358 100644 --- a/searx/templates/simple/result_templates/code.html +++ b/searx/templates/simple/result_templates/code.html @@ -25,7 +25,7 @@ {%- endif -%}
- {{- result.codelines|code_highlighter(result.code_language)|safe -}} + {{- result.codelines|code_highlighter(result.code_language, result.hl_lines, result.strip_whitespace, result.strip_new_lines)|safe -}}
{{- result_sub_footer(result) -}} diff --git a/searx/webapp.py b/searx/webapp.py index 2dd7ddb08..9b590eeab 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -181,24 +181,32 @@ def _get_locale_rfc5646(locale): # code-highlighter @app.template_filter('code_highlighter') -def code_highlighter(codelines, language=None): +def code_highlighter(codelines, language=None, hl_lines=None, strip_whitespace=True, strip_new_lines=True): if not language: language = 'text' try: - # find lexer by programming language - lexer = get_lexer_by_name(language, stripall=True) + lexer = get_lexer_by_name(language, stripall=strip_whitespace, stripnl=strip_new_lines) except Exception as e: # pylint: disable=broad-except logger.warning("pygments lexer: %s " % e) # if lexer is not found, using default one - lexer = get_lexer_by_name('text', stripall=True) + lexer = get_lexer_by_name('text', stripall=strip_whitespace, stripnl=strip_new_lines) html_code = '' tmp_code = '' last_line = None line_code_start = None + def offset_hl_lines(hl_lines, start): + """ + hl_lines in pygments are expected to be relative to the input + """ + if hl_lines is None: + return None + + return [line - start + 1 for line in hl_lines] + # parse lines for line, code in codelines: if not last_line: @@ -208,7 +216,12 @@ def code_highlighter(codelines, language=None): if last_line is not None and last_line + 1 != line: # highlight last codepart - formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight") + formatter = HtmlFormatter( + linenos='inline', + linenostart=line_code_start, + cssclass="code-highlight", + hl_lines=offset_hl_lines(hl_lines, line_code_start), + ) html_code = html_code + highlight(tmp_code, lexer, formatter) # reset conditions for next codepart @@ -222,7 +235,12 @@ def code_highlighter(codelines, language=None): last_line = line # highlight last codepart - formatter = HtmlFormatter(linenos='inline', linenostart=line_code_start, cssclass="code-highlight") + formatter = HtmlFormatter( + linenos='inline', + linenostart=line_code_start, + cssclass="code-highlight", + hl_lines=offset_hl_lines(hl_lines, line_code_start), + ) html_code = html_code + highlight(tmp_code, lexer, formatter) return html_code diff --git a/tests/unit/settings/test_github_code.yml b/tests/unit/settings/test_github_code.yml new file mode 100644 index 000000000..2cf039138 --- /dev/null +++ b/tests/unit/settings/test_github_code.yml @@ -0,0 +1,13 @@ +# This SearXNG setup is used in unit tests + +use_default_settings: + + engines: + keep_only: [] + +engines: + + - name: github code + engine: github_code + shortcut: "ghc" + disabled: true diff --git a/tests/unit/test_engine_github_code.py b/tests/unit/test_engine_github_code.py new file mode 100644 index 000000000..d10081f28 --- /dev/null +++ b/tests/unit/test_engine_github_code.py @@ -0,0 +1,170 @@ +# SPDX-License-Identifier: AGPL-3.0-or-later +# pylint: disable=missing-module-docstring,disable=missing-class-docstring + +import logging +from unittest.mock import Mock +from urllib.parse import urlparse +from parameterized import parameterized + +import searx.engines +from tests import SearxTestCase +from searx.result_types import EngineResults + + +class GithubCodeTests(SearxTestCase): + + TEST_SETTINGS = "test_github_code.yml" + + def setUp(self): + super().setUp() + self.ghc = searx.engines.engines['github code'] + self.ghc.logger.setLevel(logging.INFO) + + def tearDown(self): + searx.search.load_engines([]) + + @parameterized.expand( + [ + [ + [ + { + "fragment": " - [Tab management](#tab-management)\n - [Buffer/window management]" + "(#bufferwindow-management)\n- [🎨 Highlights](#-highlights)", + "matches": [{"indices": [47, 53], "text": "Buffer"}, {"indices": [74, 80], "text": "buffer"}], + }, + { + "fragment": "To conditionally activate plugins, the best solution is to use the\n" + "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode). However, " + "`packer.nvim` and `lazy.nvim` have built-in\nsupport for " + "`cond = vim.g.vscode` and `vim-plug` has a", + "matches": [ + {"indices": [68, 75], "text": "LazyVim"}, + {"indices": [102, 109], "text": "lazyvim"}, + ], + }, + ], + [ + " - [Tab management](#tab-management)", + " - [Buffer/window management](#bufferwindow-management)", + "- [🎨 Highlights](#-highlights)", + "To conditionally activate plugins, the best solution is to use the", + "[LazyVim VSCode extra](https://www.lazyvim.org/extras/vscode)." + " However, `packer.nvim` and `lazy.nvim` have built-in", + "support for `cond = vim.g.vscode` and `vim-plug` has a", + ], + {2, 5}, + ], + [ + [ + { + "fragment": "\n| `uf` | Toggle format (global) |\n" + "| `uF` | Toggle format (buffer) |\n" + "| `us` | Toggle spelling |\n", + "matches": [{"indices": [74, 80], "text": "buffer"}], + }, + ], + [ + "| `uf` | Toggle format (global) |", + "| `uF` | Toggle format (buffer) |", + "| `us` | Toggle spelling |", + ], + {2}, + ], + [ + [ + { + "fragment": "\n\n\n1\n2\n3\n4", + "matches": [{"indices": [3, 4], "text": "1"}], + }, + ], + [ + "1", + "2", + "3", + "4", + ], + {1}, + ], + [ + [ + { + "fragment": "placeholder", + "matches": [], + }, + ], + [ + "placeholder", + ], + set(), + ], + ] + ) + def test_code_extraction(self, code_matches, expected_code, expected_highlighted_lines): + code, highlights = self.ghc.extract_code(code_matches=code_matches) + self.assertEqual(code, expected_code) + self.assertEqual(highlights, expected_highlighted_lines) + + def test_transforms_response(self): + response = Mock() + response.json.return_value = { + "items": [ + { + "name": "TODO.md", + "path": "TODO.md", + "html_url": "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md", + "repository": { + "full_name": "folke/dot", + "html_url": "https://github.com/folke/dot", + "description": "☕️ My Dot Files", + }, + "text_matches": [ + { + "object_type": "FileContent", + "property": "content", + "fragment": "- [x] windows picker\n" + "- [x] toggle cwd / root (LazyVim)\n" + "- [x] dynamic workspace symbol", + "matches": [{"indices": [46, 53], "text": "LazyVim"}], + }, + { + "object_type": "FileContent", + "property": "content", + "fragment": "- [x] smart stops working after custom\n" + "- [x] edit in empty buffer\n" + "- [x] support toggling line nr for preview", + "matches": [{"indices": [59, 65], "text": "buffer"}, {"indices": [89, 93], "text": "line"}], + }, + ], + } + ] + } + response.status_code = 200 + results = self.ghc.response(response) + expected_results = EngineResults() + expected_results.add( + expected_results.types.LegacyResult( + **{ + 'url': "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md", + 'title': "folke/dot · TODO.md", + 'content': "☕️ My Dot Files", + 'repository': "https://github.com/folke/dot", + 'codelines': [ + (1, "- [x] windows picker"), + (2, "- [x] toggle cwd / root (LazyVim)"), + (3, "- [x] dynamic workspace symbol"), + (4, "- [x] smart stops working after custom"), + (5, "- [x] edit in empty buffer"), + (6, "- [x] support toggling line nr for preview"), + ], + 'hl_lines': {2, 5, 6}, + 'code_language': "markdown", + 'template': 'code.html', + 'strip_whitespace': False, + 'strip_new_lines': True, + 'parsed_url': urlparse( + "https://github.com/folke/dot/blob/3140f4f5720c3cc6b5034c624eb7706f8533a82c/TODO.md" + ), + } + ) + ) + self.assertEqual(results, expected_results)