From 742303d03011d01fc8ca9eb3a03468c44968950f Mon Sep 17 00:00:00 2001 From: Markus Heiser Date: Sun, 28 Apr 2024 18:28:30 +0200 Subject: [PATCH] [mod] improve unit converter plugin MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - l10n support: parse and format decimal numbers by babel - ability to add additional units - improved unit detection (symbols are not unique) - support for alias units (0,010C to F --> 32,018 °F) Signed-off-by: Markus Heiser --- docs/src/searx.plugins.unit_converter.rst | 9 + searx/plugins/unit_converter.py | 264 ++++++++++++++++++---- 2 files changed, 233 insertions(+), 40 deletions(-) create mode 100644 docs/src/searx.plugins.unit_converter.rst diff --git a/docs/src/searx.plugins.unit_converter.rst b/docs/src/searx.plugins.unit_converter.rst new file mode 100644 index 000000000..48d495d80 --- /dev/null +++ b/docs/src/searx.plugins.unit_converter.rst @@ -0,0 +1,9 @@ +.. _unit converter plugin: + +===================== +Unit converter plugin +===================== + +.. automodule:: searx.plugins.unit_converter + :members: + diff --git a/searx/plugins/unit_converter.py b/searx/plugins/unit_converter.py index dd515aa72..6ac25b8d8 100644 --- a/searx/plugins/unit_converter.py +++ b/searx/plugins/unit_converter.py @@ -1,10 +1,29 @@ # SPDX-License-Identifier: AGPL-3.0-or-later -"""Calculate mathematical expressions using ack#eval +"""A plugin for converting measured values from one unit to another unit (a +unit converter). + +The plugin looks up the symbols (given in the query term) in a list of +converters, each converter is one item in the list (compare +:py:obj:`ADDITIONAL_UNITS`). If the symbols are ambiguous, the matching units +of measurement are evaluated. The weighting in the evaluation results from the +sorting of the :py:obj:`list of unit converters`. + +Enable in ``settings.yml``: + +.. code:: yaml + + enabled_plugins: + .. + - 'Unit converter plugin' + """ -from flask_babel import gettext +import re +import babel.numbers +from flask_babel import gettext, get_locale + +from searx import data -from searx.data import WIKIDATA_UNITS name = "Unit converter plugin" description = gettext("Convert between units") @@ -12,47 +31,212 @@ default_on = True CONVERT_KEYWORDS = ["in", "to", "as"] - -def _convert(from_value, source_si_factor, target_si_factor): - return from_value * source_si_factor / target_si_factor +# inspired from https://stackoverflow.com/a/42475086 +RE_MEASURE = r''' +(?P[-+]?) # +/- or nothing for positive +(\s*) # separator: white space or nothing +(?P[\d\.,]*) # number: 1,000.00 (en) or 1.000,00 (de) +(?P[eE][-+]?\d+)? # scientific notation: e(+/-)2 (*10^2) +(\s*) # separator: white space or nothing +(?P\S+) # unit of measure +''' -def _parse_text_and_convert(search, splitted_query): - if len(splitted_query) != 2 or splitted_query[0].strip() == "" or splitted_query[1].strip() == "": +ADDITIONAL_UNITS = [ + { + "si_name": "Q11579", + "symbol": "°C", + "to_si": lambda val: val + 273.15, + "from_si": lambda val: val - 273.15, + }, + { + "si_name": "Q11579", + "symbol": "°F", + "to_si": lambda val: (val + 459.67) * 5 / 9, + "from_si": lambda val: (val * 9 / 5) - 459.67, + }, +] +"""Additional items to convert from a measure unit to a SI unit (vice versa). + +.. code:: python + + { + "si_name": "Q11579", # Wikidata item ID of the SI unit (Kelvin) + "symbol": "°C", # symbol of the measure unit + "to_si": lambda val: val + 273.15, # convert measure value (val) to SI unit + "from_si": lambda val: val - 273.15, # convert SI value (val) measure unit + }, + { + "si_name": "Q11573", + "symbol": "mi", + "to_si": 1609.344, # convert measure value (val) to SI unit + "from_si": 1 / 1609.344 # convert SI value (val) measure unit + }, + +The values of ``to_si`` and ``from_si`` can be of :py:obj:`float` (a multiplier) +or a callable_ (val in / converted value returned). + +.. _callable: https://docs.python.org/3/glossary.html#term-callable +""" + + +ALIAS_SYMBOLS = { + '°C': ('C',), + '°F': ('F',), + 'mi': ('L',), +} +"""Alias symbols for known unit of measure symbols / by example:: + + '°C': ('C', ...), # list of alias symbols for °C (Q69362731) + '°F': ('F', ...), # list of alias symbols for °F (Q99490479) + 'mi': ('L',), # list of alias symbols for mi (Q253276) +""" + + +SYMBOL_TO_SI = [] + + +def symbol_to_si(): + """Generates a list of tuples, each tuple is a measure unit and the fields + in the tuple are: + + 0. Symbol of the measure unit (e.g. 'mi' for measure unit 'miles' Q253276) + + 1. SI name of the measure unit (e.g. Q11573 for SI unit 'metre') + + 2. Factor to get SI value from measure unit (e.g. 1mi is equal to SI 1m + multiplied by 1609.344) + + 3. Factor to get measure value from from SI value (e.g. SI 100m is equal to + 100mi divided by 1609.344) + + The returned list is sorted, the first items are created from + ``WIKIDATA_UNITS``, the second group of items is build from + :py:obj:`ADDITIONAL_UNITS` and items created from :py:obj:`ALIAS_SYMBOLS`. + + If you search this list for a symbol, then a match with a symbol from + Wikidata has the highest weighting (first hit in the list), followed by the + symbols from the :py:obj:`ADDITIONAL_UNITS` and the lowest weighting is + given to the symbols resulting from the aliases :py:obj:`ALIAS_SYMBOLS`. + + """ + + global SYMBOL_TO_SI # pylint: disable=global-statement + if SYMBOL_TO_SI: + return SYMBOL_TO_SI + + # filter out units which can't be normalized to a SI unit and filter out + # units without a symbol / arcsecond does not have a symbol + # https://www.wikidata.org/wiki/Q829073 + + for item in data.WIKIDATA_UNITS.values(): + if item['to_si_factor'] and item['symbol']: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + item['to_si_factor'], # from_si + 1 / item['to_si_factor'], # to_si + item['symbol'], + ) + ) + + for item in ADDITIONAL_UNITS: + SYMBOL_TO_SI.append( + ( + item['symbol'], + item['si_name'], + item['from_si'], + item['to_si'], + item['symbol'], + ) + ) + + alias_items = [] + for item in SYMBOL_TO_SI: + for alias in ALIAS_SYMBOLS.get(item[0], ()): + alias_items.append( + ( + alias, + item[1], + item[2], # from_si + item[3], # to_si + item[0], # origin unit + ) + ) + SYMBOL_TO_SI = SYMBOL_TO_SI + alias_items + return SYMBOL_TO_SI + + +def _parse_text_and_convert(search, from_query, to_query): + + # pylint: disable=too-many-branches, too-many-locals + + if not (from_query and to_query): return - from_value = "" - from_unit_key = "" - - # only parse digits as value that belong together - read_alpha = False - for c in splitted_query[0]: - if not read_alpha and (c in ("-", ".") or str.isdigit(c)): - from_value += c - read_alpha = True - elif c != " ": - from_unit_key += c - - to_unit_key = splitted_query[1].strip() - - from_unit = None - to_unit = None - - for unit in WIKIDATA_UNITS.values(): - if unit['symbol'] == from_unit_key: - from_unit = unit - - if unit['symbol'] == to_unit_key: - to_unit = unit - - if from_unit and to_unit: - break - - if from_unit is None or to_unit is None or to_unit.get('si_name') != from_unit.get('si_name'): + measured = re.match(RE_MEASURE, from_query, re.VERBOSE) + if not (measured and measured.group('number'), measured.group('unit')): return - result = _convert(float(from_value), from_unit['to_si_factor'], to_unit['to_si_factor']) - search.result_container.answers['conversion'] = {'answer': f"{result:g} {to_unit['symbol']}"} + # Symbols are not unique, if there are several hits for the from-unit, then + # the correct one must be determined by comparing it with the to-unit + # https://github.com/searxng/searxng/pull/3378#issuecomment-2080974863 + + # first: collecting possible units + + source_list, target_list = [], [] + + for symbol, si_name, from_si, to_si, orig_symbol in symbol_to_si(): + + if symbol == measured.group('unit'): + source_list.append((si_name, to_si)) + if symbol == to_query: + target_list.append((si_name, from_si, orig_symbol)) + + if not (source_list and target_list): + return + + source_to_si = target_from_si = target_symbol = None + + # second: find the right unit by comparing list of from-units with list of to-units + + for source in source_list: + for target in target_list: + if source[0] == target[0]: # compare si_name + source_to_si = source[1] + target_from_si = target[1] + target_symbol = target[2] + + if not (source_to_si and target_from_si): + return + + _locale = get_locale() or 'en_US' + + value = measured.group('sign') + measured.group('number') + (measured.group('E') or '') + value = babel.numbers.parse_decimal(value, locale=_locale) + + # convert value to SI unit + + if isinstance(source_to_si, (float, int)): + value = float(value) * source_to_si + else: + value = source_to_si(float(value)) + + # convert value from SI unit to target unit + + if isinstance(target_from_si, (float, int)): + value = float(value) * target_from_si + else: + value = target_from_si(float(value)) + + if measured.group('E'): + # when incomming notation is scientific, outgoing notation is scientific + result = babel.numbers.format_scientific(value, locale=_locale) + else: + result = babel.numbers.format_decimal(value, locale=_locale, format='#,##0.##########;-#') + + search.result_container.answers['conversion'] = {'answer': f'{result} {target_symbol}'} def post_search(_request, search): @@ -69,8 +253,8 @@ def post_search(_request, search): for query_part in query_parts: for keyword in CONVERT_KEYWORDS: if query_part == keyword: - keyword_split = query.split(keyword, 1) - _parse_text_and_convert(search, keyword_split) + from_query, to_query = query.split(keyword, 1) + _parse_text_and_convert(search, from_query.strip(), to_query.strip()) return True return True