LibreTranslate/app/language.py

from argostranslate import translate
from polyglot.detect.base import Detector


languages = translate.load_installed_languages()


__lang_codes = [l.code for l in languages]


def detect_languages(text):
    # detect batch processing
    if isinstance(text, list):
        is_batch = True
    else:
        is_batch = False
        text = [text]

    # get the candidates
    candidates = []
    for t in text:
        candidates.extend(Detector(t).languages)

    # total read bytes of the provided text
    read_bytes_total = sum(c.read_bytes for c in candidates)

    # only use candidates that are supported by argostranslate
    candidate_langs = list(filter(lambda l: l.read_bytes != 0 and l.code in __lang_codes, candidates))

    # this happens if no language could be detected
    if not candidate_langs:
        # use language "en" by default but with zero confidence
        return [
                    {
                        'confidence': 0.0,
                        'language': "en"
                    }
                ]

    # for multiple occurrences of the same language (can happen on batch detection)
    # calculate the average confidence for each language
    if is_batch:
        temp_average_list = []
        for lang_code in __lang_codes:
            # get all candidates for a specific language
            lc = list(filter(lambda l: l.code == lang_code, candidate_langs))
            if len(lc) > 1:
                # if more than one is present, calculate the average confidence
                lang = lc[0]
                lang.confidence = sum(l.confidence for l in lc) / len(lc)
                lang.read_bytes = sum(l.read_bytes for l in lc)
                temp_average_list.append(lang)
            elif lc:
                # otherwise just add it to the temporary list
                temp_average_list.append(lc[0])

        if temp_average_list:
            # replace the list
            candidate_langs = temp_average_list

    # sort the candidates descending based on the detected confidence
    candidate_langs.sort(key=lambda l: (l.confidence * l.read_bytes) / read_bytes_total, reverse=True)

    return [
                {
                    'confidence': l.confidence,
                    'language': l.code
                }
                for l in candidate_langs
            ]