mirror of
https://github.com/LibreTranslate/LibreTranslate.git
synced 2024-06-02 12:10:01 +00:00
format detect.py
This commit is contained in:
parent
e7f29d2194
commit
df6e63fbe2
|
@ -1,71 +1,96 @@
|
|||
# Originally adapted from https://github.com/aboSamoor/polyglot/blob/master/polyglot/base.py
|
||||
|
||||
from abc import ABC
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pycld2 as cld2
|
||||
|
||||
class UnknownLanguage(Exception):
|
||||
pass
|
||||
pass
|
||||
|
||||
|
||||
class Language(object):
|
||||
def __init__(self, choice):
|
||||
name, code, confidence, bytesize = choice
|
||||
self.code = code
|
||||
self.name = name
|
||||
self.confidence = float(confidence)
|
||||
self.read_bytes = int(bytesize)
|
||||
def __init__(self, choice: "tuple[str, str, float | int, int]") -> None:
|
||||
name, code, confidence, bytesize = choice
|
||||
self.code = code
|
||||
self.name = name
|
||||
self.confidence = float(confidence)
|
||||
self.read_bytes = int(bytesize)
|
||||
|
||||
def __str__(self):
|
||||
return ("name: {:<12}code: {:<9}confidence: {:>5.1f} "
|
||||
"read bytes:{:>6}".format(self.name, self.code,
|
||||
self.confidence, self.read_bytes))
|
||||
def __str__(self) -> str:
|
||||
return "name: {:<12}code: {:<9}confidence: {:>5.1f} " "read bytes:{:>6}".format(
|
||||
self.name, self.code, self.confidence, self.read_bytes
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_code(code):
|
||||
return Language(("", code, 100, 0))
|
||||
@staticmethod
|
||||
def from_code(code: str) -> "Language":
|
||||
return Language(("", code, 100, 0))
|
||||
|
||||
|
||||
class BaseDetector(ABC):
|
||||
"""Detect the language used in a snippet of text."""
|
||||
"""Detect the language used in a snippet of text."""
|
||||
|
||||
def __init__(self, text: str, quiet: bool = False) -> None:
|
||||
""" Detector of the language used in `text`.
|
||||
Args:
|
||||
text (string): unicode string.
|
||||
"""
|
||||
# self.__text = text
|
||||
self.reliable = True
|
||||
"""False if the detector used Best Effort strategy in detection."""
|
||||
self.quiet = quiet
|
||||
"""If true, exceptions will be silenced."""
|
||||
self.languages = self.detect(text)
|
||||
self.language = self.languages[0]
|
||||
def __init__(
|
||||
self,
|
||||
text: str,
|
||||
quiet: bool = False,
|
||||
allowed_languages: "Iterable[str] | None" = None,
|
||||
) -> None:
|
||||
"""Detector of the language used in `text`.
|
||||
Args:
|
||||
text (string): unicode string.
|
||||
"""
|
||||
self.allowed_languages: "frozenset[str]" = frozenset(
|
||||
self.supported_languages() if allowed_languages is None else allowed_languages
|
||||
)
|
||||
# self.__text = text
|
||||
self.reliable: bool = True
|
||||
"""False if the detector used Best Effort strategy in detection."""
|
||||
self.quiet: bool = quiet
|
||||
"""If true, exceptions will be silenced."""
|
||||
self.languages: "list[Language]" = [
|
||||
lang
|
||||
for lang in self.detect(text)
|
||||
if lang.code in self.allowed_languages
|
||||
]
|
||||
self.language: "Language | None" = self.languages[0] if self.languages else None
|
||||
if self.language.confidence < 0.4:
|
||||
self.reliable = False
|
||||
|
||||
def __str__(self) -> str:
|
||||
text = "Prediction is reliable: {}\n".format(self.reliable)
|
||||
text += u"\n".join(["Language {}: {}".format(i+1, str(l))
|
||||
for i,l in enumerate(self.languages)])
|
||||
return text
|
||||
def __str__(self) -> str:
|
||||
text = "Prediction is reliable: {}\n".format(self.reliable)
|
||||
text += "\n".join(
|
||||
[
|
||||
"Language {}: {}".format(i + 1, str(l))
|
||||
for i, l in enumerate(self.languages)
|
||||
]
|
||||
)
|
||||
return text
|
||||
|
||||
@staticmethod
|
||||
def supported_languages() -> "list[str]":
|
||||
"""Returns a list of the languages that this Detector can detect."""
|
||||
raise NotImplementedError()
|
||||
@staticmethod
|
||||
def supported_languages() -> "list[str]":
|
||||
"""Returns a list of the languages that this Detector can detect."""
|
||||
raise NotImplementedError()
|
||||
|
||||
def detect(self, text: str) -> "list[Language]":
|
||||
"""Decide which language is used to write the text."""
|
||||
raise NotImplementedError()
|
||||
def detect(self, text: str) -> "list[Language]":
|
||||
"""Decide which language is used to write the text."""
|
||||
raise NotImplementedError()
|
||||
|
||||
|
||||
class Detector(BaseDetector):
|
||||
""" Detect the language used in a snippet of text."""
|
||||
"""Detect the language used in a snippet of text."""
|
||||
|
||||
@staticmethod
|
||||
def supported_languages() -> "list[str]":
|
||||
"""Returns a list of the languages that can be detected by pycld2."""
|
||||
return [name.capitalize() for name, code in cld2.LANGUAGES if not name.startswith("X_")]
|
||||
return [
|
||||
name.capitalize()
|
||||
for name, code in cld2.LANGUAGES
|
||||
if not name.startswith("X_")
|
||||
]
|
||||
|
||||
def detect(self, text) -> "list[Language]":
|
||||
def detect(self, text: str) -> "list[Language]":
|
||||
"""Decide which language is used to write the text.
|
||||
The method tries first to detect the language with high reliability. If
|
||||
that is not possible, the method switches to the best effort strategy.
|
||||
|
|
Loading…
Reference in a new issue