LanguageDetector: strip non-language text to (hopefully) improve accuracy

This commit is contained in:
Alex Gleason 2022-11-08 17:06:16 -06:00 committed by marcin mikołajczak
parent 17d885fed8
commit 8bec926beb

View file

@ -15,10 +15,18 @@ defmodule Pleroma.Language.LanguageDetector do
end
end
# Strip tags from text, etc.
defp prepare_text(text) do
text
|> Floki.parse_fragment!()
|> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre")
|> Floki.text()
end
def detect(text) do
provider = get_provider()
{:ok, text} = text |> FastSanitize.strip_tags()
text = prepare_text(text)
word_count = text |> String.split(~r/\s+/) |> Enum.count()
if word_count < @words_threshold or !provider or !provider.configured? do