LanguageDetector: strip non-language text to (hopefully) improve accuracy

2024-06-24 08:00:37 +00:00 · 2022-11-08 17:06:16 -06:00 · 2022-11-08 17:06:16 -06:00 · 8bec926beb
parent 17d885fed8
commit 8bec926beb
1 changed files with 9 additions and 1 deletions
--- a/lib/pleroma/language/language_detector.ex
+++ b/lib/pleroma/language/language_detector.ex
@ -15,10 +15,18 @@ defmodule Pleroma.Language.LanguageDetector do
    end
  end

+  # Strip tags from text, etc.
+  defp prepare_text(text) do
+    text
+    |> Floki.parse_fragment!()
+    |> Floki.filter_out(".h-card, .mention, .hashtag, .u-url, .quote-inline, .recipients-inline, code, pre")
+    |> Floki.text()
+  end
+
  def detect(text) do
    provider = get_provider()

-    {:ok, text} = text |> FastSanitize.strip_tags()
+    text = prepare_text(text)
    word_count = text |> String.split(~r/\s+/) |> Enum.count()

    if word_count < @words_threshold or !provider or !provider.configured? do