From 7763b9a87fe534bd85892884fdbb4bbb6b31c982 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Wed, 19 Mar 2025 10:29:45 -0700 Subject: [PATCH] Truncate the length of Rich Media title and description fields Some sites like Instagram are serving obnoxiously long metadata fields --- changelog.d/truncate-rich-media.change | 1 + lib/pleroma/web/rich_media/parser.ex | 13 +++ .../rich_media/instagram_longtext.html | 90 +++++++++++++++++++ test/pleroma/web/rich_media/parser_test.exs | 7 ++ test/support/http_request_mock.ex | 8 +- 5 files changed, 118 insertions(+), 1 deletion(-) create mode 100644 changelog.d/truncate-rich-media.change create mode 100644 test/fixtures/rich_media/instagram_longtext.html diff --git a/changelog.d/truncate-rich-media.change b/changelog.d/truncate-rich-media.change new file mode 100644 index 000000000..1df064be1 --- /dev/null +++ b/changelog.d/truncate-rich-media.change @@ -0,0 +1 @@ +Truncate the length of Rich Media title and description fields diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index a3a522d7a..9c8ec7a9f 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -4,6 +4,7 @@ defmodule Pleroma.Web.RichMedia.Parser do alias Pleroma.Web.RichMedia.Helpers + import Pleroma.Web.Metadata.Utils, only: [scrub_html_and_truncate: 2] require Logger @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) @@ -63,8 +64,20 @@ defmodule Pleroma.Web.RichMedia.Parser do not match?({:ok, _}, Jason.encode(%{key => val})) end) |> Map.new() + |> truncate_title() + |> truncate_desc() end + defp truncate_title(%{"title" => title} = data) when is_binary(title), + do: %{data | "title" => scrub_html_and_truncate(title, 120)} + + defp truncate_title(data), do: data + + defp truncate_desc(%{"description" => desc} = data) when is_binary(desc), + do: %{data | "description" => scrub_html_and_truncate(desc, 200)} + + defp truncate_desc(data), do: data + @spec validate_page_url(URI.t() | binary()) :: :ok | :error defp validate_page_url(page_url) when is_binary(page_url) do validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) diff --git a/test/fixtures/rich_media/instagram_longtext.html b/test/fixtures/rich_media/instagram_longtext.html new file mode 100644 index 000000000..e833f408c --- /dev/null +++ b/test/fixtures/rich_media/instagram_longtext.html @@ -0,0 +1,90 @@ + + + + + + + + + + + + + + + + +CAPTURE THE ATLAS | ✨ A Once-in-a-Lifetime Shot: Total Lunar Eclipse + Aurora Substorm! 🔴💚 + +Last Thursday night, under the freezing skies of Northern Alaska, I... | Instagram + +
+
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/test/pleroma/web/rich_media/parser_test.exs b/test/pleroma/web/rich_media/parser_test.exs index 20f61badc..1f01d657a 100644 --- a/test/pleroma/web/rich_media/parser_test.exs +++ b/test/pleroma/web/rich_media/parser_test.exs @@ -61,6 +61,13 @@ defmodule Pleroma.Web.RichMedia.ParserTest do }} end + test "truncates title and description fields" do + {:ok, parsed} = Parser.parse("https://instagram.com/longtext") + + assert String.length(parsed["title"]) == 120 + assert String.length(parsed["description"]) == 200 + end + test "parses OEmbed and filters HTML tags" do assert Parser.parse("https://example.com/oembed") == {:ok, diff --git a/test/support/http_request_mock.ex b/test/support/http_request_mock.ex index 1c472fca9..a8f954af9 100644 --- a/test/support/http_request_mock.ex +++ b/test/support/http_request_mock.ex @@ -1494,6 +1494,11 @@ defmodule HttpRequestMock do {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/twitter_card.html")}} end + def get("https://instagram.com/longtext", _, _, _) do + {:ok, + %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/instagram_longtext.html")}} + end + def get("https://example.com/non-ogp", _, _, _) do {:ok, %Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/non_ogp_embed.html")}} @@ -1720,7 +1725,8 @@ defmodule HttpRequestMock do "https://example.com/twitter-card", "https://google.com/", "https://pleroma.local/notice/9kCP7V", - "https://yahoo.com/" + "https://yahoo.com/", + "https://instagram.com/longtext" ] def head(url, _query, _body, _headers) when url in @rich_media_mocks do