From 9f2319e50dc0516bde4bfa3b117ec4792e553bd2 Mon Sep 17 00:00:00 2001 From: Mark Felder Date: Tue, 6 Feb 2024 16:54:52 -0500 Subject: [PATCH] RichMedia.Helpers: move the validate_page_url/1 function to the Parser module This will ensure that the page validation happens in Parser.parse/1 so it can be called from anywhere and still filter invalid URLs. --- changelog.d/rich_media_tests.skip | 0 lib/pleroma/web/rich_media/helpers.ex | 43 ------------------------- lib/pleroma/web/rich_media/parser.ex | 46 ++++++++++++++++++++++++++- 3 files changed, 45 insertions(+), 44 deletions(-) create mode 100644 changelog.d/rich_media_tests.skip diff --git a/changelog.d/rich_media_tests.skip b/changelog.d/rich_media_tests.skip new file mode 100644 index 000000000..e69de29bb diff --git a/lib/pleroma/web/rich_media/helpers.ex b/lib/pleroma/web/rich_media/helpers.ex index 1501776d9..a711dc436 100644 --- a/lib/pleroma/web/rich_media/helpers.ex +++ b/lib/pleroma/web/rich_media/helpers.ex @@ -18,53 +18,10 @@ defmodule Pleroma.Web.RichMedia.Helpers do recv_timeout: 2_000 ] - @spec validate_page_url(URI.t() | binary()) :: :ok | :error - defp validate_page_url(page_url) when is_binary(page_url) do - validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) - - page_url - |> Linkify.Parser.url?(validate_tld: validate_tld) - |> parse_uri(page_url) - end - - defp validate_page_url(%URI{host: host, scheme: "https"}) do - cond do - Linkify.Parser.ip?(host) -> - :error - - host in @config_impl.get([:rich_media, :ignore_hosts], []) -> - :error - - get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> - :error - - true -> - :ok - end - end - - defp validate_page_url(_), do: :error - - defp parse_uri(true, url) do - url - |> URI.parse() - |> validate_page_url - end - - defp parse_uri(_, _), do: :error - - defp get_tld(host) do - host - |> String.split(".") - |> Enum.reverse() - |> hd - end - def fetch_data_for_object(object) do with true <- @config_impl.get([:rich_media, :enabled]), {:ok, page_url} <- HTML.extract_first_external_url_from_object(object), - :ok <- validate_page_url(page_url), {:ok, rich_media} <- Parser.parse(page_url) do %{page_url: page_url, rich_media: rich_media} else diff --git a/lib/pleroma/web/rich_media/parser.ex b/lib/pleroma/web/rich_media/parser.ex index a791b2bab..a73fbc4b9 100644 --- a/lib/pleroma/web/rich_media/parser.ex +++ b/lib/pleroma/web/rich_media/parser.ex @@ -6,6 +6,7 @@ defmodule Pleroma.Web.RichMedia.Parser do require Logger @cachex Pleroma.Config.get([:cachex, :provider], Cachex) + @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) defp parsers do Pleroma.Config.get([:rich_media, :parsers]) @@ -15,7 +16,8 @@ defmodule Pleroma.Web.RichMedia.Parser do @spec parse(String.t()) :: {:ok, map()} | {:error, any()} def parse(url) do - with {:ok, data} <- get_cached_or_parse(url), + with :ok <- validate_page_url(url), + {:ok, data} <- get_cached_or_parse(url), {:ok, _} <- set_ttl_based_on_image(data, url) do {:ok, data} end @@ -161,4 +163,46 @@ defmodule Pleroma.Web.RichMedia.Parser do end) |> Map.new() end + + @spec validate_page_url(URI.t() | binary()) :: :ok | :error + defp validate_page_url(page_url) when is_binary(page_url) do + validate_tld = @config_impl.get([Pleroma.Formatter, :validate_tld]) + + page_url + |> Linkify.Parser.url?(validate_tld: validate_tld) + |> parse_uri(page_url) + end + + defp validate_page_url(%URI{host: host, scheme: "https"}) do + cond do + Linkify.Parser.ip?(host) -> + :error + + host in @config_impl.get([:rich_media, :ignore_hosts], []) -> + :error + + get_tld(host) in @config_impl.get([:rich_media, :ignore_tld], []) -> + :error + + true -> + :ok + end + end + + defp validate_page_url(_), do: :error + + defp parse_uri(true, url) do + url + |> URI.parse() + |> validate_page_url + end + + defp parse_uri(_, _), do: :error + + defp get_tld(host) do + host + |> String.split(".") + |> Enum.reverse() + |> hd + end end