Merge branch 'oban/rich-media-invalid' into 'develop'

Rework some Rich Media functionality for better error handling

See merge request pleroma/pleroma!4182
This commit is contained in:
feld 2024-07-17 17:50:17 +00:00
commit 4544505761
8 changed files with 74 additions and 50 deletions

View file

@ -0,0 +1 @@
Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again.

View file

@ -4,6 +4,7 @@
defmodule Pleroma.Web.RichMedia.Backfill do defmodule Pleroma.Web.RichMedia.Backfill do
alias Pleroma.Web.RichMedia.Card alias Pleroma.Web.RichMedia.Card
alias Pleroma.Web.RichMedia.Helpers
alias Pleroma.Web.RichMedia.Parser alias Pleroma.Web.RichMedia.Parser
alias Pleroma.Web.RichMedia.Parser.TTL alias Pleroma.Web.RichMedia.Parser.TTL
alias Pleroma.Workers.RichMediaWorker alias Pleroma.Workers.RichMediaWorker
@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do
Pleroma.Web.ActivityPub.ActivityPub Pleroma.Web.ActivityPub.ActivityPub
) )
@spec run(map()) :: @spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors()
:ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()}
def run(%{"url" => url} = args) do def run(%{"url" => url} = args) do
url_hash = Card.url_to_hash(url) url_hash = Card.url_to_hash(url)
@ -33,22 +33,16 @@ defmodule Pleroma.Web.RichMedia.Backfill do
end end
warm_cache(url_hash, card) warm_cache(url_hash, card)
:ok
{:error, {:invalid_metadata, fields}} -> {:error, type} = error
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}") when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
negative_cache(url_hash) negative_cache(url_hash)
error
{:error, :body_too_large} -> {:error, type} = error
Logger.error("Rich media error for #{url}: :body_too_large") when type in [:get, :head] ->
negative_cache(url_hash) error
{:error, {:content_type, type}} ->
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
negative_cache(url_hash)
e ->
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
{:error, e}
end end
end end

View file

@ -5,26 +5,38 @@
defmodule Pleroma.Web.RichMedia.Helpers do defmodule Pleroma.Web.RichMedia.Helpers do
alias Pleroma.Config alias Pleroma.Config
require Logger
@type get_errors :: {:error, :body_too_large | :content_type | :head | :get}
@spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors()
def rich_media_get(url) do def rich_media_get(url) do
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}] headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
head_check = with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <-
case Pleroma.HTTP.head(url, headers, http_options()) do {:head, Pleroma.HTTP.head(url, headers, http_options())},
# If the HEAD request didn't reach the server for whatever reason, {_, :ok} <- {:content_type, check_content_type(headers)},
# we assume the GET that comes right after won't either {_, :ok} <- {:content_length, check_content_length(headers)},
{:error, _} = e -> {_, {:ok, %Tesla.Env{status: 200, body: body}}} <-
e {:get, Pleroma.HTTP.get(url, headers, http_options())} do
{:ok, body}
else
{:head, _} ->
Logger.debug("Rich media error for #{url}: HTTP HEAD failed")
{:error, :head}
{:ok, %Tesla.Env{status: 200, headers: headers}} -> {:content_type, {_, type}} ->
with :ok <- check_content_type(headers), Logger.debug("Rich media error for #{url}: content-type is #{type}")
:ok <- check_content_length(headers), {:error, :content_type}
do: :ok
_ -> {:content_length, {_, length}} ->
:ok Logger.debug("Rich media error for #{url}: content-length is #{length}")
{:error, :body_too_large}
{:get, _} ->
Logger.debug("Rich media error for #{url}: HTTP GET failed")
{:error, :get}
end end
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
end end
defp check_content_type(headers) do defp check_content_type(headers) do
@ -32,7 +44,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
{_, content_type} -> {_, content_type} ->
case Plug.Conn.Utils.media_type(content_type) do case Plug.Conn.Utils.media_type(content_type) do
{:ok, "text", "html", _} -> :ok {:ok, "text", "html", _} -> :ok
_ -> {:error, {:content_type, content_type}} _ -> {:error, content_type}
end end
_ -> _ ->
@ -47,7 +59,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
{_, maybe_content_length} -> {_, maybe_content_length} ->
case Integer.parse(maybe_content_length) do case Integer.parse(maybe_content_length) do
{content_length, ""} when content_length <= max_body -> :ok {content_length, ""} when content_length <= max_body -> :ok
{_, ""} -> {:error, :body_too_large} {_, ""} -> {:error, maybe_content_length}
_ -> :ok _ -> :ok
end end

View file

@ -3,6 +3,7 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser do defmodule Pleroma.Web.RichMedia.Parser do
alias Pleroma.Web.RichMedia.Helpers
require Logger require Logger
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config) @config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
@ -11,24 +12,26 @@ defmodule Pleroma.Web.RichMedia.Parser do
Pleroma.Config.get([:rich_media, :parsers]) Pleroma.Config.get([:rich_media, :parsers])
end end
def parse(nil), do: nil @type parse_errors :: {:error, :rich_media_disabled | :validate}
@spec parse(String.t()) :: {:ok, map()} | {:error, any()} @spec parse(String.t()) ::
def parse(url) do {:ok, map()} | parse_errors() | Helpers.get_errors()
def parse(url) when is_binary(url) do
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])}, with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
:ok <- validate_page_url(url), {_, :ok} <- {:validate, validate_page_url(url)},
{:ok, data} <- parse_url(url) do {_, {:ok, data}} <- {:parse, parse_url(url)} do
data = Map.put(data, "url", url) data = Map.put(data, "url", url)
{:ok, data} {:ok, data}
else else
{:config, _} -> {:error, :rich_media_disabled} {:config, _} -> {:error, :rich_media_disabled}
e -> e {:validate, _} -> {:error, :validate}
{:parse, error} -> error
end end
end end
defp parse_url(url) do defp parse_url(url) do
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url), with {:ok, body} <- Helpers.rich_media_get(url),
{:ok, html} <- Floki.parse_document(html) do {:ok, html} <- Floki.parse_document(body) do
html html
|> maybe_parse() |> maybe_parse()
|> clean_parsed_data() |> clean_parsed_data()
@ -50,8 +53,8 @@ defmodule Pleroma.Web.RichMedia.Parser do
{:ok, data} {:ok, data}
end end
defp check_parsed_data(data) do defp check_parsed_data(_data) do
{:error, {:invalid_metadata, data}} {:error, :invalid_metadata}
end end
defp clean_parsed_data(data) do defp clean_parsed_data(data) do

View file

@ -22,7 +22,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do
end end
defp get_oembed_data(url) do defp get_oembed_data(url) do
with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
Jason.decode(json) Jason.decode(json)
end end
end end

View file

@ -14,7 +14,21 @@ defmodule Pleroma.Workers.RichMediaWorker do
end end
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
Backfill.run(args) case Backfill.run(args) do
:ok ->
:ok
{:error, type}
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
{:cancel, type}
{:error, type}
when type in [:get, :head] ->
{:error, type}
error ->
{:error, error}
end
end end
@impl Oban.Worker @impl Oban.Worker

View file

@ -20,7 +20,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "doesn't just add a title" do test "doesn't just add a title" do
assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp") assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp")
end end
test "parses ogp" do test "parses ogp" do
@ -96,7 +96,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "returns error if getting page was not successful" do test "returns error if getting page was not successful" do
assert {:error, :overload} = Parser.parse("https://example.com/error") assert {:error, :get} = Parser.parse("https://example.com/error")
end end
test "does a HEAD request to check if the body is too large" do test "does a HEAD request to check if the body is too large" do
@ -104,17 +104,17 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "does a HEAD request to check if the body is html" do test "does a HEAD request to check if the body is html" do
assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file") assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file")
end end
test "refuses to crawl incomplete URLs" do test "refuses to crawl incomplete URLs" do
url = "example.com/ogp" url = "example.com/ogp"
assert :error == Parser.parse(url) assert {:error, :validate} == Parser.parse(url)
end end
test "refuses to crawl malformed URLs" do test "refuses to crawl malformed URLs" do
url = "example.com[]/ogp" url = "example.com[]/ogp"
assert :error == Parser.parse(url) assert {:error, :validate} == Parser.parse(url)
end end
test "refuses to crawl URLs of private network from posts" do test "refuses to crawl URLs of private network from posts" do
@ -126,7 +126,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
"https://pleroma.local/notice/9kCP7V" "https://pleroma.local/notice/9kCP7V"
] ]
|> Enum.each(fn url -> |> Enum.each(fn url ->
assert :error == Parser.parse(url) assert {:error, :validate} == Parser.parse(url)
end) end)
end end

View file

@ -1724,7 +1724,7 @@ defmodule HttpRequestMock do
] ]
def head(url, _query, _body, _headers) when url in @rich_media_mocks do def head(url, _query, _body, _headers) when url in @rich_media_mocks do
{:ok, %Tesla.Env{status: 404, body: ""}} {:ok, %Tesla.Env{status: 200, body: ""}}
end end
def head("https://example.com/pdf-file", _, _, _) do def head("https://example.com/pdf-file", _, _, _) do