mirror of
https://git.pleroma.social/pleroma/pleroma.git
synced 2025-01-03 13:58:41 +00:00
Merge branch 'oban/rich-media-invalid' into 'develop'
Rework some Rich Media functionality for better error handling See merge request pleroma/pleroma!4182
This commit is contained in:
commit
4544505761
8 changed files with 74 additions and 50 deletions
1
changelog.d/oban-rich-media-errors.fix
Normal file
1
changelog.d/oban-rich-media-errors.fix
Normal file
|
@ -0,0 +1 @@
|
||||||
|
Prevent Rich Media backfill jobs from retrying in cases where it is likely they will fail again.
|
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
defmodule Pleroma.Web.RichMedia.Backfill do
|
defmodule Pleroma.Web.RichMedia.Backfill do
|
||||||
alias Pleroma.Web.RichMedia.Card
|
alias Pleroma.Web.RichMedia.Card
|
||||||
|
alias Pleroma.Web.RichMedia.Helpers
|
||||||
alias Pleroma.Web.RichMedia.Parser
|
alias Pleroma.Web.RichMedia.Parser
|
||||||
alias Pleroma.Web.RichMedia.Parser.TTL
|
alias Pleroma.Web.RichMedia.Parser.TTL
|
||||||
alias Pleroma.Workers.RichMediaWorker
|
alias Pleroma.Workers.RichMediaWorker
|
||||||
|
@ -16,8 +17,7 @@ defmodule Pleroma.Web.RichMedia.Backfill do
|
||||||
Pleroma.Web.ActivityPub.ActivityPub
|
Pleroma.Web.ActivityPub.ActivityPub
|
||||||
)
|
)
|
||||||
|
|
||||||
@spec run(map()) ::
|
@spec run(map()) :: :ok | Parser.parse_errors() | Helpers.get_errors()
|
||||||
:ok | {:error, {:invalid_metadata, any()} | :body_too_large | {:content, any()} | any()}
|
|
||||||
def run(%{"url" => url} = args) do
|
def run(%{"url" => url} = args) do
|
||||||
url_hash = Card.url_to_hash(url)
|
url_hash = Card.url_to_hash(url)
|
||||||
|
|
||||||
|
@ -33,22 +33,16 @@ defmodule Pleroma.Web.RichMedia.Backfill do
|
||||||
end
|
end
|
||||||
|
|
||||||
warm_cache(url_hash, card)
|
warm_cache(url_hash, card)
|
||||||
|
:ok
|
||||||
|
|
||||||
{:error, {:invalid_metadata, fields}} ->
|
{:error, type} = error
|
||||||
Logger.debug("Rich media incomplete or invalid metadata for #{url}: #{inspect(fields)}")
|
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
|
||||||
negative_cache(url_hash)
|
negative_cache(url_hash)
|
||||||
|
error
|
||||||
|
|
||||||
{:error, :body_too_large} ->
|
{:error, type} = error
|
||||||
Logger.error("Rich media error for #{url}: :body_too_large")
|
when type in [:get, :head] ->
|
||||||
negative_cache(url_hash)
|
error
|
||||||
|
|
||||||
{:error, {:content_type, type}} ->
|
|
||||||
Logger.debug("Rich media error for #{url}: :content_type is #{type}")
|
|
||||||
negative_cache(url_hash)
|
|
||||||
|
|
||||||
e ->
|
|
||||||
Logger.debug("Rich media error for #{url}: #{inspect(e)}")
|
|
||||||
{:error, e}
|
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -5,26 +5,38 @@
|
||||||
defmodule Pleroma.Web.RichMedia.Helpers do
|
defmodule Pleroma.Web.RichMedia.Helpers do
|
||||||
alias Pleroma.Config
|
alias Pleroma.Config
|
||||||
|
|
||||||
|
require Logger
|
||||||
|
|
||||||
|
@type get_errors :: {:error, :body_too_large | :content_type | :head | :get}
|
||||||
|
|
||||||
|
@spec rich_media_get(String.t()) :: {:ok, String.t()} | get_errors()
|
||||||
def rich_media_get(url) do
|
def rich_media_get(url) do
|
||||||
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
|
headers = [{"user-agent", Pleroma.Application.user_agent() <> "; Bot"}]
|
||||||
|
|
||||||
head_check =
|
with {_, {:ok, %Tesla.Env{status: 200, headers: headers}}} <-
|
||||||
case Pleroma.HTTP.head(url, headers, http_options()) do
|
{:head, Pleroma.HTTP.head(url, headers, http_options())},
|
||||||
# If the HEAD request didn't reach the server for whatever reason,
|
{_, :ok} <- {:content_type, check_content_type(headers)},
|
||||||
# we assume the GET that comes right after won't either
|
{_, :ok} <- {:content_length, check_content_length(headers)},
|
||||||
{:error, _} = e ->
|
{_, {:ok, %Tesla.Env{status: 200, body: body}}} <-
|
||||||
e
|
{:get, Pleroma.HTTP.get(url, headers, http_options())} do
|
||||||
|
{:ok, body}
|
||||||
|
else
|
||||||
|
{:head, _} ->
|
||||||
|
Logger.debug("Rich media error for #{url}: HTTP HEAD failed")
|
||||||
|
{:error, :head}
|
||||||
|
|
||||||
{:ok, %Tesla.Env{status: 200, headers: headers}} ->
|
{:content_type, {_, type}} ->
|
||||||
with :ok <- check_content_type(headers),
|
Logger.debug("Rich media error for #{url}: content-type is #{type}")
|
||||||
:ok <- check_content_length(headers),
|
{:error, :content_type}
|
||||||
do: :ok
|
|
||||||
|
|
||||||
_ ->
|
{:content_length, {_, length}} ->
|
||||||
:ok
|
Logger.debug("Rich media error for #{url}: content-length is #{length}")
|
||||||
|
{:error, :body_too_large}
|
||||||
|
|
||||||
|
{:get, _} ->
|
||||||
|
Logger.debug("Rich media error for #{url}: HTTP GET failed")
|
||||||
|
{:error, :get}
|
||||||
end
|
end
|
||||||
|
|
||||||
with :ok <- head_check, do: Pleroma.HTTP.get(url, headers, http_options())
|
|
||||||
end
|
end
|
||||||
|
|
||||||
defp check_content_type(headers) do
|
defp check_content_type(headers) do
|
||||||
|
@ -32,7 +44,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
|
||||||
{_, content_type} ->
|
{_, content_type} ->
|
||||||
case Plug.Conn.Utils.media_type(content_type) do
|
case Plug.Conn.Utils.media_type(content_type) do
|
||||||
{:ok, "text", "html", _} -> :ok
|
{:ok, "text", "html", _} -> :ok
|
||||||
_ -> {:error, {:content_type, content_type}}
|
_ -> {:error, content_type}
|
||||||
end
|
end
|
||||||
|
|
||||||
_ ->
|
_ ->
|
||||||
|
@ -47,7 +59,7 @@ defmodule Pleroma.Web.RichMedia.Helpers do
|
||||||
{_, maybe_content_length} ->
|
{_, maybe_content_length} ->
|
||||||
case Integer.parse(maybe_content_length) do
|
case Integer.parse(maybe_content_length) do
|
||||||
{content_length, ""} when content_length <= max_body -> :ok
|
{content_length, ""} when content_length <= max_body -> :ok
|
||||||
{_, ""} -> {:error, :body_too_large}
|
{_, ""} -> {:error, maybe_content_length}
|
||||||
_ -> :ok
|
_ -> :ok
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -3,6 +3,7 @@
|
||||||
# SPDX-License-Identifier: AGPL-3.0-only
|
# SPDX-License-Identifier: AGPL-3.0-only
|
||||||
|
|
||||||
defmodule Pleroma.Web.RichMedia.Parser do
|
defmodule Pleroma.Web.RichMedia.Parser do
|
||||||
|
alias Pleroma.Web.RichMedia.Helpers
|
||||||
require Logger
|
require Logger
|
||||||
|
|
||||||
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
|
@config_impl Application.compile_env(:pleroma, [__MODULE__, :config_impl], Pleroma.Config)
|
||||||
|
@ -11,24 +12,26 @@ defmodule Pleroma.Web.RichMedia.Parser do
|
||||||
Pleroma.Config.get([:rich_media, :parsers])
|
Pleroma.Config.get([:rich_media, :parsers])
|
||||||
end
|
end
|
||||||
|
|
||||||
def parse(nil), do: nil
|
@type parse_errors :: {:error, :rich_media_disabled | :validate}
|
||||||
|
|
||||||
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
|
@spec parse(String.t()) ::
|
||||||
def parse(url) do
|
{:ok, map()} | parse_errors() | Helpers.get_errors()
|
||||||
|
def parse(url) when is_binary(url) do
|
||||||
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
|
with {_, true} <- {:config, @config_impl.get([:rich_media, :enabled])},
|
||||||
:ok <- validate_page_url(url),
|
{_, :ok} <- {:validate, validate_page_url(url)},
|
||||||
{:ok, data} <- parse_url(url) do
|
{_, {:ok, data}} <- {:parse, parse_url(url)} do
|
||||||
data = Map.put(data, "url", url)
|
data = Map.put(data, "url", url)
|
||||||
{:ok, data}
|
{:ok, data}
|
||||||
else
|
else
|
||||||
{:config, _} -> {:error, :rich_media_disabled}
|
{:config, _} -> {:error, :rich_media_disabled}
|
||||||
e -> e
|
{:validate, _} -> {:error, :validate}
|
||||||
|
{:parse, error} -> error
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
defp parse_url(url) do
|
defp parse_url(url) do
|
||||||
with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
|
with {:ok, body} <- Helpers.rich_media_get(url),
|
||||||
{:ok, html} <- Floki.parse_document(html) do
|
{:ok, html} <- Floki.parse_document(body) do
|
||||||
html
|
html
|
||||||
|> maybe_parse()
|
|> maybe_parse()
|
||||||
|> clean_parsed_data()
|
|> clean_parsed_data()
|
||||||
|
@ -50,8 +53,8 @@ defmodule Pleroma.Web.RichMedia.Parser do
|
||||||
{:ok, data}
|
{:ok, data}
|
||||||
end
|
end
|
||||||
|
|
||||||
defp check_parsed_data(data) do
|
defp check_parsed_data(_data) do
|
||||||
{:error, {:invalid_metadata, data}}
|
{:error, :invalid_metadata}
|
||||||
end
|
end
|
||||||
|
|
||||||
defp clean_parsed_data(data) do
|
defp clean_parsed_data(data) do
|
||||||
|
|
|
@ -22,7 +22,7 @@ defmodule Pleroma.Web.RichMedia.Parsers.OEmbed do
|
||||||
end
|
end
|
||||||
|
|
||||||
defp get_oembed_data(url) do
|
defp get_oembed_data(url) do
|
||||||
with {:ok, %Tesla.Env{body: json}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
|
with {:ok, json} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url) do
|
||||||
Jason.decode(json)
|
Jason.decode(json)
|
||||||
end
|
end
|
||||||
end
|
end
|
||||||
|
|
|
@ -14,7 +14,21 @@ defmodule Pleroma.Workers.RichMediaWorker do
|
||||||
end
|
end
|
||||||
|
|
||||||
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
|
def perform(%Job{args: %{"op" => "backfill", "url" => _url} = args}) do
|
||||||
Backfill.run(args)
|
case Backfill.run(args) do
|
||||||
|
:ok ->
|
||||||
|
:ok
|
||||||
|
|
||||||
|
{:error, type}
|
||||||
|
when type in [:invalid_metadata, :body_too_large, :content_type, :validate] ->
|
||||||
|
{:cancel, type}
|
||||||
|
|
||||||
|
{:error, type}
|
||||||
|
when type in [:get, :head] ->
|
||||||
|
{:error, type}
|
||||||
|
|
||||||
|
error ->
|
||||||
|
{:error, error}
|
||||||
|
end
|
||||||
end
|
end
|
||||||
|
|
||||||
@impl Oban.Worker
|
@impl Oban.Worker
|
||||||
|
|
|
@ -20,7 +20,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "doesn't just add a title" do
|
test "doesn't just add a title" do
|
||||||
assert {:error, {:invalid_metadata, _}} = Parser.parse("https://example.com/non-ogp")
|
assert {:error, :invalid_metadata} = Parser.parse("https://example.com/non-ogp")
|
||||||
end
|
end
|
||||||
|
|
||||||
test "parses ogp" do
|
test "parses ogp" do
|
||||||
|
@ -96,7 +96,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "returns error if getting page was not successful" do
|
test "returns error if getting page was not successful" do
|
||||||
assert {:error, :overload} = Parser.parse("https://example.com/error")
|
assert {:error, :get} = Parser.parse("https://example.com/error")
|
||||||
end
|
end
|
||||||
|
|
||||||
test "does a HEAD request to check if the body is too large" do
|
test "does a HEAD request to check if the body is too large" do
|
||||||
|
@ -104,17 +104,17 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
|
||||||
end
|
end
|
||||||
|
|
||||||
test "does a HEAD request to check if the body is html" do
|
test "does a HEAD request to check if the body is html" do
|
||||||
assert {:error, {:content_type, _}} = Parser.parse("https://example.com/pdf-file")
|
assert {:error, :content_type} = Parser.parse("https://example.com/pdf-file")
|
||||||
end
|
end
|
||||||
|
|
||||||
test "refuses to crawl incomplete URLs" do
|
test "refuses to crawl incomplete URLs" do
|
||||||
url = "example.com/ogp"
|
url = "example.com/ogp"
|
||||||
assert :error == Parser.parse(url)
|
assert {:error, :validate} == Parser.parse(url)
|
||||||
end
|
end
|
||||||
|
|
||||||
test "refuses to crawl malformed URLs" do
|
test "refuses to crawl malformed URLs" do
|
||||||
url = "example.com[]/ogp"
|
url = "example.com[]/ogp"
|
||||||
assert :error == Parser.parse(url)
|
assert {:error, :validate} == Parser.parse(url)
|
||||||
end
|
end
|
||||||
|
|
||||||
test "refuses to crawl URLs of private network from posts" do
|
test "refuses to crawl URLs of private network from posts" do
|
||||||
|
@ -126,7 +126,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
|
||||||
"https://pleroma.local/notice/9kCP7V"
|
"https://pleroma.local/notice/9kCP7V"
|
||||||
]
|
]
|
||||||
|> Enum.each(fn url ->
|
|> Enum.each(fn url ->
|
||||||
assert :error == Parser.parse(url)
|
assert {:error, :validate} == Parser.parse(url)
|
||||||
end)
|
end)
|
||||||
end
|
end
|
||||||
|
|
||||||
|
|
|
@ -1724,7 +1724,7 @@ defmodule HttpRequestMock do
|
||||||
]
|
]
|
||||||
|
|
||||||
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
|
def head(url, _query, _body, _headers) when url in @rich_media_mocks do
|
||||||
{:ok, %Tesla.Env{status: 404, body: ""}}
|
{:ok, %Tesla.Env{status: 200, body: ""}}
|
||||||
end
|
end
|
||||||
|
|
||||||
def head("https://example.com/pdf-file", _, _, _) do
|
def head("https://example.com/pdf-file", _, _, _) do
|
||||||
|
|
Loading…
Reference in a new issue