Merge branch 'fix/2047-rich-media-parser' into 'develop'

RichMedia parser fix

Closes #2047

See merge request pleroma/pleroma!2941
This commit is contained in:
rinpatch 2020-09-02 09:38:43 +00:00
commit 47ff425cfd
4 changed files with 49 additions and 38 deletions

View file

@ -3,6 +3,8 @@
# SPDX-License-Identifier: AGPL-3.0-only # SPDX-License-Identifier: AGPL-3.0-only
defmodule Pleroma.Web.RichMedia.Parser do defmodule Pleroma.Web.RichMedia.Parser do
require Logger
defp parsers do defp parsers do
Pleroma.Config.get([:rich_media, :parsers]) Pleroma.Config.get([:rich_media, :parsers])
end end
@ -10,18 +12,19 @@ defmodule Pleroma.Web.RichMedia.Parser do
def parse(nil), do: {:error, "No URL provided"} def parse(nil), do: {:error, "No URL provided"}
if Pleroma.Config.get(:env) == :test do if Pleroma.Config.get(:env) == :test do
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url), do: parse_url(url) def parse(url), do: parse_url(url)
else else
@spec parse(String.t()) :: {:ok, map()} | {:error, any()}
def parse(url) do def parse(url) do
try do Cachex.fetch!(:rich_media_cache, url, fn _ ->
Cachex.fetch!(:rich_media_cache, url, fn _ -> with {:ok, data} <- parse_url(url) do
{:commit, parse_url(url)} {:commit, {:ok, data}}
end) else
|> set_ttl_based_on_image(url) error -> {:ignore, error}
rescue end
e -> end)
{:error, "Cachex error: #{inspect(e)}"} |> set_ttl_based_on_image(url)
end
end end
end end
@ -47,9 +50,11 @@ defmodule Pleroma.Web.RichMedia.Parser do
config :pleroma, :rich_media, config :pleroma, :rich_media,
ttl_setters: [MyModule] ttl_setters: [MyModule]
""" """
@spec set_ttl_based_on_image({:ok, map()} | {:error, any()}, String.t()) ::
{:ok, map()} | {:error, any()}
def set_ttl_based_on_image({:ok, data}, url) do def set_ttl_based_on_image({:ok, data}, url) do
with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url), with {:ok, nil} <- Cachex.ttl(:rich_media_cache, url),
ttl when is_number(ttl) <- get_ttl_from_image(data, url) do {:ok, ttl} when is_number(ttl) <- get_ttl_from_image(data, url) do
Cachex.expire_at(:rich_media_cache, url, ttl * 1000) Cachex.expire_at(:rich_media_cache, url, ttl * 1000)
{:ok, data} {:ok, data}
else else
@ -58,8 +63,14 @@ defmodule Pleroma.Web.RichMedia.Parser do
end end
end end
def set_ttl_based_on_image({:error, _} = error, _) do
Logger.error("parsing error: #{inspect(error)}")
error
end
defp get_ttl_from_image(data, url) do defp get_ttl_from_image(data, url) do
Pleroma.Config.get([:rich_media, :ttl_setters]) [:rich_media, :ttl_setters]
|> Pleroma.Config.get()
|> Enum.reduce({:ok, nil}, fn |> Enum.reduce({:ok, nil}, fn
module, {:ok, _ttl} -> module, {:ok, _ttl} ->
module.ttl(data, url) module.ttl(data, url)
@ -70,23 +81,16 @@ defmodule Pleroma.Web.RichMedia.Parser do
end end
defp parse_url(url) do defp parse_url(url) do
try do with {:ok, %Tesla.Env{body: html}} <- Pleroma.Web.RichMedia.Helpers.rich_media_get(url),
{:ok, %Tesla.Env{body: html}} = Pleroma.Web.RichMedia.Helpers.rich_media_get(url) {:ok, html} <- Floki.parse_document(html) do
html html
|> parse_html()
|> maybe_parse() |> maybe_parse()
|> Map.put("url", url) |> Map.put("url", url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
rescue
e ->
{:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
end end
end end
defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->
case parser.parse(html, acc) do case parser.parse(html, acc) do

View file

@ -10,20 +10,15 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
|> parse_query_params() |> parse_query_params()
|> format_query_params() |> format_query_params()
|> get_expiration_timestamp() |> get_expiration_timestamp()
else
{:error, "Not aws signed url #{inspect(image)}"}
end end
end end
defp is_aws_signed_url(""), do: nil defp is_aws_signed_url(image) when is_binary(image) and image != "" do
defp is_aws_signed_url(nil), do: nil
defp is_aws_signed_url(image) when is_binary(image) do
%URI{host: host, query: query} = URI.parse(image) %URI{host: host, query: query} = URI.parse(image)
if String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires") do String.contains?(host, "amazonaws.com") and String.contains?(query, "X-Amz-Expires")
image
else
nil
end
end end
defp is_aws_signed_url(_), do: nil defp is_aws_signed_url(_), do: nil
@ -46,6 +41,6 @@ defmodule Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl do
|> Map.get("X-Amz-Date") |> Map.get("X-Amz-Date")
|> Timex.parse("{ISO:Basic:Z}") |> Timex.parse("{ISO:Basic:Z}")
Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires")) {:ok, Timex.to_unix(date) + String.to_integer(Map.get(params, "X-Amz-Expires"))}
end end
end end

View file

@ -21,7 +21,7 @@ defmodule Pleroma.Web.RichMedia.TTL.AwsSignedUrlTest do
expire_time = expire_time =
Timex.parse!(timestamp, "{ISO:Basic:Z}") |> Timex.to_unix() |> Kernel.+(valid_till) Timex.parse!(timestamp, "{ISO:Basic:Z}") |> Timex.to_unix() |> Kernel.+(valid_till)
assert expire_time == Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.ttl(metadata, url) assert {:ok, expire_time} == Pleroma.Web.RichMedia.Parser.TTL.AwsSignedUrl.ttl(metadata, url)
end end
test "s3 signed url is parsed and correct ttl is set for rich media" do test "s3 signed url is parsed and correct ttl is set for rich media" do

View file

@ -5,6 +5,8 @@
defmodule Pleroma.Web.RichMedia.ParserTest do defmodule Pleroma.Web.RichMedia.ParserTest do
use ExUnit.Case, async: true use ExUnit.Case, async: true
alias Pleroma.Web.RichMedia.Parser
setup do setup do
Tesla.Mock.mock(fn Tesla.Mock.mock(fn
%{ %{
@ -48,23 +50,29 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
%{method: :get, url: "http://example.com/empty"} -> %{method: :get, url: "http://example.com/empty"} ->
%Tesla.Env{status: 200, body: "hello"} %Tesla.Env{status: 200, body: "hello"}
%{method: :get, url: "http://example.com/malformed"} ->
%Tesla.Env{status: 200, body: File.read!("test/fixtures/rich_media/malformed-data.html")}
%{method: :get, url: "http://example.com/error"} ->
{:error, :overload}
end) end)
:ok :ok
end end
test "returns error when no metadata present" do test "returns error when no metadata present" do
assert {:error, _} = Pleroma.Web.RichMedia.Parser.parse("http://example.com/empty") assert {:error, _} = Parser.parse("http://example.com/empty")
end end
test "doesn't just add a title" do test "doesn't just add a title" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/non-ogp") == assert Parser.parse("http://example.com/non-ogp") ==
{:error, {:error,
"Found metadata was invalid or incomplete: %{\"url\" => \"http://example.com/non-ogp\"}"} "Found metadata was invalid or incomplete: %{\"url\" => \"http://example.com/non-ogp\"}"}
end end
test "parses ogp" do test "parses ogp" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp") == assert Parser.parse("http://example.com/ogp") ==
{:ok, {:ok,
%{ %{
"image" => "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -77,7 +85,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "falls back to <title> when ogp:title is missing" do test "falls back to <title> when ogp:title is missing" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/ogp-missing-title") == assert Parser.parse("http://example.com/ogp-missing-title") ==
{:ok, {:ok,
%{ %{
"image" => "http://ia.media-imdb.com/images/rock.jpg", "image" => "http://ia.media-imdb.com/images/rock.jpg",
@ -90,7 +98,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "parses twitter card" do test "parses twitter card" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/twitter-card") == assert Parser.parse("http://example.com/twitter-card") ==
{:ok, {:ok,
%{ %{
"card" => "summary", "card" => "summary",
@ -103,7 +111,7 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "parses OEmbed" do test "parses OEmbed" do
assert Pleroma.Web.RichMedia.Parser.parse("http://example.com/oembed") == assert Parser.parse("http://example.com/oembed") ==
{:ok, {:ok,
%{ %{
"author_name" => "bees", "author_name" => "bees",
@ -132,6 +140,10 @@ defmodule Pleroma.Web.RichMedia.ParserTest do
end end
test "rejects invalid OGP data" do test "rejects invalid OGP data" do
assert {:error, _} = Pleroma.Web.RichMedia.Parser.parse("http://example.com/malformed") assert {:error, _} = Parser.parse("http://example.com/malformed")
end
test "returns error if getting page was not successful" do
assert {:error, :overload} = Parser.parse("http://example.com/error")
end end
end end