Merge branch 'feat/floki-fasthtml' into 'develop'

Make Floki use fast_html

See merge request pleroma/pleroma!2194
This commit is contained in:
lain 2020-02-11 13:22:35 +00:00
commit 3fee859b60
9 changed files with 35 additions and 18 deletions

View file

@ -612,6 +612,8 @@ config :pleroma, :modules, runtime_dir: "instance/modules"
config :pleroma, configurable_from_database: false config :pleroma, configurable_from_database: false
config :floki, :html_parser, Floki.HTMLParser.FastHtml
# Import environment specific config. This must remain at the bottom # Import environment specific config. This must remain at the bottom
# of this file so it overrides the configuration defined above. # of this file so it overrides the configuration defined above.
import_config "#{Mix.env()}.exs" import_config "#{Mix.env()}.exs"

View file

@ -108,6 +108,7 @@ defmodule Pleroma.HTML do
Cachex.fetch!(:scrubber_cache, key, fn _key -> Cachex.fetch!(:scrubber_cache, key, fn _key ->
result = result =
content content
|> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]") |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"]")
|> Floki.attribute("a", "href") |> Floki.attribute("a", "href")
|> Enum.at(0) |> Enum.at(0)

View file

@ -17,6 +17,7 @@ defmodule Pleroma.Web.ActivityPub.MRF.AntiLinkSpamPolicy do
# does the post contain links? # does the post contain links?
defp contains_links?(%{"content" => content} = _object) do defp contains_links?(%{"content" => content} = _object) do
content content
|> Floki.parse_fragment!()
|> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl") |> Floki.filter_out("a.mention,a.hashtag,a[rel~=\"tag\"],a.zrl")
|> Floki.attribute("a", "href") |> Floki.attribute("a", "href")
|> length() > 0 |> length() > 0

View file

@ -8,8 +8,10 @@ defmodule Pleroma.Web.Metadata.Providers.RelMe do
@impl Provider @impl Provider
def build_tags(%{user: user}) do def build_tags(%{user: user}) do
(Floki.attribute(user.bio, "link[rel~=me]", "href") ++ bio_tree = Floki.parse_fragment!(user.bio)
Floki.attribute(user.bio, "a[rel~=me]", "href"))
(Floki.attribute(bio_tree, "link[rel~=me]", "href") ++
Floki.attribute(bio_tree, "a[rel~=me]", "href"))
|> Enum.map(fn link -> |> Enum.map(fn link ->
{:link, [rel: "me", href: link], []} {:link, [rel: "me", href: link], []}
end) end)

View file

@ -27,9 +27,10 @@ defmodule Pleroma.Web.RelMe do
defp parse_url(url) do defp parse_url(url) do
with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <- with {:ok, %Tesla.Env{body: html, status: status}} when status in 200..299 <-
Pleroma.HTTP.get(url, [], adapter: @hackney_options), Pleroma.HTTP.get(url, [], adapter: @hackney_options),
{:ok, html_tree} <- Floki.parse_document(html),
data <- data <-
Floki.attribute(html, "link[rel~=me]", "href") ++ Floki.attribute(html_tree, "link[rel~=me]", "href") ++
Floki.attribute(html, "a[rel~=me]", "href") do Floki.attribute(html_tree, "a[rel~=me]", "href") do
{:ok, data} {:ok, data}
end end
rescue rescue

View file

@ -81,18 +81,18 @@ defmodule Pleroma.Web.RichMedia.Parser do
{:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options) {:ok, %Tesla.Env{body: html}} = Pleroma.HTTP.get(url, [], adapter: @hackney_options)
html html
|> parse_html |> parse_html()
|> maybe_parse() |> maybe_parse()
|> Map.put(:url, url) |> Map.put(:url, url)
|> clean_parsed_data() |> clean_parsed_data()
|> check_parsed_data() |> check_parsed_data()
rescue rescue
e -> e ->
{:error, "Parsing error: #{inspect(e)}"} {:error, "Parsing error: #{inspect(e)} #{inspect(__STACKTRACE__)}"}
end end
end end
defp parse_html(html), do: Floki.parse(html) defp parse_html(html), do: Floki.parse_document!(html)
defp maybe_parse(html) do defp maybe_parse(html) do
Enum.reduce_while(parsers(), %{}, fn parser, acc -> Enum.reduce_while(parsers(), %{}, fn parser, acc ->

View file

@ -139,7 +139,7 @@ defmodule Pleroma.Mixfile do
{:phoenix_swoosh, "~> 0.2"}, {:phoenix_swoosh, "~> 0.2"},
{:gen_smtp, "~> 0.13"}, {:gen_smtp, "~> 0.13"},
{:websocket_client, git: "https://github.com/jeremyong/websocket_client.git", only: :test}, {:websocket_client, git: "https://github.com/jeremyong/websocket_client.git", only: :test},
{:floki, "~> 0.23.0"}, {:floki, "~> 0.25"},
{:ex_syslogger, github: "slashmili/ex_syslogger", tag: "1.4.0"}, {:ex_syslogger, github: "slashmili/ex_syslogger", tag: "1.4.0"},
{:timex, "~> 3.5"}, {:timex, "~> 3.5"},
{:ueberauth, "~> 0.4"}, {:ueberauth, "~> 0.4"},

View file

@ -37,16 +37,16 @@
"ex_machina": {:hex, :ex_machina, "2.3.0", "92a5ad0a8b10ea6314b876a99c8c9e3f25f4dde71a2a835845b136b9adaf199a", [:mix], [{:ecto, "~> 2.2 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_sql, "~> 3.0", [hex: :ecto_sql, repo: "hexpm", optional: true]}], "hexpm"}, "ex_machina": {:hex, :ex_machina, "2.3.0", "92a5ad0a8b10ea6314b876a99c8c9e3f25f4dde71a2a835845b136b9adaf199a", [:mix], [{:ecto, "~> 2.2 or ~> 3.0", [hex: :ecto, repo: "hexpm", optional: true]}, {:ecto_sql, "~> 3.0", [hex: :ecto_sql, repo: "hexpm", optional: true]}], "hexpm"},
"ex_syslogger": {:git, "https://github.com/slashmili/ex_syslogger.git", "f3963399047af17e038897c69e20d552e6899e1d", [tag: "1.4.0"]}, "ex_syslogger": {:git, "https://github.com/slashmili/ex_syslogger.git", "f3963399047af17e038897c69e20d552e6899e1d", [tag: "1.4.0"]},
"excoveralls": {:hex, :excoveralls, "0.12.1", "a553c59f6850d0aff3770e4729515762ba7c8e41eedde03208182a8dc9d0ce07", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"}, "excoveralls": {:hex, :excoveralls, "0.12.1", "a553c59f6850d0aff3770e4729515762ba7c8e41eedde03208182a8dc9d0ce07", [:mix], [{:hackney, "~> 1.0", [hex: :hackney, repo: "hexpm", optional: false]}, {:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}], "hexpm"},
"fast_html": {:hex, :fast_html, "0.99.4", "d80812664f0429607e1d880fba0ef04da87a2e4fa596701bcaae17953535695c", [:make, :mix], [], "hexpm"}, "fast_html": {:hex, :fast_html, "1.0.2", "b2a32022741699421e90762ce904cacb4faf12c10129acc3674262dd7fa5d2b6", [:make, :mix], [], "hexpm"},
"fast_sanitize": {:hex, :fast_sanitize, "0.1.4", "6c2e7203ca2f8275527a3021ba6e9d5d4ee213a47dc214a97c128737c9e56df1", [:mix], [{:fast_html, "~> 0.99", [hex: :fast_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.8", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"}, "fast_sanitize": {:hex, :fast_sanitize, "0.1.7", "2a7cd8734c88a2de6de55022104f8a3b87f1fdbe8bbf131d9049764b53d50d0d", [:mix], [{:fast_html, "~> 1.0", [hex: :fast_html, repo: "hexpm", optional: false]}, {:plug, "~> 1.8", [hex: :plug, repo: "hexpm", optional: false]}], "hexpm"},
"flake_id": {:hex, :flake_id, "0.1.0", "7716b086d2e405d09b647121a166498a0d93d1a623bead243e1f74216079ccb3", [:mix], [{:base62, "~> 1.2", [hex: :base62, repo: "hexpm", optional: false]}, {:ecto, ">= 2.0.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"}, "flake_id": {:hex, :flake_id, "0.1.0", "7716b086d2e405d09b647121a166498a0d93d1a623bead243e1f74216079ccb3", [:mix], [{:base62, "~> 1.2", [hex: :base62, repo: "hexpm", optional: false]}, {:ecto, ">= 2.0.0", [hex: :ecto, repo: "hexpm", optional: true]}], "hexpm"},
"floki": {:hex, :floki, "0.23.1", "e100306ce7d8841d70a559748e5091542e2cfc67ffb3ade92b89a8435034dab1", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm"}, "floki": {:hex, :floki, "0.25.0", "b1c9ddf5f32a3a90b43b76f3386ca054325dc2478af020e87b5111c19f2284ac", [:mix], [{:html_entities, "~> 0.5.0", [hex: :html_entities, repo: "hexpm", optional: false]}], "hexpm"},
"gen_smtp": {:hex, :gen_smtp, "0.15.0", "9f51960c17769b26833b50df0b96123605a8024738b62db747fece14eb2fbfcc", [:rebar3], [], "hexpm"}, "gen_smtp": {:hex, :gen_smtp, "0.15.0", "9f51960c17769b26833b50df0b96123605a8024738b62db747fece14eb2fbfcc", [:rebar3], [], "hexpm"},
"gen_stage": {:hex, :gen_stage, "0.14.3", "d0c66f1c87faa301c1a85a809a3ee9097a4264b2edf7644bf5c123237ef732bf", [:mix], [], "hexpm"}, "gen_stage": {:hex, :gen_stage, "0.14.3", "d0c66f1c87faa301c1a85a809a3ee9097a4264b2edf7644bf5c123237ef732bf", [:mix], [], "hexpm"},
"gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"}, "gen_state_machine": {:hex, :gen_state_machine, "2.0.5", "9ac15ec6e66acac994cc442dcc2c6f9796cf380ec4b08267223014be1c728a95", [:mix], [], "hexpm"},
"gettext": {:hex, :gettext, "0.17.1", "8baab33482df4907b3eae22f719da492cee3981a26e649b9c2be1c0192616962", [:mix], [], "hexpm"}, "gettext": {:hex, :gettext, "0.17.1", "8baab33482df4907b3eae22f719da492cee3981a26e649b9c2be1c0192616962", [:mix], [], "hexpm"},
"hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"}, "hackney": {:hex, :hackney, "1.15.2", "07e33c794f8f8964ee86cebec1a8ed88db5070e52e904b8f12209773c1036085", [:rebar3], [{:certifi, "2.5.1", [hex: :certifi, repo: "hexpm", optional: false]}, {:idna, "6.0.0", [hex: :idna, repo: "hexpm", optional: false]}, {:metrics, "1.0.1", [hex: :metrics, repo: "hexpm", optional: false]}, {:mimerl, "~>1.1", [hex: :mimerl, repo: "hexpm", optional: false]}, {:ssl_verify_fun, "1.1.5", [hex: :ssl_verify_fun, repo: "hexpm", optional: false]}], "hexpm"},
"html_entities": {:hex, :html_entities, "0.5.0", "40f5c5b9cbe23073b48a4e69c67b6c11974f623a76165e2b92d098c0e88ccb1d", [:mix], [], "hexpm"}, "html_entities": {:hex, :html_entities, "0.5.1", "1c9715058b42c35a2ab65edc5b36d0ea66dd083767bef6e3edb57870ef556549", [:mix], [], "hexpm"},
"html_sanitize_ex": {:hex, :html_sanitize_ex, "1.3.0", "f005ad692b717691203f940c686208aa3d8ffd9dd4bb3699240096a51fa9564e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"}, "html_sanitize_ex": {:hex, :html_sanitize_ex, "1.3.0", "f005ad692b717691203f940c686208aa3d8ffd9dd4bb3699240096a51fa9564e", [:mix], [{:mochiweb, "~> 2.15", [hex: :mochiweb, repo: "hexpm", optional: false]}], "hexpm"},
"http_signatures": {:git, "https://git.pleroma.social/pleroma/http_signatures.git", "293d77bb6f4a67ac8bde1428735c3b42f22cbb30", [ref: "293d77bb6f4a67ac8bde1428735c3b42f22cbb30"]}, "http_signatures": {:git, "https://git.pleroma.social/pleroma/http_signatures.git", "293d77bb6f4a67ac8bde1428735c3b42f22cbb30", [ref: "293d77bb6f4a67ac8bde1428735c3b42f22cbb30"]},
"httpoison": {:hex, :httpoison, "1.6.1", "2ce5bf6e535cd0ab02e905ba8c276580bab80052c5c549f53ddea52d72e81f33", [:mix], [{:hackney, "~> 1.15 and >= 1.15.2", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"}, "httpoison": {:hex, :httpoison, "1.6.1", "2ce5bf6e535cd0ab02e905ba8c276580bab80052c5c549f53ddea52d72e81f33", [:mix], [{:hackney, "~> 1.15 and >= 1.15.2", [hex: :hackney, repo: "hexpm", optional: false]}], "hexpm"},

View file

@ -7,11 +7,14 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
alias Pleroma.Web.RichMedia.Parsers.TwitterCard alias Pleroma.Web.RichMedia.Parsers.TwitterCard
test "returns error when html not contains twitter card" do test "returns error when html not contains twitter card" do
assert TwitterCard.parse("", %{}) == {:error, "No twitter card metadata found"} assert TwitterCard.parse([{"html", [], [{"head", [], []}, {"body", [], []}]}], %{}) ==
{:error, "No twitter card metadata found"}
end end
test "parses twitter card with only name attributes" do test "parses twitter card with only name attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers3.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -26,7 +29,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end end
test "parses twitter card with only property attributes" do test "parses twitter card with only property attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers2.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -45,7 +50,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end end
test "parses twitter card with name & property attributes" do test "parses twitter card with name & property attributes" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -73,7 +80,8 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
"YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <> "YTQ5MF9EQVIgZXhodW1hdGlvbiBvZiBNYXJnYXJldCBDb3JiaW4gZ3JhdmUgMTkyNi5qcGciXSxbInAiLCJjb252ZXJ0IiwiIl0sWyJwIiwiY29udmVydCIsIi1xdWFsaXR5IDgxIC1hdXRvLW9" <>
"yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg" "yaWVudCJdLFsicCIsInRodW1iIiwiNjAweD4iXV0/DAR%20exhumation%20of%20Margaret%20Corbin%20grave%201926.jpg"
html = File.read!("test/fixtures/margaret-corbin-grave-west-point.html") html =
File.read!("test/fixtures/margaret-corbin-grave-west-point.html") |> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,
@ -87,7 +95,9 @@ defmodule Pleroma.Web.RichMedia.Parsers.TwitterCardTest do
end end
test "takes first founded title in html head if there is html markup error" do test "takes first founded title in html head if there is html markup error" do
html = File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html") html =
File.read!("test/fixtures/nypd-facial-recognition-children-teenagers4.html")
|> Floki.parse_document!()
assert TwitterCard.parse(html, %{}) == assert TwitterCard.parse(html, %{}) ==
{:ok, {:ok,