takahe/tests/core/test_html.py
Corry Haines cfe18932b8
Match sanitizing for posts to Mastodon (#422)
Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list

Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73

I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact.
2023-01-15 22:32:04 -07:00

128 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

from unittest.mock import Mock
import pytest
from core.html import ContentRenderer, html_to_plaintext, sanitize_html
def test_html_to_plaintext():
assert html_to_plaintext("<p>Hi!</p>") == "Hi!"
assert html_to_plaintext("<p>Hi!<br>There</p>") == "Hi!\nThere"
assert (
html_to_plaintext("<p>Hi!</p>\n\n<p>How are you?</p>") == "Hi!\n\nHow are you?"
)
assert (
html_to_plaintext("<p>Hi!</p>\n\n<p>How are<br> you?</p><p>today</p>")
== "Hi!\n\nHow are\n you?\n\ntoday"
)
def test_sanitize_post():
assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
assert sanitize_html("<p>It&#39;s great</p>") == "<p>It&#39;s great</p>"
# Note that we only want to linkify things with protocol prefixes to prevent
# too many false positives.
assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
assert (
sanitize_html("<p>https://test.com</p>")
== '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
)
assert (
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
== "<p>@someone@subdomain.some-domain.com</p>"
)
@pytest.mark.django_db
def test_link_preservation():
"""
We want to:
- Preserve incoming links from other servers
- Linkify mentions and hashtags
- Not have these all step on each other!
"""
renderer = ContentRenderer(local=True)
fake_mention = Mock()
fake_mention.username = "andrew"
fake_mention.domain_id = "aeracode.org"
fake_mention.urls.view = "/@andrew@aeracode.org/"
fake_post = Mock()
fake_post.mentions.all.return_value = [fake_mention]
fake_post.author.domain.uri_domain = "example.com"
fake_post.emojis.all.return_value = []
assert (
renderer.render_post(
'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
fake_post,
)
== 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
)
@pytest.mark.django_db
def test_list_rendering():
"""
We want to:
- Preserve incoming links from other servers
- Linkify mentions and hashtags
- Not have these all step on each other!
"""
renderer = ContentRenderer(local=True)
fake_mention = Mock()
fake_mention.username = "andrew"
fake_mention.domain_id = "aeracode.org"
fake_mention.urls.view = "/@andrew@aeracode.org/"
fake_post = Mock()
fake_post.mentions.all.return_value = [fake_mention]
fake_post.author.domain.uri_domain = "example.com"
fake_post.emojis.all.return_value = []
assert (
renderer.render_post(
"<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>Whats next? I think I promised some people here bookwyrm</p>",
fake_post,
)
== "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>Whats next? I think I promised some people here bookwyrm</p>"
)
@pytest.mark.django_db
def test_link_mixcase_mentions():
renderer = ContentRenderer(local=True)
fake_mention = Mock()
fake_mention.username = "Manfre"
fake_mention.domain_id = "manfre.net"
fake_mention.urls.view = "/@Manfre@manfre.net/"
fake_mention2 = Mock()
fake_mention2.username = "manfre"
fake_mention2.domain_id = "takahe.social"
fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
unfetched_mention = Mock()
unfetched_mention.username = None
unfetched_mention.domain_id = None
unfetched_mention.urls.view = "/None@None/"
fake_post = Mock()
fake_post.mentions.all.return_value = [
fake_mention,
fake_mention2,
unfetched_mention,
]
fake_post.author.domain.uri_domain = "example.com"
fake_post.emojis.all.return_value = []
assert renderer.render_post(
"@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
fake_post,
) == (
'<a href="/@Manfre@manfre.net/">@Manfre</a> '
'<a href="https://takahe.social/@manfre@takahe.social/">@mAnFrE@takahe.social</a> '
'<a href="/@Manfre@manfre.net/">@manfre</a> '
"@unfetched@manfre.net"
)