Match sanitizing for posts to Mastodon (#422)

Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list

Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73

I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact.
This commit is contained in:
Corry Haines 2023-01-15 21:32:04 -08:00 committed by GitHub
parent b721833b4f
commit cfe18932b8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 90 additions and 2 deletions

View file

@ -2,6 +2,7 @@ import re
from functools import partial from functools import partial
import bleach import bleach
from bleach.html5lib_shim import Filter
from bleach.linkifier import LinkifyFilter from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe from django.utils.safestring import mark_safe
@ -16,6 +17,66 @@ url_regex = re.compile(
re.IGNORECASE | re.VERBOSE | re.UNICODE, re.IGNORECASE | re.VERBOSE | re.UNICODE,
) )
ALLOWED_TAGS = ["br", "p", "a"]
REWRITTEN_TAGS = [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"blockquote",
"pre",
"ul",
"ol",
"li",
]
class MastodonStrictTagFilter(Filter):
"""
Implements Python equivalent of Mastodon tag rewriter
Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
"""
def __iter__(self):
li_pending_break = False
break_token = {
"name": "br",
"data": {},
"type": "StartTag",
}
for token in Filter.__iter__(self):
if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
"StartTag",
"EndTag",
]:
yield token
continue
if token["type"] == "StartTag":
if token["name"] == "li":
if li_pending_break:
# Another `li` appeared, so break after the last one
yield break_token
continue
token["name"] = "p"
elif token["type"] == "EndTag":
if token["name"] == "li":
# Track that an `li` closed so we know a break should be considered
li_pending_break = True
continue
if token["name"] == "ul":
# If the last `li` happened, then don't add a break because Mastodon doesn't
li_pending_break = False
token["name"] = "p"
yield token
def allow_a(tag: str, name: str, value: str): def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]: if name in ["href", "title", "class"]:
@ -34,12 +95,12 @@ def sanitize_html(post_html: str) -> str:
Only allows a, br, p and span tags, and class attributes. Only allows a, br, p and span tags, and class attributes.
""" """
cleaner = bleach.Cleaner( cleaner = bleach.Cleaner(
tags=["br", "p", "a"], tags=ALLOWED_TAGS + REWRITTEN_TAGS,
attributes={ # type:ignore attributes={ # type:ignore
"a": allow_a, "a": allow_a,
"p": ["class"], "p": ["class"],
}, },
filters=[partial(LinkifyFilter, url_re=url_regex)], filters=[partial(LinkifyFilter, url_re=url_regex), MastodonStrictTagFilter],
strip=True, strip=True,
) )
return mark_safe(cleaner.clean(post_html)) return mark_safe(cleaner.clean(post_html))

View file

@ -64,6 +64,33 @@ def test_link_preservation():
) )
@pytest.mark.django_db
def test_list_rendering():
"""
We want to:
- Preserve incoming links from other servers
- Linkify mentions and hashtags
- Not have these all step on each other!
"""
renderer = ContentRenderer(local=True)
fake_mention = Mock()
fake_mention.username = "andrew"
fake_mention.domain_id = "aeracode.org"
fake_mention.urls.view = "/@andrew@aeracode.org/"
fake_post = Mock()
fake_post.mentions.all.return_value = [fake_mention]
fake_post.author.domain.uri_domain = "example.com"
fake_post.emojis.all.return_value = []
assert (
renderer.render_post(
"<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>Whats next? I think I promised some people here bookwyrm</p>",
fake_post,
)
== "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>Whats next? I think I promised some people here bookwyrm</p>"
)
@pytest.mark.django_db @pytest.mark.django_db
def test_link_mixcase_mentions(): def test_link_mixcase_mentions():
renderer = ContentRenderer(local=True) renderer = ContentRenderer(local=True)