takahe/core/html.py
Corry Haines cfe18932b8
Match sanitizing for posts to Mastodon (#422)
Creates filter for REWRITTEN_TAGS that converts them to `p` rather than ripping them out entirely, and formats `ul` as break-separated list

Both changes align sanitization to Mastodon's "strict" sanitizer at https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L73

I don't love the complexity of the Filter, but Bleach doesn't give us great options to work with. The code operates within an iterator without the useful "sibling" methods that Ruby's equivalent has. Also, Bleach runs filters _after_ sanitizing (unlike Ruby's which runs before) so we have to pass all the elements through the sanitizer, then rewrite them after the fact.
2023-01-15 22:32:04 -07:00

296 lines
9.7 KiB
Python

import re
from functools import partial
import bleach
from bleach.html5lib_shim import Filter
from bleach.linkifier import LinkifyFilter
from django.utils.safestring import mark_safe
url_regex = re.compile(
r"""\(* # Match any opening parentheses.
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
# /path/zz (excluding "unsafe" chars from RFC 1738,
# except for # and ~, which happen in practice)
""",
re.IGNORECASE | re.VERBOSE | re.UNICODE,
)
ALLOWED_TAGS = ["br", "p", "a"]
REWRITTEN_TAGS = [
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"blockquote",
"pre",
"ul",
"ol",
"li",
]
class MastodonStrictTagFilter(Filter):
"""
Implements Python equivalent of Mastodon tag rewriter
Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
"""
def __iter__(self):
li_pending_break = False
break_token = {
"name": "br",
"data": {},
"type": "StartTag",
}
for token in Filter.__iter__(self):
if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
"StartTag",
"EndTag",
]:
yield token
continue
if token["type"] == "StartTag":
if token["name"] == "li":
if li_pending_break:
# Another `li` appeared, so break after the last one
yield break_token
continue
token["name"] = "p"
elif token["type"] == "EndTag":
if token["name"] == "li":
# Track that an `li` closed so we know a break should be considered
li_pending_break = True
continue
if token["name"] == "ul":
# If the last `li` happened, then don't add a break because Mastodon doesn't
li_pending_break = False
token["name"] = "p"
yield token
def allow_a(tag: str, name: str, value: str):
if name in ["href", "title", "class"]:
return True
elif name == "rel":
# Only allow rel attributes with a small subset of values
# (we're defending against, for example, rel=me)
rel_values = value.split()
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
return True
return False
def sanitize_html(post_html: str) -> str:
"""
Only allows a, br, p and span tags, and class attributes.
"""
cleaner = bleach.Cleaner(
tags=ALLOWED_TAGS + REWRITTEN_TAGS,
attributes={ # type:ignore
"a": allow_a,
"p": ["class"],
},
filters=[partial(LinkifyFilter, url_re=url_regex), MastodonStrictTagFilter],
strip=True,
)
return mark_safe(cleaner.clean(post_html))
def strip_html(post_html: str, *, linkify: bool = True) -> str:
"""
Strips all tags from the text, then linkifies it.
"""
cleaner = bleach.Cleaner(
tags=[],
strip=True,
filters=[partial(LinkifyFilter, url_re=url_regex)] if linkify else [],
)
return mark_safe(cleaner.clean(post_html))
def html_to_plaintext(post_html: str) -> str:
"""
Tries to do the inverse of the linebreaks filter.
"""
# TODO: Handle HTML entities
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
# Remove all other HTML and return
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
return cleaner.clean(post_html).strip()
class ContentRenderer:
"""
Renders HTML for posts, identity fields, and more.
The `local` parameter affects whether links are absolute (False) or relative (True)
"""
def __init__(self, local: bool):
self.local = local
def render_post(self, html: str, post) -> str:
"""
Given post HTML, normalises it and renders it for presentation.
"""
if not html:
return ""
html = sanitize_html(html)
html = self.linkify_mentions(html, post=post)
html = self.linkify_hashtags(html, identity=post.author)
if self.local:
html = self.imageify_emojis(
html,
identity=post.author,
emojis=post.emojis.all(),
)
html = self.remove_extra_newlines(html)
return mark_safe(html)
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
"""
Given identity summary HTML, normalises it and renders it for presentation.
"""
if not html:
return ""
if strip:
html = strip_html(html)
else:
html = sanitize_html(html)
html = self.linkify_hashtags(html, identity=identity)
if self.local:
html = self.imageify_emojis(html, identity=identity)
html = self.remove_extra_newlines(html)
return mark_safe(html)
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
"""
Given name/basic value HTML, normalises it and renders it for presentation.
"""
if not html:
return ""
if strip:
html = strip_html(html)
else:
html = sanitize_html(html)
if self.local:
html = self.imageify_emojis(html, identity=identity)
html = self.remove_extra_newlines(html)
return mark_safe(html)
def linkify_mentions(self, html: str, post) -> str:
"""
Links mentions _in the context of the post_ - as in, using the mentions
property as the only source (as we might be doing this without other
DB access allowed)
"""
from activities.models import Post
possible_matches = {}
for mention in post.mentions.all():
if self.local:
url = str(mention.urls.view)
else:
url = mention.absolute_profile_uri()
# Might not have fetched it (yet)
if mention.username:
username = mention.username.lower()
possible_matches[username] = url
possible_matches[f"{username}@{mention.domain_id}"] = url
collapse_name: dict[str, str] = {}
def replacer(match):
precursor = match.group(1)
handle = match.group(2)
if "@" in handle:
short_handle = handle.split("@", 1)[0]
else:
short_handle = handle
handle_hash = handle.lower()
short_hash = short_handle.lower()
if handle_hash in possible_matches:
if short_hash not in collapse_name:
collapse_name[short_hash] = handle_hash
elif collapse_name.get(short_hash) != handle_hash:
short_handle = handle
return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
else:
return match.group()
return Post.mention_regex.sub(replacer, html)
def linkify_hashtags(self, html, identity) -> str:
from activities.models import Hashtag
def replacer(attrs, new=False):
# See if the text in this link looks like a hashtag
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
return attrs
hashtag = attrs["_text"].strip().lstrip("#")
attrs[None, "class"] = "hashtag"
if (None, "rel") in attrs:
del attrs[None, "rel"]
if self.local:
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
else:
attrs[
None, "href"
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
return attrs
linker = bleach.linkifier.Linker(
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
)
return linker.linkify(html)
def imageify_emojis(
self, html: str, identity, include_local: bool = True, emojis=None
):
"""
Find :emoji: in content and convert to <img>. If include_local is True,
the local emoji will be used as a fallback for any shortcodes not defined
by emojis.
"""
from activities.models import Emoji
# If precached emojis were passed, prep them
cached_emojis = {}
if emojis:
for emoji in emojis:
cached_emojis[emoji.shortcode] = emoji
def replacer(match):
shortcode = match.group(1).lower()
if shortcode in cached_emojis:
return cached_emojis[shortcode].as_html()
emoji = Emoji.get_by_domain(shortcode, identity.domain)
if emoji and emoji.is_usable:
return emoji.as_html()
elif not emoji and include_local:
emoji = Emoji.get_by_domain(shortcode, None)
if emoji:
return emoji.as_html()
return match.group()
return Emoji.emoji_regex.sub(replacer, html)
def remove_extra_newlines(self, html: str) -> str:
"""
Some clients are sensitive to extra newlines even though it's HTML
"""
# TODO: More intelligent way to strip these?
return html.replace("\n", "")