2022-12-20 13:10:35 +00:00
|
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
|
2022-11-12 05:02:43 +00:00
|
|
|
import bleach
|
2023-01-16 18:59:46 +00:00
|
|
|
import bleach.callbacks
|
2023-01-16 05:32:04 +00:00
|
|
|
from bleach.html5lib_shim import Filter
|
2022-11-14 02:03:43 +00:00
|
|
|
from bleach.linkifier import LinkifyFilter
|
2022-11-12 05:02:43 +00:00
|
|
|
from django.utils.safestring import mark_safe
|
|
|
|
|
2022-12-20 13:10:35 +00:00
|
|
|
url_regex = re.compile(
|
|
|
|
r"""\(* # Match any opening parentheses.
|
|
|
|
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
|
|
|
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
|
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
|
|
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
|
|
|
# except for # and ~, which happen in practice)
|
|
|
|
""",
|
|
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
|
|
|
)
|
|
|
|
|
2023-01-16 05:32:04 +00:00
|
|
|
ALLOWED_TAGS = ["br", "p", "a"]
|
|
|
|
REWRITTEN_TAGS = [
|
|
|
|
"h1",
|
|
|
|
"h2",
|
|
|
|
"h3",
|
|
|
|
"h4",
|
|
|
|
"h5",
|
|
|
|
"h6",
|
|
|
|
"blockquote",
|
|
|
|
"pre",
|
|
|
|
"ul",
|
|
|
|
"ol",
|
|
|
|
"li",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
|
|
class MastodonStrictTagFilter(Filter):
|
|
|
|
"""
|
|
|
|
Implements Python equivalent of Mastodon tag rewriter
|
|
|
|
|
|
|
|
Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
|
|
|
|
|
|
|
|
Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
li_pending_break = False
|
|
|
|
break_token = {
|
|
|
|
"name": "br",
|
|
|
|
"data": {},
|
|
|
|
"type": "StartTag",
|
|
|
|
}
|
|
|
|
|
|
|
|
for token in Filter.__iter__(self):
|
|
|
|
if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
|
|
|
|
"StartTag",
|
|
|
|
"EndTag",
|
|
|
|
]:
|
|
|
|
yield token
|
|
|
|
continue
|
|
|
|
|
|
|
|
if token["type"] == "StartTag":
|
|
|
|
if token["name"] == "li":
|
|
|
|
if li_pending_break:
|
|
|
|
# Another `li` appeared, so break after the last one
|
|
|
|
yield break_token
|
|
|
|
continue
|
|
|
|
token["name"] = "p"
|
|
|
|
elif token["type"] == "EndTag":
|
|
|
|
if token["name"] == "li":
|
|
|
|
# Track that an `li` closed so we know a break should be considered
|
|
|
|
li_pending_break = True
|
|
|
|
continue
|
|
|
|
if token["name"] == "ul":
|
|
|
|
# If the last `li` happened, then don't add a break because Mastodon doesn't
|
|
|
|
li_pending_break = False
|
|
|
|
token["name"] = "p"
|
|
|
|
|
|
|
|
yield token
|
|
|
|
|
2022-11-12 05:02:43 +00:00
|
|
|
|
2023-01-18 06:41:33 +00:00
|
|
|
class UnlinkifyFilter(Filter):
|
|
|
|
"""
|
|
|
|
Forcibly replaces link text with the href.
|
|
|
|
|
|
|
|
This is intented to be used when stripping <a> tags to preserve the link
|
|
|
|
location at the expense of the link text.
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __iter__(self):
|
|
|
|
discarding_a_text = False
|
|
|
|
for token in Filter.__iter__(self):
|
|
|
|
if token.get("name") == "a":
|
|
|
|
if token["type"] == "EndTag":
|
|
|
|
discarding_a_text = False
|
|
|
|
continue
|
|
|
|
href = token["data"].get((None, "href"))
|
|
|
|
|
|
|
|
# If <a> has an href, we use it and throw away all content
|
|
|
|
# within the <a>...</a>. If href missing or empty, try to find
|
|
|
|
# text within the <a>...</a>
|
|
|
|
if href:
|
|
|
|
yield {"data": href, "type": "Characters"}
|
|
|
|
discarding_a_text = True
|
|
|
|
continue
|
|
|
|
elif not discarding_a_text:
|
|
|
|
yield token
|
|
|
|
# else: throw away tokens until we're out of the <a>
|
|
|
|
|
|
|
|
|
2022-11-14 02:03:43 +00:00
|
|
|
def allow_a(tag: str, name: str, value: str):
|
|
|
|
if name in ["href", "title", "class"]:
|
|
|
|
return True
|
|
|
|
elif name == "rel":
|
|
|
|
# Only allow rel attributes with a small subset of values
|
|
|
|
# (we're defending against, for example, rel=me)
|
|
|
|
rel_values = value.split()
|
|
|
|
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2023-01-16 18:59:46 +00:00
|
|
|
def shorten_link_text(attrs, new=False):
|
|
|
|
"""
|
|
|
|
Applies Mastodon's link shortening behavior where URL text links are
|
|
|
|
shortened by removing the scheme and only showing the first 30 chars.
|
|
|
|
|
|
|
|
Orig:
|
|
|
|
<a>https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened</a>
|
|
|
|
|
|
|
|
Becomes:
|
|
|
|
<a>social.example.com/a-long/path</a>
|
|
|
|
|
|
|
|
"""
|
|
|
|
text = attrs.get("_text")
|
|
|
|
if not text:
|
|
|
|
text = attrs.get((None, "href"))
|
|
|
|
if text and "://" in text and len(text) > 30:
|
2023-01-16 19:27:25 +00:00
|
|
|
text = text.split("://", 1)[-1]
|
|
|
|
attrs["_text"] = text[:30]
|
|
|
|
if len(text) > 30:
|
|
|
|
attrs[(None, "class")] = " ".join(
|
|
|
|
filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
|
|
|
|
)
|
2023-01-16 18:59:46 +00:00
|
|
|
# Add the full URL in to title for easier user inspection
|
|
|
|
attrs[(None, "title")] = attrs.get((None, "href"))
|
|
|
|
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
|
|
|
|
linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
|
|
|
|
|
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
def sanitize_html(post_html: str) -> str:
|
2022-11-12 05:02:43 +00:00
|
|
|
"""
|
|
|
|
Only allows a, br, p and span tags, and class attributes.
|
|
|
|
"""
|
2022-11-14 02:03:43 +00:00
|
|
|
cleaner = bleach.Cleaner(
|
2023-01-16 05:32:04 +00:00
|
|
|
tags=ALLOWED_TAGS + REWRITTEN_TAGS,
|
2022-11-14 02:03:43 +00:00
|
|
|
attributes={ # type:ignore
|
|
|
|
"a": allow_a,
|
|
|
|
"p": ["class"],
|
|
|
|
},
|
2023-01-16 18:59:46 +00:00
|
|
|
filters=[
|
|
|
|
partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
|
|
|
|
MastodonStrictTagFilter,
|
|
|
|
],
|
2022-11-18 02:31:00 +00:00
|
|
|
strip=True,
|
2022-11-12 05:02:43 +00:00
|
|
|
)
|
2022-11-14 02:03:43 +00:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-22 04:18:13 +00:00
|
|
|
|
|
|
|
|
2022-12-28 00:42:30 +00:00
|
|
|
def strip_html(post_html: str, *, linkify: bool = True) -> str:
|
2022-11-22 04:18:13 +00:00
|
|
|
"""
|
|
|
|
Strips all tags from the text, then linkifies it.
|
|
|
|
"""
|
2022-12-20 13:10:35 +00:00
|
|
|
cleaner = bleach.Cleaner(
|
|
|
|
tags=[],
|
|
|
|
strip=True,
|
2023-01-16 18:59:46 +00:00
|
|
|
filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
|
|
|
|
if linkify
|
2023-01-18 06:41:33 +00:00
|
|
|
else [UnlinkifyFilter],
|
2022-12-20 13:10:35 +00:00
|
|
|
)
|
2022-11-22 04:18:13 +00:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-27 19:09:08 +00:00
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Tries to do the inverse of the linebreaks filter.
|
|
|
|
"""
|
|
|
|
# TODO: Handle HTML entities
|
|
|
|
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
|
|
|
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
|
|
|
# Remove all other HTML and return
|
2023-01-18 06:41:33 +00:00
|
|
|
cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
|
2022-11-27 19:09:08 +00:00
|
|
|
return cleaner.clean(post_html).strip()
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ContentRenderer:
|
|
|
|
"""
|
|
|
|
Renders HTML for posts, identity fields, and more.
|
|
|
|
|
|
|
|
The `local` parameter affects whether links are absolute (False) or relative (True)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, local: bool):
|
|
|
|
self.local = local
|
|
|
|
|
|
|
|
def render_post(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Given post HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_mentions(html, post=post)
|
|
|
|
html = self.linkify_hashtags(html, identity=post.author)
|
|
|
|
if self.local:
|
2022-12-22 16:55:31 +00:00
|
|
|
html = self.imageify_emojis(
|
|
|
|
html,
|
|
|
|
identity=post.author,
|
|
|
|
emojis=post.emojis.all(),
|
|
|
|
)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-22 01:10:25 +00:00
|
|
|
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
2022-12-22 01:10:25 +00:00
|
|
|
Given identity summary HTML, normalises it and renders it for presentation.
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_hashtags(html, identity=identity)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-22 01:10:25 +00:00
|
|
|
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
|
|
|
"""
|
|
|
|
Given name/basic value HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-22 01:10:25 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
def linkify_mentions(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Links mentions _in the context of the post_ - as in, using the mentions
|
|
|
|
property as the only source (as we might be doing this without other
|
|
|
|
DB access allowed)
|
|
|
|
"""
|
|
|
|
from activities.models import Post
|
|
|
|
|
|
|
|
possible_matches = {}
|
|
|
|
for mention in post.mentions.all():
|
|
|
|
if self.local:
|
|
|
|
url = str(mention.urls.view)
|
|
|
|
else:
|
|
|
|
url = mention.absolute_profile_uri()
|
2022-12-25 21:37:31 +00:00
|
|
|
# Might not have fetched it (yet)
|
|
|
|
if mention.username:
|
|
|
|
username = mention.username.lower()
|
|
|
|
possible_matches[username] = url
|
|
|
|
possible_matches[f"{username}@{mention.domain_id}"] = url
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
collapse_name: dict[str, str] = {}
|
|
|
|
|
|
|
|
def replacer(match):
|
|
|
|
precursor = match.group(1)
|
2022-12-25 04:04:25 +00:00
|
|
|
handle = match.group(2)
|
2022-12-20 11:39:45 +00:00
|
|
|
if "@" in handle:
|
|
|
|
short_handle = handle.split("@", 1)[0]
|
|
|
|
else:
|
|
|
|
short_handle = handle
|
2022-12-25 04:04:25 +00:00
|
|
|
handle_hash = handle.lower()
|
|
|
|
short_hash = short_handle.lower()
|
|
|
|
if handle_hash in possible_matches:
|
|
|
|
if short_hash not in collapse_name:
|
|
|
|
collapse_name[short_hash] = handle_hash
|
|
|
|
elif collapse_name.get(short_hash) != handle_hash:
|
2022-12-20 11:39:45 +00:00
|
|
|
short_handle = handle
|
2022-12-25 04:04:25 +00:00
|
|
|
return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
|
2022-12-20 11:39:45 +00:00
|
|
|
else:
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Post.mention_regex.sub(replacer, html)
|
|
|
|
|
|
|
|
def linkify_hashtags(self, html, identity) -> str:
|
|
|
|
from activities.models import Hashtag
|
|
|
|
|
2022-12-20 13:55:14 +00:00
|
|
|
def replacer(attrs, new=False):
|
|
|
|
# See if the text in this link looks like a hashtag
|
|
|
|
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
|
|
|
|
return attrs
|
|
|
|
hashtag = attrs["_text"].strip().lstrip("#")
|
|
|
|
attrs[None, "class"] = "hashtag"
|
|
|
|
if (None, "rel") in attrs:
|
|
|
|
del attrs[None, "rel"]
|
2022-12-20 11:39:45 +00:00
|
|
|
if self.local:
|
2022-12-20 13:55:14 +00:00
|
|
|
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
|
2022-12-20 11:39:45 +00:00
|
|
|
else:
|
2022-12-20 13:55:14 +00:00
|
|
|
attrs[
|
|
|
|
None, "href"
|
|
|
|
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
linker = bleach.linkifier.Linker(
|
|
|
|
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
|
|
|
|
)
|
|
|
|
return linker.linkify(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
|
2022-12-22 16:55:31 +00:00
|
|
|
def imageify_emojis(
|
|
|
|
self, html: str, identity, include_local: bool = True, emojis=None
|
|
|
|
):
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
|
|
|
Find :emoji: in content and convert to <img>. If include_local is True,
|
|
|
|
the local emoji will be used as a fallback for any shortcodes not defined
|
|
|
|
by emojis.
|
|
|
|
"""
|
|
|
|
from activities.models import Emoji
|
|
|
|
|
2022-12-22 16:55:31 +00:00
|
|
|
# If precached emojis were passed, prep them
|
|
|
|
cached_emojis = {}
|
|
|
|
if emojis:
|
|
|
|
for emoji in emojis:
|
|
|
|
cached_emojis[emoji.shortcode] = emoji
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
def replacer(match):
|
2022-12-22 16:55:31 +00:00
|
|
|
shortcode = match.group(1).lower()
|
|
|
|
if shortcode in cached_emojis:
|
|
|
|
return cached_emojis[shortcode].as_html()
|
2023-01-07 22:19:47 +00:00
|
|
|
|
|
|
|
emoji = Emoji.get_by_domain(shortcode, identity.domain)
|
|
|
|
if emoji and emoji.is_usable:
|
|
|
|
return emoji.as_html()
|
|
|
|
elif not emoji and include_local:
|
|
|
|
emoji = Emoji.get_by_domain(shortcode, None)
|
|
|
|
if emoji:
|
2022-12-22 16:55:31 +00:00
|
|
|
return emoji.as_html()
|
2023-01-07 22:19:47 +00:00
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Emoji.emoji_regex.sub(replacer, html)
|
2022-12-27 18:38:18 +00:00
|
|
|
|
|
|
|
def remove_extra_newlines(self, html: str) -> str:
|
|
|
|
"""
|
|
|
|
Some clients are sensitive to extra newlines even though it's HTML
|
|
|
|
"""
|
|
|
|
# TODO: More intelligent way to strip these?
|
|
|
|
return html.replace("\n", "")
|