import re from functools import partial import bleach import bleach.callbacks from bleach.html5lib_shim import Filter from bleach.linkifier import LinkifyFilter from django.utils.safestring import mark_safe url_regex = re.compile( r"""\(* # Match any opening parentheses. \b(?"]*)? # /path/zz (excluding "unsafe" chars from RFC 1738, # except for # and ~, which happen in practice) """, re.IGNORECASE | re.VERBOSE | re.UNICODE, ) ALLOWED_TAGS = ["br", "p", "a"] REWRITTEN_TAGS = [ "h1", "h2", "h3", "h4", "h5", "h6", "blockquote", "pre", "ul", "ol", "li", ] class MastodonStrictTagFilter(Filter): """ Implements Python equivalent of Mastodon tag rewriter Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55 Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `
` lists """ def __iter__(self): li_pending_break = False break_token = { "name": "br", "data": {}, "type": "StartTag", } for token in Filter.__iter__(self): if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [ "StartTag", "EndTag", ]: yield token continue if token["type"] == "StartTag": if token["name"] == "li": if li_pending_break: # Another `li` appeared, so break after the last one yield break_token continue token["name"] = "p" elif token["type"] == "EndTag": if token["name"] == "li": # Track that an `li` closed so we know a break should be considered li_pending_break = True continue if token["name"] == "ul": # If the last `li` happened, then don't add a break because Mastodon doesn't li_pending_break = False token["name"] = "p" yield token class UnlinkifyFilter(Filter): """ Forcibly replaces link text with the href. This is intented to be used when stripping tags to preserve the link location at the expense of the link text. """ def __iter__(self): discarding_a_text = False for token in Filter.__iter__(self): if token.get("name") == "a": if token["type"] == "EndTag": discarding_a_text = False continue href = token["data"].get((None, "href")) # If has an href, we use it and throw away all content # within the .... If href missing or empty, try to find # text within the ... if href: yield {"data": href, "type": "Characters"} discarding_a_text = True continue elif not discarding_a_text: yield token # else: throw away tokens until we're out of the def allow_a(tag: str, name: str, value: str): if name in ["href", "title", "class"]: return True elif name == "rel": # Only allow rel attributes with a small subset of values # (we're defending against, for example, rel=me) rel_values = value.split() if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values): return True return False def shorten_link_text(attrs, new=False): """ Applies Mastodon's link shortening behavior where URL text links are shortened by removing the scheme and only showing the first 30 chars. Orig: https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened Becomes: social.example.com/a-long/path """ text = attrs.get("_text") if not text: text = attrs.get((None, "href")) if text and "://" in text and len(text) > 30: text = text.split("://", 1)[-1] attrs["_text"] = text[:30] if len(text) > 30: attrs[(None, "class")] = " ".join( filter(None, [attrs.pop((None, "class"), ""), "ellipsis"]) ) # Add the full URL in to title for easier user inspection attrs[(None, "title")] = attrs.get((None, "href")) return attrs linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text] def sanitize_html(post_html: str) -> str: """ Only allows a, br, p and span tags, and class attributes. """ cleaner = bleach.Cleaner( tags=ALLOWED_TAGS + REWRITTEN_TAGS, attributes={ # type:ignore "a": allow_a, "p": ["class"], }, filters=[ partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks), MastodonStrictTagFilter, ], strip=True, ) return mark_safe(cleaner.clean(post_html)) def strip_html(post_html: str, *, linkify: bool = True) -> str: """ Strips all tags from the text, then linkifies it. """ cleaner = bleach.Cleaner( tags=[], strip=True, filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)] if linkify else [UnlinkifyFilter], ) return mark_safe(cleaner.clean(post_html)) def html_to_plaintext(post_html: str) -> str: """ Tries to do the inverse of the linebreaks filter. """ # TODO: Handle HTML entities # Remove all newlines, then replace br with a newline and /p with two (one comes from bleach) post_html = post_html.replace("\n", "").replace("
", "\n").replace("

", "\n") # Remove all other HTML and return cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter]) return cleaner.clean(post_html).strip() class ContentRenderer: """ Renders HTML for posts, identity fields, and more. The `local` parameter affects whether links are absolute (False) or relative (True) """ def __init__(self, local: bool): self.local = local def render_post(self, html: str, post) -> str: """ Given post HTML, normalises it and renders it for presentation. """ if not html: return "" html = sanitize_html(html) html = self.linkify_mentions(html, post=post) html = self.linkify_hashtags(html, identity=post.author) if self.local: html = self.imageify_emojis( html, identity=post.author, emojis=post.emojis.all(), ) html = self.remove_extra_newlines(html) return mark_safe(html) def render_identity_summary(self, html: str, identity, strip: bool = False) -> str: """ Given identity summary HTML, normalises it and renders it for presentation. """ if not html: return "" if strip: html = strip_html(html) else: html = sanitize_html(html) html = self.linkify_hashtags(html, identity=identity) if self.local: html = self.imageify_emojis(html, identity=identity) html = self.remove_extra_newlines(html) return mark_safe(html) def render_identity_data(self, html: str, identity, strip: bool = False) -> str: """ Given name/basic value HTML, normalises it and renders it for presentation. """ if not html: return "" if strip: html = strip_html(html) else: html = sanitize_html(html) if self.local: html = self.imageify_emojis(html, identity=identity) html = self.remove_extra_newlines(html) return mark_safe(html) def linkify_mentions(self, html: str, post) -> str: """ Links mentions _in the context of the post_ - as in, using the mentions property as the only source (as we might be doing this without other DB access allowed) """ from activities.models import Post possible_matches = {} for mention in post.mentions.all(): if self.local: url = str(mention.urls.view) else: url = mention.absolute_profile_uri() # Might not have fetched it (yet) if mention.username: username = mention.username.lower() possible_matches[username] = url possible_matches[f"{username}@{mention.domain_id}"] = url collapse_name: dict[str, str] = {} def replacer(match): precursor = match.group(1) handle = match.group(2) if "@" in handle: short_handle = handle.split("@", 1)[0] else: short_handle = handle handle_hash = handle.lower() short_hash = short_handle.lower() if handle_hash in possible_matches: if short_hash not in collapse_name: collapse_name[short_hash] = handle_hash elif collapse_name.get(short_hash) != handle_hash: short_handle = handle return f'{precursor}@{short_handle}' else: return match.group() return Post.mention_regex.sub(replacer, html) def linkify_hashtags(self, html, identity) -> str: from activities.models import Hashtag def replacer(attrs, new=False): # See if the text in this link looks like a hashtag if not Hashtag.hashtag_regex.match(attrs.get("_text", "")): return attrs hashtag = attrs["_text"].strip().lstrip("#") attrs[None, "class"] = "hashtag" if (None, "rel") in attrs: del attrs[None, "rel"] if self.local: attrs[None, "href"] = f"/tags/{hashtag.lower()}/" else: attrs[ None, "href" ] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/" return attrs linker = bleach.linkifier.Linker( url_re=Hashtag.hashtag_regex, callbacks=[replacer] ) return linker.linkify(html) def imageify_emojis( self, html: str, identity, include_local: bool = True, emojis=None ): """ Find :emoji: in content and convert to . If include_local is True, the local emoji will be used as a fallback for any shortcodes not defined by emojis. """ from activities.models import Emoji # If precached emojis were passed, prep them cached_emojis = {} if emojis: for emoji in emojis: cached_emojis[emoji.shortcode] = emoji def replacer(match): shortcode = match.group(1).lower() if shortcode in cached_emojis: return cached_emojis[shortcode].as_html() emoji = Emoji.get_by_domain(shortcode, identity.domain) if emoji and emoji.is_usable: return emoji.as_html() elif not emoji and include_local: emoji = Emoji.get_by_domain(shortcode, None) if emoji: return emoji.as_html() return match.group() return Emoji.emoji_regex.sub(replacer, html) def remove_extra_newlines(self, html: str) -> str: """ Some clients are sensitive to extra newlines even though it's HTML """ # TODO: More intelligent way to strip these? return html.replace("\n", "")