2022-12-20 13:10:35 +00:00
|
|
|
import re
|
|
|
|
from functools import partial
|
|
|
|
|
2022-11-12 05:02:43 +00:00
|
|
|
import bleach
|
2022-11-14 02:03:43 +00:00
|
|
|
from bleach.linkifier import LinkifyFilter
|
2022-11-12 05:02:43 +00:00
|
|
|
from django.utils.safestring import mark_safe
|
|
|
|
|
2022-12-20 13:10:35 +00:00
|
|
|
url_regex = re.compile(
|
|
|
|
r"""\(* # Match any opening parentheses.
|
|
|
|
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
|
|
|
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
|
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
|
|
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
|
|
|
# except for # and ~, which happen in practice)
|
|
|
|
""",
|
|
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
|
|
|
)
|
|
|
|
|
2022-11-12 05:02:43 +00:00
|
|
|
|
2022-11-14 02:03:43 +00:00
|
|
|
def allow_a(tag: str, name: str, value: str):
|
|
|
|
if name in ["href", "title", "class"]:
|
|
|
|
return True
|
|
|
|
elif name == "rel":
|
|
|
|
# Only allow rel attributes with a small subset of values
|
|
|
|
# (we're defending against, for example, rel=me)
|
|
|
|
rel_values = value.split()
|
|
|
|
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
|
|
|
return True
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
def sanitize_html(post_html: str) -> str:
|
2022-11-12 05:02:43 +00:00
|
|
|
"""
|
|
|
|
Only allows a, br, p and span tags, and class attributes.
|
|
|
|
"""
|
2022-11-14 02:03:43 +00:00
|
|
|
cleaner = bleach.Cleaner(
|
2022-12-20 13:55:14 +00:00
|
|
|
tags=["br", "p", "a"],
|
2022-11-14 02:03:43 +00:00
|
|
|
attributes={ # type:ignore
|
|
|
|
"a": allow_a,
|
|
|
|
"p": ["class"],
|
|
|
|
},
|
2022-12-20 13:10:35 +00:00
|
|
|
filters=[partial(LinkifyFilter, url_re=url_regex)],
|
2022-11-18 02:31:00 +00:00
|
|
|
strip=True,
|
2022-11-12 05:02:43 +00:00
|
|
|
)
|
2022-11-14 02:03:43 +00:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-22 04:18:13 +00:00
|
|
|
|
|
|
|
|
2022-12-28 00:42:30 +00:00
|
|
|
def strip_html(post_html: str, *, linkify: bool = True) -> str:
|
2022-11-22 04:18:13 +00:00
|
|
|
"""
|
|
|
|
Strips all tags from the text, then linkifies it.
|
|
|
|
"""
|
2022-12-20 13:10:35 +00:00
|
|
|
cleaner = bleach.Cleaner(
|
|
|
|
tags=[],
|
|
|
|
strip=True,
|
2022-12-28 00:42:30 +00:00
|
|
|
filters=[partial(LinkifyFilter, url_re=url_regex)] if linkify else [],
|
2022-12-20 13:10:35 +00:00
|
|
|
)
|
2022-11-22 04:18:13 +00:00
|
|
|
return mark_safe(cleaner.clean(post_html))
|
2022-11-27 19:09:08 +00:00
|
|
|
|
|
|
|
|
|
|
|
def html_to_plaintext(post_html: str) -> str:
|
|
|
|
"""
|
|
|
|
Tries to do the inverse of the linebreaks filter.
|
|
|
|
"""
|
|
|
|
# TODO: Handle HTML entities
|
|
|
|
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
|
|
|
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
|
|
|
# Remove all other HTML and return
|
|
|
|
cleaner = bleach.Cleaner(tags=[], strip=True, filters=[])
|
|
|
|
return cleaner.clean(post_html).strip()
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
class ContentRenderer:
|
|
|
|
"""
|
|
|
|
Renders HTML for posts, identity fields, and more.
|
|
|
|
|
|
|
|
The `local` parameter affects whether links are absolute (False) or relative (True)
|
|
|
|
"""
|
|
|
|
|
|
|
|
def __init__(self, local: bool):
|
|
|
|
self.local = local
|
|
|
|
|
|
|
|
def render_post(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Given post HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_mentions(html, post=post)
|
|
|
|
html = self.linkify_hashtags(html, identity=post.author)
|
|
|
|
if self.local:
|
2022-12-22 16:55:31 +00:00
|
|
|
html = self.imageify_emojis(
|
|
|
|
html,
|
|
|
|
identity=post.author,
|
|
|
|
emojis=post.emojis.all(),
|
|
|
|
)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-22 01:10:25 +00:00
|
|
|
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
2022-12-22 01:10:25 +00:00
|
|
|
Given identity summary HTML, normalises it and renders it for presentation.
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
html = self.linkify_hashtags(html, identity=identity)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-22 01:10:25 +00:00
|
|
|
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
|
|
|
"""
|
|
|
|
Given name/basic value HTML, normalises it and renders it for presentation.
|
|
|
|
"""
|
|
|
|
if not html:
|
|
|
|
return ""
|
|
|
|
if strip:
|
|
|
|
html = strip_html(html)
|
|
|
|
else:
|
|
|
|
html = sanitize_html(html)
|
|
|
|
if self.local:
|
|
|
|
html = self.imageify_emojis(html, identity=identity)
|
2022-12-27 18:38:18 +00:00
|
|
|
html = self.remove_extra_newlines(html)
|
2022-12-22 01:10:25 +00:00
|
|
|
return mark_safe(html)
|
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
def linkify_mentions(self, html: str, post) -> str:
|
|
|
|
"""
|
|
|
|
Links mentions _in the context of the post_ - as in, using the mentions
|
|
|
|
property as the only source (as we might be doing this without other
|
|
|
|
DB access allowed)
|
|
|
|
"""
|
|
|
|
from activities.models import Post
|
|
|
|
|
|
|
|
possible_matches = {}
|
|
|
|
for mention in post.mentions.all():
|
|
|
|
if self.local:
|
|
|
|
url = str(mention.urls.view)
|
|
|
|
else:
|
|
|
|
url = mention.absolute_profile_uri()
|
2022-12-25 21:37:31 +00:00
|
|
|
# Might not have fetched it (yet)
|
|
|
|
if mention.username:
|
|
|
|
username = mention.username.lower()
|
|
|
|
possible_matches[username] = url
|
|
|
|
possible_matches[f"{username}@{mention.domain_id}"] = url
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
collapse_name: dict[str, str] = {}
|
|
|
|
|
|
|
|
def replacer(match):
|
|
|
|
precursor = match.group(1)
|
2022-12-25 04:04:25 +00:00
|
|
|
handle = match.group(2)
|
2022-12-20 11:39:45 +00:00
|
|
|
if "@" in handle:
|
|
|
|
short_handle = handle.split("@", 1)[0]
|
|
|
|
else:
|
|
|
|
short_handle = handle
|
2022-12-25 04:04:25 +00:00
|
|
|
handle_hash = handle.lower()
|
|
|
|
short_hash = short_handle.lower()
|
|
|
|
if handle_hash in possible_matches:
|
|
|
|
if short_hash not in collapse_name:
|
|
|
|
collapse_name[short_hash] = handle_hash
|
|
|
|
elif collapse_name.get(short_hash) != handle_hash:
|
2022-12-20 11:39:45 +00:00
|
|
|
short_handle = handle
|
2022-12-25 04:04:25 +00:00
|
|
|
return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
|
2022-12-20 11:39:45 +00:00
|
|
|
else:
|
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Post.mention_regex.sub(replacer, html)
|
|
|
|
|
|
|
|
def linkify_hashtags(self, html, identity) -> str:
|
|
|
|
from activities.models import Hashtag
|
|
|
|
|
2022-12-20 13:55:14 +00:00
|
|
|
def replacer(attrs, new=False):
|
|
|
|
# See if the text in this link looks like a hashtag
|
|
|
|
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
|
|
|
|
return attrs
|
|
|
|
hashtag = attrs["_text"].strip().lstrip("#")
|
|
|
|
attrs[None, "class"] = "hashtag"
|
|
|
|
if (None, "rel") in attrs:
|
|
|
|
del attrs[None, "rel"]
|
2022-12-20 11:39:45 +00:00
|
|
|
if self.local:
|
2022-12-20 13:55:14 +00:00
|
|
|
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
|
2022-12-20 11:39:45 +00:00
|
|
|
else:
|
2022-12-20 13:55:14 +00:00
|
|
|
attrs[
|
|
|
|
None, "href"
|
|
|
|
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
|
|
|
|
return attrs
|
|
|
|
|
|
|
|
linker = bleach.linkifier.Linker(
|
|
|
|
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
|
|
|
|
)
|
|
|
|
return linker.linkify(html)
|
2022-12-20 11:39:45 +00:00
|
|
|
|
2022-12-22 16:55:31 +00:00
|
|
|
def imageify_emojis(
|
|
|
|
self, html: str, identity, include_local: bool = True, emojis=None
|
|
|
|
):
|
2022-12-20 11:39:45 +00:00
|
|
|
"""
|
|
|
|
Find :emoji: in content and convert to <img>. If include_local is True,
|
|
|
|
the local emoji will be used as a fallback for any shortcodes not defined
|
|
|
|
by emojis.
|
|
|
|
"""
|
|
|
|
from activities.models import Emoji
|
|
|
|
|
2022-12-22 16:55:31 +00:00
|
|
|
# If precached emojis were passed, prep them
|
|
|
|
cached_emojis = {}
|
|
|
|
if emojis:
|
|
|
|
for emoji in emojis:
|
|
|
|
cached_emojis[emoji.shortcode] = emoji
|
2022-12-20 11:39:45 +00:00
|
|
|
|
|
|
|
def replacer(match):
|
2022-12-22 16:55:31 +00:00
|
|
|
shortcode = match.group(1).lower()
|
|
|
|
if shortcode in cached_emojis:
|
|
|
|
return cached_emojis[shortcode].as_html()
|
2023-01-07 22:19:47 +00:00
|
|
|
|
|
|
|
emoji = Emoji.get_by_domain(shortcode, identity.domain)
|
|
|
|
if emoji and emoji.is_usable:
|
|
|
|
return emoji.as_html()
|
|
|
|
elif not emoji and include_local:
|
|
|
|
emoji = Emoji.get_by_domain(shortcode, None)
|
|
|
|
if emoji:
|
2022-12-22 16:55:31 +00:00
|
|
|
return emoji.as_html()
|
2023-01-07 22:19:47 +00:00
|
|
|
|
2022-12-20 11:39:45 +00:00
|
|
|
return match.group()
|
|
|
|
|
|
|
|
return Emoji.emoji_regex.sub(replacer, html)
|
2022-12-27 18:38:18 +00:00
|
|
|
|
|
|
|
def remove_extra_newlines(self, html: str) -> str:
|
|
|
|
"""
|
|
|
|
Some clients are sensitive to extra newlines even though it's HTML
|
|
|
|
"""
|
|
|
|
# TODO: More intelligent way to strip these?
|
|
|
|
return html.replace("\n", "")
|