mirror of
https://github.com/jointakahe/takahe.git
synced 2024-11-22 15:21:01 +00:00
a6922cb9d6
This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place.
366 lines
12 KiB
Python
366 lines
12 KiB
Python
import html
|
|
import re
|
|
from html.parser import HTMLParser
|
|
|
|
from django.utils.safestring import mark_safe
|
|
|
|
|
|
class FediverseHtmlParser(HTMLParser):
|
|
"""
|
|
A custom HTML parser that only allows a certain tag subset and behaviour:
|
|
- br, p tags are passed through
|
|
- a tags are passed through if they're not hashtags or mentions
|
|
- Another set of tags are converted to p
|
|
|
|
It also linkifies URLs, mentions, hashtags, and imagifies emoji.
|
|
"""
|
|
|
|
REWRITE_TO_P = [
|
|
"p",
|
|
"h1",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"blockquote",
|
|
"pre",
|
|
"ul",
|
|
"ol",
|
|
]
|
|
|
|
REWRITE_TO_BR = [
|
|
"br",
|
|
"li",
|
|
]
|
|
|
|
MENTION_REGEX = re.compile(
|
|
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
|
|
)
|
|
|
|
HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
|
|
|
|
EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
|
|
|
|
URL_REGEX = re.compile(
|
|
r"""(\(* # Match any opening parentheses.
|
|
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
|
(?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
|
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
|
# except for # and ~, which happen in practice)
|
|
""",
|
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
|
)
|
|
|
|
def __init__(
|
|
self,
|
|
html: str,
|
|
uri_domain: str | None = None,
|
|
mentions: list | None = None,
|
|
find_mentions: bool = False,
|
|
find_hashtags: bool = False,
|
|
find_emojis: bool = False,
|
|
emoji_domain=None,
|
|
):
|
|
super().__init__()
|
|
self.uri_domain = uri_domain
|
|
self.emoji_domain = emoji_domain
|
|
self.find_mentions = find_mentions
|
|
self.find_hashtags = find_hashtags
|
|
self.find_emojis = find_emojis
|
|
self.calculate_mentions(mentions)
|
|
self._data_buffer = ""
|
|
self.html_output = ""
|
|
self.text_output = ""
|
|
self.emojis: set[str] = set()
|
|
self.mentions: set[str] = set()
|
|
self.hashtags: set[str] = set()
|
|
self._pending_a: dict | None = None
|
|
self._fresh_p = False
|
|
self.feed(html.replace("\n", ""))
|
|
self.flush_data()
|
|
|
|
def calculate_mentions(self, mentions: list | None):
|
|
"""
|
|
Prepares a set of content that we expect to see mentions look like
|
|
(this imp)
|
|
"""
|
|
self.mention_matches: dict[str, str] = {}
|
|
self.mention_aliases: dict[str, str] = {}
|
|
for mention in mentions or []:
|
|
if self.uri_domain:
|
|
url = mention.absolute_profile_uri()
|
|
else:
|
|
url = str(mention.urls.view)
|
|
if mention.username:
|
|
username = mention.username.lower()
|
|
domain = mention.domain_id.lower()
|
|
self.mention_matches[f"{username}"] = url
|
|
self.mention_matches[f"{username}@{domain}"] = url
|
|
|
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
if tag in self.REWRITE_TO_P:
|
|
self.flush_data()
|
|
self.html_output += "<p>"
|
|
elif tag in self.REWRITE_TO_BR:
|
|
self.flush_data()
|
|
if not self._fresh_p:
|
|
self.html_output += "<br>"
|
|
self.text_output += "\n"
|
|
elif tag == "a":
|
|
self.flush_data()
|
|
self._pending_a = {"attrs": dict(attrs), "content": ""}
|
|
self._fresh_p = tag in self.REWRITE_TO_P
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
self._fresh_p = False
|
|
if tag in self.REWRITE_TO_P:
|
|
self.flush_data()
|
|
self.html_output += "</p>"
|
|
self.text_output += "\n\n"
|
|
elif tag == "a":
|
|
if self._pending_a:
|
|
href = self._pending_a["attrs"].get("href")
|
|
content = self._pending_a["content"].strip()
|
|
# Is it a mention?
|
|
if content.lower().lstrip("@") in self.mention_matches:
|
|
self.html_output += self.create_mention(content)
|
|
self.text_output += content
|
|
# Is it a hashtag?
|
|
elif self.HASHTAG_REGEX.match(content):
|
|
self.html_output += self.create_hashtag(content)
|
|
self.text_output += content
|
|
elif content:
|
|
# Shorten the link if we need to
|
|
self.html_output += self.create_link(href, content)
|
|
self.text_output += href
|
|
self._pending_a = None
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
self._fresh_p = False
|
|
if self._pending_a:
|
|
self._pending_a["content"] += data
|
|
else:
|
|
self._data_buffer += data
|
|
|
|
def flush_data(self) -> None:
|
|
"""
|
|
We collect data segments until we encounter a tag we care about,
|
|
so we can treat <span>#</span>hashtag as #hashtag
|
|
"""
|
|
self.text_output += self._data_buffer
|
|
self.html_output += self.linkify(self._data_buffer)
|
|
self._data_buffer = ""
|
|
|
|
def create_link(self, href, content):
|
|
"""
|
|
Generates a link, doing optional shortening.
|
|
|
|
All return values from this function should be HTML-safe.
|
|
"""
|
|
looks_like_link = bool(self.URL_REGEX.match(content))
|
|
if looks_like_link:
|
|
content = content.split("://", 1)[1]
|
|
if looks_like_link and len(content) > 30:
|
|
return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
|
|
else:
|
|
return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'
|
|
|
|
def create_mention(self, handle) -> str:
|
|
"""
|
|
Generates a mention link. Handle should have a leading @.
|
|
|
|
All return values from this function should be HTML-safe
|
|
"""
|
|
handle = handle.lstrip("@")
|
|
if "@" in handle:
|
|
short_handle = handle.split("@", 1)[0]
|
|
else:
|
|
short_handle = handle
|
|
handle_hash = handle.lower()
|
|
short_hash = short_handle.lower()
|
|
self.mentions.add(handle_hash)
|
|
url = self.mention_matches.get(handle_hash)
|
|
if url:
|
|
if short_hash not in self.mention_aliases:
|
|
self.mention_aliases[short_hash] = handle_hash
|
|
elif self.mention_aliases.get(short_hash) != handle_hash:
|
|
short_handle = handle
|
|
return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
|
|
else:
|
|
return "@" + html.escape(handle)
|
|
|
|
def create_hashtag(self, hashtag) -> str:
|
|
"""
|
|
Generates a hashtag link. Hashtag does not need to start with #
|
|
|
|
All return values from this function should be HTML-safe
|
|
"""
|
|
hashtag = hashtag.lstrip("#")
|
|
self.hashtags.add(hashtag.lower())
|
|
if self.uri_domain:
|
|
return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
|
else:
|
|
return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
|
|
|
def create_emoji(self, shortcode) -> str:
|
|
"""
|
|
Generates an emoji <img> tag
|
|
|
|
All return values from this function should be HTML-safe
|
|
"""
|
|
from activities.models import Emoji
|
|
|
|
emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
|
|
if emoji and emoji.is_usable:
|
|
self.emojis.add(shortcode)
|
|
return emoji.as_html()
|
|
return f":{shortcode}:"
|
|
|
|
def linkify(self, data):
|
|
"""
|
|
Linkifies some content that is plaintext.
|
|
|
|
Handles URLs first, then mentions. Note that this takes great care to
|
|
keep track of what is HTML and what needs to be escaped.
|
|
"""
|
|
# Split the string by the URL regex so we know what to escape and what
|
|
# not to escape.
|
|
bits = self.URL_REGEX.split(data)
|
|
result = ""
|
|
# Even indices are data we should pass though, odd indices are links
|
|
for i, bit in enumerate(bits):
|
|
# A link!
|
|
if i % 2 == 1:
|
|
result += self.create_link(bit, bit)
|
|
# Not a link
|
|
elif self.mention_matches or self.find_mentions:
|
|
result += self.linkify_mentions(bit)
|
|
elif self.find_hashtags:
|
|
result += self.linkify_hashtags(bit)
|
|
elif self.find_emojis:
|
|
result += self.linkify_emoji(bit)
|
|
else:
|
|
result += html.escape(bit)
|
|
return result
|
|
|
|
def linkify_mentions(self, data):
|
|
"""
|
|
Linkifies mentions
|
|
"""
|
|
bits = self.MENTION_REGEX.split(data)
|
|
result = ""
|
|
for i, bit in enumerate(bits):
|
|
# Mention content
|
|
if i % 3 == 2:
|
|
result += self.create_mention(bit)
|
|
# Not part of a mention (0) or mention preamble (1)
|
|
elif self.find_hashtags:
|
|
result += self.linkify_hashtags(bit)
|
|
elif self.find_emojis:
|
|
result += self.linkify_emoji(bit)
|
|
else:
|
|
result += html.escape(bit)
|
|
return result
|
|
|
|
def linkify_hashtags(self, data):
|
|
"""
|
|
Linkifies hashtags
|
|
"""
|
|
bits = self.HASHTAG_REGEX.split(data)
|
|
result = ""
|
|
for i, bit in enumerate(bits):
|
|
# Not part of a hashtag
|
|
if i % 2 == 0:
|
|
if self.find_emojis:
|
|
result += self.linkify_emoji(bit)
|
|
else:
|
|
result += html.escape(bit)
|
|
# Hashtag content
|
|
else:
|
|
result += self.create_hashtag(bit)
|
|
return result
|
|
|
|
def linkify_emoji(self, data):
|
|
"""
|
|
Linkifies emoji
|
|
"""
|
|
bits = self.EMOJI_REGEX.split(data)
|
|
result = ""
|
|
for i, bit in enumerate(bits):
|
|
# Not part of an emoji
|
|
if i % 2 == 0:
|
|
result += html.escape(bit)
|
|
# Emoji content
|
|
else:
|
|
result += self.create_emoji(bit)
|
|
return result
|
|
|
|
@property
|
|
def html(self):
|
|
return self.html_output.strip()
|
|
|
|
@property
|
|
def plain_text(self):
|
|
return self.text_output.strip()
|
|
|
|
|
|
class ContentRenderer:
|
|
"""
|
|
Renders HTML for posts, identity fields, and more.
|
|
|
|
The `local` parameter affects whether links are absolute (False) or relative (True)
|
|
"""
|
|
|
|
def __init__(self, local: bool):
|
|
self.local = local
|
|
|
|
def render_post(self, html: str, post) -> str:
|
|
"""
|
|
Given post HTML, normalises it and renders it for presentation.
|
|
"""
|
|
if not html:
|
|
return ""
|
|
parser = FediverseHtmlParser(
|
|
html,
|
|
mentions=post.mentions.all(),
|
|
uri_domain=(None if self.local else post.author.domain.uri_domain),
|
|
find_hashtags=True,
|
|
find_emojis=True,
|
|
emoji_domain=post.author.domain,
|
|
)
|
|
return mark_safe(parser.html)
|
|
|
|
def render_identity_summary(self, html: str, identity) -> str:
|
|
"""
|
|
Given identity summary HTML, normalises it and renders it for presentation.
|
|
"""
|
|
if not html:
|
|
return ""
|
|
parser = FediverseHtmlParser(
|
|
html,
|
|
uri_domain=(None if self.local else identity.domain.uri_domain),
|
|
find_hashtags=True,
|
|
find_emojis=True,
|
|
emoji_domain=identity.domain,
|
|
)
|
|
return mark_safe(parser.html)
|
|
|
|
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
|
"""
|
|
Given name/basic value HTML, normalises it and renders it for presentation.
|
|
"""
|
|
if not html:
|
|
return ""
|
|
parser = FediverseHtmlParser(
|
|
html,
|
|
uri_domain=(None if self.local else identity.domain.uri_domain),
|
|
find_hashtags=False,
|
|
find_emojis=True,
|
|
emoji_domain=identity.domain,
|
|
)
|
|
if strip:
|
|
return mark_safe(parser.html)
|
|
else:
|
|
return mark_safe(parser.html)
|