mirror of
https://github.com/jointakahe/takahe.git
synced 2024-11-21 23:01:00 +00:00
Move to a new HTML parser/stripper
This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place.
This commit is contained in:
parent
93c0af992b
commit
a6922cb9d6
14 changed files with 503 additions and 562 deletions
|
@ -48,13 +48,7 @@ repos:
|
|||
- id: mypy
|
||||
exclude: "^tests/"
|
||||
additional_dependencies:
|
||||
[
|
||||
types-pyopenssl,
|
||||
types-bleach,
|
||||
types-mock,
|
||||
types-cachetools,
|
||||
types-python-dateutil,
|
||||
]
|
||||
[types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]
|
||||
|
||||
- repo: https://github.com/rtts/djhtml
|
||||
rev: v1.5.2
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
from asgiref.sync import async_to_sync
|
||||
from django.contrib import admin
|
||||
from django.db import models
|
||||
from django.utils.safestring import mark_safe
|
||||
|
@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
|
|||
list_filter = ("type", "local", "visibility", "state", "created")
|
||||
raw_id_fields = ["emojis"]
|
||||
autocomplete_fields = ["to", "mentions", "author"]
|
||||
actions = ["reparse_hashtags"]
|
||||
search_fields = ["content", "search_handle", "search_service_handle"]
|
||||
inlines = [PostAttachmentInline]
|
||||
readonly_fields = ["created", "updated", "state_changed", "object_json"]
|
||||
|
@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
|
|||
)
|
||||
return super().get_search_results(request, queryset, search_term)
|
||||
|
||||
@admin.action(description="Reprocess content for hashtags")
|
||||
def reparse_hashtags(self, request, queryset):
|
||||
for instance in queryset:
|
||||
instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
|
||||
instance.save()
|
||||
async_to_sync(instance.ensure_hashtags)()
|
||||
|
||||
@admin.display(description="ActivityPub JSON")
|
||||
def object_json(self, instance):
|
||||
return instance.to_ap()
|
||||
|
|
|
@ -1,5 +1,4 @@
|
|||
import mimetypes
|
||||
import re
|
||||
from functools import partial
|
||||
from typing import ClassVar
|
||||
|
||||
|
@ -14,7 +13,7 @@ from django.db import models
|
|||
from django.utils.safestring import mark_safe
|
||||
|
||||
from core.files import get_remote_file
|
||||
from core.html import strip_html
|
||||
from core.html import FediverseHtmlParser
|
||||
from core.ld import format_ld_date
|
||||
from core.models import Config
|
||||
from core.uploads import upload_emoji_namer
|
||||
|
@ -134,8 +133,6 @@ class Emoji(StatorModel):
|
|||
admin_disable = "{admin}{self.pk}/disable/"
|
||||
admin_copy = "{admin}{self.pk}/copy/"
|
||||
|
||||
emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
|
||||
|
||||
def delete(self, using=None, keep_parents=False):
|
||||
if self.file:
|
||||
self.file.delete()
|
||||
|
@ -242,7 +239,9 @@ class Emoji(StatorModel):
|
|||
Return a parsed and sanitized of emoji found in content without
|
||||
the surrounding ':'.
|
||||
"""
|
||||
emoji_hits = cls.emoji_regex.findall(strip_html(content))
|
||||
emoji_hits = FediverseHtmlParser(
|
||||
content, find_emojis=True, emoji_domain=domain
|
||||
).emojis
|
||||
emojis = sorted({emoji.lower() for emoji in emoji_hits})
|
||||
return list(
|
||||
cls.objects.filter(local=(domain is None) or domain.local)
|
||||
|
|
|
@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
|
|||
from django.db import models
|
||||
from django.utils import timezone
|
||||
|
||||
from core.html import strip_html
|
||||
from core.models import Config
|
||||
from stator.models import State, StateField, StateGraph, StatorModel
|
||||
|
||||
|
@ -167,16 +166,6 @@ class Hashtag(StatorModel):
|
|||
results[date(year, month, day)] = val
|
||||
return dict(sorted(results.items(), reverse=True)[:num])
|
||||
|
||||
@classmethod
|
||||
def hashtags_from_content(cls, content) -> list[str]:
|
||||
"""
|
||||
Return a parsed and sanitized of hashtags found in content without
|
||||
leading '#'.
|
||||
"""
|
||||
hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
|
||||
hashtags = sorted({tag.lower() for tag in hashtag_hits})
|
||||
return list(hashtags)
|
||||
|
||||
def to_mastodon_json(self):
|
||||
return {
|
||||
"name": self.hashtag,
|
||||
|
|
|
@ -2,7 +2,6 @@ import datetime
|
|||
import hashlib
|
||||
import json
|
||||
import mimetypes
|
||||
import re
|
||||
import ssl
|
||||
from collections.abc import Iterable
|
||||
from typing import Optional
|
||||
|
@ -26,7 +25,7 @@ from activities.models.post_types import (
|
|||
PostTypeDataEncoder,
|
||||
)
|
||||
from core.exceptions import capture_message
|
||||
from core.html import ContentRenderer, strip_html
|
||||
from core.html import ContentRenderer, FediverseHtmlParser
|
||||
from core.ld import (
|
||||
canonicalise,
|
||||
format_ld_date,
|
||||
|
@ -374,10 +373,6 @@ class Post(StatorModel):
|
|||
def clean_type_data(self, value):
|
||||
PostTypeData.parse_obj(value)
|
||||
|
||||
mention_regex = re.compile(
|
||||
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
|
||||
)
|
||||
|
||||
def _safe_content_note(self, *, local: bool = True):
|
||||
return ContentRenderer(local=local).render_post(self.content, self)
|
||||
|
||||
|
@ -474,12 +469,12 @@ class Post(StatorModel):
|
|||
# Maintain local-only for replies
|
||||
if reply_to.visibility == reply_to.Visibilities.local_only:
|
||||
visibility = reply_to.Visibilities.local_only
|
||||
# Find hashtags in this post
|
||||
hashtags = Hashtag.hashtags_from_content(content) or None
|
||||
# Find emoji in this post
|
||||
emojis = Emoji.emojis_from_content(content, None)
|
||||
# Strip all HTML and apply linebreaks filter
|
||||
content = linebreaks_filter(strip_html(content))
|
||||
# Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
|
||||
parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
|
||||
content = parser.html
|
||||
hashtags = sorted(parser.hashtags) or None
|
||||
# Make the Post object
|
||||
post = cls.objects.create(
|
||||
author=author,
|
||||
|
@ -512,12 +507,13 @@ class Post(StatorModel):
|
|||
):
|
||||
with transaction.atomic():
|
||||
# Strip all HTML and apply linebreaks filter
|
||||
self.content = linebreaks_filter(strip_html(content))
|
||||
parser = FediverseHtmlParser(linebreaks_filter(content))
|
||||
self.content = parser.html
|
||||
self.hashtags = sorted(parser.hashtags) or None
|
||||
self.summary = summary or None
|
||||
self.sensitive = bool(summary)
|
||||
self.visibility = visibility
|
||||
self.edited = timezone.now()
|
||||
self.hashtags = Hashtag.hashtags_from_content(content) or None
|
||||
self.mentions.set(self.mentions_from_content(content, self.author))
|
||||
self.emojis.set(Emoji.emojis_from_content(content, None))
|
||||
self.attachments.set(attachments or [])
|
||||
|
@ -525,9 +521,9 @@ class Post(StatorModel):
|
|||
|
||||
@classmethod
|
||||
def mentions_from_content(cls, content, author) -> set[Identity]:
|
||||
mention_hits = cls.mention_regex.findall(content)
|
||||
mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
|
||||
mentions = set()
|
||||
for precursor, handle in mention_hits:
|
||||
for handle in mention_hits:
|
||||
handle = handle.lower()
|
||||
if "@" in handle:
|
||||
username, domain = handle.split("@", 1)
|
||||
|
|
|
@ -14,7 +14,7 @@ from activities.models import (
|
|||
TimelineEvent,
|
||||
)
|
||||
from core.files import blurhash_image, resize_image
|
||||
from core.html import html_to_plaintext
|
||||
from core.html import FediverseHtmlParser
|
||||
from core.models import Config
|
||||
from users.decorators import identity_required
|
||||
|
||||
|
@ -112,7 +112,7 @@ class Compose(FormView):
|
|||
{
|
||||
"reply_to": self.reply_to.pk if self.reply_to else "",
|
||||
"visibility": self.post_obj.visibility,
|
||||
"text": html_to_plaintext(self.post_obj.content),
|
||||
"text": FediverseHtmlParser(self.post_obj.content).plain_text,
|
||||
"content_warning": self.post_obj.summary,
|
||||
}
|
||||
)
|
||||
|
|
642
core/html.py
642
core/html.py
|
@ -1,199 +1,309 @@
|
|||
import html
|
||||
import re
|
||||
from functools import partial
|
||||
from html.parser import HTMLParser
|
||||
|
||||
import bleach
|
||||
import bleach.callbacks
|
||||
from bleach.html5lib_shim import Filter
|
||||
from bleach.linkifier import LinkifyFilter
|
||||
from django.utils.safestring import mark_safe
|
||||
|
||||
url_regex = re.compile(
|
||||
r"""\(* # Match any opening parentheses.
|
||||
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
||||
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
||||
|
||||
class FediverseHtmlParser(HTMLParser):
|
||||
"""
|
||||
A custom HTML parser that only allows a certain tag subset and behaviour:
|
||||
- br, p tags are passed through
|
||||
- a tags are passed through if they're not hashtags or mentions
|
||||
- Another set of tags are converted to p
|
||||
|
||||
It also linkifies URLs, mentions, hashtags, and imagifies emoji.
|
||||
"""
|
||||
|
||||
REWRITE_TO_P = [
|
||||
"p",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"blockquote",
|
||||
"pre",
|
||||
"ul",
|
||||
"ol",
|
||||
]
|
||||
|
||||
REWRITE_TO_BR = [
|
||||
"br",
|
||||
"li",
|
||||
]
|
||||
|
||||
MENTION_REGEX = re.compile(
|
||||
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
|
||||
)
|
||||
|
||||
HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
|
||||
|
||||
EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
|
||||
|
||||
URL_REGEX = re.compile(
|
||||
r"""(\(* # Match any opening parentheses.
|
||||
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
||||
(?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
|
||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||
# except for # and ~, which happen in practice)
|
||||
""",
|
||||
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||
)
|
||||
|
||||
ALLOWED_TAGS = ["br", "p", "a"]
|
||||
REWRITTEN_TAGS = [
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"blockquote",
|
||||
"pre",
|
||||
"ul",
|
||||
"ol",
|
||||
"li",
|
||||
]
|
||||
|
||||
|
||||
class MastodonStrictTagFilter(Filter):
|
||||
"""
|
||||
Implements Python equivalent of Mastodon tag rewriter
|
||||
|
||||
Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
|
||||
|
||||
Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
|
||||
"""
|
||||
|
||||
def __iter__(self):
|
||||
li_pending_break = False
|
||||
break_token = {
|
||||
"name": "br",
|
||||
"data": {},
|
||||
"type": "StartTag",
|
||||
}
|
||||
|
||||
for token in Filter.__iter__(self):
|
||||
if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
|
||||
"StartTag",
|
||||
"EndTag",
|
||||
]:
|
||||
yield token
|
||||
continue
|
||||
|
||||
if token["type"] == "StartTag":
|
||||
if token["name"] == "li":
|
||||
if li_pending_break:
|
||||
# Another `li` appeared, so break after the last one
|
||||
yield break_token
|
||||
continue
|
||||
token["name"] = "p"
|
||||
elif token["type"] == "EndTag":
|
||||
if token["name"] == "li":
|
||||
# Track that an `li` closed so we know a break should be considered
|
||||
li_pending_break = True
|
||||
continue
|
||||
if token["name"] == "ul":
|
||||
# If the last `li` happened, then don't add a break because Mastodon doesn't
|
||||
li_pending_break = False
|
||||
token["name"] = "p"
|
||||
|
||||
yield token
|
||||
|
||||
|
||||
class UnlinkifyFilter(Filter):
|
||||
"""
|
||||
Forcibly replaces link text with the href.
|
||||
|
||||
This is intented to be used when stripping <a> tags to preserve the link
|
||||
location at the expense of the link text.
|
||||
"""
|
||||
|
||||
def __iter__(self):
|
||||
discarding_a_text = False
|
||||
for token in Filter.__iter__(self):
|
||||
if token.get("name") == "a":
|
||||
if token["type"] == "EndTag":
|
||||
discarding_a_text = False
|
||||
continue
|
||||
href = token["data"].get((None, "href"))
|
||||
|
||||
# If <a> has an href, we use it and throw away all content
|
||||
# within the <a>...</a>. If href missing or empty, try to find
|
||||
# text within the <a>...</a>
|
||||
if href:
|
||||
yield {"data": href, "type": "Characters"}
|
||||
discarding_a_text = True
|
||||
continue
|
||||
elif not discarding_a_text:
|
||||
yield token
|
||||
# else: throw away tokens until we're out of the <a>
|
||||
|
||||
|
||||
def allow_a(tag: str, name: str, value: str):
|
||||
if name in ["href", "title", "class"]:
|
||||
return True
|
||||
elif name == "rel":
|
||||
# Only allow rel attributes with a small subset of values
|
||||
# (we're defending against, for example, rel=me)
|
||||
rel_values = value.split()
|
||||
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
def shorten_link_text(attrs, new=False):
|
||||
"""
|
||||
Applies Mastodon's link shortening behavior where URL text links are
|
||||
shortened by removing the scheme and only showing the first 30 chars.
|
||||
|
||||
Orig:
|
||||
<a>https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened</a>
|
||||
|
||||
Becomes:
|
||||
<a>social.example.com/a-long/path</a>
|
||||
|
||||
"""
|
||||
text = attrs.get("_text")
|
||||
if not text:
|
||||
text = attrs.get((None, "href"))
|
||||
if text and "://" in text and len(text) > 30:
|
||||
text = text.split("://", 1)[-1]
|
||||
attrs["_text"] = text[:30]
|
||||
if len(text) > 30:
|
||||
attrs[(None, "class")] = " ".join(
|
||||
filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
|
||||
)
|
||||
# Add the full URL in to title for easier user inspection
|
||||
attrs[(None, "title")] = attrs.get((None, "href"))
|
||||
|
||||
return attrs
|
||||
|
||||
|
||||
linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
|
||||
|
||||
|
||||
def sanitize_html(post_html: str) -> str:
|
||||
"""
|
||||
Only allows a, br, p and span tags, and class attributes.
|
||||
"""
|
||||
cleaner = bleach.Cleaner(
|
||||
tags=ALLOWED_TAGS + REWRITTEN_TAGS,
|
||||
attributes={ # type:ignore
|
||||
"a": allow_a,
|
||||
"p": ["class"],
|
||||
},
|
||||
filters=[
|
||||
partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
|
||||
MastodonStrictTagFilter,
|
||||
],
|
||||
strip=True,
|
||||
""",
|
||||
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||
)
|
||||
return mark_safe(cleaner.clean(post_html))
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
html: str,
|
||||
uri_domain: str | None = None,
|
||||
mentions: list | None = None,
|
||||
find_mentions: bool = False,
|
||||
find_hashtags: bool = False,
|
||||
find_emojis: bool = False,
|
||||
emoji_domain=None,
|
||||
):
|
||||
super().__init__()
|
||||
self.uri_domain = uri_domain
|
||||
self.emoji_domain = emoji_domain
|
||||
self.find_mentions = find_mentions
|
||||
self.find_hashtags = find_hashtags
|
||||
self.find_emojis = find_emojis
|
||||
self.calculate_mentions(mentions)
|
||||
self._data_buffer = ""
|
||||
self.html_output = ""
|
||||
self.text_output = ""
|
||||
self.emojis: set[str] = set()
|
||||
self.mentions: set[str] = set()
|
||||
self.hashtags: set[str] = set()
|
||||
self._pending_a: dict | None = None
|
||||
self._fresh_p = False
|
||||
self.feed(html.replace("\n", ""))
|
||||
self.flush_data()
|
||||
|
||||
def strip_html(post_html: str, *, linkify: bool = True) -> str:
|
||||
"""
|
||||
Strips all tags from the text, then linkifies it.
|
||||
"""
|
||||
cleaner = bleach.Cleaner(
|
||||
tags=[],
|
||||
strip=True,
|
||||
filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
|
||||
if linkify
|
||||
else [UnlinkifyFilter],
|
||||
)
|
||||
return mark_safe(cleaner.clean(post_html))
|
||||
def calculate_mentions(self, mentions: list | None):
|
||||
"""
|
||||
Prepares a set of content that we expect to see mentions look like
|
||||
(this imp)
|
||||
"""
|
||||
self.mention_matches: dict[str, str] = {}
|
||||
self.mention_aliases: dict[str, str] = {}
|
||||
for mention in mentions or []:
|
||||
if self.uri_domain:
|
||||
url = mention.absolute_profile_uri()
|
||||
else:
|
||||
url = str(mention.urls.view)
|
||||
if mention.username:
|
||||
username = mention.username.lower()
|
||||
domain = mention.domain_id.lower()
|
||||
self.mention_matches[f"{username}"] = url
|
||||
self.mention_matches[f"{username}@{domain}"] = url
|
||||
|
||||
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||
if tag in self.REWRITE_TO_P:
|
||||
self.flush_data()
|
||||
self.html_output += "<p>"
|
||||
elif tag in self.REWRITE_TO_BR:
|
||||
self.flush_data()
|
||||
if not self._fresh_p:
|
||||
self.html_output += "<br>"
|
||||
self.text_output += "\n"
|
||||
elif tag == "a":
|
||||
self.flush_data()
|
||||
self._pending_a = {"attrs": dict(attrs), "content": ""}
|
||||
self._fresh_p = tag in self.REWRITE_TO_P
|
||||
|
||||
def html_to_plaintext(post_html: str) -> str:
|
||||
"""
|
||||
Tries to do the inverse of the linebreaks filter.
|
||||
"""
|
||||
# TODO: Handle HTML entities
|
||||
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
||||
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
||||
# Remove all other HTML and return
|
||||
cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
|
||||
return cleaner.clean(post_html).strip()
|
||||
def handle_endtag(self, tag: str) -> None:
|
||||
self._fresh_p = False
|
||||
if tag in self.REWRITE_TO_P:
|
||||
self.flush_data()
|
||||
self.html_output += "</p>"
|
||||
self.text_output += "\n\n"
|
||||
elif tag == "a":
|
||||
if self._pending_a:
|
||||
href = self._pending_a["attrs"].get("href")
|
||||
content = self._pending_a["content"].strip()
|
||||
# Is it a mention?
|
||||
if content.lower().lstrip("@") in self.mention_matches:
|
||||
self.html_output += self.create_mention(content)
|
||||
self.text_output += content
|
||||
# Is it a hashtag?
|
||||
elif self.HASHTAG_REGEX.match(content):
|
||||
self.html_output += self.create_hashtag(content)
|
||||
self.text_output += content
|
||||
elif content:
|
||||
# Shorten the link if we need to
|
||||
self.html_output += self.create_link(href, content)
|
||||
self.text_output += href
|
||||
self._pending_a = None
|
||||
|
||||
def handle_data(self, data: str) -> None:
|
||||
self._fresh_p = False
|
||||
if self._pending_a:
|
||||
self._pending_a["content"] += data
|
||||
else:
|
||||
self._data_buffer += data
|
||||
|
||||
def flush_data(self) -> None:
|
||||
"""
|
||||
We collect data segments until we encounter a tag we care about,
|
||||
so we can treat <span>#</span>hashtag as #hashtag
|
||||
"""
|
||||
self.text_output += self._data_buffer
|
||||
self.html_output += self.linkify(self._data_buffer)
|
||||
self._data_buffer = ""
|
||||
|
||||
def create_link(self, href, content):
|
||||
"""
|
||||
Generates a link, doing optional shortening.
|
||||
|
||||
All return values from this function should be HTML-safe.
|
||||
"""
|
||||
looks_like_link = bool(self.URL_REGEX.match(content))
|
||||
if looks_like_link:
|
||||
content = content.split("://", 1)[1]
|
||||
if looks_like_link and len(content) > 30:
|
||||
return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
|
||||
else:
|
||||
return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'
|
||||
|
||||
def create_mention(self, handle) -> str:
|
||||
"""
|
||||
Generates a mention link. Handle should have a leading @.
|
||||
|
||||
All return values from this function should be HTML-safe
|
||||
"""
|
||||
handle = handle.lstrip("@")
|
||||
if "@" in handle:
|
||||
short_handle = handle.split("@", 1)[0]
|
||||
else:
|
||||
short_handle = handle
|
||||
handle_hash = handle.lower()
|
||||
short_hash = short_handle.lower()
|
||||
self.mentions.add(handle_hash)
|
||||
url = self.mention_matches.get(handle_hash)
|
||||
if url:
|
||||
if short_hash not in self.mention_aliases:
|
||||
self.mention_aliases[short_hash] = handle_hash
|
||||
elif self.mention_aliases.get(short_hash) != handle_hash:
|
||||
short_handle = handle
|
||||
return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
|
||||
else:
|
||||
return "@" + html.escape(handle)
|
||||
|
||||
def create_hashtag(self, hashtag) -> str:
|
||||
"""
|
||||
Generates a hashtag link. Hashtag does not need to start with #
|
||||
|
||||
All return values from this function should be HTML-safe
|
||||
"""
|
||||
hashtag = hashtag.lstrip("#")
|
||||
self.hashtags.add(hashtag.lower())
|
||||
if self.uri_domain:
|
||||
return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
||||
else:
|
||||
return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
||||
|
||||
def create_emoji(self, shortcode) -> str:
|
||||
"""
|
||||
Generates an emoji <img> tag
|
||||
|
||||
All return values from this function should be HTML-safe
|
||||
"""
|
||||
from activities.models import Emoji
|
||||
|
||||
emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
|
||||
if emoji and emoji.is_usable:
|
||||
self.emojis.add(shortcode)
|
||||
return emoji.as_html()
|
||||
return f":{shortcode}:"
|
||||
|
||||
def linkify(self, data):
|
||||
"""
|
||||
Linkifies some content that is plaintext.
|
||||
|
||||
Handles URLs first, then mentions. Note that this takes great care to
|
||||
keep track of what is HTML and what needs to be escaped.
|
||||
"""
|
||||
# Split the string by the URL regex so we know what to escape and what
|
||||
# not to escape.
|
||||
bits = self.URL_REGEX.split(data)
|
||||
result = ""
|
||||
# Even indices are data we should pass though, odd indices are links
|
||||
for i, bit in enumerate(bits):
|
||||
# A link!
|
||||
if i % 2 == 1:
|
||||
result += self.create_link(bit, bit)
|
||||
# Not a link
|
||||
elif self.mention_matches or self.find_mentions:
|
||||
result += self.linkify_mentions(bit)
|
||||
elif self.find_hashtags:
|
||||
result += self.linkify_hashtags(bit)
|
||||
elif self.find_emojis:
|
||||
result += self.linkify_emoji(bit)
|
||||
else:
|
||||
result += html.escape(bit)
|
||||
return result
|
||||
|
||||
def linkify_mentions(self, data):
|
||||
"""
|
||||
Linkifies mentions
|
||||
"""
|
||||
bits = self.MENTION_REGEX.split(data)
|
||||
result = ""
|
||||
for i, bit in enumerate(bits):
|
||||
# Mention content
|
||||
if i % 3 == 2:
|
||||
result += self.create_mention(bit)
|
||||
# Not part of a mention (0) or mention preamble (1)
|
||||
elif self.find_hashtags:
|
||||
result += self.linkify_hashtags(bit)
|
||||
elif self.find_emojis:
|
||||
result += self.linkify_emoji(bit)
|
||||
else:
|
||||
result += html.escape(bit)
|
||||
return result
|
||||
|
||||
def linkify_hashtags(self, data):
|
||||
"""
|
||||
Linkifies hashtags
|
||||
"""
|
||||
bits = self.HASHTAG_REGEX.split(data)
|
||||
result = ""
|
||||
for i, bit in enumerate(bits):
|
||||
# Not part of a hashtag
|
||||
if i % 2 == 0:
|
||||
if self.find_emojis:
|
||||
result += self.linkify_emoji(bit)
|
||||
else:
|
||||
result += html.escape(bit)
|
||||
# Hashtag content
|
||||
else:
|
||||
result += self.create_hashtag(bit)
|
||||
return result
|
||||
|
||||
def linkify_emoji(self, data):
|
||||
"""
|
||||
Linkifies emoji
|
||||
"""
|
||||
bits = self.EMOJI_REGEX.split(data)
|
||||
result = ""
|
||||
for i, bit in enumerate(bits):
|
||||
# Not part of an emoji
|
||||
if i % 2 == 0:
|
||||
result += html.escape(bit)
|
||||
# Emoji content
|
||||
else:
|
||||
result += self.create_emoji(bit)
|
||||
return result
|
||||
|
||||
@property
|
||||
def html(self):
|
||||
return self.html_output.strip()
|
||||
|
||||
@property
|
||||
def plain_text(self):
|
||||
return self.text_output.strip()
|
||||
|
||||
|
||||
class ContentRenderer:
|
||||
|
@ -212,33 +322,30 @@ class ContentRenderer:
|
|||
"""
|
||||
if not html:
|
||||
return ""
|
||||
html = sanitize_html(html)
|
||||
html = self.linkify_mentions(html, post=post)
|
||||
html = self.linkify_hashtags(html, identity=post.author)
|
||||
if self.local:
|
||||
html = self.imageify_emojis(
|
||||
html,
|
||||
identity=post.author,
|
||||
emojis=post.emojis.all(),
|
||||
)
|
||||
html = self.remove_extra_newlines(html)
|
||||
return mark_safe(html)
|
||||
parser = FediverseHtmlParser(
|
||||
html,
|
||||
mentions=post.mentions.all(),
|
||||
uri_domain=(None if self.local else post.author.domain.uri_domain),
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
emoji_domain=post.author.domain,
|
||||
)
|
||||
return mark_safe(parser.html)
|
||||
|
||||
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
|
||||
def render_identity_summary(self, html: str, identity) -> str:
|
||||
"""
|
||||
Given identity summary HTML, normalises it and renders it for presentation.
|
||||
"""
|
||||
if not html:
|
||||
return ""
|
||||
if strip:
|
||||
html = strip_html(html)
|
||||
else:
|
||||
html = sanitize_html(html)
|
||||
html = self.linkify_hashtags(html, identity=identity)
|
||||
if self.local:
|
||||
html = self.imageify_emojis(html, identity=identity)
|
||||
html = self.remove_extra_newlines(html)
|
||||
return mark_safe(html)
|
||||
parser = FediverseHtmlParser(
|
||||
html,
|
||||
uri_domain=(None if self.local else identity.domain.uri_domain),
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
emoji_domain=identity.domain,
|
||||
)
|
||||
return mark_safe(parser.html)
|
||||
|
||||
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
||||
"""
|
||||
|
@ -246,117 +353,14 @@ class ContentRenderer:
|
|||
"""
|
||||
if not html:
|
||||
return ""
|
||||
if strip:
|
||||
html = strip_html(html)
|
||||
else:
|
||||
html = sanitize_html(html)
|
||||
if self.local:
|
||||
html = self.imageify_emojis(html, identity=identity)
|
||||
html = self.remove_extra_newlines(html)
|
||||
return mark_safe(html)
|
||||
|
||||
def linkify_mentions(self, html: str, post) -> str:
|
||||
"""
|
||||
Links mentions _in the context of the post_ - as in, using the mentions
|
||||
property as the only source (as we might be doing this without other
|
||||
DB access allowed)
|
||||
"""
|
||||
from activities.models import Post
|
||||
|
||||
possible_matches = {}
|
||||
for mention in post.mentions.all():
|
||||
if self.local:
|
||||
url = str(mention.urls.view)
|
||||
else:
|
||||
url = mention.absolute_profile_uri()
|
||||
# Might not have fetched it (yet)
|
||||
if mention.username:
|
||||
username = mention.username.lower()
|
||||
possible_matches[username] = url
|
||||
possible_matches[f"{username}@{mention.domain_id}"] = url
|
||||
|
||||
collapse_name: dict[str, str] = {}
|
||||
|
||||
def replacer(match):
|
||||
precursor = match.group(1)
|
||||
handle = match.group(2)
|
||||
if "@" in handle:
|
||||
short_handle = handle.split("@", 1)[0]
|
||||
else:
|
||||
short_handle = handle
|
||||
handle_hash = handle.lower()
|
||||
short_hash = short_handle.lower()
|
||||
if handle_hash in possible_matches:
|
||||
if short_hash not in collapse_name:
|
||||
collapse_name[short_hash] = handle_hash
|
||||
elif collapse_name.get(short_hash) != handle_hash:
|
||||
short_handle = handle
|
||||
return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
|
||||
else:
|
||||
return match.group()
|
||||
|
||||
return Post.mention_regex.sub(replacer, html)
|
||||
|
||||
def linkify_hashtags(self, html, identity) -> str:
|
||||
from activities.models import Hashtag
|
||||
|
||||
def replacer(attrs, new=False):
|
||||
# See if the text in this link looks like a hashtag
|
||||
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
|
||||
return attrs
|
||||
hashtag = attrs["_text"].strip().lstrip("#")
|
||||
attrs[None, "class"] = "hashtag"
|
||||
if (None, "rel") in attrs:
|
||||
del attrs[None, "rel"]
|
||||
if self.local:
|
||||
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
|
||||
else:
|
||||
attrs[
|
||||
None, "href"
|
||||
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
|
||||
return attrs
|
||||
|
||||
linker = bleach.linkifier.Linker(
|
||||
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
|
||||
parser = FediverseHtmlParser(
|
||||
html,
|
||||
uri_domain=(None if self.local else identity.domain.uri_domain),
|
||||
find_hashtags=False,
|
||||
find_emojis=True,
|
||||
emoji_domain=identity.domain,
|
||||
)
|
||||
return linker.linkify(html)
|
||||
|
||||
def imageify_emojis(
|
||||
self, html: str, identity, include_local: bool = True, emojis=None
|
||||
):
|
||||
"""
|
||||
Find :emoji: in content and convert to <img>. If include_local is True,
|
||||
the local emoji will be used as a fallback for any shortcodes not defined
|
||||
by emojis.
|
||||
"""
|
||||
from activities.models import Emoji
|
||||
|
||||
# If precached emojis were passed, prep them
|
||||
cached_emojis = {}
|
||||
if emojis:
|
||||
for emoji in emojis:
|
||||
cached_emojis[emoji.shortcode] = emoji
|
||||
|
||||
def replacer(match):
|
||||
shortcode = match.group(1).lower()
|
||||
if shortcode in cached_emojis:
|
||||
return cached_emojis[shortcode].as_html()
|
||||
|
||||
emoji = Emoji.get_by_domain(shortcode, identity.domain)
|
||||
if emoji and emoji.is_usable:
|
||||
return emoji.as_html()
|
||||
elif not emoji and include_local:
|
||||
emoji = Emoji.get_by_domain(shortcode, None)
|
||||
if emoji:
|
||||
return emoji.as_html()
|
||||
|
||||
return match.group()
|
||||
|
||||
return Emoji.emoji_regex.sub(replacer, html)
|
||||
|
||||
def remove_extra_newlines(self, html: str) -> str:
|
||||
"""
|
||||
Some clients are sensitive to extra newlines even though it's HTML
|
||||
"""
|
||||
# TODO: More intelligent way to strip these?
|
||||
return html.replace("\n", "")
|
||||
if strip:
|
||||
return mark_safe(parser.html)
|
||||
else:
|
||||
return mark_safe(parser.html)
|
||||
|
|
|
@ -1,4 +1,3 @@
|
|||
bleach~=5.0.1
|
||||
blurhash-python~=1.1.3
|
||||
cachetools~=5.2.0
|
||||
cryptography~=39.0
|
||||
|
|
|
@ -1,44 +0,0 @@
|
|||
from activities.models import Hashtag
|
||||
from core.html import ContentRenderer
|
||||
|
||||
|
||||
def test_hashtag_from_content():
|
||||
assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"]
|
||||
assert Hashtag.hashtags_from_content("a#hashtag") == []
|
||||
assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [
|
||||
"hashtag",
|
||||
"with",
|
||||
]
|
||||
assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"]
|
||||
assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [
|
||||
"hashtag",
|
||||
"one",
|
||||
"three",
|
||||
]
|
||||
assert Hashtag.hashtags_from_content("my #html loves   entities") == ["html"]
|
||||
assert Hashtag.hashtags_from_content("<span class='hash'>#</span>tag") == ["tag"]
|
||||
|
||||
|
||||
def test_linkify_hashtag():
|
||||
linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None)
|
||||
|
||||
assert linkify("# hashtag") == "# hashtag"
|
||||
assert (
|
||||
linkify('<a href="/url/with#anchor">Text</a>')
|
||||
== '<a href="/url/with#anchor">Text</a>'
|
||||
)
|
||||
assert (
|
||||
linkify("#HashTag") == '<a href="/tags/hashtag/" class="hashtag">#HashTag</a>'
|
||||
)
|
||||
assert (
|
||||
linkify(
|
||||
"""A longer text #bigContent
|
||||
with #tags, linebreaks, and
|
||||
maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
|
||||
#allTheTags #AllTheTags #ALLTHETAGS"""
|
||||
)
|
||||
== """A longer text <a href="/tags/bigcontent/" class="hashtag">#bigContent</a>
|
||||
with <a href="/tags/tags/" class="hashtag">#tags</a>, linebreaks, and
|
||||
maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
|
||||
<a href="/tags/allthetags/" class="hashtag">#allTheTags</a> <a href="/tags/allthetags/" class="hashtag">#AllTheTags</a> <a href="/tags/allthetags/" class="hashtag">#ALLTHETAGS</a>"""
|
||||
)
|
|
@ -1,5 +1,7 @@
|
|||
import pytest
|
||||
|
||||
from activities.models import Post
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_post_status(api_token, identity, client):
|
||||
|
@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client):
|
|||
).json()
|
||||
assert response["content"] == "<p>Hello, world!</p>"
|
||||
assert response["visibility"] == "unlisted"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_mention_format(api_token, identity, remote_identity, client):
|
||||
"""
|
||||
Ensures mentions work, and only have one link around them.
|
||||
"""
|
||||
# Make a local post and check it
|
||||
response = client.post(
|
||||
"/api/v1/statuses",
|
||||
HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
|
||||
HTTP_ACCEPT="application/json",
|
||||
content_type="application/json",
|
||||
data={
|
||||
"status": "Hello, @test!",
|
||||
"visibility": "unlisted",
|
||||
},
|
||||
).json()
|
||||
assert (
|
||||
response["content"]
|
||||
== '<p>Hello, <a href="https://example.com/@test/">@test</a>!</p>'
|
||||
)
|
||||
assert response["visibility"] == "unlisted"
|
||||
|
||||
# Make a remote post and check it
|
||||
post = Post.objects.create(
|
||||
local=False,
|
||||
author=remote_identity,
|
||||
content='<p>Hey <a href="https://example.com/@test/" class="u-url mention" rel="nofollow">@test</a></p>',
|
||||
object_uri="https://remote.test/status/12345",
|
||||
)
|
||||
post.mentions.add(identity)
|
||||
response = client.get(
|
||||
f"/api/v1/statuses/{post.id}",
|
||||
HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
|
||||
HTTP_ACCEPT="application/json",
|
||||
content_type="application/json",
|
||||
).json()
|
||||
assert (
|
||||
response["text"] == '<p>Hey <a href="https://example.com/@test/">@test</a></p>'
|
||||
)
|
||||
|
|
|
@ -1,155 +1,117 @@
|
|||
from unittest.mock import Mock
|
||||
|
||||
import pytest
|
||||
|
||||
from core.html import ContentRenderer, html_to_plaintext, sanitize_html
|
||||
|
||||
|
||||
def test_html_to_plaintext():
|
||||
|
||||
assert html_to_plaintext("<p>Hi!</p>") == "Hi!"
|
||||
assert html_to_plaintext("<p>Hi!<br>There</p>") == "Hi!\nThere"
|
||||
assert (
|
||||
html_to_plaintext("<p>Hi!</p>\n\n<p>How are you?</p>") == "Hi!\n\nHow are you?"
|
||||
)
|
||||
|
||||
assert (
|
||||
html_to_plaintext("<p>Hi!</p>\n\n<p>How are<br> you?</p><p>today</p>")
|
||||
== "Hi!\n\nHow are\n you?\n\ntoday"
|
||||
)
|
||||
|
||||
assert (
|
||||
html_to_plaintext(
|
||||
'<p><a href="https://fedi.takahe.social/with/a/long/path">'
|
||||
'<b>The</b> <img src="takahe.png"> Link</a> '
|
||||
'<a href="">Empty href</a> '
|
||||
"<a>Empty A</a></p>"
|
||||
)
|
||||
== "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
|
||||
)
|
||||
|
||||
|
||||
def test_sanitize_post():
|
||||
|
||||
assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
|
||||
assert sanitize_html("<p>It's great</p>") == "<p>It's great</p>"
|
||||
|
||||
# Note that we only want to linkify things with protocol prefixes to prevent
|
||||
# too many false positives.
|
||||
assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
|
||||
assert (
|
||||
sanitize_html("<p>https://test.com</p>")
|
||||
== '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
|
||||
)
|
||||
assert (
|
||||
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
|
||||
== "<p>@someone@subdomain.some-domain.com</p>"
|
||||
)
|
||||
|
||||
|
||||
def test_shorten_url():
|
||||
full_url = (
|
||||
"https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened"
|
||||
)
|
||||
assert (
|
||||
sanitize_html(f"<p>{full_url}</p>")
|
||||
== f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url}">social.example.com/a-long/path</a></p>'
|
||||
)
|
||||
|
||||
assert (
|
||||
sanitize_html(
|
||||
f'<p><a href="{full_url}">This is a long link text, but cannot be shortened as a URL</a></p>'
|
||||
)
|
||||
== f'<p><a href="{full_url}" rel="nofollow">This is a long link text, but cannot be shortened as a URL</a></p>'
|
||||
)
|
||||
from core.html import FediverseHtmlParser
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_link_preservation():
|
||||
def test_parser(identity):
|
||||
"""
|
||||
We want to:
|
||||
- Preserve incoming links from other servers
|
||||
- Linkify mentions and hashtags
|
||||
- Not have these all step on each other!
|
||||
Validates the HtmlParser in its various output modes
|
||||
"""
|
||||
renderer = ContentRenderer(local=True)
|
||||
fake_mention = Mock()
|
||||
fake_mention.username = "andrew"
|
||||
fake_mention.domain_id = "aeracode.org"
|
||||
fake_mention.urls.view = "/@andrew@aeracode.org/"
|
||||
fake_post = Mock()
|
||||
fake_post.mentions.all.return_value = [fake_mention]
|
||||
fake_post.author.domain.uri_domain = "example.com"
|
||||
fake_post.emojis.all.return_value = []
|
||||
|
||||
# Basic tag allowance
|
||||
parser = FediverseHtmlParser("<p>Hello!</p><script></script>")
|
||||
assert parser.html == "<p>Hello!</p>"
|
||||
assert parser.plain_text == "Hello!"
|
||||
|
||||
# Newline erasure
|
||||
parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")
|
||||
assert parser.html == "<p>Hi!</p><p>How are you?</p>"
|
||||
assert parser.plain_text == "Hi!\n\nHow are you?"
|
||||
|
||||
# Trying to be evil
|
||||
parser = FediverseHtmlParser("<scri<span></span>pt>")
|
||||
assert "<scr" not in parser.html
|
||||
parser = FediverseHtmlParser("<scri #hashtag pt>")
|
||||
assert "<scr" not in parser.html
|
||||
|
||||
# Entities are escaped
|
||||
parser = FediverseHtmlParser("<p>It's great</p>", find_hashtags=True)
|
||||
assert parser.html == "<p>It's great</p>"
|
||||
assert parser.plain_text == "It's great"
|
||||
assert parser.hashtags == set()
|
||||
|
||||
# Linkify works, but only with protocol prefixes
|
||||
parser = FediverseHtmlParser("<p>test.com</p>")
|
||||
assert parser.html == "<p>test.com</p>"
|
||||
assert parser.plain_text == "test.com"
|
||||
parser = FediverseHtmlParser("<p>https://test.com</p>")
|
||||
assert (
|
||||
renderer.render_post(
|
||||
'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
|
||||
fake_post,
|
||||
)
|
||||
== 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
|
||||
parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'
|
||||
)
|
||||
assert parser.plain_text == "https://test.com"
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_list_rendering():
|
||||
"""
|
||||
We want to:
|
||||
- Preserve incoming links from other servers
|
||||
- Linkify mentions and hashtags
|
||||
- Not have these all step on each other!
|
||||
"""
|
||||
renderer = ContentRenderer(local=True)
|
||||
fake_mention = Mock()
|
||||
fake_mention.username = "andrew"
|
||||
fake_mention.domain_id = "aeracode.org"
|
||||
fake_mention.urls.view = "/@andrew@aeracode.org/"
|
||||
fake_post = Mock()
|
||||
fake_post.mentions.all.return_value = [fake_mention]
|
||||
fake_post.author.domain.uri_domain = "example.com"
|
||||
fake_post.emojis.all.return_value = []
|
||||
|
||||
# Links are preserved
|
||||
parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")
|
||||
assert (
|
||||
renderer.render_post(
|
||||
"<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>What’s next? I think I promised some people here bookwyrm</p>",
|
||||
fake_post,
|
||||
)
|
||||
== "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>What’s next? I think I promised some people here bookwyrm</p>"
|
||||
parser.html
|
||||
== '<a href="https://takahe.social" rel="nofollow">takahe social</a>'
|
||||
)
|
||||
assert parser.plain_text == "https://takahe.social"
|
||||
|
||||
# Very long links are shortened
|
||||
full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
|
||||
parser = FediverseHtmlParser(f"<p>{full_url}</p>")
|
||||
assert (
|
||||
parser.html
|
||||
== f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'
|
||||
)
|
||||
assert (
|
||||
parser.plain_text
|
||||
== "https://social.example.com/a-long/path/that-should-be-shortened"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_link_mixcase_mentions():
|
||||
renderer = ContentRenderer(local=True)
|
||||
fake_mention = Mock()
|
||||
fake_mention.username = "Manfre"
|
||||
fake_mention.domain_id = "manfre.net"
|
||||
fake_mention.urls.view = "/@Manfre@manfre.net/"
|
||||
fake_mention2 = Mock()
|
||||
fake_mention2.username = "manfre"
|
||||
fake_mention2.domain_id = "takahe.social"
|
||||
fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
|
||||
|
||||
unfetched_mention = Mock()
|
||||
unfetched_mention.username = None
|
||||
unfetched_mention.domain_id = None
|
||||
unfetched_mention.urls.view = "/None@None/"
|
||||
|
||||
fake_post = Mock()
|
||||
fake_post.mentions.all.return_value = [
|
||||
fake_mention,
|
||||
fake_mention2,
|
||||
unfetched_mention,
|
||||
]
|
||||
fake_post.author.domain.uri_domain = "example.com"
|
||||
fake_post.emojis.all.return_value = []
|
||||
|
||||
assert renderer.render_post(
|
||||
"@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
|
||||
fake_post,
|
||||
) == (
|
||||
'<a href="/@Manfre@manfre.net/">@Manfre</a> '
|
||||
'<a href="https://takahe.social/@manfre@takahe.social/">@mAnFrE@takahe.social</a> '
|
||||
'<a href="/@Manfre@manfre.net/">@manfre</a> '
|
||||
"@unfetched@manfre.net"
|
||||
# Make sure things that look like mentions are left alone with no mentions supplied.
|
||||
parser = FediverseHtmlParser(
|
||||
"<p>@test@example.com</p>",
|
||||
find_mentions=True,
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
)
|
||||
assert parser.html == "<p>@test@example.com</p>"
|
||||
assert parser.plain_text == "@test@example.com"
|
||||
assert parser.mentions == {"test@example.com"}
|
||||
|
||||
# Make sure mentions work when there is a mention supplied
|
||||
parser = FediverseHtmlParser(
|
||||
"<p>@test@example.com</p>",
|
||||
mentions=[identity],
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
)
|
||||
assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'
|
||||
assert parser.plain_text == "@test@example.com"
|
||||
assert parser.mentions == {"test@example.com"}
|
||||
|
||||
# Ensure mentions are case insensitive
|
||||
parser = FediverseHtmlParser(
|
||||
"<p>@TeSt@ExamPle.com</p>",
|
||||
mentions=[identity],
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
)
|
||||
assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'
|
||||
assert parser.plain_text == "@TeSt@ExamPle.com"
|
||||
assert parser.mentions == {"test@example.com"}
|
||||
|
||||
# Ensure hashtags are linked, even through spans, but not within hrefs
|
||||
parser = FediverseHtmlParser(
|
||||
'<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
)
|
||||
assert (
|
||||
parser.html
|
||||
== '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/">#hashtag</a> <a href="/tags/hashtagtwo/">#hashtagtwo</a>'
|
||||
)
|
||||
assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
|
||||
assert parser.hashtags == {"hashtag", "hashtagtwo"}
|
||||
|
||||
# Ensure lists are rendered reasonably
|
||||
parser = FediverseHtmlParser(
|
||||
"<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",
|
||||
find_hashtags=True,
|
||||
find_emojis=True,
|
||||
)
|
||||
assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"
|
||||
assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
|
||||
|
|
|
@ -13,7 +13,7 @@ from django.utils.functional import lazy
|
|||
from lxml import etree
|
||||
|
||||
from core.exceptions import ActorMismatchError, capture_message
|
||||
from core.html import ContentRenderer, html_to_plaintext, strip_html
|
||||
from core.html import ContentRenderer, FediverseHtmlParser
|
||||
from core.ld import (
|
||||
canonicalise,
|
||||
format_ld_date,
|
||||
|
@ -530,8 +530,8 @@ class Identity(StatorModel):
|
|||
response["attachment"] = [
|
||||
{
|
||||
"type": "http://schema.org#PropertyValue",
|
||||
"name": strip_html(item["name"], linkify=False),
|
||||
"value": strip_html(item["value"]),
|
||||
"name": FediverseHtmlParser(item["name"]).plain_text,
|
||||
"value": FediverseHtmlParser(item["value"]).html,
|
||||
}
|
||||
for item in self.metadata
|
||||
]
|
||||
|
@ -781,7 +781,9 @@ class Identity(StatorModel):
|
|||
self.metadata.append(
|
||||
{
|
||||
"name": attachment.get("name"),
|
||||
"value": strip_html(attachment.get("http://schema.org#value")),
|
||||
"value": FediverseHtmlParser(
|
||||
attachment.get("http://schema.org#value")
|
||||
).html,
|
||||
}
|
||||
)
|
||||
# Now go do webfinger with that info to see if we can get a canonical domain
|
||||
|
@ -903,12 +905,14 @@ class Identity(StatorModel):
|
|||
Post.Visibilities.mentioned: "direct",
|
||||
}
|
||||
result["source"] = {
|
||||
"note": html_to_plaintext(self.summary) if self.summary else "",
|
||||
"note": FediverseHtmlParser(self.summary).plain_text
|
||||
if self.summary
|
||||
else "",
|
||||
"fields": (
|
||||
[
|
||||
{
|
||||
"name": m["name"],
|
||||
"value": strip_html(m["value"], linkify=False),
|
||||
"value": FediverseHtmlParser(m["value"]).plain_text,
|
||||
"verified_at": None,
|
||||
}
|
||||
for m in self.metadata
|
||||
|
|
|
@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter
|
|||
|
||||
from activities.models import FanOut
|
||||
from core.files import resize_image
|
||||
from core.html import strip_html
|
||||
from core.html import FediverseHtmlParser
|
||||
from users.models import (
|
||||
Block,
|
||||
BlockStates,
|
||||
|
@ -211,7 +211,7 @@ class IdentityService:
|
|||
Safely sets a summary and turns linebreaks into HTML
|
||||
"""
|
||||
if summary:
|
||||
self.identity.summary = linebreaks_filter(strip_html(summary))
|
||||
self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html
|
||||
else:
|
||||
self.identity.summary = None
|
||||
self.identity.save()
|
||||
|
|
|
@ -4,7 +4,7 @@ from django.shortcuts import redirect
|
|||
from django.utils.decorators import method_decorator
|
||||
from django.views.generic import FormView
|
||||
|
||||
from core.html import html_to_plaintext
|
||||
from core.html import FediverseHtmlParser
|
||||
from core.models.config import Config
|
||||
from users.decorators import identity_required
|
||||
from users.models import IdentityStates
|
||||
|
@ -65,7 +65,11 @@ class ProfilePage(FormView):
|
|||
identity = self.request.identity
|
||||
return {
|
||||
"name": identity.name,
|
||||
"summary": html_to_plaintext(identity.summary) if identity.summary else "",
|
||||
"summary": (
|
||||
FediverseHtmlParser(identity.summary).plain_text
|
||||
if identity.summary
|
||||
else ""
|
||||
),
|
||||
"icon": identity.icon and identity.icon.url,
|
||||
"image": identity.image and identity.image.url,
|
||||
"discoverable": identity.discoverable,
|
||||
|
|
Loading…
Reference in a new issue