mirror of
https://github.com/jointakahe/takahe.git
synced 2024-11-25 08:41:00 +00:00
Move to a new HTML parser/stripper
This removes the use of the EOL'd Bleach, and also integrates hashtag, mention and emoji searching into one single place.
This commit is contained in:
parent
93c0af992b
commit
a6922cb9d6
14 changed files with 503 additions and 562 deletions
|
@ -48,13 +48,7 @@ repos:
|
||||||
- id: mypy
|
- id: mypy
|
||||||
exclude: "^tests/"
|
exclude: "^tests/"
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
[
|
[types-pyopenssl, types-mock, types-cachetools, types-python-dateutil]
|
||||||
types-pyopenssl,
|
|
||||||
types-bleach,
|
|
||||||
types-mock,
|
|
||||||
types-cachetools,
|
|
||||||
types-python-dateutil,
|
|
||||||
]
|
|
||||||
|
|
||||||
- repo: https://github.com/rtts/djhtml
|
- repo: https://github.com/rtts/djhtml
|
||||||
rev: v1.5.2
|
rev: v1.5.2
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
from asgiref.sync import async_to_sync
|
|
||||||
from django.contrib import admin
|
from django.contrib import admin
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils.safestring import mark_safe
|
from django.utils.safestring import mark_safe
|
||||||
|
@ -165,7 +164,6 @@ class PostAdmin(admin.ModelAdmin):
|
||||||
list_filter = ("type", "local", "visibility", "state", "created")
|
list_filter = ("type", "local", "visibility", "state", "created")
|
||||||
raw_id_fields = ["emojis"]
|
raw_id_fields = ["emojis"]
|
||||||
autocomplete_fields = ["to", "mentions", "author"]
|
autocomplete_fields = ["to", "mentions", "author"]
|
||||||
actions = ["reparse_hashtags"]
|
|
||||||
search_fields = ["content", "search_handle", "search_service_handle"]
|
search_fields = ["content", "search_handle", "search_service_handle"]
|
||||||
inlines = [PostAttachmentInline]
|
inlines = [PostAttachmentInline]
|
||||||
readonly_fields = ["created", "updated", "state_changed", "object_json"]
|
readonly_fields = ["created", "updated", "state_changed", "object_json"]
|
||||||
|
@ -183,13 +181,6 @@ class PostAdmin(admin.ModelAdmin):
|
||||||
)
|
)
|
||||||
return super().get_search_results(request, queryset, search_term)
|
return super().get_search_results(request, queryset, search_term)
|
||||||
|
|
||||||
@admin.action(description="Reprocess content for hashtags")
|
|
||||||
def reparse_hashtags(self, request, queryset):
|
|
||||||
for instance in queryset:
|
|
||||||
instance.hashtags = Hashtag.hashtags_from_content(instance.content) or None
|
|
||||||
instance.save()
|
|
||||||
async_to_sync(instance.ensure_hashtags)()
|
|
||||||
|
|
||||||
@admin.display(description="ActivityPub JSON")
|
@admin.display(description="ActivityPub JSON")
|
||||||
def object_json(self, instance):
|
def object_json(self, instance):
|
||||||
return instance.to_ap()
|
return instance.to_ap()
|
||||||
|
|
|
@ -1,5 +1,4 @@
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import re
|
|
||||||
from functools import partial
|
from functools import partial
|
||||||
from typing import ClassVar
|
from typing import ClassVar
|
||||||
|
|
||||||
|
@ -14,7 +13,7 @@ from django.db import models
|
||||||
from django.utils.safestring import mark_safe
|
from django.utils.safestring import mark_safe
|
||||||
|
|
||||||
from core.files import get_remote_file
|
from core.files import get_remote_file
|
||||||
from core.html import strip_html
|
from core.html import FediverseHtmlParser
|
||||||
from core.ld import format_ld_date
|
from core.ld import format_ld_date
|
||||||
from core.models import Config
|
from core.models import Config
|
||||||
from core.uploads import upload_emoji_namer
|
from core.uploads import upload_emoji_namer
|
||||||
|
@ -134,8 +133,6 @@ class Emoji(StatorModel):
|
||||||
admin_disable = "{admin}{self.pk}/disable/"
|
admin_disable = "{admin}{self.pk}/disable/"
|
||||||
admin_copy = "{admin}{self.pk}/copy/"
|
admin_copy = "{admin}{self.pk}/copy/"
|
||||||
|
|
||||||
emoji_regex = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
|
|
||||||
|
|
||||||
def delete(self, using=None, keep_parents=False):
|
def delete(self, using=None, keep_parents=False):
|
||||||
if self.file:
|
if self.file:
|
||||||
self.file.delete()
|
self.file.delete()
|
||||||
|
@ -242,7 +239,9 @@ class Emoji(StatorModel):
|
||||||
Return a parsed and sanitized of emoji found in content without
|
Return a parsed and sanitized of emoji found in content without
|
||||||
the surrounding ':'.
|
the surrounding ':'.
|
||||||
"""
|
"""
|
||||||
emoji_hits = cls.emoji_regex.findall(strip_html(content))
|
emoji_hits = FediverseHtmlParser(
|
||||||
|
content, find_emojis=True, emoji_domain=domain
|
||||||
|
).emojis
|
||||||
emojis = sorted({emoji.lower() for emoji in emoji_hits})
|
emojis = sorted({emoji.lower() for emoji in emoji_hits})
|
||||||
return list(
|
return list(
|
||||||
cls.objects.filter(local=(domain is None) or domain.local)
|
cls.objects.filter(local=(domain is None) or domain.local)
|
||||||
|
|
|
@ -6,7 +6,6 @@ from asgiref.sync import sync_to_async
|
||||||
from django.db import models
|
from django.db import models
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from core.html import strip_html
|
|
||||||
from core.models import Config
|
from core.models import Config
|
||||||
from stator.models import State, StateField, StateGraph, StatorModel
|
from stator.models import State, StateField, StateGraph, StatorModel
|
||||||
|
|
||||||
|
@ -167,16 +166,6 @@ class Hashtag(StatorModel):
|
||||||
results[date(year, month, day)] = val
|
results[date(year, month, day)] = val
|
||||||
return dict(sorted(results.items(), reverse=True)[:num])
|
return dict(sorted(results.items(), reverse=True)[:num])
|
||||||
|
|
||||||
@classmethod
|
|
||||||
def hashtags_from_content(cls, content) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return a parsed and sanitized of hashtags found in content without
|
|
||||||
leading '#'.
|
|
||||||
"""
|
|
||||||
hashtag_hits = cls.hashtag_regex.findall(strip_html(content))
|
|
||||||
hashtags = sorted({tag.lower() for tag in hashtag_hits})
|
|
||||||
return list(hashtags)
|
|
||||||
|
|
||||||
def to_mastodon_json(self):
|
def to_mastodon_json(self):
|
||||||
return {
|
return {
|
||||||
"name": self.hashtag,
|
"name": self.hashtag,
|
||||||
|
|
|
@ -2,7 +2,6 @@ import datetime
|
||||||
import hashlib
|
import hashlib
|
||||||
import json
|
import json
|
||||||
import mimetypes
|
import mimetypes
|
||||||
import re
|
|
||||||
import ssl
|
import ssl
|
||||||
from collections.abc import Iterable
|
from collections.abc import Iterable
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
|
@ -26,7 +25,7 @@ from activities.models.post_types import (
|
||||||
PostTypeDataEncoder,
|
PostTypeDataEncoder,
|
||||||
)
|
)
|
||||||
from core.exceptions import capture_message
|
from core.exceptions import capture_message
|
||||||
from core.html import ContentRenderer, strip_html
|
from core.html import ContentRenderer, FediverseHtmlParser
|
||||||
from core.ld import (
|
from core.ld import (
|
||||||
canonicalise,
|
canonicalise,
|
||||||
format_ld_date,
|
format_ld_date,
|
||||||
|
@ -374,10 +373,6 @@ class Post(StatorModel):
|
||||||
def clean_type_data(self, value):
|
def clean_type_data(self, value):
|
||||||
PostTypeData.parse_obj(value)
|
PostTypeData.parse_obj(value)
|
||||||
|
|
||||||
mention_regex = re.compile(
|
|
||||||
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
|
|
||||||
)
|
|
||||||
|
|
||||||
def _safe_content_note(self, *, local: bool = True):
|
def _safe_content_note(self, *, local: bool = True):
|
||||||
return ContentRenderer(local=local).render_post(self.content, self)
|
return ContentRenderer(local=local).render_post(self.content, self)
|
||||||
|
|
||||||
|
@ -474,12 +469,12 @@ class Post(StatorModel):
|
||||||
# Maintain local-only for replies
|
# Maintain local-only for replies
|
||||||
if reply_to.visibility == reply_to.Visibilities.local_only:
|
if reply_to.visibility == reply_to.Visibilities.local_only:
|
||||||
visibility = reply_to.Visibilities.local_only
|
visibility = reply_to.Visibilities.local_only
|
||||||
# Find hashtags in this post
|
|
||||||
hashtags = Hashtag.hashtags_from_content(content) or None
|
|
||||||
# Find emoji in this post
|
# Find emoji in this post
|
||||||
emojis = Emoji.emojis_from_content(content, None)
|
emojis = Emoji.emojis_from_content(content, None)
|
||||||
# Strip all HTML and apply linebreaks filter
|
# Strip all unwanted HTML and apply linebreaks filter, grabbing hashtags on the way
|
||||||
content = linebreaks_filter(strip_html(content))
|
parser = FediverseHtmlParser(linebreaks_filter(content), find_hashtags=True)
|
||||||
|
content = parser.html
|
||||||
|
hashtags = sorted(parser.hashtags) or None
|
||||||
# Make the Post object
|
# Make the Post object
|
||||||
post = cls.objects.create(
|
post = cls.objects.create(
|
||||||
author=author,
|
author=author,
|
||||||
|
@ -512,12 +507,13 @@ class Post(StatorModel):
|
||||||
):
|
):
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
# Strip all HTML and apply linebreaks filter
|
# Strip all HTML and apply linebreaks filter
|
||||||
self.content = linebreaks_filter(strip_html(content))
|
parser = FediverseHtmlParser(linebreaks_filter(content))
|
||||||
|
self.content = parser.html
|
||||||
|
self.hashtags = sorted(parser.hashtags) or None
|
||||||
self.summary = summary or None
|
self.summary = summary or None
|
||||||
self.sensitive = bool(summary)
|
self.sensitive = bool(summary)
|
||||||
self.visibility = visibility
|
self.visibility = visibility
|
||||||
self.edited = timezone.now()
|
self.edited = timezone.now()
|
||||||
self.hashtags = Hashtag.hashtags_from_content(content) or None
|
|
||||||
self.mentions.set(self.mentions_from_content(content, self.author))
|
self.mentions.set(self.mentions_from_content(content, self.author))
|
||||||
self.emojis.set(Emoji.emojis_from_content(content, None))
|
self.emojis.set(Emoji.emojis_from_content(content, None))
|
||||||
self.attachments.set(attachments or [])
|
self.attachments.set(attachments or [])
|
||||||
|
@ -525,9 +521,9 @@ class Post(StatorModel):
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def mentions_from_content(cls, content, author) -> set[Identity]:
|
def mentions_from_content(cls, content, author) -> set[Identity]:
|
||||||
mention_hits = cls.mention_regex.findall(content)
|
mention_hits = FediverseHtmlParser(content, find_mentions=True).mentions
|
||||||
mentions = set()
|
mentions = set()
|
||||||
for precursor, handle in mention_hits:
|
for handle in mention_hits:
|
||||||
handle = handle.lower()
|
handle = handle.lower()
|
||||||
if "@" in handle:
|
if "@" in handle:
|
||||||
username, domain = handle.split("@", 1)
|
username, domain = handle.split("@", 1)
|
||||||
|
|
|
@ -14,7 +14,7 @@ from activities.models import (
|
||||||
TimelineEvent,
|
TimelineEvent,
|
||||||
)
|
)
|
||||||
from core.files import blurhash_image, resize_image
|
from core.files import blurhash_image, resize_image
|
||||||
from core.html import html_to_plaintext
|
from core.html import FediverseHtmlParser
|
||||||
from core.models import Config
|
from core.models import Config
|
||||||
from users.decorators import identity_required
|
from users.decorators import identity_required
|
||||||
|
|
||||||
|
@ -112,7 +112,7 @@ class Compose(FormView):
|
||||||
{
|
{
|
||||||
"reply_to": self.reply_to.pk if self.reply_to else "",
|
"reply_to": self.reply_to.pk if self.reply_to else "",
|
||||||
"visibility": self.post_obj.visibility,
|
"visibility": self.post_obj.visibility,
|
||||||
"text": html_to_plaintext(self.post_obj.content),
|
"text": FediverseHtmlParser(self.post_obj.content).plain_text,
|
||||||
"content_warning": self.post_obj.summary,
|
"content_warning": self.post_obj.summary,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
642
core/html.py
642
core/html.py
|
@ -1,199 +1,309 @@
|
||||||
|
import html
|
||||||
import re
|
import re
|
||||||
from functools import partial
|
from html.parser import HTMLParser
|
||||||
|
|
||||||
import bleach
|
|
||||||
import bleach.callbacks
|
|
||||||
from bleach.html5lib_shim import Filter
|
|
||||||
from bleach.linkifier import LinkifyFilter
|
|
||||||
from django.utils.safestring import mark_safe
|
from django.utils.safestring import mark_safe
|
||||||
|
|
||||||
url_regex = re.compile(
|
|
||||||
r"""\(* # Match any opening parentheses.
|
class FediverseHtmlParser(HTMLParser):
|
||||||
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
"""
|
||||||
([\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
A custom HTML parser that only allows a certain tag subset and behaviour:
|
||||||
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?
|
- br, p tags are passed through
|
||||||
|
- a tags are passed through if they're not hashtags or mentions
|
||||||
|
- Another set of tags are converted to p
|
||||||
|
|
||||||
|
It also linkifies URLs, mentions, hashtags, and imagifies emoji.
|
||||||
|
"""
|
||||||
|
|
||||||
|
REWRITE_TO_P = [
|
||||||
|
"p",
|
||||||
|
"h1",
|
||||||
|
"h2",
|
||||||
|
"h3",
|
||||||
|
"h4",
|
||||||
|
"h5",
|
||||||
|
"h6",
|
||||||
|
"blockquote",
|
||||||
|
"pre",
|
||||||
|
"ul",
|
||||||
|
"ol",
|
||||||
|
]
|
||||||
|
|
||||||
|
REWRITE_TO_BR = [
|
||||||
|
"br",
|
||||||
|
"li",
|
||||||
|
]
|
||||||
|
|
||||||
|
MENTION_REGEX = re.compile(
|
||||||
|
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
|
||||||
|
)
|
||||||
|
|
||||||
|
HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)")
|
||||||
|
|
||||||
|
EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")
|
||||||
|
|
||||||
|
URL_REGEX = re.compile(
|
||||||
|
r"""(\(* # Match any opening parentheses.
|
||||||
|
\b(?<![@.])(?:https?://(?:(?:\w+:)?\w+@)?) # http://
|
||||||
|
(?:[\w-]+\.)+(?:[\w-]+)(?:\:[0-9]+)?(?!\.\w)\b # xx.yy.tld(:##)?
|
||||||
|
(?:[/?][^\s\{{\}}\|\\\^\[\]`<>"]*)?)
|
||||||
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
# /path/zz (excluding "unsafe" chars from RFC 1738,
|
||||||
# except for # and ~, which happen in practice)
|
# except for # and ~, which happen in practice)
|
||||||
""",
|
""",
|
||||||
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
re.IGNORECASE | re.VERBOSE | re.UNICODE,
|
||||||
)
|
|
||||||
|
|
||||||
ALLOWED_TAGS = ["br", "p", "a"]
|
|
||||||
REWRITTEN_TAGS = [
|
|
||||||
"h1",
|
|
||||||
"h2",
|
|
||||||
"h3",
|
|
||||||
"h4",
|
|
||||||
"h5",
|
|
||||||
"h6",
|
|
||||||
"blockquote",
|
|
||||||
"pre",
|
|
||||||
"ul",
|
|
||||||
"ol",
|
|
||||||
"li",
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
class MastodonStrictTagFilter(Filter):
|
|
||||||
"""
|
|
||||||
Implements Python equivalent of Mastodon tag rewriter
|
|
||||||
|
|
||||||
Clone of https://github.com/mastodon/mastodon/blob/main/lib/sanitize_ext/sanitize_config.rb#L55
|
|
||||||
|
|
||||||
Broadly this replaces all REWRITTEN_TAGS with `p` except for lists where it formats it into `<br>` lists
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
li_pending_break = False
|
|
||||||
break_token = {
|
|
||||||
"name": "br",
|
|
||||||
"data": {},
|
|
||||||
"type": "StartTag",
|
|
||||||
}
|
|
||||||
|
|
||||||
for token in Filter.__iter__(self):
|
|
||||||
if token.get("name") not in REWRITTEN_TAGS or token["type"] not in [
|
|
||||||
"StartTag",
|
|
||||||
"EndTag",
|
|
||||||
]:
|
|
||||||
yield token
|
|
||||||
continue
|
|
||||||
|
|
||||||
if token["type"] == "StartTag":
|
|
||||||
if token["name"] == "li":
|
|
||||||
if li_pending_break:
|
|
||||||
# Another `li` appeared, so break after the last one
|
|
||||||
yield break_token
|
|
||||||
continue
|
|
||||||
token["name"] = "p"
|
|
||||||
elif token["type"] == "EndTag":
|
|
||||||
if token["name"] == "li":
|
|
||||||
# Track that an `li` closed so we know a break should be considered
|
|
||||||
li_pending_break = True
|
|
||||||
continue
|
|
||||||
if token["name"] == "ul":
|
|
||||||
# If the last `li` happened, then don't add a break because Mastodon doesn't
|
|
||||||
li_pending_break = False
|
|
||||||
token["name"] = "p"
|
|
||||||
|
|
||||||
yield token
|
|
||||||
|
|
||||||
|
|
||||||
class UnlinkifyFilter(Filter):
|
|
||||||
"""
|
|
||||||
Forcibly replaces link text with the href.
|
|
||||||
|
|
||||||
This is intented to be used when stripping <a> tags to preserve the link
|
|
||||||
location at the expense of the link text.
|
|
||||||
"""
|
|
||||||
|
|
||||||
def __iter__(self):
|
|
||||||
discarding_a_text = False
|
|
||||||
for token in Filter.__iter__(self):
|
|
||||||
if token.get("name") == "a":
|
|
||||||
if token["type"] == "EndTag":
|
|
||||||
discarding_a_text = False
|
|
||||||
continue
|
|
||||||
href = token["data"].get((None, "href"))
|
|
||||||
|
|
||||||
# If <a> has an href, we use it and throw away all content
|
|
||||||
# within the <a>...</a>. If href missing or empty, try to find
|
|
||||||
# text within the <a>...</a>
|
|
||||||
if href:
|
|
||||||
yield {"data": href, "type": "Characters"}
|
|
||||||
discarding_a_text = True
|
|
||||||
continue
|
|
||||||
elif not discarding_a_text:
|
|
||||||
yield token
|
|
||||||
# else: throw away tokens until we're out of the <a>
|
|
||||||
|
|
||||||
|
|
||||||
def allow_a(tag: str, name: str, value: str):
|
|
||||||
if name in ["href", "title", "class"]:
|
|
||||||
return True
|
|
||||||
elif name == "rel":
|
|
||||||
# Only allow rel attributes with a small subset of values
|
|
||||||
# (we're defending against, for example, rel=me)
|
|
||||||
rel_values = value.split()
|
|
||||||
if all(v in ["nofollow", "noopener", "noreferrer", "tag"] for v in rel_values):
|
|
||||||
return True
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def shorten_link_text(attrs, new=False):
|
|
||||||
"""
|
|
||||||
Applies Mastodon's link shortening behavior where URL text links are
|
|
||||||
shortened by removing the scheme and only showing the first 30 chars.
|
|
||||||
|
|
||||||
Orig:
|
|
||||||
<a>https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened</a>
|
|
||||||
|
|
||||||
Becomes:
|
|
||||||
<a>social.example.com/a-long/path</a>
|
|
||||||
|
|
||||||
"""
|
|
||||||
text = attrs.get("_text")
|
|
||||||
if not text:
|
|
||||||
text = attrs.get((None, "href"))
|
|
||||||
if text and "://" in text and len(text) > 30:
|
|
||||||
text = text.split("://", 1)[-1]
|
|
||||||
attrs["_text"] = text[:30]
|
|
||||||
if len(text) > 30:
|
|
||||||
attrs[(None, "class")] = " ".join(
|
|
||||||
filter(None, [attrs.pop((None, "class"), ""), "ellipsis"])
|
|
||||||
)
|
|
||||||
# Add the full URL in to title for easier user inspection
|
|
||||||
attrs[(None, "title")] = attrs.get((None, "href"))
|
|
||||||
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
|
|
||||||
linkify_callbacks = [bleach.callbacks.nofollow, shorten_link_text]
|
|
||||||
|
|
||||||
|
|
||||||
def sanitize_html(post_html: str) -> str:
|
|
||||||
"""
|
|
||||||
Only allows a, br, p and span tags, and class attributes.
|
|
||||||
"""
|
|
||||||
cleaner = bleach.Cleaner(
|
|
||||||
tags=ALLOWED_TAGS + REWRITTEN_TAGS,
|
|
||||||
attributes={ # type:ignore
|
|
||||||
"a": allow_a,
|
|
||||||
"p": ["class"],
|
|
||||||
},
|
|
||||||
filters=[
|
|
||||||
partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks),
|
|
||||||
MastodonStrictTagFilter,
|
|
||||||
],
|
|
||||||
strip=True,
|
|
||||||
)
|
)
|
||||||
return mark_safe(cleaner.clean(post_html))
|
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
html: str,
|
||||||
|
uri_domain: str | None = None,
|
||||||
|
mentions: list | None = None,
|
||||||
|
find_mentions: bool = False,
|
||||||
|
find_hashtags: bool = False,
|
||||||
|
find_emojis: bool = False,
|
||||||
|
emoji_domain=None,
|
||||||
|
):
|
||||||
|
super().__init__()
|
||||||
|
self.uri_domain = uri_domain
|
||||||
|
self.emoji_domain = emoji_domain
|
||||||
|
self.find_mentions = find_mentions
|
||||||
|
self.find_hashtags = find_hashtags
|
||||||
|
self.find_emojis = find_emojis
|
||||||
|
self.calculate_mentions(mentions)
|
||||||
|
self._data_buffer = ""
|
||||||
|
self.html_output = ""
|
||||||
|
self.text_output = ""
|
||||||
|
self.emojis: set[str] = set()
|
||||||
|
self.mentions: set[str] = set()
|
||||||
|
self.hashtags: set[str] = set()
|
||||||
|
self._pending_a: dict | None = None
|
||||||
|
self._fresh_p = False
|
||||||
|
self.feed(html.replace("\n", ""))
|
||||||
|
self.flush_data()
|
||||||
|
|
||||||
def strip_html(post_html: str, *, linkify: bool = True) -> str:
|
def calculate_mentions(self, mentions: list | None):
|
||||||
"""
|
"""
|
||||||
Strips all tags from the text, then linkifies it.
|
Prepares a set of content that we expect to see mentions look like
|
||||||
"""
|
(this imp)
|
||||||
cleaner = bleach.Cleaner(
|
"""
|
||||||
tags=[],
|
self.mention_matches: dict[str, str] = {}
|
||||||
strip=True,
|
self.mention_aliases: dict[str, str] = {}
|
||||||
filters=[partial(LinkifyFilter, url_re=url_regex, callbacks=linkify_callbacks)]
|
for mention in mentions or []:
|
||||||
if linkify
|
if self.uri_domain:
|
||||||
else [UnlinkifyFilter],
|
url = mention.absolute_profile_uri()
|
||||||
)
|
else:
|
||||||
return mark_safe(cleaner.clean(post_html))
|
url = str(mention.urls.view)
|
||||||
|
if mention.username:
|
||||||
|
username = mention.username.lower()
|
||||||
|
domain = mention.domain_id.lower()
|
||||||
|
self.mention_matches[f"{username}"] = url
|
||||||
|
self.mention_matches[f"{username}@{domain}"] = url
|
||||||
|
|
||||||
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
||||||
|
if tag in self.REWRITE_TO_P:
|
||||||
|
self.flush_data()
|
||||||
|
self.html_output += "<p>"
|
||||||
|
elif tag in self.REWRITE_TO_BR:
|
||||||
|
self.flush_data()
|
||||||
|
if not self._fresh_p:
|
||||||
|
self.html_output += "<br>"
|
||||||
|
self.text_output += "\n"
|
||||||
|
elif tag == "a":
|
||||||
|
self.flush_data()
|
||||||
|
self._pending_a = {"attrs": dict(attrs), "content": ""}
|
||||||
|
self._fresh_p = tag in self.REWRITE_TO_P
|
||||||
|
|
||||||
def html_to_plaintext(post_html: str) -> str:
|
def handle_endtag(self, tag: str) -> None:
|
||||||
"""
|
self._fresh_p = False
|
||||||
Tries to do the inverse of the linebreaks filter.
|
if tag in self.REWRITE_TO_P:
|
||||||
"""
|
self.flush_data()
|
||||||
# TODO: Handle HTML entities
|
self.html_output += "</p>"
|
||||||
# Remove all newlines, then replace br with a newline and /p with two (one comes from bleach)
|
self.text_output += "\n\n"
|
||||||
post_html = post_html.replace("\n", "").replace("<br>", "\n").replace("</p>", "\n")
|
elif tag == "a":
|
||||||
# Remove all other HTML and return
|
if self._pending_a:
|
||||||
cleaner = bleach.Cleaner(tags=["a"], strip=True, filters=[UnlinkifyFilter])
|
href = self._pending_a["attrs"].get("href")
|
||||||
return cleaner.clean(post_html).strip()
|
content = self._pending_a["content"].strip()
|
||||||
|
# Is it a mention?
|
||||||
|
if content.lower().lstrip("@") in self.mention_matches:
|
||||||
|
self.html_output += self.create_mention(content)
|
||||||
|
self.text_output += content
|
||||||
|
# Is it a hashtag?
|
||||||
|
elif self.HASHTAG_REGEX.match(content):
|
||||||
|
self.html_output += self.create_hashtag(content)
|
||||||
|
self.text_output += content
|
||||||
|
elif content:
|
||||||
|
# Shorten the link if we need to
|
||||||
|
self.html_output += self.create_link(href, content)
|
||||||
|
self.text_output += href
|
||||||
|
self._pending_a = None
|
||||||
|
|
||||||
|
def handle_data(self, data: str) -> None:
|
||||||
|
self._fresh_p = False
|
||||||
|
if self._pending_a:
|
||||||
|
self._pending_a["content"] += data
|
||||||
|
else:
|
||||||
|
self._data_buffer += data
|
||||||
|
|
||||||
|
def flush_data(self) -> None:
|
||||||
|
"""
|
||||||
|
We collect data segments until we encounter a tag we care about,
|
||||||
|
so we can treat <span>#</span>hashtag as #hashtag
|
||||||
|
"""
|
||||||
|
self.text_output += self._data_buffer
|
||||||
|
self.html_output += self.linkify(self._data_buffer)
|
||||||
|
self._data_buffer = ""
|
||||||
|
|
||||||
|
def create_link(self, href, content):
|
||||||
|
"""
|
||||||
|
Generates a link, doing optional shortening.
|
||||||
|
|
||||||
|
All return values from this function should be HTML-safe.
|
||||||
|
"""
|
||||||
|
looks_like_link = bool(self.URL_REGEX.match(content))
|
||||||
|
if looks_like_link:
|
||||||
|
content = content.split("://", 1)[1]
|
||||||
|
if looks_like_link and len(content) > 30:
|
||||||
|
return f'<a href="{html.escape(href)}" rel="nofollow" class="ellipsis" title="{html.escape(content)}">{html.escape(content[:30])}</a>'
|
||||||
|
else:
|
||||||
|
return f'<a href="{html.escape(href)}" rel="nofollow">{html.escape(content)}</a>'
|
||||||
|
|
||||||
|
def create_mention(self, handle) -> str:
|
||||||
|
"""
|
||||||
|
Generates a mention link. Handle should have a leading @.
|
||||||
|
|
||||||
|
All return values from this function should be HTML-safe
|
||||||
|
"""
|
||||||
|
handle = handle.lstrip("@")
|
||||||
|
if "@" in handle:
|
||||||
|
short_handle = handle.split("@", 1)[0]
|
||||||
|
else:
|
||||||
|
short_handle = handle
|
||||||
|
handle_hash = handle.lower()
|
||||||
|
short_hash = short_handle.lower()
|
||||||
|
self.mentions.add(handle_hash)
|
||||||
|
url = self.mention_matches.get(handle_hash)
|
||||||
|
if url:
|
||||||
|
if short_hash not in self.mention_aliases:
|
||||||
|
self.mention_aliases[short_hash] = handle_hash
|
||||||
|
elif self.mention_aliases.get(short_hash) != handle_hash:
|
||||||
|
short_handle = handle
|
||||||
|
return f'<a href="{html.escape(url)}">@{html.escape(short_handle)}</a>'
|
||||||
|
else:
|
||||||
|
return "@" + html.escape(handle)
|
||||||
|
|
||||||
|
def create_hashtag(self, hashtag) -> str:
|
||||||
|
"""
|
||||||
|
Generates a hashtag link. Hashtag does not need to start with #
|
||||||
|
|
||||||
|
All return values from this function should be HTML-safe
|
||||||
|
"""
|
||||||
|
hashtag = hashtag.lstrip("#")
|
||||||
|
self.hashtags.add(hashtag.lower())
|
||||||
|
if self.uri_domain:
|
||||||
|
return f'<a href="https://{self.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
||||||
|
else:
|
||||||
|
return f'<a href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
|
||||||
|
|
||||||
|
def create_emoji(self, shortcode) -> str:
|
||||||
|
"""
|
||||||
|
Generates an emoji <img> tag
|
||||||
|
|
||||||
|
All return values from this function should be HTML-safe
|
||||||
|
"""
|
||||||
|
from activities.models import Emoji
|
||||||
|
|
||||||
|
emoji = Emoji.get_by_domain(shortcode, self.emoji_domain)
|
||||||
|
if emoji and emoji.is_usable:
|
||||||
|
self.emojis.add(shortcode)
|
||||||
|
return emoji.as_html()
|
||||||
|
return f":{shortcode}:"
|
||||||
|
|
||||||
|
def linkify(self, data):
|
||||||
|
"""
|
||||||
|
Linkifies some content that is plaintext.
|
||||||
|
|
||||||
|
Handles URLs first, then mentions. Note that this takes great care to
|
||||||
|
keep track of what is HTML and what needs to be escaped.
|
||||||
|
"""
|
||||||
|
# Split the string by the URL regex so we know what to escape and what
|
||||||
|
# not to escape.
|
||||||
|
bits = self.URL_REGEX.split(data)
|
||||||
|
result = ""
|
||||||
|
# Even indices are data we should pass though, odd indices are links
|
||||||
|
for i, bit in enumerate(bits):
|
||||||
|
# A link!
|
||||||
|
if i % 2 == 1:
|
||||||
|
result += self.create_link(bit, bit)
|
||||||
|
# Not a link
|
||||||
|
elif self.mention_matches or self.find_mentions:
|
||||||
|
result += self.linkify_mentions(bit)
|
||||||
|
elif self.find_hashtags:
|
||||||
|
result += self.linkify_hashtags(bit)
|
||||||
|
elif self.find_emojis:
|
||||||
|
result += self.linkify_emoji(bit)
|
||||||
|
else:
|
||||||
|
result += html.escape(bit)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def linkify_mentions(self, data):
|
||||||
|
"""
|
||||||
|
Linkifies mentions
|
||||||
|
"""
|
||||||
|
bits = self.MENTION_REGEX.split(data)
|
||||||
|
result = ""
|
||||||
|
for i, bit in enumerate(bits):
|
||||||
|
# Mention content
|
||||||
|
if i % 3 == 2:
|
||||||
|
result += self.create_mention(bit)
|
||||||
|
# Not part of a mention (0) or mention preamble (1)
|
||||||
|
elif self.find_hashtags:
|
||||||
|
result += self.linkify_hashtags(bit)
|
||||||
|
elif self.find_emojis:
|
||||||
|
result += self.linkify_emoji(bit)
|
||||||
|
else:
|
||||||
|
result += html.escape(bit)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def linkify_hashtags(self, data):
|
||||||
|
"""
|
||||||
|
Linkifies hashtags
|
||||||
|
"""
|
||||||
|
bits = self.HASHTAG_REGEX.split(data)
|
||||||
|
result = ""
|
||||||
|
for i, bit in enumerate(bits):
|
||||||
|
# Not part of a hashtag
|
||||||
|
if i % 2 == 0:
|
||||||
|
if self.find_emojis:
|
||||||
|
result += self.linkify_emoji(bit)
|
||||||
|
else:
|
||||||
|
result += html.escape(bit)
|
||||||
|
# Hashtag content
|
||||||
|
else:
|
||||||
|
result += self.create_hashtag(bit)
|
||||||
|
return result
|
||||||
|
|
||||||
|
def linkify_emoji(self, data):
|
||||||
|
"""
|
||||||
|
Linkifies emoji
|
||||||
|
"""
|
||||||
|
bits = self.EMOJI_REGEX.split(data)
|
||||||
|
result = ""
|
||||||
|
for i, bit in enumerate(bits):
|
||||||
|
# Not part of an emoji
|
||||||
|
if i % 2 == 0:
|
||||||
|
result += html.escape(bit)
|
||||||
|
# Emoji content
|
||||||
|
else:
|
||||||
|
result += self.create_emoji(bit)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@property
|
||||||
|
def html(self):
|
||||||
|
return self.html_output.strip()
|
||||||
|
|
||||||
|
@property
|
||||||
|
def plain_text(self):
|
||||||
|
return self.text_output.strip()
|
||||||
|
|
||||||
|
|
||||||
class ContentRenderer:
|
class ContentRenderer:
|
||||||
|
@ -212,33 +322,30 @@ class ContentRenderer:
|
||||||
"""
|
"""
|
||||||
if not html:
|
if not html:
|
||||||
return ""
|
return ""
|
||||||
html = sanitize_html(html)
|
parser = FediverseHtmlParser(
|
||||||
html = self.linkify_mentions(html, post=post)
|
html,
|
||||||
html = self.linkify_hashtags(html, identity=post.author)
|
mentions=post.mentions.all(),
|
||||||
if self.local:
|
uri_domain=(None if self.local else post.author.domain.uri_domain),
|
||||||
html = self.imageify_emojis(
|
find_hashtags=True,
|
||||||
html,
|
find_emojis=True,
|
||||||
identity=post.author,
|
emoji_domain=post.author.domain,
|
||||||
emojis=post.emojis.all(),
|
)
|
||||||
)
|
return mark_safe(parser.html)
|
||||||
html = self.remove_extra_newlines(html)
|
|
||||||
return mark_safe(html)
|
|
||||||
|
|
||||||
def render_identity_summary(self, html: str, identity, strip: bool = False) -> str:
|
def render_identity_summary(self, html: str, identity) -> str:
|
||||||
"""
|
"""
|
||||||
Given identity summary HTML, normalises it and renders it for presentation.
|
Given identity summary HTML, normalises it and renders it for presentation.
|
||||||
"""
|
"""
|
||||||
if not html:
|
if not html:
|
||||||
return ""
|
return ""
|
||||||
if strip:
|
parser = FediverseHtmlParser(
|
||||||
html = strip_html(html)
|
html,
|
||||||
else:
|
uri_domain=(None if self.local else identity.domain.uri_domain),
|
||||||
html = sanitize_html(html)
|
find_hashtags=True,
|
||||||
html = self.linkify_hashtags(html, identity=identity)
|
find_emojis=True,
|
||||||
if self.local:
|
emoji_domain=identity.domain,
|
||||||
html = self.imageify_emojis(html, identity=identity)
|
)
|
||||||
html = self.remove_extra_newlines(html)
|
return mark_safe(parser.html)
|
||||||
return mark_safe(html)
|
|
||||||
|
|
||||||
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
def render_identity_data(self, html: str, identity, strip: bool = False) -> str:
|
||||||
"""
|
"""
|
||||||
|
@ -246,117 +353,14 @@ class ContentRenderer:
|
||||||
"""
|
"""
|
||||||
if not html:
|
if not html:
|
||||||
return ""
|
return ""
|
||||||
if strip:
|
parser = FediverseHtmlParser(
|
||||||
html = strip_html(html)
|
html,
|
||||||
else:
|
uri_domain=(None if self.local else identity.domain.uri_domain),
|
||||||
html = sanitize_html(html)
|
find_hashtags=False,
|
||||||
if self.local:
|
find_emojis=True,
|
||||||
html = self.imageify_emojis(html, identity=identity)
|
emoji_domain=identity.domain,
|
||||||
html = self.remove_extra_newlines(html)
|
|
||||||
return mark_safe(html)
|
|
||||||
|
|
||||||
def linkify_mentions(self, html: str, post) -> str:
|
|
||||||
"""
|
|
||||||
Links mentions _in the context of the post_ - as in, using the mentions
|
|
||||||
property as the only source (as we might be doing this without other
|
|
||||||
DB access allowed)
|
|
||||||
"""
|
|
||||||
from activities.models import Post
|
|
||||||
|
|
||||||
possible_matches = {}
|
|
||||||
for mention in post.mentions.all():
|
|
||||||
if self.local:
|
|
||||||
url = str(mention.urls.view)
|
|
||||||
else:
|
|
||||||
url = mention.absolute_profile_uri()
|
|
||||||
# Might not have fetched it (yet)
|
|
||||||
if mention.username:
|
|
||||||
username = mention.username.lower()
|
|
||||||
possible_matches[username] = url
|
|
||||||
possible_matches[f"{username}@{mention.domain_id}"] = url
|
|
||||||
|
|
||||||
collapse_name: dict[str, str] = {}
|
|
||||||
|
|
||||||
def replacer(match):
|
|
||||||
precursor = match.group(1)
|
|
||||||
handle = match.group(2)
|
|
||||||
if "@" in handle:
|
|
||||||
short_handle = handle.split("@", 1)[0]
|
|
||||||
else:
|
|
||||||
short_handle = handle
|
|
||||||
handle_hash = handle.lower()
|
|
||||||
short_hash = short_handle.lower()
|
|
||||||
if handle_hash in possible_matches:
|
|
||||||
if short_hash not in collapse_name:
|
|
||||||
collapse_name[short_hash] = handle_hash
|
|
||||||
elif collapse_name.get(short_hash) != handle_hash:
|
|
||||||
short_handle = handle
|
|
||||||
return f'{precursor}<a href="{possible_matches[handle_hash]}">@{short_handle}</a>'
|
|
||||||
else:
|
|
||||||
return match.group()
|
|
||||||
|
|
||||||
return Post.mention_regex.sub(replacer, html)
|
|
||||||
|
|
||||||
def linkify_hashtags(self, html, identity) -> str:
|
|
||||||
from activities.models import Hashtag
|
|
||||||
|
|
||||||
def replacer(attrs, new=False):
|
|
||||||
# See if the text in this link looks like a hashtag
|
|
||||||
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
|
|
||||||
return attrs
|
|
||||||
hashtag = attrs["_text"].strip().lstrip("#")
|
|
||||||
attrs[None, "class"] = "hashtag"
|
|
||||||
if (None, "rel") in attrs:
|
|
||||||
del attrs[None, "rel"]
|
|
||||||
if self.local:
|
|
||||||
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
|
|
||||||
else:
|
|
||||||
attrs[
|
|
||||||
None, "href"
|
|
||||||
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
|
|
||||||
return attrs
|
|
||||||
|
|
||||||
linker = bleach.linkifier.Linker(
|
|
||||||
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
|
|
||||||
)
|
)
|
||||||
return linker.linkify(html)
|
if strip:
|
||||||
|
return mark_safe(parser.html)
|
||||||
def imageify_emojis(
|
else:
|
||||||
self, html: str, identity, include_local: bool = True, emojis=None
|
return mark_safe(parser.html)
|
||||||
):
|
|
||||||
"""
|
|
||||||
Find :emoji: in content and convert to <img>. If include_local is True,
|
|
||||||
the local emoji will be used as a fallback for any shortcodes not defined
|
|
||||||
by emojis.
|
|
||||||
"""
|
|
||||||
from activities.models import Emoji
|
|
||||||
|
|
||||||
# If precached emojis were passed, prep them
|
|
||||||
cached_emojis = {}
|
|
||||||
if emojis:
|
|
||||||
for emoji in emojis:
|
|
||||||
cached_emojis[emoji.shortcode] = emoji
|
|
||||||
|
|
||||||
def replacer(match):
|
|
||||||
shortcode = match.group(1).lower()
|
|
||||||
if shortcode in cached_emojis:
|
|
||||||
return cached_emojis[shortcode].as_html()
|
|
||||||
|
|
||||||
emoji = Emoji.get_by_domain(shortcode, identity.domain)
|
|
||||||
if emoji and emoji.is_usable:
|
|
||||||
return emoji.as_html()
|
|
||||||
elif not emoji and include_local:
|
|
||||||
emoji = Emoji.get_by_domain(shortcode, None)
|
|
||||||
if emoji:
|
|
||||||
return emoji.as_html()
|
|
||||||
|
|
||||||
return match.group()
|
|
||||||
|
|
||||||
return Emoji.emoji_regex.sub(replacer, html)
|
|
||||||
|
|
||||||
def remove_extra_newlines(self, html: str) -> str:
|
|
||||||
"""
|
|
||||||
Some clients are sensitive to extra newlines even though it's HTML
|
|
||||||
"""
|
|
||||||
# TODO: More intelligent way to strip these?
|
|
||||||
return html.replace("\n", "")
|
|
||||||
|
|
|
@ -1,4 +1,3 @@
|
||||||
bleach~=5.0.1
|
|
||||||
blurhash-python~=1.1.3
|
blurhash-python~=1.1.3
|
||||||
cachetools~=5.2.0
|
cachetools~=5.2.0
|
||||||
cryptography~=39.0
|
cryptography~=39.0
|
||||||
|
|
|
@ -1,44 +0,0 @@
|
||||||
from activities.models import Hashtag
|
|
||||||
from core.html import ContentRenderer
|
|
||||||
|
|
||||||
|
|
||||||
def test_hashtag_from_content():
|
|
||||||
assert Hashtag.hashtags_from_content("#hashtag") == ["hashtag"]
|
|
||||||
assert Hashtag.hashtags_from_content("a#hashtag") == []
|
|
||||||
assert Hashtag.hashtags_from_content("Text #with #hashtag in it") == [
|
|
||||||
"hashtag",
|
|
||||||
"with",
|
|
||||||
]
|
|
||||||
assert Hashtag.hashtags_from_content("#hashtag.") == ["hashtag"]
|
|
||||||
assert Hashtag.hashtags_from_content("More text\n#one # two ##three #hashtag!") == [
|
|
||||||
"hashtag",
|
|
||||||
"one",
|
|
||||||
"three",
|
|
||||||
]
|
|
||||||
assert Hashtag.hashtags_from_content("my #html loves   entities") == ["html"]
|
|
||||||
assert Hashtag.hashtags_from_content("<span class='hash'>#</span>tag") == ["tag"]
|
|
||||||
|
|
||||||
|
|
||||||
def test_linkify_hashtag():
|
|
||||||
linkify = lambda html: ContentRenderer(local=True).linkify_hashtags(html, None)
|
|
||||||
|
|
||||||
assert linkify("# hashtag") == "# hashtag"
|
|
||||||
assert (
|
|
||||||
linkify('<a href="/url/with#anchor">Text</a>')
|
|
||||||
== '<a href="/url/with#anchor">Text</a>'
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
linkify("#HashTag") == '<a href="/tags/hashtag/" class="hashtag">#HashTag</a>'
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
linkify(
|
|
||||||
"""A longer text #bigContent
|
|
||||||
with #tags, linebreaks, and
|
|
||||||
maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
|
|
||||||
#allTheTags #AllTheTags #ALLTHETAGS"""
|
|
||||||
)
|
|
||||||
== """A longer text <a href="/tags/bigcontent/" class="hashtag">#bigContent</a>
|
|
||||||
with <a href="/tags/tags/" class="hashtag">#tags</a>, linebreaks, and
|
|
||||||
maybe a few <a href="https://awesome.sauce/about#spicy">links</a>
|
|
||||||
<a href="/tags/allthetags/" class="hashtag">#allTheTags</a> <a href="/tags/allthetags/" class="hashtag">#AllTheTags</a> <a href="/tags/allthetags/" class="hashtag">#ALLTHETAGS</a>"""
|
|
||||||
)
|
|
|
@ -1,5 +1,7 @@
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
|
from activities.models import Post
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_post_status(api_token, identity, client):
|
def test_post_status(api_token, identity, client):
|
||||||
|
@ -15,3 +17,44 @@ def test_post_status(api_token, identity, client):
|
||||||
).json()
|
).json()
|
||||||
assert response["content"] == "<p>Hello, world!</p>"
|
assert response["content"] == "<p>Hello, world!</p>"
|
||||||
assert response["visibility"] == "unlisted"
|
assert response["visibility"] == "unlisted"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.django_db
|
||||||
|
def test_mention_format(api_token, identity, remote_identity, client):
|
||||||
|
"""
|
||||||
|
Ensures mentions work, and only have one link around them.
|
||||||
|
"""
|
||||||
|
# Make a local post and check it
|
||||||
|
response = client.post(
|
||||||
|
"/api/v1/statuses",
|
||||||
|
HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
|
||||||
|
HTTP_ACCEPT="application/json",
|
||||||
|
content_type="application/json",
|
||||||
|
data={
|
||||||
|
"status": "Hello, @test!",
|
||||||
|
"visibility": "unlisted",
|
||||||
|
},
|
||||||
|
).json()
|
||||||
|
assert (
|
||||||
|
response["content"]
|
||||||
|
== '<p>Hello, <a href="https://example.com/@test/">@test</a>!</p>'
|
||||||
|
)
|
||||||
|
assert response["visibility"] == "unlisted"
|
||||||
|
|
||||||
|
# Make a remote post and check it
|
||||||
|
post = Post.objects.create(
|
||||||
|
local=False,
|
||||||
|
author=remote_identity,
|
||||||
|
content='<p>Hey <a href="https://example.com/@test/" class="u-url mention" rel="nofollow">@test</a></p>',
|
||||||
|
object_uri="https://remote.test/status/12345",
|
||||||
|
)
|
||||||
|
post.mentions.add(identity)
|
||||||
|
response = client.get(
|
||||||
|
f"/api/v1/statuses/{post.id}",
|
||||||
|
HTTP_AUTHORIZATION=f"Bearer {api_token.token}",
|
||||||
|
HTTP_ACCEPT="application/json",
|
||||||
|
content_type="application/json",
|
||||||
|
).json()
|
||||||
|
assert (
|
||||||
|
response["text"] == '<p>Hey <a href="https://example.com/@test/">@test</a></p>'
|
||||||
|
)
|
||||||
|
|
|
@ -1,155 +1,117 @@
|
||||||
from unittest.mock import Mock
|
|
||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
|
|
||||||
from core.html import ContentRenderer, html_to_plaintext, sanitize_html
|
from core.html import FediverseHtmlParser
|
||||||
|
|
||||||
|
|
||||||
def test_html_to_plaintext():
|
|
||||||
|
|
||||||
assert html_to_plaintext("<p>Hi!</p>") == "Hi!"
|
|
||||||
assert html_to_plaintext("<p>Hi!<br>There</p>") == "Hi!\nThere"
|
|
||||||
assert (
|
|
||||||
html_to_plaintext("<p>Hi!</p>\n\n<p>How are you?</p>") == "Hi!\n\nHow are you?"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
html_to_plaintext("<p>Hi!</p>\n\n<p>How are<br> you?</p><p>today</p>")
|
|
||||||
== "Hi!\n\nHow are\n you?\n\ntoday"
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
html_to_plaintext(
|
|
||||||
'<p><a href="https://fedi.takahe.social/with/a/long/path">'
|
|
||||||
'<b>The</b> <img src="takahe.png"> Link</a> '
|
|
||||||
'<a href="">Empty href</a> '
|
|
||||||
"<a>Empty A</a></p>"
|
|
||||||
)
|
|
||||||
== "https://fedi.takahe.social/with/a/long/path Empty href Empty A"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_sanitize_post():
|
|
||||||
|
|
||||||
assert sanitize_html("<p>Hello!</p>") == "<p>Hello!</p>"
|
|
||||||
assert sanitize_html("<p>It's great</p>") == "<p>It's great</p>"
|
|
||||||
|
|
||||||
# Note that we only want to linkify things with protocol prefixes to prevent
|
|
||||||
# too many false positives.
|
|
||||||
assert sanitize_html("<p>test.com</p>") == "<p>test.com</p>"
|
|
||||||
assert (
|
|
||||||
sanitize_html("<p>https://test.com</p>")
|
|
||||||
== '<p><a href="https://test.com" rel="nofollow">https://test.com</a></p>'
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
|
|
||||||
== "<p>@someone@subdomain.some-domain.com</p>"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def test_shorten_url():
|
|
||||||
full_url = (
|
|
||||||
"https://social.example.com/a-long/path/2023/01/16/that-should-be-shortened"
|
|
||||||
)
|
|
||||||
assert (
|
|
||||||
sanitize_html(f"<p>{full_url}</p>")
|
|
||||||
== f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url}">social.example.com/a-long/path</a></p>'
|
|
||||||
)
|
|
||||||
|
|
||||||
assert (
|
|
||||||
sanitize_html(
|
|
||||||
f'<p><a href="{full_url}">This is a long link text, but cannot be shortened as a URL</a></p>'
|
|
||||||
)
|
|
||||||
== f'<p><a href="{full_url}" rel="nofollow">This is a long link text, but cannot be shortened as a URL</a></p>'
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
@pytest.mark.django_db
|
@pytest.mark.django_db
|
||||||
def test_link_preservation():
|
def test_parser(identity):
|
||||||
"""
|
"""
|
||||||
We want to:
|
Validates the HtmlParser in its various output modes
|
||||||
- Preserve incoming links from other servers
|
|
||||||
- Linkify mentions and hashtags
|
|
||||||
- Not have these all step on each other!
|
|
||||||
"""
|
"""
|
||||||
renderer = ContentRenderer(local=True)
|
|
||||||
fake_mention = Mock()
|
|
||||||
fake_mention.username = "andrew"
|
|
||||||
fake_mention.domain_id = "aeracode.org"
|
|
||||||
fake_mention.urls.view = "/@andrew@aeracode.org/"
|
|
||||||
fake_post = Mock()
|
|
||||||
fake_post.mentions.all.return_value = [fake_mention]
|
|
||||||
fake_post.author.domain.uri_domain = "example.com"
|
|
||||||
fake_post.emojis.all.return_value = []
|
|
||||||
|
|
||||||
|
# Basic tag allowance
|
||||||
|
parser = FediverseHtmlParser("<p>Hello!</p><script></script>")
|
||||||
|
assert parser.html == "<p>Hello!</p>"
|
||||||
|
assert parser.plain_text == "Hello!"
|
||||||
|
|
||||||
|
# Newline erasure
|
||||||
|
parser = FediverseHtmlParser("<p>Hi!</p>\n\n<p>How are you?</p>")
|
||||||
|
assert parser.html == "<p>Hi!</p><p>How are you?</p>"
|
||||||
|
assert parser.plain_text == "Hi!\n\nHow are you?"
|
||||||
|
|
||||||
|
# Trying to be evil
|
||||||
|
parser = FediverseHtmlParser("<scri<span></span>pt>")
|
||||||
|
assert "<scr" not in parser.html
|
||||||
|
parser = FediverseHtmlParser("<scri #hashtag pt>")
|
||||||
|
assert "<scr" not in parser.html
|
||||||
|
|
||||||
|
# Entities are escaped
|
||||||
|
parser = FediverseHtmlParser("<p>It's great</p>", find_hashtags=True)
|
||||||
|
assert parser.html == "<p>It's great</p>"
|
||||||
|
assert parser.plain_text == "It's great"
|
||||||
|
assert parser.hashtags == set()
|
||||||
|
|
||||||
|
# Linkify works, but only with protocol prefixes
|
||||||
|
parser = FediverseHtmlParser("<p>test.com</p>")
|
||||||
|
assert parser.html == "<p>test.com</p>"
|
||||||
|
assert parser.plain_text == "test.com"
|
||||||
|
parser = FediverseHtmlParser("<p>https://test.com</p>")
|
||||||
assert (
|
assert (
|
||||||
renderer.render_post(
|
parser.html == '<p><a href="https://test.com" rel="nofollow">test.com</a></p>'
|
||||||
'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
|
|
||||||
fake_post,
|
|
||||||
)
|
|
||||||
== 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
|
|
||||||
)
|
)
|
||||||
|
assert parser.plain_text == "https://test.com"
|
||||||
|
|
||||||
|
# Links are preserved
|
||||||
@pytest.mark.django_db
|
parser = FediverseHtmlParser("<a href='https://takahe.social'>takahe social</a>")
|
||||||
def test_list_rendering():
|
|
||||||
"""
|
|
||||||
We want to:
|
|
||||||
- Preserve incoming links from other servers
|
|
||||||
- Linkify mentions and hashtags
|
|
||||||
- Not have these all step on each other!
|
|
||||||
"""
|
|
||||||
renderer = ContentRenderer(local=True)
|
|
||||||
fake_mention = Mock()
|
|
||||||
fake_mention.username = "andrew"
|
|
||||||
fake_mention.domain_id = "aeracode.org"
|
|
||||||
fake_mention.urls.view = "/@andrew@aeracode.org/"
|
|
||||||
fake_post = Mock()
|
|
||||||
fake_post.mentions.all.return_value = [fake_mention]
|
|
||||||
fake_post.author.domain.uri_domain = "example.com"
|
|
||||||
fake_post.emojis.all.return_value = []
|
|
||||||
|
|
||||||
assert (
|
assert (
|
||||||
renderer.render_post(
|
parser.html
|
||||||
"<p>Ok. The roster so far is:</p><ul><li>Infosec.exchange (mastodon)</li><li>pixel.Infosec.exchange (pixelfed)</li><li>video.Infosec.exchange (peertube)</li><li>relay.Infosec.exchange (activitypub relay)</li><li>risky.af (alt mastodon)</li></ul><p>What’s next? I think I promised some people here bookwyrm</p>",
|
== '<a href="https://takahe.social" rel="nofollow">takahe social</a>'
|
||||||
fake_post,
|
)
|
||||||
)
|
assert parser.plain_text == "https://takahe.social"
|
||||||
== "<p>Ok. The roster so far is:</p><p>Infosec.exchange (mastodon)<br>pixel.Infosec.exchange (pixelfed)<br>video.Infosec.exchange (peertube)<br>relay.Infosec.exchange (activitypub relay)<br>risky.af (alt mastodon)</p><p>What’s next? I think I promised some people here bookwyrm</p>"
|
|
||||||
|
# Very long links are shortened
|
||||||
|
full_url = "https://social.example.com/a-long/path/that-should-be-shortened"
|
||||||
|
parser = FediverseHtmlParser(f"<p>{full_url}</p>")
|
||||||
|
assert (
|
||||||
|
parser.html
|
||||||
|
== f'<p><a href="{full_url}" rel="nofollow" class="ellipsis" title="{full_url.removeprefix("https://")}">social.example.com/a-long/path</a></p>'
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
parser.plain_text
|
||||||
|
== "https://social.example.com/a-long/path/that-should-be-shortened"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Make sure things that look like mentions are left alone with no mentions supplied.
|
||||||
@pytest.mark.django_db
|
parser = FediverseHtmlParser(
|
||||||
def test_link_mixcase_mentions():
|
"<p>@test@example.com</p>",
|
||||||
renderer = ContentRenderer(local=True)
|
find_mentions=True,
|
||||||
fake_mention = Mock()
|
find_hashtags=True,
|
||||||
fake_mention.username = "Manfre"
|
find_emojis=True,
|
||||||
fake_mention.domain_id = "manfre.net"
|
|
||||||
fake_mention.urls.view = "/@Manfre@manfre.net/"
|
|
||||||
fake_mention2 = Mock()
|
|
||||||
fake_mention2.username = "manfre"
|
|
||||||
fake_mention2.domain_id = "takahe.social"
|
|
||||||
fake_mention2.urls.view = "https://takahe.social/@manfre@takahe.social/"
|
|
||||||
|
|
||||||
unfetched_mention = Mock()
|
|
||||||
unfetched_mention.username = None
|
|
||||||
unfetched_mention.domain_id = None
|
|
||||||
unfetched_mention.urls.view = "/None@None/"
|
|
||||||
|
|
||||||
fake_post = Mock()
|
|
||||||
fake_post.mentions.all.return_value = [
|
|
||||||
fake_mention,
|
|
||||||
fake_mention2,
|
|
||||||
unfetched_mention,
|
|
||||||
]
|
|
||||||
fake_post.author.domain.uri_domain = "example.com"
|
|
||||||
fake_post.emojis.all.return_value = []
|
|
||||||
|
|
||||||
assert renderer.render_post(
|
|
||||||
"@Manfre@manfre.net @mAnFrE@takahe.social @manfre@manfre.net @unfetched@manfre.net",
|
|
||||||
fake_post,
|
|
||||||
) == (
|
|
||||||
'<a href="/@Manfre@manfre.net/">@Manfre</a> '
|
|
||||||
'<a href="https://takahe.social/@manfre@takahe.social/">@mAnFrE@takahe.social</a> '
|
|
||||||
'<a href="/@Manfre@manfre.net/">@manfre</a> '
|
|
||||||
"@unfetched@manfre.net"
|
|
||||||
)
|
)
|
||||||
|
assert parser.html == "<p>@test@example.com</p>"
|
||||||
|
assert parser.plain_text == "@test@example.com"
|
||||||
|
assert parser.mentions == {"test@example.com"}
|
||||||
|
|
||||||
|
# Make sure mentions work when there is a mention supplied
|
||||||
|
parser = FediverseHtmlParser(
|
||||||
|
"<p>@test@example.com</p>",
|
||||||
|
mentions=[identity],
|
||||||
|
find_hashtags=True,
|
||||||
|
find_emojis=True,
|
||||||
|
)
|
||||||
|
assert parser.html == '<p><a href="/@test@example.com/">@test</a></p>'
|
||||||
|
assert parser.plain_text == "@test@example.com"
|
||||||
|
assert parser.mentions == {"test@example.com"}
|
||||||
|
|
||||||
|
# Ensure mentions are case insensitive
|
||||||
|
parser = FediverseHtmlParser(
|
||||||
|
"<p>@TeSt@ExamPle.com</p>",
|
||||||
|
mentions=[identity],
|
||||||
|
find_hashtags=True,
|
||||||
|
find_emojis=True,
|
||||||
|
)
|
||||||
|
assert parser.html == '<p><a href="/@test@example.com/">@TeSt</a></p>'
|
||||||
|
assert parser.plain_text == "@TeSt@ExamPle.com"
|
||||||
|
assert parser.mentions == {"test@example.com"}
|
||||||
|
|
||||||
|
# Ensure hashtags are linked, even through spans, but not within hrefs
|
||||||
|
parser = FediverseHtmlParser(
|
||||||
|
'<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',
|
||||||
|
find_hashtags=True,
|
||||||
|
find_emojis=True,
|
||||||
|
)
|
||||||
|
assert (
|
||||||
|
parser.html
|
||||||
|
== '<a href="http://example.com#notahashtag" rel="nofollow">something</a> <a href="/tags/hashtag/">#hashtag</a> <a href="/tags/hashtagtwo/">#hashtagtwo</a>'
|
||||||
|
)
|
||||||
|
assert parser.plain_text == "http://example.com#notahashtag #hashtag #hashtagtwo"
|
||||||
|
assert parser.hashtags == {"hashtag", "hashtagtwo"}
|
||||||
|
|
||||||
|
# Ensure lists are rendered reasonably
|
||||||
|
parser = FediverseHtmlParser(
|
||||||
|
"<p>List:</p><ul><li>One</li><li>Two</li><li>Three</li></ul><p>End!</p>",
|
||||||
|
find_hashtags=True,
|
||||||
|
find_emojis=True,
|
||||||
|
)
|
||||||
|
assert parser.html == "<p>List:</p><p>One<br>Two<br>Three</p><p>End!</p>"
|
||||||
|
assert parser.plain_text == "List:\n\nOne\nTwo\nThree\n\nEnd!"
|
||||||
|
|
|
@ -13,7 +13,7 @@ from django.utils.functional import lazy
|
||||||
from lxml import etree
|
from lxml import etree
|
||||||
|
|
||||||
from core.exceptions import ActorMismatchError, capture_message
|
from core.exceptions import ActorMismatchError, capture_message
|
||||||
from core.html import ContentRenderer, html_to_plaintext, strip_html
|
from core.html import ContentRenderer, FediverseHtmlParser
|
||||||
from core.ld import (
|
from core.ld import (
|
||||||
canonicalise,
|
canonicalise,
|
||||||
format_ld_date,
|
format_ld_date,
|
||||||
|
@ -530,8 +530,8 @@ class Identity(StatorModel):
|
||||||
response["attachment"] = [
|
response["attachment"] = [
|
||||||
{
|
{
|
||||||
"type": "http://schema.org#PropertyValue",
|
"type": "http://schema.org#PropertyValue",
|
||||||
"name": strip_html(item["name"], linkify=False),
|
"name": FediverseHtmlParser(item["name"]).plain_text,
|
||||||
"value": strip_html(item["value"]),
|
"value": FediverseHtmlParser(item["value"]).html,
|
||||||
}
|
}
|
||||||
for item in self.metadata
|
for item in self.metadata
|
||||||
]
|
]
|
||||||
|
@ -781,7 +781,9 @@ class Identity(StatorModel):
|
||||||
self.metadata.append(
|
self.metadata.append(
|
||||||
{
|
{
|
||||||
"name": attachment.get("name"),
|
"name": attachment.get("name"),
|
||||||
"value": strip_html(attachment.get("http://schema.org#value")),
|
"value": FediverseHtmlParser(
|
||||||
|
attachment.get("http://schema.org#value")
|
||||||
|
).html,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
# Now go do webfinger with that info to see if we can get a canonical domain
|
# Now go do webfinger with that info to see if we can get a canonical domain
|
||||||
|
@ -903,12 +905,14 @@ class Identity(StatorModel):
|
||||||
Post.Visibilities.mentioned: "direct",
|
Post.Visibilities.mentioned: "direct",
|
||||||
}
|
}
|
||||||
result["source"] = {
|
result["source"] = {
|
||||||
"note": html_to_plaintext(self.summary) if self.summary else "",
|
"note": FediverseHtmlParser(self.summary).plain_text
|
||||||
|
if self.summary
|
||||||
|
else "",
|
||||||
"fields": (
|
"fields": (
|
||||||
[
|
[
|
||||||
{
|
{
|
||||||
"name": m["name"],
|
"name": m["name"],
|
||||||
"value": strip_html(m["value"], linkify=False),
|
"value": FediverseHtmlParser(m["value"]).plain_text,
|
||||||
"verified_at": None,
|
"verified_at": None,
|
||||||
}
|
}
|
||||||
for m in self.metadata
|
for m in self.metadata
|
||||||
|
|
|
@ -3,7 +3,7 @@ from django.template.defaultfilters import linebreaks_filter
|
||||||
|
|
||||||
from activities.models import FanOut
|
from activities.models import FanOut
|
||||||
from core.files import resize_image
|
from core.files import resize_image
|
||||||
from core.html import strip_html
|
from core.html import FediverseHtmlParser
|
||||||
from users.models import (
|
from users.models import (
|
||||||
Block,
|
Block,
|
||||||
BlockStates,
|
BlockStates,
|
||||||
|
@ -211,7 +211,7 @@ class IdentityService:
|
||||||
Safely sets a summary and turns linebreaks into HTML
|
Safely sets a summary and turns linebreaks into HTML
|
||||||
"""
|
"""
|
||||||
if summary:
|
if summary:
|
||||||
self.identity.summary = linebreaks_filter(strip_html(summary))
|
self.identity.summary = FediverseHtmlParser(linebreaks_filter(summary)).html
|
||||||
else:
|
else:
|
||||||
self.identity.summary = None
|
self.identity.summary = None
|
||||||
self.identity.save()
|
self.identity.save()
|
||||||
|
|
|
@ -4,7 +4,7 @@ from django.shortcuts import redirect
|
||||||
from django.utils.decorators import method_decorator
|
from django.utils.decorators import method_decorator
|
||||||
from django.views.generic import FormView
|
from django.views.generic import FormView
|
||||||
|
|
||||||
from core.html import html_to_plaintext
|
from core.html import FediverseHtmlParser
|
||||||
from core.models.config import Config
|
from core.models.config import Config
|
||||||
from users.decorators import identity_required
|
from users.decorators import identity_required
|
||||||
from users.models import IdentityStates
|
from users.models import IdentityStates
|
||||||
|
@ -65,7 +65,11 @@ class ProfilePage(FormView):
|
||||||
identity = self.request.identity
|
identity = self.request.identity
|
||||||
return {
|
return {
|
||||||
"name": identity.name,
|
"name": identity.name,
|
||||||
"summary": html_to_plaintext(identity.summary) if identity.summary else "",
|
"summary": (
|
||||||
|
FediverseHtmlParser(identity.summary).plain_text
|
||||||
|
if identity.summary
|
||||||
|
else ""
|
||||||
|
),
|
||||||
"icon": identity.icon and identity.icon.url,
|
"icon": identity.icon and identity.icon.url,
|
||||||
"image": identity.image and identity.image.url,
|
"image": identity.image and identity.image.url,
|
||||||
"discoverable": identity.discoverable,
|
"discoverable": identity.discoverable,
|
||||||
|
|
Loading…
Reference in a new issue