Significantly better hashtag link parsing

Fixes #203
This commit is contained in:
Andrew Godwin 2022-12-20 13:55:14 +00:00
parent 4635874c12
commit 51d34eda9c
3 changed files with 50 additions and 11 deletions

View file

@ -731,7 +731,7 @@ class Post(StatorModel):
if tag["type"].lower() == "mention":
mention_identity = Identity.by_actor_uri(tag["href"], create=True)
post.mentions.add(mention_identity)
elif tag["type"].lower() == "hashtag":
elif tag["type"].lower() in ["_:hashtag", "hashtag"]:
post.hashtags.append(tag["name"].lower().lstrip("#"))
elif tag["type"].lower() in ["toot:emoji", "emoji"]:
emoji = Emoji.by_ap_tag(post.author.domain, tag, create=True)

View file

@ -34,11 +34,10 @@ def sanitize_html(post_html: str) -> str:
Only allows a, br, p and span tags, and class attributes.
"""
cleaner = bleach.Cleaner(
tags=["br", "p"],
tags=["br", "p", "a"],
attributes={ # type:ignore
"a": allow_a,
"p": ["class"],
"span": ["class"],
},
filters=[partial(LinkifyFilter, url_re=url_regex)],
strip=True,
@ -148,16 +147,26 @@ class ContentRenderer:
def linkify_hashtags(self, html, identity) -> str:
from activities.models import Hashtag
def replacer(match):
hashtag = match.group(1)
def replacer(attrs, new=False):
# See if the text in this link looks like a hashtag
if not Hashtag.hashtag_regex.match(attrs.get("_text", "")):
return attrs
hashtag = attrs["_text"].strip().lstrip("#")
attrs[None, "class"] = "hashtag"
if (None, "rel") in attrs:
del attrs[None, "rel"]
if self.local:
return (
f'<a class="hashtag" href="/tags/{hashtag.lower()}/">#{hashtag}</a>'
)
attrs[None, "href"] = f"/tags/{hashtag.lower()}/"
else:
return f'<a class="hashtag" href="https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/">#{hashtag}</a>'
attrs[
None, "href"
] = f"https://{identity.domain.uri_domain}/tags/{hashtag.lower()}/"
return attrs
return Hashtag.hashtag_regex.sub(replacer, html)
linker = bleach.linkifier.Linker(
url_re=Hashtag.hashtag_regex, callbacks=[replacer]
)
return linker.linkify(html)
def imageify_emojis(self, html: str, identity, include_local: bool = True):
"""

View file

@ -1,4 +1,8 @@
from core.html import html_to_plaintext, sanitize_html
from unittest.mock import Mock
import pytest
from core.html import ContentRenderer, html_to_plaintext, sanitize_html
def test_html_to_plaintext():
@ -31,3 +35,29 @@ def test_sanitize_post():
sanitize_html("<p>@someone@subdomain.some-domain.com</p>")
== "<p>@someone@subdomain.some-domain.com</p>"
)
@pytest.mark.django_db
def test_link_preservation(emoji_locals):
"""
We want to:
- Preserve incoming links from other servers
- Linkify mentions and hashtags
- Not have these all step on each other!
"""
renderer = ContentRenderer(local=True)
fake_mention = Mock()
fake_mention.username = "andrew"
fake_mention.domain_id = "aeracode.org"
fake_mention.urls.view = "/@andrew@aeracode.org/"
fake_post = Mock()
fake_post.mentions.all.return_value = [fake_mention]
fake_post.author.domain.uri_domain = "example.com"
assert (
renderer.render_post(
'Hello @andrew, I want to link to this <span>#</span>hashtag: <a href="http://example.com/@andrew/#notahashtag">here</a> and rewrite <a href="https://example.com/tags/thishashtag/">#thishashtag</a>',
fake_post,
)
== 'Hello <a href="/@andrew@aeracode.org/">@andrew</a>, I want to link to this <a href="/tags/hashtag/" class="hashtag">#hashtag</a>: <a href="http://example.com/@andrew/#notahashtag" rel="nofollow">here</a> and rewrite <a href="/tags/thishashtag/" class="hashtag">#thishashtag</a>'
)