Allow unicode characters in hashtag (#659)

This commit is contained in:
Henri Dickson 2023-11-19 11:58:20 -05:00 committed by GitHub
parent b122e2beda
commit 5267e4108c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 12 additions and 1 deletions

View file

@ -38,7 +38,7 @@ class FediverseHtmlParser(HTMLParser):
r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)" r"(^|[^\w\d\-_/])@([\w\d\-_]+(?:@[\w\d\-_\.]+[\w\d\-_]+)?)"
) )
HASHTAG_REGEX = re.compile(r"\B#([a-zA-Z0-9(_)]+\b)(?!;)") HASHTAG_REGEX = re.compile(r"\B#([\w()]+\b)(?!;)")
EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B") EMOJI_REGEX = re.compile(r"\B:([a-zA-Z0-9(_)-]+):\B")

View file

@ -1,4 +1,5 @@
import pytest import pytest
from django.template.defaultfilters import linebreaks_filter
from core.html import FediverseHtmlParser from core.html import FediverseHtmlParser
@ -101,6 +102,16 @@ def test_parser(identity):
assert parser.plain_text == "@TeSt@ExamPle.com" assert parser.plain_text == "@TeSt@ExamPle.com"
assert parser.mentions == {"test@example.com"} assert parser.mentions == {"test@example.com"}
# Ensure hashtags are parsed and linkified in local posts
parser = FediverseHtmlParser(
linebreaks_filter("#tag1-x,#tag2 #标签。"), find_hashtags=True
)
assert (
parser.html
== '<p><a href="/tags/tag1/" rel="tag">#tag1</a>-x,<a href="/tags/tag2/" rel="tag">#tag2</a> <a href="/tags/标签/" rel="tag">#标签</a>。</p>'
)
assert parser.hashtags == {"tag1", "tag2", "标签"}
# Ensure hashtags are linked, even through spans, but not within hrefs # Ensure hashtags are linked, even through spans, but not within hrefs
parser = FediverseHtmlParser( parser = FediverseHtmlParser(
'<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>', '<a href="http://example.com#notahashtag">something</a> <span>#</span>hashtag <a href="https://example.com/tags/hashtagtwo/">#hashtagtwo</a>',