Enable finding existing hashtags case-insensitive

We should store hashtags case-sensitive, but ensures that an existing
hashtag with different case are found and re-used. for example,
an existing #BookWyrm hashtag will be found and used even if the
status content is using #bookwyrm.
This commit is contained in:
Christof Dorner 2023-02-21 17:17:53 +01:00
parent f3334b1550
commit 9ca9883e0b
4 changed files with 41 additions and 27 deletions

View file

@ -65,6 +65,7 @@ class Note(ActivityObject):
rf'(<a href=")[^"]*(" data-mention="hashtag">{hashtag.name}</a>)', rf'(<a href=")[^"]*(" data-mention="hashtag">{hashtag.name}</a>)',
rf"\1{hashtag.remote_id}\2", rf"\1{hashtag.remote_id}\2",
instance.content, instance.content,
flags=re.IGNORECASE,
) )
if instance.content != updated_content: if instance.content != updated_content:
instance.content = updated_content instance.content = updated_content

View file

@ -34,7 +34,7 @@ class Note(TestCase):
inReplyToBook=self.book.remote_id, inReplyToBook=self.book.remote_id,
content="<p>This is interesting " content="<p>This is interesting "
+ '<a href="https://test-instance.org/hashtag/2" data-mention="hashtag">' + '<a href="https://test-instance.org/hashtag/2" data-mention="hashtag">'
+ "#BookClub</a></p>", + "#bookclub</a></p>",
published="2023-02-17T23:12:59.398030+00:00", published="2023-02-17T23:12:59.398030+00:00",
to=[], to=[],
cc=[], cc=[],
@ -60,5 +60,5 @@ class Note(TestCase):
instance.content, instance.content,
"<p>This is interesting " "<p>This is interesting "
+ f'<a href="{hashtag.remote_id}" data-mention="hashtag">' + f'<a href="{hashtag.remote_id}" data-mention="hashtag">'
+ "#BookClub</a></p>", + "#bookclub</a></p>",
) )

View file

@ -6,7 +6,7 @@ from django.test import TestCase, TransactionTestCase
from django.test.client import RequestFactory from django.test.client import RequestFactory
from bookwyrm import forms, models, views from bookwyrm import forms, models, views
from bookwyrm.views.status import find_mentions, find_hashtags from bookwyrm.views.status import find_mentions, find_or_create_hashtags
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.tests.validate_html import validate_html from bookwyrm.tests.validate_html import validate_html
@ -339,7 +339,8 @@ class StatusViews(TestCase):
view = views.CreateStatus.as_view() view = views.CreateStatus.as_view()
form = forms.CommentForm( form = forms.CommentForm(
{ {
"content": "this is an #existing hashtag, this one is #new.", "content": "this is an #EXISTING hashtag but all uppercase, "
+ "this one is #NewTag.",
"user": self.local_user.id, "user": self.local_user.id,
"book": self.book.id, "book": self.book.id,
"privacy": "public", "privacy": "public",
@ -356,44 +357,45 @@ class StatusViews(TestCase):
self.assertEqual(list(status.mention_hashtags.all()), list(hashtags)) self.assertEqual(list(status.mention_hashtags.all()), list(hashtags))
hashtag_exising = models.Hashtag.objects.filter(name="#existing").first() hashtag_exising = models.Hashtag.objects.filter(name="#existing").first()
hashtag_new = models.Hashtag.objects.filter(name="#new").first() hashtag_new = models.Hashtag.objects.filter(name="#NewTag").first()
self.assertEqual( self.assertEqual(
status.content, status.content,
"<p>this is an " "<p>this is an "
+ f'<a href="{hashtag_exising.remote_id}" data-mention="hashtag">' + f'<a href="{hashtag_exising.remote_id}" data-mention="hashtag">'
+ "#existing</a> hashtag, this one is " + "#EXISTING</a> hashtag but all uppercase, this one is "
+ f'<a href="{hashtag_new.remote_id}" data-mention="hashtag">#new</a>.</p>', + f'<a href="{hashtag_new.remote_id}" data-mention="hashtag">'
+ "#NewTag</a>.</p>",
) )
def test_find_hashtags(self, *_): def test_find_or_create_hashtags(self, *_):
"""detect and look up #hashtags""" """detect and look up #hashtags"""
result = find_hashtags("no hashtag to be found here") result = find_or_create_hashtags("no hashtag to be found here")
self.assertEqual(result, {}) self.assertEqual(result, {})
result = find_hashtags("#existing") result = find_or_create_hashtags("#existing")
self.assertEqual(result["#existing"], self.existing_hashtag) self.assertEqual(result["#existing"], self.existing_hashtag)
result = find_hashtags("leading text #existing") result = find_or_create_hashtags("leading text #existing")
self.assertEqual(result["#existing"], self.existing_hashtag) self.assertEqual(result["#existing"], self.existing_hashtag)
result = find_hashtags("leading #existing trailing") result = find_or_create_hashtags("leading #existing trailing")
self.assertEqual(result["#existing"], self.existing_hashtag) self.assertEqual(result["#existing"], self.existing_hashtag)
self.assertIsNone(models.Hashtag.objects.filter(name="new").first()) self.assertIsNone(models.Hashtag.objects.filter(name="new").first())
result = find_hashtags("leading #new trailing") result = find_or_create_hashtags("leading #new trailing")
new_hashtag = models.Hashtag.objects.filter(name="#new").first() new_hashtag = models.Hashtag.objects.filter(name="#new").first()
self.assertIsNotNone(new_hashtag) self.assertIsNotNone(new_hashtag)
self.assertEqual(result["#new"], new_hashtag) self.assertEqual(result["#new"], new_hashtag)
result = find_hashtags("leading #existing #new trailing") result = find_or_create_hashtags("leading #existing #new trailing")
self.assertEqual(result["#existing"], self.existing_hashtag) self.assertEqual(result["#existing"], self.existing_hashtag)
self.assertEqual(result["#new"], new_hashtag) self.assertEqual(result["#new"], new_hashtag)
result = find_hashtags("#Braunbär") result = find_or_create_hashtags("#Braunbär")
hashtag = models.Hashtag.objects.filter(name="#Braunbär").first() hashtag = models.Hashtag.objects.filter(name="#Braunbär").first()
self.assertEqual(result["#Braunbär"], hashtag) self.assertEqual(result["#Braunbär"], hashtag)
result = find_hashtags("#ひぐま") result = find_or_create_hashtags("#ひぐま")
hashtag = models.Hashtag.objects.filter(name="#ひぐま").first() hashtag = models.Hashtag.objects.filter(name="#ひぐま").first()
self.assertEqual(result["#ひぐま"], hashtag) self.assertEqual(result["#ひぐま"], hashtag)

View file

@ -116,7 +116,7 @@ class CreateStatus(View):
status.mention_users.add(status.reply_parent.user) status.mention_users.add(status.reply_parent.user)
# inspect the text for hashtags # inspect the text for hashtags
for (mention_text, mention_hashtag) in find_hashtags(content).items(): for (mention_text, mention_hashtag) in find_or_create_hashtags(content).items():
# add them to status mentions fk # add them to status mentions fk
status.mention_hashtags.add(mention_hashtag) status.mention_hashtags.add(mention_hashtag)
@ -250,25 +250,36 @@ def find_mentions(user, content):
return username_dict return username_dict
def find_hashtags(content): def find_or_create_hashtags(content):
"""detect #hashtags in raw status content""" """detect #hashtags in raw status content
it stores hashtags case-sensitive, but ensures that an existing
hashtag with different case are found and re-used. for example,
an existing #BookWyrm hashtag will be found and used even if the
status content is using #bookwyrm.
"""
if not content: if not content:
return {} return {}
hashtags = re.findall(regex.HASHTAG, content) found_hashtags = {t.lower(): t for t in re.findall(regex.HASHTAG, content)}
if len(hashtags) == 0: if len(found_hashtags) == 0:
return {} return {}
known_tags = models.Hashtag.objects.filter(Q(name__in=hashtags)).distinct() known_hashtags = {
hashtag_dict = {t.name: t for t in known_tags} t.name.lower(): t
for t in models.Hashtag.objects.filter(
Q(name__in=found_hashtags.keys())
).distinct()
}
not_found = set(hashtags) - set(hashtag_dict.keys()) not_found = found_hashtags.keys() - known_hashtags.keys()
for tag_name in not_found: for lower_name in not_found:
tag_name = found_hashtags[lower_name]
mention_hashtag = models.Hashtag(name=tag_name) mention_hashtag = models.Hashtag(name=tag_name)
mention_hashtag.save() mention_hashtag.save()
hashtag_dict[mention_hashtag.name] = mention_hashtag known_hashtags[lower_name] = mention_hashtag
return hashtag_dict return {found_hashtags[k]: v for k, v in known_hashtags.items()}
def format_links(content): def format_links(content):