Enable finding existing hashtags case-insensitive

We should store hashtags case-sensitive, but ensures that an existing hashtag with different case are found and re-used. for example, an existing #BookWyrm hashtag will be found and used even if the status content is using #bookwyrm.
2024-11-26 19:41:11 +00:00 · 2023-02-21 17:17:53 +01:00 · 2023-02-21 17:17:53 +01:00 · 9ca9883e0b
commit 9ca9883e0b
parent f3334b1550
4 changed files with 41 additions and 27 deletions
--- a/bookwyrm/activitypub/note.py
+++ b/bookwyrm/activitypub/note.py
@ -65,6 +65,7 @@ class Note(ActivityObject):
                rf'(<a href=")[^"]*(" data-mention="hashtag">{hashtag.name}</a>)',
                rf"\1{hashtag.remote_id}\2",
                instance.content,
+                flags=re.IGNORECASE,
            )
            if instance.content != updated_content:
                instance.content = updated_content
--- a/bookwyrm/tests/activitypub/test_note.py
+++ b/bookwyrm/tests/activitypub/test_note.py
@ -34,7 +34,7 @@ class Note(TestCase):
            inReplyToBook=self.book.remote_id,
            content="<p>This is interesting "
            + '<a href="https://test-instance.org/hashtag/2" data-mention="hashtag">'
-            + "#BookClub</a></p>",
+            + "#bookclub</a></p>",
            published="2023-02-17T23:12:59.398030+00:00",
            to=[],
            cc=[],
@ -60,5 +60,5 @@ class Note(TestCase):
            instance.content,
            "<p>This is interesting "
            + f'<a href="{hashtag.remote_id}" data-mention="hashtag">'
-            + "#BookClub</a></p>",
+            + "#bookclub</a></p>",
        )
--- a/bookwyrm/tests/views/test_status.py
+++ b/bookwyrm/tests/views/test_status.py
@ -6,7 +6,7 @@ from django.test import TestCase, TransactionTestCase
 from django.test.client import RequestFactory

 from bookwyrm import forms, models, views
-from bookwyrm.views.status import find_mentions, find_hashtags
+from bookwyrm.views.status import find_mentions, find_or_create_hashtags
 from bookwyrm.settings import DOMAIN

 from bookwyrm.tests.validate_html import validate_html
@ -339,7 +339,8 @@ class StatusViews(TestCase):
        view = views.CreateStatus.as_view()
        form = forms.CommentForm(
            {
-                "content": "this is an #existing hashtag, this one is #new.",
+                "content": "this is an #EXISTING hashtag but all uppercase, "
+                + "this one is #NewTag.",
                "user": self.local_user.id,
                "book": self.book.id,
                "privacy": "public",
@ -356,44 +357,45 @@ class StatusViews(TestCase):
        self.assertEqual(list(status.mention_hashtags.all()), list(hashtags))

        hashtag_exising = models.Hashtag.objects.filter(name="#existing").first()
-        hashtag_new = models.Hashtag.objects.filter(name="#new").first()
+        hashtag_new = models.Hashtag.objects.filter(name="#NewTag").first()
        self.assertEqual(
            status.content,
            "<p>this is an "
            + f'<a href="{hashtag_exising.remote_id}" data-mention="hashtag">'
-            + "#existing</a> hashtag, this one is "
-            + f'<a href="{hashtag_new.remote_id}" data-mention="hashtag">#new</a>.</p>',
+            + "#EXISTING</a> hashtag but all uppercase, this one is "
+            + f'<a href="{hashtag_new.remote_id}" data-mention="hashtag">'
+            + "#NewTag</a>.</p>",
        )

-    def test_find_hashtags(self, *_):
+    def test_find_or_create_hashtags(self, *_):
        """detect and look up #hashtags"""
-        result = find_hashtags("no hashtag to be found here")
+        result = find_or_create_hashtags("no hashtag to be found here")
        self.assertEqual(result, {})

-        result = find_hashtags("#existing")
+        result = find_or_create_hashtags("#existing")
        self.assertEqual(result["#existing"], self.existing_hashtag)

-        result = find_hashtags("leading text #existing")
+        result = find_or_create_hashtags("leading text #existing")
        self.assertEqual(result["#existing"], self.existing_hashtag)

-        result = find_hashtags("leading #existing trailing")
+        result = find_or_create_hashtags("leading #existing trailing")
        self.assertEqual(result["#existing"], self.existing_hashtag)

        self.assertIsNone(models.Hashtag.objects.filter(name="new").first())
-        result = find_hashtags("leading #new trailing")
+        result = find_or_create_hashtags("leading #new trailing")
        new_hashtag = models.Hashtag.objects.filter(name="#new").first()
        self.assertIsNotNone(new_hashtag)
        self.assertEqual(result["#new"], new_hashtag)

-        result = find_hashtags("leading #existing #new trailing")
+        result = find_or_create_hashtags("leading #existing #new trailing")
        self.assertEqual(result["#existing"], self.existing_hashtag)
        self.assertEqual(result["#new"], new_hashtag)

-        result = find_hashtags("#Braunbär")
+        result = find_or_create_hashtags("#Braunbär")
        hashtag = models.Hashtag.objects.filter(name="#Braunbär").first()
        self.assertEqual(result["#Braunbär"], hashtag)

-        result = find_hashtags("#ひぐま")
+        result = find_or_create_hashtags("#ひぐま")
        hashtag = models.Hashtag.objects.filter(name="#ひぐま").first()
        self.assertEqual(result["#ひぐま"], hashtag)

--- a/bookwyrm/views/status.py
+++ b/bookwyrm/views/status.py
@ -116,7 +116,7 @@ class CreateStatus(View):
            status.mention_users.add(status.reply_parent.user)

        # inspect the text for hashtags
-        for (mention_text, mention_hashtag) in find_hashtags(content).items():
+        for (mention_text, mention_hashtag) in find_or_create_hashtags(content).items():
            # add them to status mentions fk
            status.mention_hashtags.add(mention_hashtag)

@ -250,25 +250,36 @@ def find_mentions(user, content):
    return username_dict


-def find_hashtags(content):
-    """detect #hashtags in raw status content"""
+def find_or_create_hashtags(content):
+    """detect #hashtags in raw status content
+
+    it stores hashtags case-sensitive, but ensures that an existing
+    hashtag with different case are found and re-used. for example,
+    an existing #BookWyrm hashtag will be found and used even if the
+    status content is using #bookwyrm.
+    """
    if not content:
        return {}

-    hashtags = re.findall(regex.HASHTAG, content)
-    if len(hashtags) == 0:
+    found_hashtags = {t.lower(): t for t in re.findall(regex.HASHTAG, content)}
+    if len(found_hashtags) == 0:
        return {}

-    known_tags = models.Hashtag.objects.filter(Q(name__in=hashtags)).distinct()
-    hashtag_dict = {t.name: t for t in known_tags}
+    known_hashtags = {
+        t.name.lower(): t
+        for t in models.Hashtag.objects.filter(
+            Q(name__in=found_hashtags.keys())
+        ).distinct()
+    }

-    not_found = set(hashtags) - set(hashtag_dict.keys())
-    for tag_name in not_found:
+    not_found = found_hashtags.keys() - known_hashtags.keys()
+    for lower_name in not_found:
+        tag_name = found_hashtags[lower_name]
        mention_hashtag = models.Hashtag(name=tag_name)
        mention_hashtag.save()
-        hashtag_dict[mention_hashtag.name] = mention_hashtag
+        known_hashtags[lower_name] = mention_hashtag

-    return hashtag_dict
+    return {found_hashtags[k]: v for k, v in known_hashtags.items()}


 def format_links(content):