From c68304a99beb1f6a732b4150429eb2c73f1b7826 Mon Sep 17 00:00:00 2001 From: Christof Dorner Date: Sat, 17 Dec 2022 20:13:57 +0100 Subject: [PATCH] Parse hashtags and store them in CreateStatus view --- bookwyrm/activitypub/__init__.py | 2 +- bookwyrm/activitypub/base_activity.py | 7 +++ bookwyrm/migrations/0171_hashtag_support.py | 53 +++++++++++++++++++ bookwyrm/models/__init__.py | 2 + bookwyrm/models/fields.py | 5 ++ bookwyrm/models/hashtag.py | 19 +++++++ bookwyrm/models/status.py | 1 + bookwyrm/tests/views/test_status.py | 57 ++++++++++++++++++++- bookwyrm/utils/regex.py | 1 + bookwyrm/views/status.py | 29 +++++++++++ 10 files changed, 174 insertions(+), 2 deletions(-) create mode 100644 bookwyrm/migrations/0171_hashtag_support.py create mode 100644 bookwyrm/models/hashtag.py diff --git a/bookwyrm/activitypub/__init__.py b/bookwyrm/activitypub/__init__.py index bfb22fa32..05ca44476 100644 --- a/bookwyrm/activitypub/__init__.py +++ b/bookwyrm/activitypub/__init__.py @@ -3,7 +3,7 @@ import inspect import sys from .base_activity import ActivityEncoder, Signature, naive_parse -from .base_activity import Link, Mention +from .base_activity import Link, Mention, Hashtag from .base_activity import ActivitySerializerError, resolve_remote_id from .image import Document, Image from .note import Note, GeneratedNote, Article, Comment, Quotation diff --git a/bookwyrm/activitypub/base_activity.py b/bookwyrm/activitypub/base_activity.py index 6751f9c8e..ebc06045d 100644 --- a/bookwyrm/activitypub/base_activity.py +++ b/bookwyrm/activitypub/base_activity.py @@ -426,3 +426,10 @@ class Mention(Link): """a subtype of Link for mentioning an actor""" type: str = "Mention" + + +@dataclass(init=False) +class Hashtag(Link): + """a subtype of Link for mentioning a hashtag""" + + type: str = "Hashtag" diff --git a/bookwyrm/migrations/0171_hashtag_support.py b/bookwyrm/migrations/0171_hashtag_support.py new file mode 100644 index 000000000..b77162edb --- /dev/null +++ b/bookwyrm/migrations/0171_hashtag_support.py @@ -0,0 +1,53 @@ +# Generated by Django 3.2.16 on 2022-12-17 19:28 + +import bookwyrm.models.fields +import django.contrib.postgres.fields.citext +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ("bookwyrm", "0170_merge_0168_auto_20221205_2331_0169_auto_20221206_0902"), + ] + + operations = [ + migrations.CreateModel( + name="Hashtag", + fields=[ + ( + "id", + models.AutoField( + auto_created=True, + primary_key=True, + serialize=False, + verbose_name="ID", + ), + ), + ("created_date", models.DateTimeField(auto_now_add=True)), + ("updated_date", models.DateTimeField(auto_now=True)), + ( + "remote_id", + bookwyrm.models.fields.RemoteIdField( + max_length=255, + null=True, + validators=[bookwyrm.models.fields.validate_remote_id], + ), + ), + ( + "name", + django.contrib.postgres.fields.citext.CICharField(max_length=256), + ), + ], + options={ + "abstract": False, + }, + ), + migrations.AddField( + model_name="status", + name="mention_hashtags", + field=bookwyrm.models.fields.TagField( + related_name="mention_hashtag", to="bookwyrm.Hashtag" + ), + ), + ] diff --git a/bookwyrm/models/__init__.py b/bookwyrm/models/__init__.py index ae7000162..f5b72f3e4 100644 --- a/bookwyrm/models/__init__.py +++ b/bookwyrm/models/__init__.py @@ -34,6 +34,8 @@ from .antispam import EmailBlocklist, IPBlocklist, AutoMod, automod_task from .notification import Notification +from .hashtag import Hashtag + cls_members = inspect.getmembers(sys.modules[__name__], inspect.isclass) activity_models = { c[1].activity_serializer.__name__: c[1] diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index a970e4124..f78f883f8 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -7,6 +7,7 @@ from urllib.parse import urljoin import dateutil.parser from dateutil.parser import ParserError from django.contrib.postgres.fields import ArrayField as DjangoArrayField +from django.contrib.postgres.fields import CICharField as DjangoCICharField from django.core.exceptions import ValidationError from django.db import models from django.forms import ClearableFileInput, ImageField as DjangoImageField @@ -546,6 +547,10 @@ class CharField(ActivitypubFieldMixin, models.CharField): """activitypub-aware char field""" +class CICharField(ActivitypubFieldMixin, DjangoCICharField): + """activitypub-aware cichar field""" + + class URLField(ActivitypubFieldMixin, models.URLField): """activitypub-aware url field""" diff --git a/bookwyrm/models/hashtag.py b/bookwyrm/models/hashtag.py new file mode 100644 index 000000000..d1ede638c --- /dev/null +++ b/bookwyrm/models/hashtag.py @@ -0,0 +1,19 @@ +""" model for tags """ +from bookwyrm import activitypub +from .activitypub_mixin import ActivitypubMixin +from .base_model import BookWyrmModel +from .fields import CICharField + + +class Hashtag(ActivitypubMixin, BookWyrmModel): + "a hashtag which can be used in statuses" + + name = CICharField( + max_length=256, blank=False, null=False, activitypub_field="name" + ) + + name_field = "name" + activity_serializer = activitypub.Hashtag + + def __repr__(self): + return f"<{self.__class__} id={self.id} name={self.name}>" diff --git a/bookwyrm/models/status.py b/bookwyrm/models/status.py index e51c7b2a1..1fcc9ee75 100644 --- a/bookwyrm/models/status.py +++ b/bookwyrm/models/status.py @@ -34,6 +34,7 @@ class Status(OrderedCollectionPageMixin, BookWyrmModel): raw_content = models.TextField(blank=True, null=True) mention_users = fields.TagField("User", related_name="mention_user") mention_books = fields.TagField("Edition", related_name="mention_book") + mention_hashtags = fields.TagField("Hashtag", related_name="mention_hashtag") local = models.BooleanField(default=True) content_warning = fields.CharField( max_length=500, blank=True, null=True, activitypub_field="summary" diff --git a/bookwyrm/tests/views/test_status.py b/bookwyrm/tests/views/test_status.py index 203ec57dd..2ef20935b 100644 --- a/bookwyrm/tests/views/test_status.py +++ b/bookwyrm/tests/views/test_status.py @@ -6,7 +6,7 @@ from django.test import TestCase, TransactionTestCase from django.test.client import RequestFactory from bookwyrm import forms, models, views -from bookwyrm.views.status import find_mentions +from bookwyrm.views.status import find_mentions, find_hashtags from bookwyrm.settings import DOMAIN from bookwyrm.tests.validate_html import validate_html @@ -95,6 +95,7 @@ class StatusViews(TestCase): local=True, localname="nutria", ) + self.existing_hashtag = models.Hashtag.objects.create(name="#existing") with patch("bookwyrm.models.user.set_remote_server"): self.remote_user = models.User.objects.create_user( "rat", @@ -333,6 +334,60 @@ class StatusViews(TestCase): result = find_mentions(self.local_user, "@beep@beep.com") self.assertEqual(result, {}) + def test_create_status_hashtags(self, *_): + """#mention a hashtag in a post""" + view = views.CreateStatus.as_view() + form = forms.CommentForm( + { + "content": "this is an #existing hashtag, this is a #new hashtag", + "user": self.local_user.id, + "book": self.book.id, + "privacy": "public", + } + ) + request = self.factory.post("", form.data) + request.user = self.local_user + + view(request, "comment") + status = models.Status.objects.get() + + hashtags = models.Hashtag.objects.all() + self.assertEqual(len(hashtags), 2) + self.assertEqual(list(status.mention_hashtags.all()), list(hashtags)) + # TODO: assert tag is linked to a page listing all statuses by tag + + def test_find_hashtags(self, *_): + """detect and look up #hashtags""" + result = find_hashtags("no hashtag to be found here") + self.assertEqual(result, {}) + + result = find_hashtags("#existing") + self.assertEqual(result["#existing"], self.existing_hashtag) + + result = find_hashtags("leading text #existing") + self.assertEqual(result["#existing"], self.existing_hashtag) + + result = find_hashtags("leading #existing trailing") + self.assertEqual(result["#existing"], self.existing_hashtag) + + self.assertIsNone(models.Hashtag.objects.filter(name="new").first()) + result = find_hashtags("leading #new trailing") + new_hashtag = models.Hashtag.objects.filter(name="#new").first() + self.assertIsNotNone(new_hashtag) + self.assertEqual(result["#new"], new_hashtag) + + result = find_hashtags("leading #existing #new trailing") + self.assertEqual(result["#existing"], self.existing_hashtag) + self.assertEqual(result["#new"], new_hashtag) + + result = find_hashtags("#Braunbär") + hashtag = models.Hashtag.objects.filter(name="#Braunbär").first() + self.assertEqual(result["#Braunbär"], hashtag) + + result = find_hashtags("#ひぐま") + hashtag = models.Hashtag.objects.filter(name="#ひぐま").first() + self.assertEqual(result["#ひぐま"], hashtag) + def test_format_links_simple_url(self, *_): """find and format urls into a tags""" url = "http://www.fish.com/" diff --git a/bookwyrm/utils/regex.py b/bookwyrm/utils/regex.py index c8a475a3d..98bcde5ad 100644 --- a/bookwyrm/utils/regex.py +++ b/bookwyrm/utils/regex.py @@ -7,5 +7,6 @@ USERNAME = rf"{LOCALNAME}(@{DOMAIN})?" STRICT_USERNAME = rf"(\B{STRICT_LOCALNAME}(@{DOMAIN})?\b)" FULL_USERNAME = rf"{LOCALNAME}@{DOMAIN}\b" SLUG = r"/s/(?P[-_a-z0-9]*)" +HASHTAG = r"(#[^!@#$%^&*(),.?\":{}|<>\s]+)" # should match (BookWyrm/1.0.0; or (BookWyrm/99.1.2; BOOKWYRM_USER_AGENT = r"\(BookWyrm/[0-9]+\.[0-9]+\.[0-9]+;" diff --git a/bookwyrm/views/status.py b/bookwyrm/views/status.py index 52f547cdd..b220c1240 100644 --- a/bookwyrm/views/status.py +++ b/bookwyrm/views/status.py @@ -115,6 +115,14 @@ class CreateStatus(View): if status.reply_parent: status.mention_users.add(status.reply_parent.user) + # inspect the text for hashtags + for (tag, mention_hashtag) in find_hashtags(content).items(): + # add them to status mentions fk + status.mention_hashtags.add(mention_hashtag) + + # TODO: turn the mention into a link + content = content + # deduplicate mentions status.mention_users.set(set(status.mention_users.all())) @@ -237,6 +245,27 @@ def find_mentions(user, content): return username_dict +def find_hashtags(content): + """detect #hashtags in raw status content""" + if not content: + return {} + + hashtags = re.findall(regex.HASHTAG, content) + if len(hashtags) == 0: + return {} + + known_tags = models.Hashtag.objects.filter(Q(name__in=hashtags)).distinct() + hashtag_dict = {t.name: t for t in known_tags} + + not_found = set(hashtags) - set(hashtag_dict.keys()) + for tag_name in not_found: + mention_hashtag = models.Hashtag(name=tag_name) + mention_hashtag.save() + hashtag_dict[mention_hashtag.name] = mention_hashtag + + return hashtag_dict + + def format_links(content): """detect and format links""" validator = URLValidator()