Fix imports creating duplicate Editions

This fixes bugs at both the sending and receiving instances:
As the primary fix is at the sending end, a full fix will need to be collaborative at the network level.

**Fixing at the sending end: don't send duplicate GeneratedNotes**

The primary fix is to `handle_reading_status`: it calls `create_generated_note` which saves and broadcasts the GeneratedNote.
Before this fix, `handle_reading_status` then saved and broadcast the note again, creating two incoming GeneratedNotes
for all federated BookWyrm servers, and thus triggering simultaneous or near-simultaneous imports of the associated book
within the `tag` reference. This is then dereferenced, creating two identical Works, the first of which ends up empty
and the second of which ends up with two identical Editions.

**Fixing at the sending end: delay BookStatus broadcasts**

My testing doesn't indicate one way or the other whether this happens, but there's a theoretical race condition for incoming
editions that are ListItem or ShelfBook items. The Add activity needs to create the Edition and Work if they are not already
on the receiving instance, and so does the Comment or GeneratedNote. The fix in `ActivitypubMixin::broadcast` sets a delay on
these statuses to ensure that the Add activity is processed first, and thus the book will already exist when the status is
processed.

**Fixing at the receiving end: prevent dereferencing the incoming Edition**

The book attached to a ListItem or ShelfBook is an Edition. This means that when a receiving instance dereferences it, we
end up with a recursion: The Edition dereferences the parent Work, which dereferences all the child editions in celery tasks.
This includes the Edition that triggered the whole process in the first place.

I couldn't create a reproducable test of this, but I've seen evidence that this can cause a race condition where the incoming
Edition ends up being duplicated - the task to create child Editions checks for existence before the original triggering
Edition has saved.

The changes to Note, Fields, and `ActivityObject::to_model` are aimed at preventing this by allowing us to compare related
field ids to the original triggering remote_id before running `set_related_field` and excluding them if there is a match.
This commit is contained in:
Hugh Rundle 2024-11-17 09:24:49 +11:00
parent 13381b9b4d
commit 32b041d10c
No known key found for this signature in database
GPG key ID: A7E35779918253F9
6 changed files with 45 additions and 13 deletions

View file

@ -120,6 +120,7 @@ class ActivityObject:
save: bool = True,
overwrite: bool = True,
allow_external_connections: bool = True,
trigger=None,
) -> Optional[TBookWyrmModel]:
"""convert from an activity to a model instance. Args:
model: the django model that this object is being converted to
@ -133,6 +134,9 @@ class ActivityObject:
only update blank fields if false
allow_external_connections: look up missing data if true,
throw an exception if false and an external connection is needed
trigger: the object that originally triggered this
self.to_model. e.g. if this is a Work being dereferenced from
an incoming Edition
"""
model = model or get_model_from_type(self.type)
@ -223,6 +227,8 @@ class ActivityObject:
related_field_name = model_field.field.name
for item in values:
if trigger and item == trigger.remote_id:
continue
set_related_field.delay(
related_model.__name__,
instance.__class__.__name__,

View file

@ -50,6 +50,7 @@ class Note(ActivityObject):
save=True,
overwrite=True,
allow_external_connections=True,
trigger=None,
):
instance = super().to_model(
model, instance, allow_create, save, overwrite, allow_external_connections

View file

@ -129,7 +129,20 @@ class ActivitypubMixin:
def broadcast(self, activity, sender, software=None, queue=BROADCAST):
"""send out an activity"""
# if we're posting about ShelfBooks, set a delay to give the base activity
# time to add the book on remote servers first to avoid race conditions
countdown = (
10
if (
isinstance(activity, object)
and not isinstance(activity["object"], str)
and activity["object"].get("type", None) in ["GeneratedNote", "Comment"]
)
else 0
)
broadcast_task.apply_async(
countdown=countdown,
args=(
sender.id,
json.dumps(activity, cls=activitypub.ActivityEncoder),
@ -227,6 +240,7 @@ class ObjectMixin(ActivitypubMixin):
return
try:
# TODO: here is where we might use an ActivityPub extension instead
# do we have a "pure" activitypub version of this for mastodon?
if software != "bookwyrm" and hasattr(self, "pure_content"):
pure_activity = self.to_create_activity(user, pure=True)

View file

@ -86,7 +86,9 @@ class ActivitypubFieldMixin:
raise
value = getattr(data, "actor")
formatted = self.field_from_activity(
value, allow_external_connections=allow_external_connections
value,
allow_external_connections=allow_external_connections,
trigger=instance,
)
if formatted is None or formatted is MISSING or formatted == {}:
return False
@ -128,7 +130,7 @@ class ActivitypubFieldMixin:
return value
# pylint: disable=unused-argument
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
"""formatter to convert activitypub into a model value"""
if value and hasattr(self, "activitypub_wrapper"):
value = value.get(self.activitypub_wrapper)
@ -150,7 +152,9 @@ class ActivitypubRelatedFieldMixin(ActivitypubFieldMixin):
self.load_remote = load_remote
super().__init__(*args, **kwargs)
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
"""trigger: the object that triggered this deserialization.
For example the Edition for which self is the parent Work"""
if not value:
return None
@ -160,7 +164,7 @@ class ActivitypubRelatedFieldMixin(ActivitypubFieldMixin):
# only look in the local database
return related_model.find_existing(value.serialize())
# this is an activitypub object, which we can deserialize
return value.to_model(model=related_model)
return value.to_model(model=related_model, trigger=trigger)
try:
# make sure the value looks like a remote id
validate_remote_id(value)
@ -336,7 +340,7 @@ class ManyToManyField(ActivitypubFieldMixin, models.ManyToManyField):
return f"{value.instance.remote_id}/{self.name}"
return [i.remote_id for i in value.all()]
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if value is None or value is MISSING:
return None
if not isinstance(value, list):
@ -386,7 +390,7 @@ class TagField(ManyToManyField):
)
return tags
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if not isinstance(value, list):
# GoToSocial DMs and single-user mentions are
# sent as objects, not as an array of objects
@ -481,7 +485,7 @@ class ImageField(ActivitypubFieldMixin, models.ImageField):
return activitypub.Image(url=url, name=alt)
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
image_slug = value
# when it's an inline image (User avatar/icon, Book cover), it's a json
# blob, but when it's an attached image, it's just a url
@ -538,7 +542,7 @@ class DateTimeField(ActivitypubFieldMixin, models.DateTimeField):
return None
return value.isoformat()
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
missing_fields = datetime(1970, 1, 1) # "2022-10" => "2022-10-01"
try:
date_value = dateutil.parser.parse(value, default=missing_fields)
@ -556,7 +560,7 @@ class PartialDateField(ActivitypubFieldMixin, PartialDateModel):
def field_to_activity(self, value) -> str:
return value.partial_isoformat() if value else None
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
# pylint: disable=no-else-return
try:
return from_partial_isoformat(value)
@ -584,7 +588,7 @@ class PartialDateField(ActivitypubFieldMixin, PartialDateModel):
class HtmlField(ActivitypubFieldMixin, models.TextField):
"""a text field for storing html"""
def field_from_activity(self, value, allow_external_connections=True):
def field_from_activity(self, value, allow_external_connections=True, trigger=None):
if not value or value == MISSING:
return None
return clean(value)

View file

@ -70,7 +70,9 @@ class ReadingViews(TestCase):
},
)
request.user = self.local_user
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
with patch(
"bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"
) as mock:
views.ReadingStatus.as_view()(request, "start", self.book.id)
self.assertEqual(shelf.books.get(), self.book)
@ -86,6 +88,12 @@ class ReadingViews(TestCase):
self.assertEqual(readthrough.user, self.local_user)
self.assertEqual(readthrough.book, self.book)
# Three broadcast tasks:
# 1. Create Readthrough
# 2. Create post as pure_content (for non-BookWyrm)
# 3. Create post with book attached - this should only happen once!
self.assertEqual(len(mock.mock_calls), 3)
def test_start_reading_with_comment(self, *_):
"""begin a book"""
shelf = self.local_user.shelf_set.get(identifier=models.Shelf.READING)

View file

@ -156,8 +156,7 @@ def handle_reading_status(user, shelf, book, privacy):
# it's a non-standard shelf, don't worry about it
return
status = create_generated_note(user, message, mention_books=[book], privacy=privacy)
status.save()
create_generated_note(user, message, mention_books=[book], privacy=privacy)
def load_date_in_user_tz_as_utc(date_str: str, user: models.User) -> datetime: