BookDataModel: implement merge_into method

2025-06-05 23:38:53 +00:00 · 2024-02-22 10:27:38 +01:00 · 2024-02-22 10:27:38 +01:00 · 5e123972e8
commit 5e123972e8
parent b3753ab6da
5 changed files with 95 additions and 90 deletions
--- a/bookwyrm/management/commands/deduplicate_book_data.py
+++ b/bookwyrm/management/commands/deduplicate_book_data.py
@ -1,13 +1,14 @@
 """ PROCEED WITH CAUTION: uses deduplication fields to permanently
 merge book data objects """
+
 from django.core.management.base import BaseCommand
 from django.db.models import Count
 from bookwyrm import models
-from bookwyrm.management.merge import merge_objects


 def dedupe_model(model):
    """combine duplicate editions and update related models"""
+    print(f"deduplicating {model.__name__}:")
    fields = model._meta.get_fields()
    dedupe_fields = [
        f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
@ -16,27 +17,27 @@ def dedupe_model(model):
        dupes = (
            model.objects.values(field.name)
            .annotate(Count(field.name))
-            .filter(**{"%s__count__gt" % field.name: 1})
+            .filter(**{f"{field.name}__count__gt": 1})
+            .exclude(**{field.name: ""})
+            .exclude(**{f"{field.name}__isnull": True})
        )

        for dupe in dupes:
            value = dupe[field.name]
-            if not value or value == "":
-                continue
            print("----------")
-            print(dupe)
            objs = model.objects.filter(**{field.name: value}).order_by("id")
            canonical = objs.first()
-            print("keeping", canonical.remote_id)
+            print(f"merging into {canonical.remote_id} based on {field.name} {value}:")
            for obj in objs[1:]:
-                print(obj.remote_id)
-                merge_objects(canonical, obj)
+                print(f"- {obj.remote_id}")
+                obj.merge_into(canonical)


 class Command(BaseCommand):
    """deduplicate allllll the book data models"""

    help = "merges duplicate book data"
+
    # pylint: disable=no-self-use,unused-argument
    def handle(self, *args, **options):
        """run deduplications"""
--- a/bookwyrm/management/merge.py
+++ b/bookwyrm/management/merge.py
@ -1,50 +0,0 @@
-from django.db.models import ManyToManyField
-
-
-def update_related(canonical, obj):
-    """update all the models with fk to the object being removed"""
-    # move related models to canonical
-    related_models = [
-        (r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
-    ]
-    for (related_field, related_model) in related_models:
-        # Skip the ManyToMany fields that aren’t auto-created. These
-        # should have a corresponding OneToMany field in the model for
-        # the linking table anyway. If we update it through that model
-        # instead then we won’t lose the extra fields in the linking
-        # table.
-        related_field_obj = related_model._meta.get_field(related_field)
-        if isinstance(related_field_obj, ManyToManyField):
-            through = related_field_obj.remote_field.through
-            if not through._meta.auto_created:
-                continue
-        related_objs = related_model.objects.filter(**{related_field: obj})
-        for related_obj in related_objs:
-            print("replacing in", related_model.__name__, related_field, related_obj.id)
-            try:
-                setattr(related_obj, related_field, canonical)
-                related_obj.save()
-            except TypeError:
-                getattr(related_obj, related_field).add(canonical)
-                getattr(related_obj, related_field).remove(obj)
-
-
-def copy_data(canonical, obj):
-    """try to get the most data possible"""
-    for data_field in obj._meta.get_fields():
-        if not hasattr(data_field, "activitypub_field"):
-            continue
-        data_value = getattr(obj, data_field.name)
-        if not data_value:
-            continue
-        if not getattr(canonical, data_field.name):
-            print("setting data field", data_field.name, data_value)
-            setattr(canonical, data_field.name, data_value)
-    canonical.save()
-
-
-def merge_objects(canonical, obj):
-    copy_data(canonical, obj)
-    update_related(canonical, obj)
-    # remove the outdated entry
-    obj.delete()
--- a/bookwyrm/management/merge_command.py
+++ b/bookwyrm/management/merge_command.py
@ -1,4 +1,3 @@
-from bookwyrm.management.merge import merge_objects
 from django.core.management.base import BaseCommand


@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
            print("other book doesn’t exist!")
            return

-        merge_objects(canonical, other)
+        other.merge_into(canonical)
--- a/bookwyrm/models/author.py
+++ b/bookwyrm/models/author.py
@ -1,4 +1,5 @@
 """ database schema for info about authors """
+
 import re
 from typing import Tuple, Any

@ -9,13 +10,15 @@ from bookwyrm import activitypub
 from bookwyrm.settings import DOMAIN
 from bookwyrm.utils.db import format_trigger

-from .book import BookDataModel
+from .book import BookDataModel, MergedAuthor
 from . import fields


 class Author(BookDataModel):
    """basic biographic info"""

+    merged_model = MergedAuthor
+
    wikipedia_link = fields.CharField(
        max_length=255, blank=True, null=True, deduplication_field=True
    )
--- a/bookwyrm/models/book.py
+++ b/bookwyrm/models/book.py
@ -3,12 +3,13 @@
 from itertools import chain
 import re
 from typing import Any
+from typing_extensions import Self

 from django.contrib.postgres.search import SearchVectorField
 from django.contrib.postgres.indexes import GinIndex
 from django.core.cache import cache
 from django.db import models, transaction
-from django.db.models import Prefetch
+from django.db.models import Prefetch, ManyToManyField
 from django.dispatch import receiver
 from django.utils.translation import gettext_lazy as _
 from model_utils import FieldTracker
@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
        """only send book data updates to other bookwyrm instances"""
        super().broadcast(activity, sender, software=software, **kwargs)

+    def merge_into(self, canonical: Self) -> None:
+        """merge this entity into another entity"""
+        if canonical.id == self.id:
+            raise ValueError(f"Cannot merge {self} into itself")
+
+        canonical.absorb_data_from(self)
+        canonical.save()
+
+        self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)
+
+        # move related models to canonical
+        related_models = [
+            (r.remote_field.name, r.related_model) for r in self._meta.related_objects
+        ]
+        # pylint: disable=protected-access
+        for related_field, related_model in related_models:
+            # Skip the ManyToMany fields that aren’t auto-created. These
+            # should have a corresponding OneToMany field in the model for
+            # the linking table anyway. If we update it through that model
+            # instead then we won’t lose the extra fields in the linking
+            # table.
+            # pylint: disable=protected-access
+            related_field_obj = related_model._meta.get_field(related_field)
+            if isinstance(related_field_obj, ManyToManyField):
+                through = related_field_obj.remote_field.through
+                if not through._meta.auto_created:
+                    continue
+            related_objs = related_model.objects.filter(**{related_field: self})
+            for related_obj in related_objs:
+                try:
+                    setattr(related_obj, related_field, canonical)
+                    related_obj.save()
+                except TypeError:
+                    getattr(related_obj, related_field).add(canonical)
+                    getattr(related_obj, related_field).remove(self)
+
+        self.delete()
+
+    def absorb_data_from(self, other: Self) -> None:
+        """fill empty fields with values from another entity"""
+        for data_field in self._meta.get_fields():
+            if not hasattr(data_field, "activitypub_field"):
+                continue
+            data_value = getattr(other, data_field.name)
+            if not data_value:
+                continue
+            if not getattr(self, data_field.name):
+                setattr(self, data_field.name, data_value)
+
+
+class MergedBookDataModel(models.Model):
+    """a BookDataModel instance that has been merged into another instance. kept
+    to be able to redirect old URLs"""
+
+    deleted_id = models.IntegerField(primary_key=True)
+
+    class Meta:
+        """abstract just like BookDataModel"""
+
+        abstract = True
+
+
+class MergedBook(MergedBookDataModel):
+    """an Book that has been merged into another one"""
+
+    merged_into = models.ForeignKey(
+        "Book", on_delete=models.PROTECT, related_name="absorbed"
+    )
+
+
+class MergedAuthor(MergedBookDataModel):
+    """an Author that has been merged into another one"""
+
+    merged_into = models.ForeignKey(
+        "Author", on_delete=models.PROTECT, related_name="absorbed"
+    )
+

 class Book(BookDataModel):
    """a generic book, which can mean either an edition or a work"""

+    merged_model = MergedBook
+
    connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)

    # book/work metadata
@ -456,34 +536,6 @@ class Edition(Book):
        return queryset


-class MergedBookDataModel(models.Model):
-    """a BookDataModel instance that has been merged into another instance. kept
-    to be able to redirect old URLs"""
-
-    deleted_id = models.IntegerField(primary_key=True)
-
-    class Meta:
-        """abstract just like BookDataModel"""
-
-        abstract = True
-
-
-class MergedAuthor(MergedBookDataModel):
-    """an Author that has been merged into another one"""
-
-    merged_into = models.ForeignKey(
-        "Author", on_delete=models.PROTECT, related_name="absorbed"
-    )
-
-
-class MergedBook(MergedBookDataModel):
-    """an Book that has been merged into another one"""
-
-    merged_into = models.ForeignKey(
-        "Book", on_delete=models.PROTECT, related_name="absorbed"
-    )
-
-
 def isbn_10_to_13(isbn_10):
    """convert an isbn 10 into an isbn 13"""
    isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)