diff --git a/bookwyrm/management/commands/deduplicate_book_data.py b/bookwyrm/management/commands/deduplicate_book_data.py index dde7d133c..d2f4ef936 100644 --- a/bookwyrm/management/commands/deduplicate_book_data.py +++ b/bookwyrm/management/commands/deduplicate_book_data.py @@ -1,13 +1,14 @@ """ PROCEED WITH CAUTION: uses deduplication fields to permanently merge book data objects """ + from django.core.management.base import BaseCommand from django.db.models import Count from bookwyrm import models -from bookwyrm.management.merge import merge_objects def dedupe_model(model): """combine duplicate editions and update related models""" + print(f"deduplicating {model.__name__}:") fields = model._meta.get_fields() dedupe_fields = [ f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field @@ -16,27 +17,27 @@ def dedupe_model(model): dupes = ( model.objects.values(field.name) .annotate(Count(field.name)) - .filter(**{"%s__count__gt" % field.name: 1}) + .filter(**{f"{field.name}__count__gt": 1}) + .exclude(**{field.name: ""}) + .exclude(**{f"{field.name}__isnull": True}) ) for dupe in dupes: value = dupe[field.name] - if not value or value == "": - continue print("----------") - print(dupe) objs = model.objects.filter(**{field.name: value}).order_by("id") canonical = objs.first() - print("keeping", canonical.remote_id) + print(f"merging into {canonical.remote_id} based on {field.name} {value}:") for obj in objs[1:]: - print(obj.remote_id) - merge_objects(canonical, obj) + print(f"- {obj.remote_id}") + obj.merge_into(canonical) class Command(BaseCommand): """deduplicate allllll the book data models""" help = "merges duplicate book data" + # pylint: disable=no-self-use,unused-argument def handle(self, *args, **options): """run deduplications""" diff --git a/bookwyrm/management/merge.py b/bookwyrm/management/merge.py deleted file mode 100644 index f55229f18..000000000 --- a/bookwyrm/management/merge.py +++ /dev/null @@ -1,50 +0,0 @@ -from django.db.models import ManyToManyField - - -def update_related(canonical, obj): - """update all the models with fk to the object being removed""" - # move related models to canonical - related_models = [ - (r.remote_field.name, r.related_model) for r in canonical._meta.related_objects - ] - for (related_field, related_model) in related_models: - # Skip the ManyToMany fields that aren’t auto-created. These - # should have a corresponding OneToMany field in the model for - # the linking table anyway. If we update it through that model - # instead then we won’t lose the extra fields in the linking - # table. - related_field_obj = related_model._meta.get_field(related_field) - if isinstance(related_field_obj, ManyToManyField): - through = related_field_obj.remote_field.through - if not through._meta.auto_created: - continue - related_objs = related_model.objects.filter(**{related_field: obj}) - for related_obj in related_objs: - print("replacing in", related_model.__name__, related_field, related_obj.id) - try: - setattr(related_obj, related_field, canonical) - related_obj.save() - except TypeError: - getattr(related_obj, related_field).add(canonical) - getattr(related_obj, related_field).remove(obj) - - -def copy_data(canonical, obj): - """try to get the most data possible""" - for data_field in obj._meta.get_fields(): - if not hasattr(data_field, "activitypub_field"): - continue - data_value = getattr(obj, data_field.name) - if not data_value: - continue - if not getattr(canonical, data_field.name): - print("setting data field", data_field.name, data_value) - setattr(canonical, data_field.name, data_value) - canonical.save() - - -def merge_objects(canonical, obj): - copy_data(canonical, obj) - update_related(canonical, obj) - # remove the outdated entry - obj.delete() diff --git a/bookwyrm/management/merge_command.py b/bookwyrm/management/merge_command.py index 805dc73fa..2f3f90c86 100644 --- a/bookwyrm/management/merge_command.py +++ b/bookwyrm/management/merge_command.py @@ -1,4 +1,3 @@ -from bookwyrm.management.merge import merge_objects from django.core.management.base import BaseCommand @@ -26,4 +25,4 @@ class MergeCommand(BaseCommand): print("other book doesn’t exist!") return - merge_objects(canonical, other) + other.merge_into(canonical) diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index 154b00ccb..7f40f562c 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -1,4 +1,5 @@ """ database schema for info about authors """ + import re from typing import Tuple, Any @@ -9,13 +10,15 @@ from bookwyrm import activitypub from bookwyrm.settings import DOMAIN from bookwyrm.utils.db import format_trigger -from .book import BookDataModel +from .book import BookDataModel, MergedAuthor from . import fields class Author(BookDataModel): """basic biographic info""" + merged_model = MergedAuthor + wikipedia_link = fields.CharField( max_length=255, blank=True, null=True, deduplication_field=True ) diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index 7a4a8addb..c7235a3f5 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -3,12 +3,13 @@ from itertools import chain import re from typing import Any +from typing_extensions import Self from django.contrib.postgres.search import SearchVectorField from django.contrib.postgres.indexes import GinIndex from django.core.cache import cache from django.db import models, transaction -from django.db.models import Prefetch +from django.db.models import Prefetch, ManyToManyField from django.dispatch import receiver from django.utils.translation import gettext_lazy as _ from model_utils import FieldTracker @@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel): """only send book data updates to other bookwyrm instances""" super().broadcast(activity, sender, software=software, **kwargs) + def merge_into(self, canonical: Self) -> None: + """merge this entity into another entity""" + if canonical.id == self.id: + raise ValueError(f"Cannot merge {self} into itself") + + canonical.absorb_data_from(self) + canonical.save() + + self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical) + + # move related models to canonical + related_models = [ + (r.remote_field.name, r.related_model) for r in self._meta.related_objects + ] + # pylint: disable=protected-access + for related_field, related_model in related_models: + # Skip the ManyToMany fields that aren’t auto-created. These + # should have a corresponding OneToMany field in the model for + # the linking table anyway. If we update it through that model + # instead then we won’t lose the extra fields in the linking + # table. + # pylint: disable=protected-access + related_field_obj = related_model._meta.get_field(related_field) + if isinstance(related_field_obj, ManyToManyField): + through = related_field_obj.remote_field.through + if not through._meta.auto_created: + continue + related_objs = related_model.objects.filter(**{related_field: self}) + for related_obj in related_objs: + try: + setattr(related_obj, related_field, canonical) + related_obj.save() + except TypeError: + getattr(related_obj, related_field).add(canonical) + getattr(related_obj, related_field).remove(self) + + self.delete() + + def absorb_data_from(self, other: Self) -> None: + """fill empty fields with values from another entity""" + for data_field in self._meta.get_fields(): + if not hasattr(data_field, "activitypub_field"): + continue + data_value = getattr(other, data_field.name) + if not data_value: + continue + if not getattr(self, data_field.name): + setattr(self, data_field.name, data_value) + + +class MergedBookDataModel(models.Model): + """a BookDataModel instance that has been merged into another instance. kept + to be able to redirect old URLs""" + + deleted_id = models.IntegerField(primary_key=True) + + class Meta: + """abstract just like BookDataModel""" + + abstract = True + + +class MergedBook(MergedBookDataModel): + """an Book that has been merged into another one""" + + merged_into = models.ForeignKey( + "Book", on_delete=models.PROTECT, related_name="absorbed" + ) + + +class MergedAuthor(MergedBookDataModel): + """an Author that has been merged into another one""" + + merged_into = models.ForeignKey( + "Author", on_delete=models.PROTECT, related_name="absorbed" + ) + class Book(BookDataModel): """a generic book, which can mean either an edition or a work""" + merged_model = MergedBook + connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True) # book/work metadata @@ -456,34 +536,6 @@ class Edition(Book): return queryset -class MergedBookDataModel(models.Model): - """a BookDataModel instance that has been merged into another instance. kept - to be able to redirect old URLs""" - - deleted_id = models.IntegerField(primary_key=True) - - class Meta: - """abstract just like BookDataModel""" - - abstract = True - - -class MergedAuthor(MergedBookDataModel): - """an Author that has been merged into another one""" - - merged_into = models.ForeignKey( - "Author", on_delete=models.PROTECT, related_name="absorbed" - ) - - -class MergedBook(MergedBookDataModel): - """an Book that has been merged into another one""" - - merged_into = models.ForeignKey( - "Book", on_delete=models.PROTECT, related_name="absorbed" - ) - - def isbn_10_to_13(isbn_10): """convert an isbn 10 into an isbn 13""" isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)