BookDataModel: implement merge_into method

This commit is contained in:
Bart Schuurmans 2024-02-22 10:27:38 +01:00
parent 6b672eda44
commit 37c7447555
4 changed files with 60 additions and 56 deletions

View file

@ -1,9 +1,9 @@
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """
from django.core.management.base import BaseCommand
from django.db.models import Count
from bookwyrm import models
from bookwyrm.management.merge import merge_objects
def dedupe_model(model):
@ -16,7 +16,8 @@ def dedupe_model(model):
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
.filter(merged_into__isnull=True, **{"%s__count__gt" % field.name: 1})
.exclude(**{field.name: "", "%s__isnull" % field.name: True})
)
for dupe in dupes:
@ -30,13 +31,14 @@ def dedupe_model(model):
print("keeping", canonical.remote_id)
for obj in objs[1:]:
print(obj.remote_id)
merge_objects(canonical, obj)
obj.merge_into(canonical)
class Command(BaseCommand):
"""deduplicate allllll the book data models"""
help = "merges duplicate book data"
# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
"""run deduplications"""

View file

@ -1,50 +0,0 @@
from django.db.models import ManyToManyField
def update_related(canonical, obj):
"""update all the models with fk to the object being removed"""
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
]
for (related_field, related_model) in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: obj})
for related_obj in related_objs:
print("replacing in", related_model.__name__, related_field, related_obj.id)
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(obj)
def copy_data(canonical, obj):
"""try to get the most data possible"""
for data_field in obj._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(obj, data_field.name)
if not data_value:
continue
if not getattr(canonical, data_field.name):
print("setting data field", data_field.name, data_value)
setattr(canonical, data_field.name, data_value)
canonical.save()
def merge_objects(canonical, obj):
copy_data(canonical, obj)
update_related(canonical, obj)
# remove the outdated entry
obj.delete()

View file

@ -1,4 +1,3 @@
from bookwyrm.management.merge import merge_objects
from django.core.management.base import BaseCommand
@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
print("other book doesnt exist!")
return
merge_objects(canonical, other)
other.merge_into(canonical)

View file

@ -3,12 +3,13 @@
from itertools import chain
import re
from typing import Any
from typing_extensions import Self
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
from django.core.cache import cache
from django.db import models, transaction
from django.db.models import Prefetch
from django.db.models import Prefetch, ManyToManyField
from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker
@ -112,6 +113,58 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
"""only send book data updates to other bookwyrm instances"""
super().broadcast(activity, sender, software=software, **kwargs)
def merge_into(self, canonical: Self) -> None:
"""merge this entity into another entity"""
if canonical.id == self.id:
raise ValueError(f"Cannot merge {self} into itself")
if canonical.merged_into:
raise ValueError(
f"Cannot merge {self} into {canonical} because "
f"{canonical} has itself been merged into {canonical.merged_into}"
)
canonical.adopt_data_from(self)
canonical.save()
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
]
# pylint: disable=protected-access
for related_field, related_model in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
# pylint: disable=protected-access
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: self})
for related_obj in related_objs:
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(self)
self.merged_into = canonical
self.save()
def adopt_data_from(self, other: Self) -> None:
"""fill empty fields with values from another entity"""
for data_field in self._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(other, data_field.name)
if not data_value:
continue
if not getattr(self, data_field.name):
setattr(self, data_field.name, data_value)
class Book(BookDataModel):
"""a generic book, which can mean either an edition or a work"""