bookwyrm/bookwyrm/management/commands/deduplicate_book_data.py
Neil Roberts 2bbc9a16ad Fix deduplicating books that are on a shelf or in a list
Previously when the deduplicate_book_data script tried to merge an
edition that was on a shelf or in a list then it would fail because when
the canonical book was added to the shelf or the list then it wouldn’t
set the extra fields of the linking table for the “through” model of the
field. These would end up defaulting to NULL, but that is not valid for
some of the fields in ShelfItem and ListItem so postgres wouldn’t accept
it.

To fix that, this patch makes it skip updating fields that have a
non-autogenerated linking table. The linking table would appear as a
separate model anyway so the book will be moved via that instead.

Fixes: #2817
2023-04-15 11:59:12 +02:00

90 lines
3.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """
from django.core.management.base import BaseCommand
from django.db.models import Count, ManyToManyField
from bookwyrm import models
def update_related(canonical, obj):
"""update all the models with fk to the object being removed"""
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
]
for (related_field, related_model) in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: obj})
for related_obj in related_objs:
print("replacing in", related_model.__name__, related_field, related_obj.id)
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(obj)
def copy_data(canonical, obj):
"""try to get the most data possible"""
for data_field in obj._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(obj, data_field.name)
if not data_value:
continue
if not getattr(canonical, data_field.name):
print("setting data field", data_field.name, data_value)
setattr(canonical, data_field.name, data_value)
canonical.save()
def dedupe_model(model):
"""combine duplicate editions and update related models"""
fields = model._meta.get_fields()
dedupe_fields = [
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
]
for field in dedupe_fields:
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
)
for dupe in dupes:
value = dupe[field.name]
if not value or value == "":
continue
print("----------")
print(dupe)
objs = model.objects.filter(**{field.name: value}).order_by("id")
canonical = objs.first()
print("keeping", canonical.remote_id)
for obj in objs[1:]:
print(obj.remote_id)
copy_data(canonical, obj)
update_related(canonical, obj)
# remove the outdated entry
obj.delete()
class Command(BaseCommand):
"""deduplicate allllll the book data models"""
help = "merges duplicate book data"
# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
"""run deduplications"""
dedupe_model(models.Edition)
dedupe_model(models.Work)
dedupe_model(models.Author)