mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-06-20 06:00:35 +00:00
2bbc9a16ad
Previously when the deduplicate_book_data script tried to merge an edition that was on a shelf or in a list then it would fail because when the canonical book was added to the shelf or the list then it wouldn’t set the extra fields of the linking table for the “through” model of the field. These would end up defaulting to NULL, but that is not valid for some of the fields in ShelfItem and ListItem so postgres wouldn’t accept it. To fix that, this patch makes it skip updating fields that have a non-autogenerated linking table. The linking table would appear as a separate model anyway so the book will be moved via that instead. Fixes: #2817
90 lines
3.4 KiB
Python
90 lines
3.4 KiB
Python
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
|
||
merge book data objects """
|
||
from django.core.management.base import BaseCommand
|
||
from django.db.models import Count, ManyToManyField
|
||
from bookwyrm import models
|
||
|
||
|
||
def update_related(canonical, obj):
|
||
"""update all the models with fk to the object being removed"""
|
||
# move related models to canonical
|
||
related_models = [
|
||
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
|
||
]
|
||
for (related_field, related_model) in related_models:
|
||
# Skip the ManyToMany fields that aren’t auto-created. These
|
||
# should have a corresponding OneToMany field in the model for
|
||
# the linking table anyway. If we update it through that model
|
||
# instead then we won’t lose the extra fields in the linking
|
||
# table.
|
||
related_field_obj = related_model._meta.get_field(related_field)
|
||
if isinstance(related_field_obj, ManyToManyField):
|
||
through = related_field_obj.remote_field.through
|
||
if not through._meta.auto_created:
|
||
continue
|
||
related_objs = related_model.objects.filter(**{related_field: obj})
|
||
for related_obj in related_objs:
|
||
print("replacing in", related_model.__name__, related_field, related_obj.id)
|
||
try:
|
||
setattr(related_obj, related_field, canonical)
|
||
related_obj.save()
|
||
except TypeError:
|
||
getattr(related_obj, related_field).add(canonical)
|
||
getattr(related_obj, related_field).remove(obj)
|
||
|
||
|
||
def copy_data(canonical, obj):
|
||
"""try to get the most data possible"""
|
||
for data_field in obj._meta.get_fields():
|
||
if not hasattr(data_field, "activitypub_field"):
|
||
continue
|
||
data_value = getattr(obj, data_field.name)
|
||
if not data_value:
|
||
continue
|
||
if not getattr(canonical, data_field.name):
|
||
print("setting data field", data_field.name, data_value)
|
||
setattr(canonical, data_field.name, data_value)
|
||
canonical.save()
|
||
|
||
|
||
def dedupe_model(model):
|
||
"""combine duplicate editions and update related models"""
|
||
fields = model._meta.get_fields()
|
||
dedupe_fields = [
|
||
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
|
||
]
|
||
for field in dedupe_fields:
|
||
dupes = (
|
||
model.objects.values(field.name)
|
||
.annotate(Count(field.name))
|
||
.filter(**{"%s__count__gt" % field.name: 1})
|
||
)
|
||
|
||
for dupe in dupes:
|
||
value = dupe[field.name]
|
||
if not value or value == "":
|
||
continue
|
||
print("----------")
|
||
print(dupe)
|
||
objs = model.objects.filter(**{field.name: value}).order_by("id")
|
||
canonical = objs.first()
|
||
print("keeping", canonical.remote_id)
|
||
for obj in objs[1:]:
|
||
print(obj.remote_id)
|
||
copy_data(canonical, obj)
|
||
update_related(canonical, obj)
|
||
# remove the outdated entry
|
||
obj.delete()
|
||
|
||
|
||
class Command(BaseCommand):
|
||
"""deduplicate allllll the book data models"""
|
||
|
||
help = "merges duplicate book data"
|
||
# pylint: disable=no-self-use,unused-argument
|
||
def handle(self, *args, **options):
|
||
"""run deduplications"""
|
||
dedupe_model(models.Edition)
|
||
dedupe_model(models.Work)
|
||
dedupe_model(models.Author)
|