bookwyrm/bookwyrm/management/commands/deduplicate_book_data.py

80 lines
2.8 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """
from django.core.management.base import BaseCommand
from django.db.models import Count
from bookwyrm import models
2021-01-01 00:30:04 +00:00
def update_related(canonical, obj):
2021-03-08 16:49:10 +00:00
""" update all the models with fk to the object being removed """
2021-01-01 00:30:04 +00:00
# move related models to canonical
related_models = [
2021-03-08 16:49:10 +00:00
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
]
2021-01-01 00:30:04 +00:00
for (related_field, related_model) in related_models:
2021-03-08 16:49:10 +00:00
related_objs = related_model.objects.filter(**{related_field: obj})
2021-01-01 00:30:04 +00:00
for related_obj in related_objs:
2021-03-08 16:49:10 +00:00
print("replacing in", related_model.__name__, related_field, related_obj.id)
2021-01-01 00:30:04 +00:00
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(obj)
def copy_data(canonical, obj):
2021-03-08 16:49:10 +00:00
""" try to get the most data possible """
2021-01-01 00:30:04 +00:00
for data_field in obj._meta.get_fields():
2021-03-08 16:49:10 +00:00
if not hasattr(data_field, "activitypub_field"):
2021-01-01 00:30:04 +00:00
continue
data_value = getattr(obj, data_field.name)
if not data_value:
continue
if not getattr(canonical, data_field.name):
2021-03-08 16:49:10 +00:00
print("setting data field", data_field.name, data_value)
2021-01-01 00:30:04 +00:00
setattr(canonical, data_field.name, data_value)
canonical.save()
def dedupe_model(model):
2021-03-08 16:49:10 +00:00
""" combine duplicate editions and update related models """
fields = model._meta.get_fields()
2021-03-08 16:49:10 +00:00
dedupe_fields = [
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
]
for field in dedupe_fields:
2021-03-08 16:49:10 +00:00
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
)
for dupe in dupes:
value = dupe[field.name]
2021-03-08 16:49:10 +00:00
if not value or value == "":
continue
2021-03-08 16:49:10 +00:00
print("----------")
print(dupe)
2021-03-08 16:49:10 +00:00
objs = model.objects.filter(**{field.name: value}).order_by("id")
canonical = objs.first()
2021-03-08 16:49:10 +00:00
print("keeping", canonical.remote_id)
for obj in objs[1:]:
2021-01-01 00:30:04 +00:00
print(obj.remote_id)
copy_data(canonical, obj)
update_related(canonical, obj)
# remove the outdated entry
obj.delete()
class Command(BaseCommand):
2021-03-08 16:49:10 +00:00
""" dedplucate allllll the book data models """
help = "merges duplicate book data"
2021-01-01 00:30:04 +00:00
# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
2021-03-08 16:49:10 +00:00
""" run deudplications """
dedupe_model(models.Edition)
dedupe_model(models.Work)
dedupe_model(models.Author)