diff --git a/bookwyrm/management/commands/deduplicate_book_data.py b/bookwyrm/management/commands/deduplicate_book_data.py new file mode 100644 index 00000000..044b2a98 --- /dev/null +++ b/bookwyrm/management/commands/deduplicate_book_data.py @@ -0,0 +1,83 @@ +''' PROCEED WITH CAUTION: uses deduplication fields to permanently +merge book data objects ''' +from django.core.management.base import BaseCommand +from django.db.models import Count +from bookwyrm import models + + +def update_related(canonical, obj): + ''' update all the models with fk to the object being removed ''' + # move related models to canonical + related_models = [ + (r.remote_field.name, r.related_model) for r in \ + canonical._meta.related_objects] + for (related_field, related_model) in related_models: + related_objs = related_model.objects.filter( + **{related_field: obj}) + for related_obj in related_objs: + print( + 'replacing in', + related_model.__name__, + related_field, + related_obj.id + ) + try: + setattr(related_obj, related_field, canonical) + related_obj.save() + except TypeError: + getattr(related_obj, related_field).add(canonical) + getattr(related_obj, related_field).remove(obj) + + +def copy_data(canonical, obj): + ''' try to get the most data possible ''' + for data_field in obj._meta.get_fields(): + if not hasattr(data_field, 'activitypub_field'): + continue + data_value = getattr(obj, data_field.name) + if not data_value: + continue + if not getattr(canonical, data_field.name): + print('setting data field', data_field.name, data_value) + setattr(canonical, data_field.name, data_value) + canonical.save() + + +def dedupe_model(model): + ''' combine duplicate editions and update related models ''' + fields = model._meta.get_fields() + dedupe_fields = [f for f in fields if \ + hasattr(f, 'deduplication_field') and f.deduplication_field] + for field in dedupe_fields: + dupes = model.objects.values(field.name).annotate( + Count(field.name) + ).filter(**{'%s__count__gt' % field.name: 1}) + + for dupe in dupes: + value = dupe[field.name] + if not value or value == '': + continue + print('----------') + print(dupe) + objs = model.objects.filter( + **{field.name: value} + ).order_by('id') + canonical = objs.first() + print('keeping', canonical.remote_id) + for obj in objs[1:]: + print(obj.remote_id) + copy_data(canonical, obj) + update_related(canonical, obj) + # remove the outdated entry + obj.delete() + + +class Command(BaseCommand): + ''' dedplucate allllll the book data models ''' + help = 'merges duplicate book data' + # pylint: disable=no-self-use,unused-argument + def handle(self, *args, **options): + ''' run deudplications ''' + dedupe_model(models.Edition) + dedupe_model(models.Work) + dedupe_model(models.Author)