''' PROCEED WITH CAUTION: uses deduplication fields to permanently merge book data objects ''' from django.core.management.base import BaseCommand from django.db.models import Count from bookwyrm import models def update_related(canonical, obj): ''' update all the models with fk to the object being removed ''' # move related models to canonical related_models = [ (r.remote_field.name, r.related_model) for r in \ canonical._meta.related_objects] for (related_field, related_model) in related_models: related_objs = related_model.objects.filter( **{related_field: obj}) for related_obj in related_objs: print( 'replacing in', related_model.__name__, related_field, related_obj.id ) try: setattr(related_obj, related_field, canonical) related_obj.save() except TypeError: getattr(related_obj, related_field).add(canonical) getattr(related_obj, related_field).remove(obj) def copy_data(canonical, obj): ''' try to get the most data possible ''' for data_field in obj._meta.get_fields(): if not hasattr(data_field, 'activitypub_field'): continue data_value = getattr(obj, data_field.name) if not data_value: continue if not getattr(canonical, data_field.name): print('setting data field', data_field.name, data_value) setattr(canonical, data_field.name, data_value) canonical.save() def dedupe_model(model): ''' combine duplicate editions and update related models ''' fields = model._meta.get_fields() dedupe_fields = [f for f in fields if \ hasattr(f, 'deduplication_field') and f.deduplication_field] for field in dedupe_fields: dupes = model.objects.values(field.name).annotate( Count(field.name) ).filter(**{'%s__count__gt' % field.name: 1}) for dupe in dupes: value = dupe[field.name] if not value or value == '': continue print('----------') print(dupe) objs = model.objects.filter( **{field.name: value} ).order_by('id') canonical = objs.first() print('keeping', canonical.remote_id) for obj in objs[1:]: print(obj.remote_id) copy_data(canonical, obj) update_related(canonical, obj) # remove the outdated entry obj.delete() class Command(BaseCommand): ''' dedplucate allllll the book data models ''' help = 'merges duplicate book data' # pylint: disable=no-self-use,unused-argument def handle(self, *args, **options): ''' run deudplications ''' dedupe_model(models.Edition) dedupe_model(models.Work) dedupe_model(models.Author)