From c5a3e57bb42e83e1c8bb24ff047c1adcdd4b53b4 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Thu, 31 Dec 2020 16:03:54 -0800 Subject: [PATCH] Script for removing duplicates in book data --- .../commands/deduplicate_book_data.py | 68 +++++++++++++++++++ 1 file changed, 68 insertions(+) create mode 100644 bookwyrm/management/commands/deduplicate_book_data.py diff --git a/bookwyrm/management/commands/deduplicate_book_data.py b/bookwyrm/management/commands/deduplicate_book_data.py new file mode 100644 index 000000000..cd226d916 --- /dev/null +++ b/bookwyrm/management/commands/deduplicate_book_data.py @@ -0,0 +1,68 @@ +''' PROCEED WITH CAUTION: uses deduplication fields to permanently +merge book data objects ''' +from django.core.management.base import BaseCommand +from django.db.models import Count +from bookwyrm import models + + +def dedupe_model(model): + ''' combine duplicate editions and update related models ''' + fields = model._meta.get_fields() + dedupe_fields = [f for f in fields if \ + hasattr(f, 'deduplication_field') and f.deduplication_field] + for field in dedupe_fields: + dupes = model.objects.values(field.name).annotate( + Count(field.name) + ).filter(**{'%s__count__gt' % field.name: 1}) + + for dupe in dupes: + value = dupe[field.name] + if not value or value == '': + continue + print('----------') + print(dupe) + objs = model.objects.filter( + **{field.name: value} + ).order_by('id') + canonical = objs.first() + print('keeping', canonical.remote_id, canonical.id) + for obj in objs[1:]: + print(obj.remote_id, obj.id) + # try to get the most data possible + for data_field in obj._meta.get_fields(): + if not hasattr(data_field, 'activitypub_field'): + continue + data_value = getattr(obj, data_field.name) + if not data_value: + continue + if not getattr(canonical, data_field.name): + print('setting data field', data_field.name, data_value) + setattr(canonical, data_field.name, data_value) + canonical.save() + + # move related models to canonical + related_models = [ + (r.remote_field.name, r.related_model) for r in \ + canonical._meta.related_objects] + for (related_field, related_model) in related_models: + related_objs = related_model.objects.filter( + **{related_field: obj}) + for related_obj in related_objs: + print( + 'replacing in', related_model.__name__, related_obj) + try: + setattr(related_obj, related_field, canonical) + related_obj.save() + except TypeError: + getattr(related_obj, related_field).add(canonical) + getattr(related_obj, related_field).remove(obj) + obj.delete() + + +class Command(BaseCommand): + help = 'merges duplicate book data' + def handle(self, *args, **options): + ''' run deudplications ''' + dedupe_model(models.Edition) + dedupe_model(models.Work) + dedupe_model(models.Author)