BookDataModel: implement merge_into method

This commit is contained in:
Bart Schuurmans 2024-02-22 10:27:38 +01:00
parent b3753ab6da
commit 5e123972e8
5 changed files with 95 additions and 90 deletions

View file

@ -1,13 +1,14 @@
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """
from django.core.management.base import BaseCommand
from django.db.models import Count
from bookwyrm import models
from bookwyrm.management.merge import merge_objects
def dedupe_model(model):
"""combine duplicate editions and update related models"""
print(f"deduplicating {model.__name__}:")
fields = model._meta.get_fields()
dedupe_fields = [
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
@ -16,27 +17,27 @@ def dedupe_model(model):
dupes = (
model.objects.values(field.name)
.annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1})
.filter(**{f"{field.name}__count__gt": 1})
.exclude(**{field.name: ""})
.exclude(**{f"{field.name}__isnull": True})
)
for dupe in dupes:
value = dupe[field.name]
if not value or value == "":
continue
print("----------")
print(dupe)
objs = model.objects.filter(**{field.name: value}).order_by("id")
canonical = objs.first()
print("keeping", canonical.remote_id)
print(f"merging into {canonical.remote_id} based on {field.name} {value}:")
for obj in objs[1:]:
print(obj.remote_id)
merge_objects(canonical, obj)
print(f"- {obj.remote_id}")
obj.merge_into(canonical)
class Command(BaseCommand):
"""deduplicate allllll the book data models"""
help = "merges duplicate book data"
# pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options):
"""run deduplications"""

View file

@ -1,50 +0,0 @@
from django.db.models import ManyToManyField
def update_related(canonical, obj):
"""update all the models with fk to the object being removed"""
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
]
for (related_field, related_model) in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: obj})
for related_obj in related_objs:
print("replacing in", related_model.__name__, related_field, related_obj.id)
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(obj)
def copy_data(canonical, obj):
"""try to get the most data possible"""
for data_field in obj._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(obj, data_field.name)
if not data_value:
continue
if not getattr(canonical, data_field.name):
print("setting data field", data_field.name, data_value)
setattr(canonical, data_field.name, data_value)
canonical.save()
def merge_objects(canonical, obj):
copy_data(canonical, obj)
update_related(canonical, obj)
# remove the outdated entry
obj.delete()

View file

@ -1,4 +1,3 @@
from bookwyrm.management.merge import merge_objects
from django.core.management.base import BaseCommand
@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
print("other book doesnt exist!")
return
merge_objects(canonical, other)
other.merge_into(canonical)

View file

@ -1,4 +1,5 @@
""" database schema for info about authors """
import re
from typing import Tuple, Any
@ -9,13 +10,15 @@ from bookwyrm import activitypub
from bookwyrm.settings import DOMAIN
from bookwyrm.utils.db import format_trigger
from .book import BookDataModel
from .book import BookDataModel, MergedAuthor
from . import fields
class Author(BookDataModel):
"""basic biographic info"""
merged_model = MergedAuthor
wikipedia_link = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True
)

View file

@ -3,12 +3,13 @@
from itertools import chain
import re
from typing import Any
from typing_extensions import Self
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
from django.core.cache import cache
from django.db import models, transaction
from django.db.models import Prefetch
from django.db.models import Prefetch, ManyToManyField
from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker
@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
"""only send book data updates to other bookwyrm instances"""
super().broadcast(activity, sender, software=software, **kwargs)
def merge_into(self, canonical: Self) -> None:
"""merge this entity into another entity"""
if canonical.id == self.id:
raise ValueError(f"Cannot merge {self} into itself")
canonical.absorb_data_from(self)
canonical.save()
self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
]
# pylint: disable=protected-access
for related_field, related_model in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
# pylint: disable=protected-access
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: self})
for related_obj in related_objs:
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(self)
self.delete()
def absorb_data_from(self, other: Self) -> None:
"""fill empty fields with values from another entity"""
for data_field in self._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(other, data_field.name)
if not data_value:
continue
if not getattr(self, data_field.name):
setattr(self, data_field.name, data_value)
class MergedBookDataModel(models.Model):
"""a BookDataModel instance that has been merged into another instance. kept
to be able to redirect old URLs"""
deleted_id = models.IntegerField(primary_key=True)
class Meta:
"""abstract just like BookDataModel"""
abstract = True
class MergedBook(MergedBookDataModel):
"""an Book that has been merged into another one"""
merged_into = models.ForeignKey(
"Book", on_delete=models.PROTECT, related_name="absorbed"
)
class MergedAuthor(MergedBookDataModel):
"""an Author that has been merged into another one"""
merged_into = models.ForeignKey(
"Author", on_delete=models.PROTECT, related_name="absorbed"
)
class Book(BookDataModel):
"""a generic book, which can mean either an edition or a work"""
merged_model = MergedBook
connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)
# book/work metadata
@ -456,34 +536,6 @@ class Edition(Book):
return queryset
class MergedBookDataModel(models.Model):
"""a BookDataModel instance that has been merged into another instance. kept
to be able to redirect old URLs"""
deleted_id = models.IntegerField(primary_key=True)
class Meta:
"""abstract just like BookDataModel"""
abstract = True
class MergedAuthor(MergedBookDataModel):
"""an Author that has been merged into another one"""
merged_into = models.ForeignKey(
"Author", on_delete=models.PROTECT, related_name="absorbed"
)
class MergedBook(MergedBookDataModel):
"""an Book that has been merged into another one"""
merged_into = models.ForeignKey(
"Book", on_delete=models.PROTECT, related_name="absorbed"
)
def isbn_10_to_13(isbn_10):
"""convert an isbn 10 into an isbn 13"""
isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)