BookDataModel: implement merge_into method

This commit is contained in:
Bart Schuurmans 2024-02-22 10:27:38 +01:00
parent b3753ab6da
commit 5e123972e8
5 changed files with 95 additions and 90 deletions

View file

@ -1,13 +1,14 @@
""" PROCEED WITH CAUTION: uses deduplication fields to permanently """ PROCEED WITH CAUTION: uses deduplication fields to permanently
merge book data objects """ merge book data objects """
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
from django.db.models import Count from django.db.models import Count
from bookwyrm import models from bookwyrm import models
from bookwyrm.management.merge import merge_objects
def dedupe_model(model): def dedupe_model(model):
"""combine duplicate editions and update related models""" """combine duplicate editions and update related models"""
print(f"deduplicating {model.__name__}:")
fields = model._meta.get_fields() fields = model._meta.get_fields()
dedupe_fields = [ dedupe_fields = [
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
@ -16,27 +17,27 @@ def dedupe_model(model):
dupes = ( dupes = (
model.objects.values(field.name) model.objects.values(field.name)
.annotate(Count(field.name)) .annotate(Count(field.name))
.filter(**{"%s__count__gt" % field.name: 1}) .filter(**{f"{field.name}__count__gt": 1})
.exclude(**{field.name: ""})
.exclude(**{f"{field.name}__isnull": True})
) )
for dupe in dupes: for dupe in dupes:
value = dupe[field.name] value = dupe[field.name]
if not value or value == "":
continue
print("----------") print("----------")
print(dupe)
objs = model.objects.filter(**{field.name: value}).order_by("id") objs = model.objects.filter(**{field.name: value}).order_by("id")
canonical = objs.first() canonical = objs.first()
print("keeping", canonical.remote_id) print(f"merging into {canonical.remote_id} based on {field.name} {value}:")
for obj in objs[1:]: for obj in objs[1:]:
print(obj.remote_id) print(f"- {obj.remote_id}")
merge_objects(canonical, obj) obj.merge_into(canonical)
class Command(BaseCommand): class Command(BaseCommand):
"""deduplicate allllll the book data models""" """deduplicate allllll the book data models"""
help = "merges duplicate book data" help = "merges duplicate book data"
# pylint: disable=no-self-use,unused-argument # pylint: disable=no-self-use,unused-argument
def handle(self, *args, **options): def handle(self, *args, **options):
"""run deduplications""" """run deduplications"""

View file

@ -1,50 +0,0 @@
from django.db.models import ManyToManyField
def update_related(canonical, obj):
"""update all the models with fk to the object being removed"""
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
]
for (related_field, related_model) in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: obj})
for related_obj in related_objs:
print("replacing in", related_model.__name__, related_field, related_obj.id)
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(obj)
def copy_data(canonical, obj):
"""try to get the most data possible"""
for data_field in obj._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(obj, data_field.name)
if not data_value:
continue
if not getattr(canonical, data_field.name):
print("setting data field", data_field.name, data_value)
setattr(canonical, data_field.name, data_value)
canonical.save()
def merge_objects(canonical, obj):
copy_data(canonical, obj)
update_related(canonical, obj)
# remove the outdated entry
obj.delete()

View file

@ -1,4 +1,3 @@
from bookwyrm.management.merge import merge_objects
from django.core.management.base import BaseCommand from django.core.management.base import BaseCommand
@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
print("other book doesnt exist!") print("other book doesnt exist!")
return return
merge_objects(canonical, other) other.merge_into(canonical)

View file

@ -1,4 +1,5 @@
""" database schema for info about authors """ """ database schema for info about authors """
import re import re
from typing import Tuple, Any from typing import Tuple, Any
@ -9,13 +10,15 @@ from bookwyrm import activitypub
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
from bookwyrm.utils.db import format_trigger from bookwyrm.utils.db import format_trigger
from .book import BookDataModel from .book import BookDataModel, MergedAuthor
from . import fields from . import fields
class Author(BookDataModel): class Author(BookDataModel):
"""basic biographic info""" """basic biographic info"""
merged_model = MergedAuthor
wikipedia_link = fields.CharField( wikipedia_link = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True max_length=255, blank=True, null=True, deduplication_field=True
) )

View file

@ -3,12 +3,13 @@
from itertools import chain from itertools import chain
import re import re
from typing import Any from typing import Any
from typing_extensions import Self
from django.contrib.postgres.search import SearchVectorField from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.indexes import GinIndex
from django.core.cache import cache from django.core.cache import cache
from django.db import models, transaction from django.db import models, transaction
from django.db.models import Prefetch from django.db.models import Prefetch, ManyToManyField
from django.dispatch import receiver from django.dispatch import receiver
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
from model_utils import FieldTracker from model_utils import FieldTracker
@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
"""only send book data updates to other bookwyrm instances""" """only send book data updates to other bookwyrm instances"""
super().broadcast(activity, sender, software=software, **kwargs) super().broadcast(activity, sender, software=software, **kwargs)
def merge_into(self, canonical: Self) -> None:
"""merge this entity into another entity"""
if canonical.id == self.id:
raise ValueError(f"Cannot merge {self} into itself")
canonical.absorb_data_from(self)
canonical.save()
self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)
# move related models to canonical
related_models = [
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
]
# pylint: disable=protected-access
for related_field, related_model in related_models:
# Skip the ManyToMany fields that arent auto-created. These
# should have a corresponding OneToMany field in the model for
# the linking table anyway. If we update it through that model
# instead then we wont lose the extra fields in the linking
# table.
# pylint: disable=protected-access
related_field_obj = related_model._meta.get_field(related_field)
if isinstance(related_field_obj, ManyToManyField):
through = related_field_obj.remote_field.through
if not through._meta.auto_created:
continue
related_objs = related_model.objects.filter(**{related_field: self})
for related_obj in related_objs:
try:
setattr(related_obj, related_field, canonical)
related_obj.save()
except TypeError:
getattr(related_obj, related_field).add(canonical)
getattr(related_obj, related_field).remove(self)
self.delete()
def absorb_data_from(self, other: Self) -> None:
"""fill empty fields with values from another entity"""
for data_field in self._meta.get_fields():
if not hasattr(data_field, "activitypub_field"):
continue
data_value = getattr(other, data_field.name)
if not data_value:
continue
if not getattr(self, data_field.name):
setattr(self, data_field.name, data_value)
class MergedBookDataModel(models.Model):
"""a BookDataModel instance that has been merged into another instance. kept
to be able to redirect old URLs"""
deleted_id = models.IntegerField(primary_key=True)
class Meta:
"""abstract just like BookDataModel"""
abstract = True
class MergedBook(MergedBookDataModel):
"""an Book that has been merged into another one"""
merged_into = models.ForeignKey(
"Book", on_delete=models.PROTECT, related_name="absorbed"
)
class MergedAuthor(MergedBookDataModel):
"""an Author that has been merged into another one"""
merged_into = models.ForeignKey(
"Author", on_delete=models.PROTECT, related_name="absorbed"
)
class Book(BookDataModel): class Book(BookDataModel):
"""a generic book, which can mean either an edition or a work""" """a generic book, which can mean either an edition or a work"""
merged_model = MergedBook
connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True) connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)
# book/work metadata # book/work metadata
@ -456,34 +536,6 @@ class Edition(Book):
return queryset return queryset
class MergedBookDataModel(models.Model):
"""a BookDataModel instance that has been merged into another instance. kept
to be able to redirect old URLs"""
deleted_id = models.IntegerField(primary_key=True)
class Meta:
"""abstract just like BookDataModel"""
abstract = True
class MergedAuthor(MergedBookDataModel):
"""an Author that has been merged into another one"""
merged_into = models.ForeignKey(
"Author", on_delete=models.PROTECT, related_name="absorbed"
)
class MergedBook(MergedBookDataModel):
"""an Book that has been merged into another one"""
merged_into = models.ForeignKey(
"Book", on_delete=models.PROTECT, related_name="absorbed"
)
def isbn_10_to_13(isbn_10): def isbn_10_to_13(isbn_10):
"""convert an isbn 10 into an isbn 13""" """convert an isbn 10 into an isbn 13"""
isbn_10 = re.sub(r"[^0-9X]", "", isbn_10) isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)