mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-12-23 08:36:32 +00:00
BookDataModel: implement merge_into method
This commit is contained in:
parent
b3753ab6da
commit
5e123972e8
5 changed files with 95 additions and 90 deletions
|
@ -1,13 +1,14 @@
|
|||
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
|
||||
merge book data objects """
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.db.models import Count
|
||||
from bookwyrm import models
|
||||
from bookwyrm.management.merge import merge_objects
|
||||
|
||||
|
||||
def dedupe_model(model):
|
||||
"""combine duplicate editions and update related models"""
|
||||
print(f"deduplicating {model.__name__}:")
|
||||
fields = model._meta.get_fields()
|
||||
dedupe_fields = [
|
||||
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
|
||||
|
@ -16,27 +17,27 @@ def dedupe_model(model):
|
|||
dupes = (
|
||||
model.objects.values(field.name)
|
||||
.annotate(Count(field.name))
|
||||
.filter(**{"%s__count__gt" % field.name: 1})
|
||||
.filter(**{f"{field.name}__count__gt": 1})
|
||||
.exclude(**{field.name: ""})
|
||||
.exclude(**{f"{field.name}__isnull": True})
|
||||
)
|
||||
|
||||
for dupe in dupes:
|
||||
value = dupe[field.name]
|
||||
if not value or value == "":
|
||||
continue
|
||||
print("----------")
|
||||
print(dupe)
|
||||
objs = model.objects.filter(**{field.name: value}).order_by("id")
|
||||
canonical = objs.first()
|
||||
print("keeping", canonical.remote_id)
|
||||
print(f"merging into {canonical.remote_id} based on {field.name} {value}:")
|
||||
for obj in objs[1:]:
|
||||
print(obj.remote_id)
|
||||
merge_objects(canonical, obj)
|
||||
print(f"- {obj.remote_id}")
|
||||
obj.merge_into(canonical)
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
"""deduplicate allllll the book data models"""
|
||||
|
||||
help = "merges duplicate book data"
|
||||
|
||||
# pylint: disable=no-self-use,unused-argument
|
||||
def handle(self, *args, **options):
|
||||
"""run deduplications"""
|
||||
|
|
|
@ -1,50 +0,0 @@
|
|||
from django.db.models import ManyToManyField
|
||||
|
||||
|
||||
def update_related(canonical, obj):
|
||||
"""update all the models with fk to the object being removed"""
|
||||
# move related models to canonical
|
||||
related_models = [
|
||||
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
|
||||
]
|
||||
for (related_field, related_model) in related_models:
|
||||
# Skip the ManyToMany fields that aren’t auto-created. These
|
||||
# should have a corresponding OneToMany field in the model for
|
||||
# the linking table anyway. If we update it through that model
|
||||
# instead then we won’t lose the extra fields in the linking
|
||||
# table.
|
||||
related_field_obj = related_model._meta.get_field(related_field)
|
||||
if isinstance(related_field_obj, ManyToManyField):
|
||||
through = related_field_obj.remote_field.through
|
||||
if not through._meta.auto_created:
|
||||
continue
|
||||
related_objs = related_model.objects.filter(**{related_field: obj})
|
||||
for related_obj in related_objs:
|
||||
print("replacing in", related_model.__name__, related_field, related_obj.id)
|
||||
try:
|
||||
setattr(related_obj, related_field, canonical)
|
||||
related_obj.save()
|
||||
except TypeError:
|
||||
getattr(related_obj, related_field).add(canonical)
|
||||
getattr(related_obj, related_field).remove(obj)
|
||||
|
||||
|
||||
def copy_data(canonical, obj):
|
||||
"""try to get the most data possible"""
|
||||
for data_field in obj._meta.get_fields():
|
||||
if not hasattr(data_field, "activitypub_field"):
|
||||
continue
|
||||
data_value = getattr(obj, data_field.name)
|
||||
if not data_value:
|
||||
continue
|
||||
if not getattr(canonical, data_field.name):
|
||||
print("setting data field", data_field.name, data_value)
|
||||
setattr(canonical, data_field.name, data_value)
|
||||
canonical.save()
|
||||
|
||||
|
||||
def merge_objects(canonical, obj):
|
||||
copy_data(canonical, obj)
|
||||
update_related(canonical, obj)
|
||||
# remove the outdated entry
|
||||
obj.delete()
|
|
@ -1,4 +1,3 @@
|
|||
from bookwyrm.management.merge import merge_objects
|
||||
from django.core.management.base import BaseCommand
|
||||
|
||||
|
||||
|
@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
|
|||
print("other book doesn’t exist!")
|
||||
return
|
||||
|
||||
merge_objects(canonical, other)
|
||||
other.merge_into(canonical)
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
""" database schema for info about authors """
|
||||
|
||||
import re
|
||||
from typing import Tuple, Any
|
||||
|
||||
|
@ -9,13 +10,15 @@ from bookwyrm import activitypub
|
|||
from bookwyrm.settings import DOMAIN
|
||||
from bookwyrm.utils.db import format_trigger
|
||||
|
||||
from .book import BookDataModel
|
||||
from .book import BookDataModel, MergedAuthor
|
||||
from . import fields
|
||||
|
||||
|
||||
class Author(BookDataModel):
|
||||
"""basic biographic info"""
|
||||
|
||||
merged_model = MergedAuthor
|
||||
|
||||
wikipedia_link = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True
|
||||
)
|
||||
|
|
|
@ -3,12 +3,13 @@
|
|||
from itertools import chain
|
||||
import re
|
||||
from typing import Any
|
||||
from typing_extensions import Self
|
||||
|
||||
from django.contrib.postgres.search import SearchVectorField
|
||||
from django.contrib.postgres.indexes import GinIndex
|
||||
from django.core.cache import cache
|
||||
from django.db import models, transaction
|
||||
from django.db.models import Prefetch
|
||||
from django.db.models import Prefetch, ManyToManyField
|
||||
from django.dispatch import receiver
|
||||
from django.utils.translation import gettext_lazy as _
|
||||
from model_utils import FieldTracker
|
||||
|
@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
|
|||
"""only send book data updates to other bookwyrm instances"""
|
||||
super().broadcast(activity, sender, software=software, **kwargs)
|
||||
|
||||
def merge_into(self, canonical: Self) -> None:
|
||||
"""merge this entity into another entity"""
|
||||
if canonical.id == self.id:
|
||||
raise ValueError(f"Cannot merge {self} into itself")
|
||||
|
||||
canonical.absorb_data_from(self)
|
||||
canonical.save()
|
||||
|
||||
self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)
|
||||
|
||||
# move related models to canonical
|
||||
related_models = [
|
||||
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
|
||||
]
|
||||
# pylint: disable=protected-access
|
||||
for related_field, related_model in related_models:
|
||||
# Skip the ManyToMany fields that aren’t auto-created. These
|
||||
# should have a corresponding OneToMany field in the model for
|
||||
# the linking table anyway. If we update it through that model
|
||||
# instead then we won’t lose the extra fields in the linking
|
||||
# table.
|
||||
# pylint: disable=protected-access
|
||||
related_field_obj = related_model._meta.get_field(related_field)
|
||||
if isinstance(related_field_obj, ManyToManyField):
|
||||
through = related_field_obj.remote_field.through
|
||||
if not through._meta.auto_created:
|
||||
continue
|
||||
related_objs = related_model.objects.filter(**{related_field: self})
|
||||
for related_obj in related_objs:
|
||||
try:
|
||||
setattr(related_obj, related_field, canonical)
|
||||
related_obj.save()
|
||||
except TypeError:
|
||||
getattr(related_obj, related_field).add(canonical)
|
||||
getattr(related_obj, related_field).remove(self)
|
||||
|
||||
self.delete()
|
||||
|
||||
def absorb_data_from(self, other: Self) -> None:
|
||||
"""fill empty fields with values from another entity"""
|
||||
for data_field in self._meta.get_fields():
|
||||
if not hasattr(data_field, "activitypub_field"):
|
||||
continue
|
||||
data_value = getattr(other, data_field.name)
|
||||
if not data_value:
|
||||
continue
|
||||
if not getattr(self, data_field.name):
|
||||
setattr(self, data_field.name, data_value)
|
||||
|
||||
|
||||
class MergedBookDataModel(models.Model):
|
||||
"""a BookDataModel instance that has been merged into another instance. kept
|
||||
to be able to redirect old URLs"""
|
||||
|
||||
deleted_id = models.IntegerField(primary_key=True)
|
||||
|
||||
class Meta:
|
||||
"""abstract just like BookDataModel"""
|
||||
|
||||
abstract = True
|
||||
|
||||
|
||||
class MergedBook(MergedBookDataModel):
|
||||
"""an Book that has been merged into another one"""
|
||||
|
||||
merged_into = models.ForeignKey(
|
||||
"Book", on_delete=models.PROTECT, related_name="absorbed"
|
||||
)
|
||||
|
||||
|
||||
class MergedAuthor(MergedBookDataModel):
|
||||
"""an Author that has been merged into another one"""
|
||||
|
||||
merged_into = models.ForeignKey(
|
||||
"Author", on_delete=models.PROTECT, related_name="absorbed"
|
||||
)
|
||||
|
||||
|
||||
class Book(BookDataModel):
|
||||
"""a generic book, which can mean either an edition or a work"""
|
||||
|
||||
merged_model = MergedBook
|
||||
|
||||
connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)
|
||||
|
||||
# book/work metadata
|
||||
|
@ -456,34 +536,6 @@ class Edition(Book):
|
|||
return queryset
|
||||
|
||||
|
||||
class MergedBookDataModel(models.Model):
|
||||
"""a BookDataModel instance that has been merged into another instance. kept
|
||||
to be able to redirect old URLs"""
|
||||
|
||||
deleted_id = models.IntegerField(primary_key=True)
|
||||
|
||||
class Meta:
|
||||
"""abstract just like BookDataModel"""
|
||||
|
||||
abstract = True
|
||||
|
||||
|
||||
class MergedAuthor(MergedBookDataModel):
|
||||
"""an Author that has been merged into another one"""
|
||||
|
||||
merged_into = models.ForeignKey(
|
||||
"Author", on_delete=models.PROTECT, related_name="absorbed"
|
||||
)
|
||||
|
||||
|
||||
class MergedBook(MergedBookDataModel):
|
||||
"""an Book that has been merged into another one"""
|
||||
|
||||
merged_into = models.ForeignKey(
|
||||
"Book", on_delete=models.PROTECT, related_name="absorbed"
|
||||
)
|
||||
|
||||
|
||||
def isbn_10_to_13(isbn_10):
|
||||
"""convert an isbn 10 into an isbn 13"""
|
||||
isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)
|
||||
|
|
Loading…
Reference in a new issue