mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-11 01:35:28 +00:00
BookDataModel: implement merge_into method
This commit is contained in:
parent
b3753ab6da
commit
5e123972e8
5 changed files with 95 additions and 90 deletions
|
@ -1,13 +1,14 @@
|
||||||
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
|
""" PROCEED WITH CAUTION: uses deduplication fields to permanently
|
||||||
merge book data objects """
|
merge book data objects """
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
from django.db.models import Count
|
from django.db.models import Count
|
||||||
from bookwyrm import models
|
from bookwyrm import models
|
||||||
from bookwyrm.management.merge import merge_objects
|
|
||||||
|
|
||||||
|
|
||||||
def dedupe_model(model):
|
def dedupe_model(model):
|
||||||
"""combine duplicate editions and update related models"""
|
"""combine duplicate editions and update related models"""
|
||||||
|
print(f"deduplicating {model.__name__}:")
|
||||||
fields = model._meta.get_fields()
|
fields = model._meta.get_fields()
|
||||||
dedupe_fields = [
|
dedupe_fields = [
|
||||||
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
|
f for f in fields if hasattr(f, "deduplication_field") and f.deduplication_field
|
||||||
|
@ -16,27 +17,27 @@ def dedupe_model(model):
|
||||||
dupes = (
|
dupes = (
|
||||||
model.objects.values(field.name)
|
model.objects.values(field.name)
|
||||||
.annotate(Count(field.name))
|
.annotate(Count(field.name))
|
||||||
.filter(**{"%s__count__gt" % field.name: 1})
|
.filter(**{f"{field.name}__count__gt": 1})
|
||||||
|
.exclude(**{field.name: ""})
|
||||||
|
.exclude(**{f"{field.name}__isnull": True})
|
||||||
)
|
)
|
||||||
|
|
||||||
for dupe in dupes:
|
for dupe in dupes:
|
||||||
value = dupe[field.name]
|
value = dupe[field.name]
|
||||||
if not value or value == "":
|
|
||||||
continue
|
|
||||||
print("----------")
|
print("----------")
|
||||||
print(dupe)
|
|
||||||
objs = model.objects.filter(**{field.name: value}).order_by("id")
|
objs = model.objects.filter(**{field.name: value}).order_by("id")
|
||||||
canonical = objs.first()
|
canonical = objs.first()
|
||||||
print("keeping", canonical.remote_id)
|
print(f"merging into {canonical.remote_id} based on {field.name} {value}:")
|
||||||
for obj in objs[1:]:
|
for obj in objs[1:]:
|
||||||
print(obj.remote_id)
|
print(f"- {obj.remote_id}")
|
||||||
merge_objects(canonical, obj)
|
obj.merge_into(canonical)
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
"""deduplicate allllll the book data models"""
|
"""deduplicate allllll the book data models"""
|
||||||
|
|
||||||
help = "merges duplicate book data"
|
help = "merges duplicate book data"
|
||||||
|
|
||||||
# pylint: disable=no-self-use,unused-argument
|
# pylint: disable=no-self-use,unused-argument
|
||||||
def handle(self, *args, **options):
|
def handle(self, *args, **options):
|
||||||
"""run deduplications"""
|
"""run deduplications"""
|
||||||
|
|
|
@ -1,50 +0,0 @@
|
||||||
from django.db.models import ManyToManyField
|
|
||||||
|
|
||||||
|
|
||||||
def update_related(canonical, obj):
|
|
||||||
"""update all the models with fk to the object being removed"""
|
|
||||||
# move related models to canonical
|
|
||||||
related_models = [
|
|
||||||
(r.remote_field.name, r.related_model) for r in canonical._meta.related_objects
|
|
||||||
]
|
|
||||||
for (related_field, related_model) in related_models:
|
|
||||||
# Skip the ManyToMany fields that aren’t auto-created. These
|
|
||||||
# should have a corresponding OneToMany field in the model for
|
|
||||||
# the linking table anyway. If we update it through that model
|
|
||||||
# instead then we won’t lose the extra fields in the linking
|
|
||||||
# table.
|
|
||||||
related_field_obj = related_model._meta.get_field(related_field)
|
|
||||||
if isinstance(related_field_obj, ManyToManyField):
|
|
||||||
through = related_field_obj.remote_field.through
|
|
||||||
if not through._meta.auto_created:
|
|
||||||
continue
|
|
||||||
related_objs = related_model.objects.filter(**{related_field: obj})
|
|
||||||
for related_obj in related_objs:
|
|
||||||
print("replacing in", related_model.__name__, related_field, related_obj.id)
|
|
||||||
try:
|
|
||||||
setattr(related_obj, related_field, canonical)
|
|
||||||
related_obj.save()
|
|
||||||
except TypeError:
|
|
||||||
getattr(related_obj, related_field).add(canonical)
|
|
||||||
getattr(related_obj, related_field).remove(obj)
|
|
||||||
|
|
||||||
|
|
||||||
def copy_data(canonical, obj):
|
|
||||||
"""try to get the most data possible"""
|
|
||||||
for data_field in obj._meta.get_fields():
|
|
||||||
if not hasattr(data_field, "activitypub_field"):
|
|
||||||
continue
|
|
||||||
data_value = getattr(obj, data_field.name)
|
|
||||||
if not data_value:
|
|
||||||
continue
|
|
||||||
if not getattr(canonical, data_field.name):
|
|
||||||
print("setting data field", data_field.name, data_value)
|
|
||||||
setattr(canonical, data_field.name, data_value)
|
|
||||||
canonical.save()
|
|
||||||
|
|
||||||
|
|
||||||
def merge_objects(canonical, obj):
|
|
||||||
copy_data(canonical, obj)
|
|
||||||
update_related(canonical, obj)
|
|
||||||
# remove the outdated entry
|
|
||||||
obj.delete()
|
|
|
@ -1,4 +1,3 @@
|
||||||
from bookwyrm.management.merge import merge_objects
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
|
||||||
|
|
||||||
|
@ -26,4 +25,4 @@ class MergeCommand(BaseCommand):
|
||||||
print("other book doesn’t exist!")
|
print("other book doesn’t exist!")
|
||||||
return
|
return
|
||||||
|
|
||||||
merge_objects(canonical, other)
|
other.merge_into(canonical)
|
||||||
|
|
|
@ -1,4 +1,5 @@
|
||||||
""" database schema for info about authors """
|
""" database schema for info about authors """
|
||||||
|
|
||||||
import re
|
import re
|
||||||
from typing import Tuple, Any
|
from typing import Tuple, Any
|
||||||
|
|
||||||
|
@ -9,13 +10,15 @@ from bookwyrm import activitypub
|
||||||
from bookwyrm.settings import DOMAIN
|
from bookwyrm.settings import DOMAIN
|
||||||
from bookwyrm.utils.db import format_trigger
|
from bookwyrm.utils.db import format_trigger
|
||||||
|
|
||||||
from .book import BookDataModel
|
from .book import BookDataModel, MergedAuthor
|
||||||
from . import fields
|
from . import fields
|
||||||
|
|
||||||
|
|
||||||
class Author(BookDataModel):
|
class Author(BookDataModel):
|
||||||
"""basic biographic info"""
|
"""basic biographic info"""
|
||||||
|
|
||||||
|
merged_model = MergedAuthor
|
||||||
|
|
||||||
wikipedia_link = fields.CharField(
|
wikipedia_link = fields.CharField(
|
||||||
max_length=255, blank=True, null=True, deduplication_field=True
|
max_length=255, blank=True, null=True, deduplication_field=True
|
||||||
)
|
)
|
||||||
|
|
|
@ -3,12 +3,13 @@
|
||||||
from itertools import chain
|
from itertools import chain
|
||||||
import re
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
from typing_extensions import Self
|
||||||
|
|
||||||
from django.contrib.postgres.search import SearchVectorField
|
from django.contrib.postgres.search import SearchVectorField
|
||||||
from django.contrib.postgres.indexes import GinIndex
|
from django.contrib.postgres.indexes import GinIndex
|
||||||
from django.core.cache import cache
|
from django.core.cache import cache
|
||||||
from django.db import models, transaction
|
from django.db import models, transaction
|
||||||
from django.db.models import Prefetch
|
from django.db.models import Prefetch, ManyToManyField
|
||||||
from django.dispatch import receiver
|
from django.dispatch import receiver
|
||||||
from django.utils.translation import gettext_lazy as _
|
from django.utils.translation import gettext_lazy as _
|
||||||
from model_utils import FieldTracker
|
from model_utils import FieldTracker
|
||||||
|
@ -109,10 +110,89 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
|
||||||
"""only send book data updates to other bookwyrm instances"""
|
"""only send book data updates to other bookwyrm instances"""
|
||||||
super().broadcast(activity, sender, software=software, **kwargs)
|
super().broadcast(activity, sender, software=software, **kwargs)
|
||||||
|
|
||||||
|
def merge_into(self, canonical: Self) -> None:
|
||||||
|
"""merge this entity into another entity"""
|
||||||
|
if canonical.id == self.id:
|
||||||
|
raise ValueError(f"Cannot merge {self} into itself")
|
||||||
|
|
||||||
|
canonical.absorb_data_from(self)
|
||||||
|
canonical.save()
|
||||||
|
|
||||||
|
self.merged_model.objects.create(deleted_id=self.id, merged_into=canonical)
|
||||||
|
|
||||||
|
# move related models to canonical
|
||||||
|
related_models = [
|
||||||
|
(r.remote_field.name, r.related_model) for r in self._meta.related_objects
|
||||||
|
]
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
for related_field, related_model in related_models:
|
||||||
|
# Skip the ManyToMany fields that aren’t auto-created. These
|
||||||
|
# should have a corresponding OneToMany field in the model for
|
||||||
|
# the linking table anyway. If we update it through that model
|
||||||
|
# instead then we won’t lose the extra fields in the linking
|
||||||
|
# table.
|
||||||
|
# pylint: disable=protected-access
|
||||||
|
related_field_obj = related_model._meta.get_field(related_field)
|
||||||
|
if isinstance(related_field_obj, ManyToManyField):
|
||||||
|
through = related_field_obj.remote_field.through
|
||||||
|
if not through._meta.auto_created:
|
||||||
|
continue
|
||||||
|
related_objs = related_model.objects.filter(**{related_field: self})
|
||||||
|
for related_obj in related_objs:
|
||||||
|
try:
|
||||||
|
setattr(related_obj, related_field, canonical)
|
||||||
|
related_obj.save()
|
||||||
|
except TypeError:
|
||||||
|
getattr(related_obj, related_field).add(canonical)
|
||||||
|
getattr(related_obj, related_field).remove(self)
|
||||||
|
|
||||||
|
self.delete()
|
||||||
|
|
||||||
|
def absorb_data_from(self, other: Self) -> None:
|
||||||
|
"""fill empty fields with values from another entity"""
|
||||||
|
for data_field in self._meta.get_fields():
|
||||||
|
if not hasattr(data_field, "activitypub_field"):
|
||||||
|
continue
|
||||||
|
data_value = getattr(other, data_field.name)
|
||||||
|
if not data_value:
|
||||||
|
continue
|
||||||
|
if not getattr(self, data_field.name):
|
||||||
|
setattr(self, data_field.name, data_value)
|
||||||
|
|
||||||
|
|
||||||
|
class MergedBookDataModel(models.Model):
|
||||||
|
"""a BookDataModel instance that has been merged into another instance. kept
|
||||||
|
to be able to redirect old URLs"""
|
||||||
|
|
||||||
|
deleted_id = models.IntegerField(primary_key=True)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
"""abstract just like BookDataModel"""
|
||||||
|
|
||||||
|
abstract = True
|
||||||
|
|
||||||
|
|
||||||
|
class MergedBook(MergedBookDataModel):
|
||||||
|
"""an Book that has been merged into another one"""
|
||||||
|
|
||||||
|
merged_into = models.ForeignKey(
|
||||||
|
"Book", on_delete=models.PROTECT, related_name="absorbed"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class MergedAuthor(MergedBookDataModel):
|
||||||
|
"""an Author that has been merged into another one"""
|
||||||
|
|
||||||
|
merged_into = models.ForeignKey(
|
||||||
|
"Author", on_delete=models.PROTECT, related_name="absorbed"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class Book(BookDataModel):
|
class Book(BookDataModel):
|
||||||
"""a generic book, which can mean either an edition or a work"""
|
"""a generic book, which can mean either an edition or a work"""
|
||||||
|
|
||||||
|
merged_model = MergedBook
|
||||||
|
|
||||||
connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)
|
connector = models.ForeignKey("Connector", on_delete=models.PROTECT, null=True)
|
||||||
|
|
||||||
# book/work metadata
|
# book/work metadata
|
||||||
|
@ -456,34 +536,6 @@ class Edition(Book):
|
||||||
return queryset
|
return queryset
|
||||||
|
|
||||||
|
|
||||||
class MergedBookDataModel(models.Model):
|
|
||||||
"""a BookDataModel instance that has been merged into another instance. kept
|
|
||||||
to be able to redirect old URLs"""
|
|
||||||
|
|
||||||
deleted_id = models.IntegerField(primary_key=True)
|
|
||||||
|
|
||||||
class Meta:
|
|
||||||
"""abstract just like BookDataModel"""
|
|
||||||
|
|
||||||
abstract = True
|
|
||||||
|
|
||||||
|
|
||||||
class MergedAuthor(MergedBookDataModel):
|
|
||||||
"""an Author that has been merged into another one"""
|
|
||||||
|
|
||||||
merged_into = models.ForeignKey(
|
|
||||||
"Author", on_delete=models.PROTECT, related_name="absorbed"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
class MergedBook(MergedBookDataModel):
|
|
||||||
"""an Book that has been merged into another one"""
|
|
||||||
|
|
||||||
merged_into = models.ForeignKey(
|
|
||||||
"Book", on_delete=models.PROTECT, related_name="absorbed"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def isbn_10_to_13(isbn_10):
|
def isbn_10_to_13(isbn_10):
|
||||||
"""convert an isbn 10 into an isbn 13"""
|
"""convert an isbn 10 into an isbn 13"""
|
||||||
isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)
|
isbn_10 = re.sub(r"[^0-9X]", "", isbn_10)
|
||||||
|
|
Loading…
Reference in a new issue