forked from mirrors/bookwyrm
Adds deduplication fields
This commit is contained in:
parent
31a407d74a
commit
7c43fa1f7c
10 changed files with 174 additions and 73 deletions
|
@ -78,11 +78,11 @@ class ActivityObject:
|
|||
|
||||
# check for an existing instance, if we're not updating a known obj
|
||||
if not instance:
|
||||
instance = find_existing_by_remote_id(model, self.id) or model()
|
||||
# TODO: deduplicate books by identifiers
|
||||
instance = model.find_existing(self.serialize()) or model()
|
||||
|
||||
many_to_many_fields = {}
|
||||
for field in model._meta.get_fields():
|
||||
# check if it's an activitypub field
|
||||
if not hasattr(field, 'field_to_activity'):
|
||||
continue
|
||||
# call the formatter associated with the model field class
|
||||
|
@ -167,34 +167,25 @@ def set_related_field(
|
|||
if isinstance(data, str):
|
||||
item = resolve_remote_id(model, data, save=False)
|
||||
else:
|
||||
item = model.activity_serializer(**data)
|
||||
item = item.to_model(model, save=False)
|
||||
instance = find_existing_by_remote_id(origin_model, related_remote_id)
|
||||
# look for a match based on all the available data
|
||||
item = model.find_existing(data)
|
||||
if not item:
|
||||
# create a new model instance
|
||||
item = model.activity_serializer(**data)
|
||||
item = item.to_model(model, save=False)
|
||||
# this must exist because it's the object that triggered this function
|
||||
instance = origin_model.find_existing_by_remote_id(related_remote_id)
|
||||
if not instance:
|
||||
raise ValueError('Invalid related remote id: %s' % related_remote_id)
|
||||
|
||||
# edition.parent_work = instance, for example
|
||||
setattr(item, related_field_name, instance)
|
||||
item.save()
|
||||
|
||||
|
||||
def find_existing_by_remote_id(model, remote_id):
|
||||
''' check for an existing instance of this id in the db '''
|
||||
objects = model.objects
|
||||
if hasattr(model.objects, 'select_subclasses'):
|
||||
objects = objects.select_subclasses()
|
||||
|
||||
# first, check for an existing copy in the database
|
||||
result = objects.filter(
|
||||
remote_id=remote_id
|
||||
).first()
|
||||
|
||||
if not result and hasattr(model, 'origin_id'):
|
||||
result = objects.filter(
|
||||
origin_id=remote_id
|
||||
).first()
|
||||
return result
|
||||
|
||||
|
||||
def resolve_remote_id(model, remote_id, refresh=False, save=True):
|
||||
''' look up the remote_id in the database or load it remotely '''
|
||||
result = find_existing_by_remote_id(model, remote_id)
|
||||
''' take a remote_id and return an instance, creating if necessary '''
|
||||
result = model.find_existing_by_remote_id(remote_id)
|
||||
if result and not refresh:
|
||||
return result
|
||||
|
||||
|
@ -206,6 +197,11 @@ def resolve_remote_id(model, remote_id, refresh=False, save=True):
|
|||
'Could not connect to host for remote_id in %s model: %s' % \
|
||||
(model.__name__, remote_id))
|
||||
|
||||
# check for existing items with shared unique identifiers
|
||||
item = model.find_existing(data)
|
||||
if item:
|
||||
return item
|
||||
|
||||
item = model.activity_serializer(**data)
|
||||
# if we're refreshing, "result" will be set and we'll update it
|
||||
return item.to_model(model, instance=result, save=save)
|
||||
|
|
|
@ -4,5 +4,5 @@ from bookwyrm import models
|
|||
def site_settings(request):
|
||||
''' include the custom info about the site '''
|
||||
return {
|
||||
'site': models.SiteSettings.objects.get()
|
||||
'site': models.SiteSettings.objects.first()
|
||||
}
|
||||
|
|
|
@ -12,11 +12,11 @@ from . import fields
|
|||
class Author(ActivitypubMixin, BookWyrmModel):
|
||||
''' basic biographic info '''
|
||||
origin_id = models.CharField(max_length=255, null=True)
|
||||
''' copy of an author from OL '''
|
||||
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True)
|
||||
openlibrary_key = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
sync = models.BooleanField(default=True)
|
||||
last_sync_date = models.DateTimeField(default=timezone.now)
|
||||
wikipedia_link = fields.CharField(max_length=255, blank=True, null=True)
|
||||
wikipedia_link = fields.CharField(max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
# idk probably other keys would be useful here?
|
||||
born = fields.DateTimeField(blank=True, null=True)
|
||||
died = fields.DateTimeField(blank=True, null=True)
|
||||
|
|
|
@ -1,5 +1,7 @@
|
|||
''' base model with default fields '''
|
||||
from base64 import b64encode
|
||||
from functools import reduce
|
||||
import operator
|
||||
from uuid import uuid4
|
||||
|
||||
from Crypto.PublicKey import RSA
|
||||
|
@ -7,6 +9,7 @@ from Crypto.Signature import pkcs1_15
|
|||
from Crypto.Hash import SHA256
|
||||
from django.core.paginator import Paginator
|
||||
from django.db import models
|
||||
from django.db.models import Q
|
||||
from django.dispatch import receiver
|
||||
|
||||
from bookwyrm import activitypub
|
||||
|
@ -64,6 +67,50 @@ class ActivitypubMixin:
|
|||
activity_serializer = lambda: {}
|
||||
reverse_unfurl = False
|
||||
|
||||
@classmethod
|
||||
def find_existing_by_remote_id(cls, remote_id):
|
||||
''' look up a remote id in the db '''
|
||||
return cls.find_existing({'id': remote_id})
|
||||
|
||||
@classmethod
|
||||
def find_existing(cls, data):
|
||||
''' compare data to fields that can be used for deduplation.
|
||||
This always includes remote_id, but can also be unique identifiers
|
||||
like an isbn for an edition '''
|
||||
filters = []
|
||||
for field in cls._meta.get_fields():
|
||||
if not hasattr(field, 'deduplication_field') or \
|
||||
not field.deduplication_field:
|
||||
continue
|
||||
|
||||
value = data.get(field.activitypub_field)
|
||||
if not value:
|
||||
continue
|
||||
filters.append({field.name: value})
|
||||
|
||||
if hasattr(cls, 'origin_id') and 'id' in data:
|
||||
# kinda janky, but this handles special case for books
|
||||
filters.append({'origin_id': data['id']})
|
||||
|
||||
if not filters:
|
||||
# if there are no deduplication fields, it will match the first
|
||||
# item no matter what. this shouldn't happen but just in case.
|
||||
return None
|
||||
|
||||
objects = cls.objects
|
||||
if hasattr(objects, 'select_subclasses'):
|
||||
objects = objects.select_subclasses()
|
||||
|
||||
# an OR operation on all the match fields
|
||||
match = objects.filter(
|
||||
reduce(
|
||||
operator.or_, (Q(**f) for f in filters)
|
||||
)
|
||||
)
|
||||
# there OUGHT to be only one match
|
||||
return match.first()
|
||||
|
||||
|
||||
def to_activity(self):
|
||||
''' convert from a model to an activity '''
|
||||
activity = {}
|
||||
|
|
|
@ -16,9 +16,12 @@ class Book(ActivitypubMixin, BookWyrmModel):
|
|||
''' a generic book, which can mean either an edition or a work '''
|
||||
origin_id = models.CharField(max_length=255, null=True, blank=True)
|
||||
# these identifiers apply to both works and editions
|
||||
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True)
|
||||
librarything_key = fields.CharField(max_length=255, blank=True, null=True)
|
||||
goodreads_key = fields.CharField(max_length=255, blank=True, null=True)
|
||||
openlibrary_key = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
librarything_key = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
goodreads_key = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
|
||||
# info about where the data comes from and where/if to sync
|
||||
sync = models.BooleanField(default=True)
|
||||
|
@ -83,7 +86,8 @@ class Book(ActivitypubMixin, BookWyrmModel):
|
|||
class Work(OrderedCollectionPageMixin, Book):
|
||||
''' a work (an abstract concept of a book that manifests in an edition) '''
|
||||
# library of congress catalog control number
|
||||
lccn = fields.CharField(max_length=255, blank=True, null=True)
|
||||
lccn = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
# this has to be nullable but should never be null
|
||||
default_edition = fields.ForeignKey(
|
||||
'Edition',
|
||||
|
@ -103,10 +107,14 @@ class Work(OrderedCollectionPageMixin, Book):
|
|||
class Edition(Book):
|
||||
''' an edition of a book '''
|
||||
# these identifiers only apply to editions, not works
|
||||
isbn_10 = fields.CharField(max_length=255, blank=True, null=True)
|
||||
isbn_13 = fields.CharField(max_length=255, blank=True, null=True)
|
||||
oclc_number = fields.CharField(max_length=255, blank=True, null=True)
|
||||
asin = fields.CharField(max_length=255, blank=True, null=True)
|
||||
isbn_10 = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
isbn_13 = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
oclc_number = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
asin = fields.CharField(
|
||||
max_length=255, blank=True, null=True, deduplication_field=True)
|
||||
pages = fields.IntegerField(blank=True, null=True)
|
||||
physical_format = fields.CharField(max_length=255, blank=True, null=True)
|
||||
publishers = fields.ArrayField(
|
||||
|
|
|
@ -28,7 +28,9 @@ def validate_remote_id(value):
|
|||
class ActivitypubFieldMixin:
|
||||
''' make a database field serializable '''
|
||||
def __init__(self, *args, \
|
||||
activitypub_field=None, activitypub_wrapper=None, **kwargs):
|
||||
activitypub_field=None, activitypub_wrapper=None,
|
||||
deduplication_field=False, **kwargs):
|
||||
self.deduplication_field = deduplication_field
|
||||
if activitypub_wrapper:
|
||||
self.activitypub_wrapper = activitypub_field
|
||||
self.activitypub_field = activitypub_wrapper
|
||||
|
@ -86,6 +88,8 @@ class RemoteIdField(ActivitypubFieldMixin, models.CharField):
|
|||
*args, max_length=max_length, validators=validators,
|
||||
**kwargs
|
||||
)
|
||||
# for this field, the default is true. false everywhere else.
|
||||
self.deduplication_field = kwargs.get('deduplication_field', True)
|
||||
|
||||
|
||||
class UsernameField(ActivitypubFieldMixin, models.CharField):
|
||||
|
|
|
@ -32,7 +32,9 @@ class User(OrderedCollectionPageMixin, AbstractUser):
|
|||
inbox = fields.RemoteIdField(unique=True)
|
||||
shared_inbox = fields.RemoteIdField(
|
||||
activitypub_field='sharedInbox',
|
||||
activitypub_wrapper='endpoints', null=True)
|
||||
activitypub_wrapper='endpoints',
|
||||
deduplication_field=False,
|
||||
null=True)
|
||||
federated_server = models.ForeignKey(
|
||||
'FederatedServer',
|
||||
on_delete=models.PROTECT,
|
||||
|
|
|
@ -11,7 +11,7 @@ import responses
|
|||
|
||||
from bookwyrm import activitypub
|
||||
from bookwyrm.activitypub.base_activity import ActivityObject, \
|
||||
find_existing_by_remote_id, resolve_remote_id, set_related_field
|
||||
resolve_remote_id, set_related_field
|
||||
from bookwyrm.activitypub import ActivitySerializerError
|
||||
from bookwyrm import models
|
||||
|
||||
|
@ -77,34 +77,6 @@ class BaseActivity(TestCase):
|
|||
self.assertEqual(serialized['id'], 'a')
|
||||
self.assertEqual(serialized['type'], 'b')
|
||||
|
||||
def test_find_existing_by_remote_id(self):
|
||||
''' attempt to match a remote id to an object in the db '''
|
||||
# uses a different remote id scheme
|
||||
# this isn't really part of this test directly but it's helpful to state
|
||||
self.assertEqual(self.book.origin_id, 'http://book.com/book')
|
||||
self.assertNotEqual(self.book.remote_id, 'http://book.com/book')
|
||||
|
||||
# uses subclasses
|
||||
models.Comment.objects.create(
|
||||
user=self.user, content='test status', book=self.book, \
|
||||
remote_id='https://comment.net')
|
||||
|
||||
result = find_existing_by_remote_id(models.User, 'hi')
|
||||
self.assertIsNone(result)
|
||||
|
||||
result = find_existing_by_remote_id(
|
||||
models.User, 'http://example.com/a/b')
|
||||
self.assertEqual(result, self.user)
|
||||
|
||||
# test using origin id
|
||||
result = find_existing_by_remote_id(
|
||||
models.Edition, 'http://book.com/book')
|
||||
self.assertEqual(result, self.book)
|
||||
|
||||
# test subclass match
|
||||
result = find_existing_by_remote_id(
|
||||
models.Status, 'https://comment.net')
|
||||
|
||||
@responses.activate
|
||||
def test_resolve_remote_id(self):
|
||||
''' look up or load remote data '''
|
||||
|
|
|
@ -11,13 +11,16 @@ from bookwyrm.models.base_model import ActivitypubMixin
|
|||
from bookwyrm.settings import DOMAIN
|
||||
|
||||
class BaseModel(TestCase):
|
||||
''' functionality shared across models '''
|
||||
def test_remote_id(self):
|
||||
''' these should be generated '''
|
||||
instance = base_model.BookWyrmModel()
|
||||
instance.id = 1
|
||||
expected = instance.get_remote_id()
|
||||
self.assertEqual(expected, 'https://%s/bookwyrmmodel/1' % DOMAIN)
|
||||
|
||||
def test_remote_id_with_user(self):
|
||||
''' format of remote id when there's a user object '''
|
||||
user = models.User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.com', 'mouseword', local=True)
|
||||
instance = base_model.BookWyrmModel()
|
||||
|
@ -46,6 +49,7 @@ class BaseModel(TestCase):
|
|||
self.assertIsNone(instance.remote_id)
|
||||
|
||||
def test_to_create_activity(self):
|
||||
''' wrapper for ActivityPub "create" action '''
|
||||
user = models.User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.com', 'mouseword', local=True)
|
||||
|
||||
|
@ -75,6 +79,7 @@ class BaseModel(TestCase):
|
|||
)
|
||||
|
||||
def test_to_delete_activity(self):
|
||||
''' wrapper for Delete activity '''
|
||||
user = models.User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.com', 'mouseword', local=True)
|
||||
|
||||
|
@ -98,6 +103,7 @@ class BaseModel(TestCase):
|
|||
['https://www.w3.org/ns/activitystreams#Public'])
|
||||
|
||||
def test_to_update_activity(self):
|
||||
''' ditto above but for Update '''
|
||||
user = models.User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.com', 'mouseword', local=True)
|
||||
|
||||
|
@ -121,6 +127,7 @@ class BaseModel(TestCase):
|
|||
self.assertEqual(activity['object'], {})
|
||||
|
||||
def test_to_undo_activity(self):
|
||||
''' and again, for Undo '''
|
||||
user = models.User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.com', 'mouseword', local=True)
|
||||
|
||||
|
@ -140,12 +147,14 @@ class BaseModel(TestCase):
|
|||
|
||||
|
||||
def test_to_activity(self):
|
||||
''' model to ActivityPub json '''
|
||||
@dataclass(init=False)
|
||||
class TestActivity(ActivityObject):
|
||||
''' real simple mock '''
|
||||
type: str = 'Test'
|
||||
|
||||
class TestModel(ActivitypubMixin, base_model.BookWyrmModel):
|
||||
pass
|
||||
''' real simple mock model because BookWyrmModel is abstract '''
|
||||
|
||||
instance = TestModel()
|
||||
instance.remote_id = 'https://www.example.com/test'
|
||||
|
@ -155,3 +164,32 @@ class BaseModel(TestCase):
|
|||
self.assertIsInstance(activity, dict)
|
||||
self.assertEqual(activity['id'], 'https://www.example.com/test')
|
||||
self.assertEqual(activity['type'], 'Test')
|
||||
|
||||
|
||||
def test_find_existing_by_remote_id(self):
|
||||
''' attempt to match a remote id to an object in the db '''
|
||||
# uses a different remote id scheme
|
||||
# this isn't really part of this test directly but it's helpful to state
|
||||
self.assertEqual(self.book.origin_id, 'http://book.com/book')
|
||||
self.assertNotEqual(self.book.remote_id, 'http://book.com/book')
|
||||
|
||||
# uses subclasses
|
||||
models.Comment.objects.create(
|
||||
user=self.user, content='test status', book=self.book, \
|
||||
remote_id='https://comment.net')
|
||||
|
||||
result = models.User.find_existing_by_remote_id('hi')
|
||||
self.assertIsNone(result)
|
||||
|
||||
result = models.User.find_existing_by_remote_id(
|
||||
'http://example.com/a/b')
|
||||
self.assertEqual(result, self.user)
|
||||
|
||||
# test using origin id
|
||||
result = models.Edition.find_existing_by_remote_id(
|
||||
'http://book.com/book')
|
||||
self.assertEqual(result, self.book)
|
||||
|
||||
# test subclass match
|
||||
result = models.Status.find_existing_by_remote_id(
|
||||
'https://comment.net')
|
||||
|
|
|
@ -48,6 +48,7 @@ class ActivitypubFields(TestCase):
|
|||
instance = fields.ActivitypubFieldMixin()
|
||||
self.assertEqual(instance.field_to_activity('fish'), 'fish')
|
||||
self.assertEqual(instance.field_from_activity('fish'), 'fish')
|
||||
self.assertFalse(instance.deduplication_field)
|
||||
|
||||
instance = fields.ActivitypubFieldMixin(
|
||||
activitypub_wrapper='endpoints', activitypub_field='outbox'
|
||||
|
@ -70,6 +71,7 @@ class ActivitypubFields(TestCase):
|
|||
''' just sets some defaults on charfield '''
|
||||
instance = fields.RemoteIdField()
|
||||
self.assertEqual(instance.max_length, 255)
|
||||
self.assertTrue(instance.deduplication_field)
|
||||
|
||||
with self.assertRaises(ValidationError):
|
||||
instance.run_validators('http://www.example.com/dlfjg 23/x')
|
||||
|
@ -97,7 +99,7 @@ class ActivitypubFields(TestCase):
|
|||
self.assertEqual(instance.field_to_activity(item), 'https://e.b/c')
|
||||
|
||||
@responses.activate
|
||||
def test_foreign_key_from_activity(self):
|
||||
def test_foreign_key_from_activity_str(self):
|
||||
''' this is the important stuff '''
|
||||
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
|
||||
|
||||
|
@ -117,15 +119,47 @@ class ActivitypubFields(TestCase):
|
|||
with patch('bookwyrm.models.user.set_remote_server.delay'):
|
||||
value = instance.field_from_activity(
|
||||
'https://example.com/user/mouse')
|
||||
self.assertIsInstance(value, User)
|
||||
self.assertEqual(value.remote_id, 'https://example.com/user/mouse')
|
||||
self.assertEqual(value.name, 'MOUSE?? MOUSE!!')
|
||||
|
||||
# test recieving activity json
|
||||
value = instance.field_from_activity(userdata)
|
||||
|
||||
def test_foreign_key_from_activity_dict(self):
|
||||
''' test recieving activity json '''
|
||||
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
|
||||
|
||||
datafile = pathlib.Path(__file__).parent.joinpath(
|
||||
'../data/ap_user.json'
|
||||
)
|
||||
userdata = json.loads(datafile.read_bytes())
|
||||
# don't try to load the user icon
|
||||
del userdata['icon']
|
||||
with patch('bookwyrm.models.user.set_remote_server.delay'):
|
||||
value = instance.field_from_activity(userdata)
|
||||
self.assertIsInstance(value, User)
|
||||
self.assertEqual(value.remote_id, 'https://example.com/user/mouse')
|
||||
self.assertEqual(value.name, 'MOUSE?? MOUSE!!')
|
||||
# et cetera but we're not testing serializing user json
|
||||
|
||||
# test receiving a remote id of an object in the db
|
||||
|
||||
def test_foreign_key_from_activity_dict_existing(self):
|
||||
''' test receiving a dict of an existing object in the db '''
|
||||
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
|
||||
datafile = pathlib.Path(__file__).parent.joinpath(
|
||||
'../data/ap_user.json'
|
||||
)
|
||||
userdata = json.loads(datafile.read_bytes())
|
||||
user = User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.mouse', 'mouseword', local=True)
|
||||
user.remote_id = 'https://example.com/user/mouse'
|
||||
user.save()
|
||||
value = instance.field_from_activity(userdata)
|
||||
self.assertEqual(value, user)
|
||||
|
||||
|
||||
def test_foreign_key_from_activity_str_existing(self):
|
||||
''' test receiving a remote id of an existing object in the db '''
|
||||
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
|
||||
user = User.objects.create_user(
|
||||
'mouse', 'mouse@mouse.mouse', 'mouseword', local=True)
|
||||
value = instance.field_from_activity(user.remote_id)
|
||||
|
|
Loading…
Reference in a new issue