Adds deduplication fields

This commit is contained in:
Mouse Reeve 2020-12-12 13:39:55 -08:00
parent 31a407d74a
commit 7c43fa1f7c
10 changed files with 174 additions and 73 deletions

View file

@ -78,11 +78,11 @@ class ActivityObject:
# check for an existing instance, if we're not updating a known obj # check for an existing instance, if we're not updating a known obj
if not instance: if not instance:
instance = find_existing_by_remote_id(model, self.id) or model() instance = model.find_existing(self.serialize()) or model()
# TODO: deduplicate books by identifiers
many_to_many_fields = {} many_to_many_fields = {}
for field in model._meta.get_fields(): for field in model._meta.get_fields():
# check if it's an activitypub field
if not hasattr(field, 'field_to_activity'): if not hasattr(field, 'field_to_activity'):
continue continue
# call the formatter associated with the model field class # call the formatter associated with the model field class
@ -167,34 +167,25 @@ def set_related_field(
if isinstance(data, str): if isinstance(data, str):
item = resolve_remote_id(model, data, save=False) item = resolve_remote_id(model, data, save=False)
else: else:
# look for a match based on all the available data
item = model.find_existing(data)
if not item:
# create a new model instance
item = model.activity_serializer(**data) item = model.activity_serializer(**data)
item = item.to_model(model, save=False) item = item.to_model(model, save=False)
instance = find_existing_by_remote_id(origin_model, related_remote_id) # this must exist because it's the object that triggered this function
instance = origin_model.find_existing_by_remote_id(related_remote_id)
if not instance:
raise ValueError('Invalid related remote id: %s' % related_remote_id)
# edition.parent_work = instance, for example
setattr(item, related_field_name, instance) setattr(item, related_field_name, instance)
item.save() item.save()
def find_existing_by_remote_id(model, remote_id):
''' check for an existing instance of this id in the db '''
objects = model.objects
if hasattr(model.objects, 'select_subclasses'):
objects = objects.select_subclasses()
# first, check for an existing copy in the database
result = objects.filter(
remote_id=remote_id
).first()
if not result and hasattr(model, 'origin_id'):
result = objects.filter(
origin_id=remote_id
).first()
return result
def resolve_remote_id(model, remote_id, refresh=False, save=True): def resolve_remote_id(model, remote_id, refresh=False, save=True):
''' look up the remote_id in the database or load it remotely ''' ''' take a remote_id and return an instance, creating if necessary '''
result = find_existing_by_remote_id(model, remote_id) result = model.find_existing_by_remote_id(remote_id)
if result and not refresh: if result and not refresh:
return result return result
@ -206,6 +197,11 @@ def resolve_remote_id(model, remote_id, refresh=False, save=True):
'Could not connect to host for remote_id in %s model: %s' % \ 'Could not connect to host for remote_id in %s model: %s' % \
(model.__name__, remote_id)) (model.__name__, remote_id))
# check for existing items with shared unique identifiers
item = model.find_existing(data)
if item:
return item
item = model.activity_serializer(**data) item = model.activity_serializer(**data)
# if we're refreshing, "result" will be set and we'll update it # if we're refreshing, "result" will be set and we'll update it
return item.to_model(model, instance=result, save=save) return item.to_model(model, instance=result, save=save)

View file

@ -4,5 +4,5 @@ from bookwyrm import models
def site_settings(request): def site_settings(request):
''' include the custom info about the site ''' ''' include the custom info about the site '''
return { return {
'site': models.SiteSettings.objects.get() 'site': models.SiteSettings.objects.first()
} }

View file

@ -12,11 +12,11 @@ from . import fields
class Author(ActivitypubMixin, BookWyrmModel): class Author(ActivitypubMixin, BookWyrmModel):
''' basic biographic info ''' ''' basic biographic info '''
origin_id = models.CharField(max_length=255, null=True) origin_id = models.CharField(max_length=255, null=True)
''' copy of an author from OL ''' openlibrary_key = fields.CharField(
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True) max_length=255, blank=True, null=True, deduplication_field=True)
sync = models.BooleanField(default=True) sync = models.BooleanField(default=True)
last_sync_date = models.DateTimeField(default=timezone.now) last_sync_date = models.DateTimeField(default=timezone.now)
wikipedia_link = fields.CharField(max_length=255, blank=True, null=True) wikipedia_link = fields.CharField(max_length=255, blank=True, null=True, deduplication_field=True)
# idk probably other keys would be useful here? # idk probably other keys would be useful here?
born = fields.DateTimeField(blank=True, null=True) born = fields.DateTimeField(blank=True, null=True)
died = fields.DateTimeField(blank=True, null=True) died = fields.DateTimeField(blank=True, null=True)

View file

@ -1,5 +1,7 @@
''' base model with default fields ''' ''' base model with default fields '''
from base64 import b64encode from base64 import b64encode
from functools import reduce
import operator
from uuid import uuid4 from uuid import uuid4
from Crypto.PublicKey import RSA from Crypto.PublicKey import RSA
@ -7,6 +9,7 @@ from Crypto.Signature import pkcs1_15
from Crypto.Hash import SHA256 from Crypto.Hash import SHA256
from django.core.paginator import Paginator from django.core.paginator import Paginator
from django.db import models from django.db import models
from django.db.models import Q
from django.dispatch import receiver from django.dispatch import receiver
from bookwyrm import activitypub from bookwyrm import activitypub
@ -64,6 +67,50 @@ class ActivitypubMixin:
activity_serializer = lambda: {} activity_serializer = lambda: {}
reverse_unfurl = False reverse_unfurl = False
@classmethod
def find_existing_by_remote_id(cls, remote_id):
''' look up a remote id in the db '''
return cls.find_existing({'id': remote_id})
@classmethod
def find_existing(cls, data):
''' compare data to fields that can be used for deduplation.
This always includes remote_id, but can also be unique identifiers
like an isbn for an edition '''
filters = []
for field in cls._meta.get_fields():
if not hasattr(field, 'deduplication_field') or \
not field.deduplication_field:
continue
value = data.get(field.activitypub_field)
if not value:
continue
filters.append({field.name: value})
if hasattr(cls, 'origin_id') and 'id' in data:
# kinda janky, but this handles special case for books
filters.append({'origin_id': data['id']})
if not filters:
# if there are no deduplication fields, it will match the first
# item no matter what. this shouldn't happen but just in case.
return None
objects = cls.objects
if hasattr(objects, 'select_subclasses'):
objects = objects.select_subclasses()
# an OR operation on all the match fields
match = objects.filter(
reduce(
operator.or_, (Q(**f) for f in filters)
)
)
# there OUGHT to be only one match
return match.first()
def to_activity(self): def to_activity(self):
''' convert from a model to an activity ''' ''' convert from a model to an activity '''
activity = {} activity = {}

View file

@ -16,9 +16,12 @@ class Book(ActivitypubMixin, BookWyrmModel):
''' a generic book, which can mean either an edition or a work ''' ''' a generic book, which can mean either an edition or a work '''
origin_id = models.CharField(max_length=255, null=True, blank=True) origin_id = models.CharField(max_length=255, null=True, blank=True)
# these identifiers apply to both works and editions # these identifiers apply to both works and editions
openlibrary_key = fields.CharField(max_length=255, blank=True, null=True) openlibrary_key = fields.CharField(
librarything_key = fields.CharField(max_length=255, blank=True, null=True) max_length=255, blank=True, null=True, deduplication_field=True)
goodreads_key = fields.CharField(max_length=255, blank=True, null=True) librarything_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
goodreads_key = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
# info about where the data comes from and where/if to sync # info about where the data comes from and where/if to sync
sync = models.BooleanField(default=True) sync = models.BooleanField(default=True)
@ -83,7 +86,8 @@ class Book(ActivitypubMixin, BookWyrmModel):
class Work(OrderedCollectionPageMixin, Book): class Work(OrderedCollectionPageMixin, Book):
''' a work (an abstract concept of a book that manifests in an edition) ''' ''' a work (an abstract concept of a book that manifests in an edition) '''
# library of congress catalog control number # library of congress catalog control number
lccn = fields.CharField(max_length=255, blank=True, null=True) lccn = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
# this has to be nullable but should never be null # this has to be nullable but should never be null
default_edition = fields.ForeignKey( default_edition = fields.ForeignKey(
'Edition', 'Edition',
@ -103,10 +107,14 @@ class Work(OrderedCollectionPageMixin, Book):
class Edition(Book): class Edition(Book):
''' an edition of a book ''' ''' an edition of a book '''
# these identifiers only apply to editions, not works # these identifiers only apply to editions, not works
isbn_10 = fields.CharField(max_length=255, blank=True, null=True) isbn_10 = fields.CharField(
isbn_13 = fields.CharField(max_length=255, blank=True, null=True) max_length=255, blank=True, null=True, deduplication_field=True)
oclc_number = fields.CharField(max_length=255, blank=True, null=True) isbn_13 = fields.CharField(
asin = fields.CharField(max_length=255, blank=True, null=True) max_length=255, blank=True, null=True, deduplication_field=True)
oclc_number = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
asin = fields.CharField(
max_length=255, blank=True, null=True, deduplication_field=True)
pages = fields.IntegerField(blank=True, null=True) pages = fields.IntegerField(blank=True, null=True)
physical_format = fields.CharField(max_length=255, blank=True, null=True) physical_format = fields.CharField(max_length=255, blank=True, null=True)
publishers = fields.ArrayField( publishers = fields.ArrayField(

View file

@ -28,7 +28,9 @@ def validate_remote_id(value):
class ActivitypubFieldMixin: class ActivitypubFieldMixin:
''' make a database field serializable ''' ''' make a database field serializable '''
def __init__(self, *args, \ def __init__(self, *args, \
activitypub_field=None, activitypub_wrapper=None, **kwargs): activitypub_field=None, activitypub_wrapper=None,
deduplication_field=False, **kwargs):
self.deduplication_field = deduplication_field
if activitypub_wrapper: if activitypub_wrapper:
self.activitypub_wrapper = activitypub_field self.activitypub_wrapper = activitypub_field
self.activitypub_field = activitypub_wrapper self.activitypub_field = activitypub_wrapper
@ -86,6 +88,8 @@ class RemoteIdField(ActivitypubFieldMixin, models.CharField):
*args, max_length=max_length, validators=validators, *args, max_length=max_length, validators=validators,
**kwargs **kwargs
) )
# for this field, the default is true. false everywhere else.
self.deduplication_field = kwargs.get('deduplication_field', True)
class UsernameField(ActivitypubFieldMixin, models.CharField): class UsernameField(ActivitypubFieldMixin, models.CharField):

View file

@ -32,7 +32,9 @@ class User(OrderedCollectionPageMixin, AbstractUser):
inbox = fields.RemoteIdField(unique=True) inbox = fields.RemoteIdField(unique=True)
shared_inbox = fields.RemoteIdField( shared_inbox = fields.RemoteIdField(
activitypub_field='sharedInbox', activitypub_field='sharedInbox',
activitypub_wrapper='endpoints', null=True) activitypub_wrapper='endpoints',
deduplication_field=False,
null=True)
federated_server = models.ForeignKey( federated_server = models.ForeignKey(
'FederatedServer', 'FederatedServer',
on_delete=models.PROTECT, on_delete=models.PROTECT,

View file

@ -11,7 +11,7 @@ import responses
from bookwyrm import activitypub from bookwyrm import activitypub
from bookwyrm.activitypub.base_activity import ActivityObject, \ from bookwyrm.activitypub.base_activity import ActivityObject, \
find_existing_by_remote_id, resolve_remote_id, set_related_field resolve_remote_id, set_related_field
from bookwyrm.activitypub import ActivitySerializerError from bookwyrm.activitypub import ActivitySerializerError
from bookwyrm import models from bookwyrm import models
@ -77,34 +77,6 @@ class BaseActivity(TestCase):
self.assertEqual(serialized['id'], 'a') self.assertEqual(serialized['id'], 'a')
self.assertEqual(serialized['type'], 'b') self.assertEqual(serialized['type'], 'b')
def test_find_existing_by_remote_id(self):
''' attempt to match a remote id to an object in the db '''
# uses a different remote id scheme
# this isn't really part of this test directly but it's helpful to state
self.assertEqual(self.book.origin_id, 'http://book.com/book')
self.assertNotEqual(self.book.remote_id, 'http://book.com/book')
# uses subclasses
models.Comment.objects.create(
user=self.user, content='test status', book=self.book, \
remote_id='https://comment.net')
result = find_existing_by_remote_id(models.User, 'hi')
self.assertIsNone(result)
result = find_existing_by_remote_id(
models.User, 'http://example.com/a/b')
self.assertEqual(result, self.user)
# test using origin id
result = find_existing_by_remote_id(
models.Edition, 'http://book.com/book')
self.assertEqual(result, self.book)
# test subclass match
result = find_existing_by_remote_id(
models.Status, 'https://comment.net')
@responses.activate @responses.activate
def test_resolve_remote_id(self): def test_resolve_remote_id(self):
''' look up or load remote data ''' ''' look up or load remote data '''

View file

@ -11,13 +11,16 @@ from bookwyrm.models.base_model import ActivitypubMixin
from bookwyrm.settings import DOMAIN from bookwyrm.settings import DOMAIN
class BaseModel(TestCase): class BaseModel(TestCase):
''' functionality shared across models '''
def test_remote_id(self): def test_remote_id(self):
''' these should be generated '''
instance = base_model.BookWyrmModel() instance = base_model.BookWyrmModel()
instance.id = 1 instance.id = 1
expected = instance.get_remote_id() expected = instance.get_remote_id()
self.assertEqual(expected, 'https://%s/bookwyrmmodel/1' % DOMAIN) self.assertEqual(expected, 'https://%s/bookwyrmmodel/1' % DOMAIN)
def test_remote_id_with_user(self): def test_remote_id_with_user(self):
''' format of remote id when there's a user object '''
user = models.User.objects.create_user( user = models.User.objects.create_user(
'mouse', 'mouse@mouse.com', 'mouseword', local=True) 'mouse', 'mouse@mouse.com', 'mouseword', local=True)
instance = base_model.BookWyrmModel() instance = base_model.BookWyrmModel()
@ -46,6 +49,7 @@ class BaseModel(TestCase):
self.assertIsNone(instance.remote_id) self.assertIsNone(instance.remote_id)
def test_to_create_activity(self): def test_to_create_activity(self):
''' wrapper for ActivityPub "create" action '''
user = models.User.objects.create_user( user = models.User.objects.create_user(
'mouse', 'mouse@mouse.com', 'mouseword', local=True) 'mouse', 'mouse@mouse.com', 'mouseword', local=True)
@ -75,6 +79,7 @@ class BaseModel(TestCase):
) )
def test_to_delete_activity(self): def test_to_delete_activity(self):
''' wrapper for Delete activity '''
user = models.User.objects.create_user( user = models.User.objects.create_user(
'mouse', 'mouse@mouse.com', 'mouseword', local=True) 'mouse', 'mouse@mouse.com', 'mouseword', local=True)
@ -98,6 +103,7 @@ class BaseModel(TestCase):
['https://www.w3.org/ns/activitystreams#Public']) ['https://www.w3.org/ns/activitystreams#Public'])
def test_to_update_activity(self): def test_to_update_activity(self):
''' ditto above but for Update '''
user = models.User.objects.create_user( user = models.User.objects.create_user(
'mouse', 'mouse@mouse.com', 'mouseword', local=True) 'mouse', 'mouse@mouse.com', 'mouseword', local=True)
@ -121,6 +127,7 @@ class BaseModel(TestCase):
self.assertEqual(activity['object'], {}) self.assertEqual(activity['object'], {})
def test_to_undo_activity(self): def test_to_undo_activity(self):
''' and again, for Undo '''
user = models.User.objects.create_user( user = models.User.objects.create_user(
'mouse', 'mouse@mouse.com', 'mouseword', local=True) 'mouse', 'mouse@mouse.com', 'mouseword', local=True)
@ -140,12 +147,14 @@ class BaseModel(TestCase):
def test_to_activity(self): def test_to_activity(self):
''' model to ActivityPub json '''
@dataclass(init=False) @dataclass(init=False)
class TestActivity(ActivityObject): class TestActivity(ActivityObject):
''' real simple mock '''
type: str = 'Test' type: str = 'Test'
class TestModel(ActivitypubMixin, base_model.BookWyrmModel): class TestModel(ActivitypubMixin, base_model.BookWyrmModel):
pass ''' real simple mock model because BookWyrmModel is abstract '''
instance = TestModel() instance = TestModel()
instance.remote_id = 'https://www.example.com/test' instance.remote_id = 'https://www.example.com/test'
@ -155,3 +164,32 @@ class BaseModel(TestCase):
self.assertIsInstance(activity, dict) self.assertIsInstance(activity, dict)
self.assertEqual(activity['id'], 'https://www.example.com/test') self.assertEqual(activity['id'], 'https://www.example.com/test')
self.assertEqual(activity['type'], 'Test') self.assertEqual(activity['type'], 'Test')
def test_find_existing_by_remote_id(self):
''' attempt to match a remote id to an object in the db '''
# uses a different remote id scheme
# this isn't really part of this test directly but it's helpful to state
self.assertEqual(self.book.origin_id, 'http://book.com/book')
self.assertNotEqual(self.book.remote_id, 'http://book.com/book')
# uses subclasses
models.Comment.objects.create(
user=self.user, content='test status', book=self.book, \
remote_id='https://comment.net')
result = models.User.find_existing_by_remote_id('hi')
self.assertIsNone(result)
result = models.User.find_existing_by_remote_id(
'http://example.com/a/b')
self.assertEqual(result, self.user)
# test using origin id
result = models.Edition.find_existing_by_remote_id(
'http://book.com/book')
self.assertEqual(result, self.book)
# test subclass match
result = models.Status.find_existing_by_remote_id(
'https://comment.net')

View file

@ -48,6 +48,7 @@ class ActivitypubFields(TestCase):
instance = fields.ActivitypubFieldMixin() instance = fields.ActivitypubFieldMixin()
self.assertEqual(instance.field_to_activity('fish'), 'fish') self.assertEqual(instance.field_to_activity('fish'), 'fish')
self.assertEqual(instance.field_from_activity('fish'), 'fish') self.assertEqual(instance.field_from_activity('fish'), 'fish')
self.assertFalse(instance.deduplication_field)
instance = fields.ActivitypubFieldMixin( instance = fields.ActivitypubFieldMixin(
activitypub_wrapper='endpoints', activitypub_field='outbox' activitypub_wrapper='endpoints', activitypub_field='outbox'
@ -70,6 +71,7 @@ class ActivitypubFields(TestCase):
''' just sets some defaults on charfield ''' ''' just sets some defaults on charfield '''
instance = fields.RemoteIdField() instance = fields.RemoteIdField()
self.assertEqual(instance.max_length, 255) self.assertEqual(instance.max_length, 255)
self.assertTrue(instance.deduplication_field)
with self.assertRaises(ValidationError): with self.assertRaises(ValidationError):
instance.run_validators('http://www.example.com/dlfjg 23/x') instance.run_validators('http://www.example.com/dlfjg 23/x')
@ -97,7 +99,7 @@ class ActivitypubFields(TestCase):
self.assertEqual(instance.field_to_activity(item), 'https://e.b/c') self.assertEqual(instance.field_to_activity(item), 'https://e.b/c')
@responses.activate @responses.activate
def test_foreign_key_from_activity(self): def test_foreign_key_from_activity_str(self):
''' this is the important stuff ''' ''' this is the important stuff '''
instance = fields.ForeignKey(User, on_delete=models.CASCADE) instance = fields.ForeignKey(User, on_delete=models.CASCADE)
@ -117,15 +119,47 @@ class ActivitypubFields(TestCase):
with patch('bookwyrm.models.user.set_remote_server.delay'): with patch('bookwyrm.models.user.set_remote_server.delay'):
value = instance.field_from_activity( value = instance.field_from_activity(
'https://example.com/user/mouse') 'https://example.com/user/mouse')
self.assertIsInstance(value, User)
self.assertEqual(value.remote_id, 'https://example.com/user/mouse')
self.assertEqual(value.name, 'MOUSE?? MOUSE!!')
# test recieving activity json
def test_foreign_key_from_activity_dict(self):
''' test recieving activity json '''
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
datafile = pathlib.Path(__file__).parent.joinpath(
'../data/ap_user.json'
)
userdata = json.loads(datafile.read_bytes())
# don't try to load the user icon
del userdata['icon']
with patch('bookwyrm.models.user.set_remote_server.delay'):
value = instance.field_from_activity(userdata) value = instance.field_from_activity(userdata)
self.assertIsInstance(value, User) self.assertIsInstance(value, User)
self.assertEqual(value.remote_id, 'https://example.com/user/mouse') self.assertEqual(value.remote_id, 'https://example.com/user/mouse')
self.assertEqual(value.name, 'MOUSE?? MOUSE!!') self.assertEqual(value.name, 'MOUSE?? MOUSE!!')
# et cetera but we're not testing serializing user json # et cetera but we're not testing serializing user json
# test receiving a remote id of an object in the db
def test_foreign_key_from_activity_dict_existing(self):
''' test receiving a dict of an existing object in the db '''
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
datafile = pathlib.Path(__file__).parent.joinpath(
'../data/ap_user.json'
)
userdata = json.loads(datafile.read_bytes())
user = User.objects.create_user(
'mouse', 'mouse@mouse.mouse', 'mouseword', local=True)
user.remote_id = 'https://example.com/user/mouse'
user.save()
value = instance.field_from_activity(userdata)
self.assertEqual(value, user)
def test_foreign_key_from_activity_str_existing(self):
''' test receiving a remote id of an existing object in the db '''
instance = fields.ForeignKey(User, on_delete=models.CASCADE)
user = User.objects.create_user( user = User.objects.create_user(
'mouse', 'mouse@mouse.mouse', 'mouseword', local=True) 'mouse', 'mouse@mouse.mouse', 'mouseword', local=True)
value = instance.field_from_activity(user.remote_id) value = instance.field_from_activity(user.remote_id)