diff --git a/bookwyrm/activitypub/base_activity.py b/bookwyrm/activitypub/base_activity.py index c784c3c1..585f2449 100644 --- a/bookwyrm/activitypub/base_activity.py +++ b/bookwyrm/activitypub/base_activity.py @@ -78,11 +78,11 @@ class ActivityObject: # check for an existing instance, if we're not updating a known obj if not instance: - instance = find_existing_by_remote_id(model, self.id) or model() - # TODO: deduplicate books by identifiers + instance = model.find_existing(self.serialize()) or model() many_to_many_fields = {} for field in model._meta.get_fields(): + # check if it's an activitypub field if not hasattr(field, 'field_to_activity'): continue # call the formatter associated with the model field class @@ -167,34 +167,25 @@ def set_related_field( if isinstance(data, str): item = resolve_remote_id(model, data, save=False) else: - item = model.activity_serializer(**data) - item = item.to_model(model, save=False) - instance = find_existing_by_remote_id(origin_model, related_remote_id) + # look for a match based on all the available data + item = model.find_existing(data) + if not item: + # create a new model instance + item = model.activity_serializer(**data) + item = item.to_model(model, save=False) + # this must exist because it's the object that triggered this function + instance = origin_model.find_existing_by_remote_id(related_remote_id) + if not instance: + raise ValueError('Invalid related remote id: %s' % related_remote_id) + + # edition.parent_work = instance, for example setattr(item, related_field_name, instance) item.save() -def find_existing_by_remote_id(model, remote_id): - ''' check for an existing instance of this id in the db ''' - objects = model.objects - if hasattr(model.objects, 'select_subclasses'): - objects = objects.select_subclasses() - - # first, check for an existing copy in the database - result = objects.filter( - remote_id=remote_id - ).first() - - if not result and hasattr(model, 'origin_id'): - result = objects.filter( - origin_id=remote_id - ).first() - return result - - def resolve_remote_id(model, remote_id, refresh=False, save=True): - ''' look up the remote_id in the database or load it remotely ''' - result = find_existing_by_remote_id(model, remote_id) + ''' take a remote_id and return an instance, creating if necessary ''' + result = model.find_existing_by_remote_id(remote_id) if result and not refresh: return result @@ -206,6 +197,11 @@ def resolve_remote_id(model, remote_id, refresh=False, save=True): 'Could not connect to host for remote_id in %s model: %s' % \ (model.__name__, remote_id)) + # check for existing items with shared unique identifiers + item = model.find_existing(data) + if item: + return item + item = model.activity_serializer(**data) # if we're refreshing, "result" will be set and we'll update it return item.to_model(model, instance=result, save=save) diff --git a/bookwyrm/context_processors.py b/bookwyrm/context_processors.py index 72839dce..8422f3c3 100644 --- a/bookwyrm/context_processors.py +++ b/bookwyrm/context_processors.py @@ -4,5 +4,5 @@ from bookwyrm import models def site_settings(request): ''' include the custom info about the site ''' return { - 'site': models.SiteSettings.objects.get() + 'site': models.SiteSettings.objects.first() } diff --git a/bookwyrm/models/author.py b/bookwyrm/models/author.py index 6098447f..79973a37 100644 --- a/bookwyrm/models/author.py +++ b/bookwyrm/models/author.py @@ -12,11 +12,11 @@ from . import fields class Author(ActivitypubMixin, BookWyrmModel): ''' basic biographic info ''' origin_id = models.CharField(max_length=255, null=True) - ''' copy of an author from OL ''' - openlibrary_key = fields.CharField(max_length=255, blank=True, null=True) + openlibrary_key = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) sync = models.BooleanField(default=True) last_sync_date = models.DateTimeField(default=timezone.now) - wikipedia_link = fields.CharField(max_length=255, blank=True, null=True) + wikipedia_link = fields.CharField(max_length=255, blank=True, null=True, deduplication_field=True) # idk probably other keys would be useful here? born = fields.DateTimeField(blank=True, null=True) died = fields.DateTimeField(blank=True, null=True) diff --git a/bookwyrm/models/base_model.py b/bookwyrm/models/base_model.py index 1e437152..f44797ab 100644 --- a/bookwyrm/models/base_model.py +++ b/bookwyrm/models/base_model.py @@ -1,5 +1,7 @@ ''' base model with default fields ''' from base64 import b64encode +from functools import reduce +import operator from uuid import uuid4 from Crypto.PublicKey import RSA @@ -7,6 +9,7 @@ from Crypto.Signature import pkcs1_15 from Crypto.Hash import SHA256 from django.core.paginator import Paginator from django.db import models +from django.db.models import Q from django.dispatch import receiver from bookwyrm import activitypub @@ -64,6 +67,50 @@ class ActivitypubMixin: activity_serializer = lambda: {} reverse_unfurl = False + @classmethod + def find_existing_by_remote_id(cls, remote_id): + ''' look up a remote id in the db ''' + return cls.find_existing({'id': remote_id}) + + @classmethod + def find_existing(cls, data): + ''' compare data to fields that can be used for deduplation. + This always includes remote_id, but can also be unique identifiers + like an isbn for an edition ''' + filters = [] + for field in cls._meta.get_fields(): + if not hasattr(field, 'deduplication_field') or \ + not field.deduplication_field: + continue + + value = data.get(field.activitypub_field) + if not value: + continue + filters.append({field.name: value}) + + if hasattr(cls, 'origin_id') and 'id' in data: + # kinda janky, but this handles special case for books + filters.append({'origin_id': data['id']}) + + if not filters: + # if there are no deduplication fields, it will match the first + # item no matter what. this shouldn't happen but just in case. + return None + + objects = cls.objects + if hasattr(objects, 'select_subclasses'): + objects = objects.select_subclasses() + + # an OR operation on all the match fields + match = objects.filter( + reduce( + operator.or_, (Q(**f) for f in filters) + ) + ) + # there OUGHT to be only one match + return match.first() + + def to_activity(self): ''' convert from a model to an activity ''' activity = {} diff --git a/bookwyrm/models/book.py b/bookwyrm/models/book.py index 47b8b99e..bcd4bc04 100644 --- a/bookwyrm/models/book.py +++ b/bookwyrm/models/book.py @@ -16,9 +16,12 @@ class Book(ActivitypubMixin, BookWyrmModel): ''' a generic book, which can mean either an edition or a work ''' origin_id = models.CharField(max_length=255, null=True, blank=True) # these identifiers apply to both works and editions - openlibrary_key = fields.CharField(max_length=255, blank=True, null=True) - librarything_key = fields.CharField(max_length=255, blank=True, null=True) - goodreads_key = fields.CharField(max_length=255, blank=True, null=True) + openlibrary_key = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) + librarything_key = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) + goodreads_key = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) # info about where the data comes from and where/if to sync sync = models.BooleanField(default=True) @@ -83,7 +86,8 @@ class Book(ActivitypubMixin, BookWyrmModel): class Work(OrderedCollectionPageMixin, Book): ''' a work (an abstract concept of a book that manifests in an edition) ''' # library of congress catalog control number - lccn = fields.CharField(max_length=255, blank=True, null=True) + lccn = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) # this has to be nullable but should never be null default_edition = fields.ForeignKey( 'Edition', @@ -103,10 +107,14 @@ class Work(OrderedCollectionPageMixin, Book): class Edition(Book): ''' an edition of a book ''' # these identifiers only apply to editions, not works - isbn_10 = fields.CharField(max_length=255, blank=True, null=True) - isbn_13 = fields.CharField(max_length=255, blank=True, null=True) - oclc_number = fields.CharField(max_length=255, blank=True, null=True) - asin = fields.CharField(max_length=255, blank=True, null=True) + isbn_10 = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) + isbn_13 = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) + oclc_number = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) + asin = fields.CharField( + max_length=255, blank=True, null=True, deduplication_field=True) pages = fields.IntegerField(blank=True, null=True) physical_format = fields.CharField(max_length=255, blank=True, null=True) publishers = fields.ArrayField( diff --git a/bookwyrm/models/fields.py b/bookwyrm/models/fields.py index fb94e1a6..e6878fb9 100644 --- a/bookwyrm/models/fields.py +++ b/bookwyrm/models/fields.py @@ -28,7 +28,9 @@ def validate_remote_id(value): class ActivitypubFieldMixin: ''' make a database field serializable ''' def __init__(self, *args, \ - activitypub_field=None, activitypub_wrapper=None, **kwargs): + activitypub_field=None, activitypub_wrapper=None, + deduplication_field=False, **kwargs): + self.deduplication_field = deduplication_field if activitypub_wrapper: self.activitypub_wrapper = activitypub_field self.activitypub_field = activitypub_wrapper @@ -86,6 +88,8 @@ class RemoteIdField(ActivitypubFieldMixin, models.CharField): *args, max_length=max_length, validators=validators, **kwargs ) + # for this field, the default is true. false everywhere else. + self.deduplication_field = kwargs.get('deduplication_field', True) class UsernameField(ActivitypubFieldMixin, models.CharField): diff --git a/bookwyrm/models/user.py b/bookwyrm/models/user.py index 531b0da2..0ef9a2e6 100644 --- a/bookwyrm/models/user.py +++ b/bookwyrm/models/user.py @@ -32,7 +32,9 @@ class User(OrderedCollectionPageMixin, AbstractUser): inbox = fields.RemoteIdField(unique=True) shared_inbox = fields.RemoteIdField( activitypub_field='sharedInbox', - activitypub_wrapper='endpoints', null=True) + activitypub_wrapper='endpoints', + deduplication_field=False, + null=True) federated_server = models.ForeignKey( 'FederatedServer', on_delete=models.PROTECT, diff --git a/bookwyrm/tests/activitypub/test_base_activity.py b/bookwyrm/tests/activitypub/test_base_activity.py index 26d63b61..100ff6cf 100644 --- a/bookwyrm/tests/activitypub/test_base_activity.py +++ b/bookwyrm/tests/activitypub/test_base_activity.py @@ -11,7 +11,7 @@ import responses from bookwyrm import activitypub from bookwyrm.activitypub.base_activity import ActivityObject, \ - find_existing_by_remote_id, resolve_remote_id, set_related_field + resolve_remote_id, set_related_field from bookwyrm.activitypub import ActivitySerializerError from bookwyrm import models @@ -77,34 +77,6 @@ class BaseActivity(TestCase): self.assertEqual(serialized['id'], 'a') self.assertEqual(serialized['type'], 'b') - def test_find_existing_by_remote_id(self): - ''' attempt to match a remote id to an object in the db ''' - # uses a different remote id scheme - # this isn't really part of this test directly but it's helpful to state - self.assertEqual(self.book.origin_id, 'http://book.com/book') - self.assertNotEqual(self.book.remote_id, 'http://book.com/book') - - # uses subclasses - models.Comment.objects.create( - user=self.user, content='test status', book=self.book, \ - remote_id='https://comment.net') - - result = find_existing_by_remote_id(models.User, 'hi') - self.assertIsNone(result) - - result = find_existing_by_remote_id( - models.User, 'http://example.com/a/b') - self.assertEqual(result, self.user) - - # test using origin id - result = find_existing_by_remote_id( - models.Edition, 'http://book.com/book') - self.assertEqual(result, self.book) - - # test subclass match - result = find_existing_by_remote_id( - models.Status, 'https://comment.net') - @responses.activate def test_resolve_remote_id(self): ''' look up or load remote data ''' diff --git a/bookwyrm/tests/models/test_base_model.py b/bookwyrm/tests/models/test_base_model.py index a807c47f..966fe3e2 100644 --- a/bookwyrm/tests/models/test_base_model.py +++ b/bookwyrm/tests/models/test_base_model.py @@ -11,13 +11,16 @@ from bookwyrm.models.base_model import ActivitypubMixin from bookwyrm.settings import DOMAIN class BaseModel(TestCase): + ''' functionality shared across models ''' def test_remote_id(self): + ''' these should be generated ''' instance = base_model.BookWyrmModel() instance.id = 1 expected = instance.get_remote_id() self.assertEqual(expected, 'https://%s/bookwyrmmodel/1' % DOMAIN) def test_remote_id_with_user(self): + ''' format of remote id when there's a user object ''' user = models.User.objects.create_user( 'mouse', 'mouse@mouse.com', 'mouseword', local=True) instance = base_model.BookWyrmModel() @@ -46,6 +49,7 @@ class BaseModel(TestCase): self.assertIsNone(instance.remote_id) def test_to_create_activity(self): + ''' wrapper for ActivityPub "create" action ''' user = models.User.objects.create_user( 'mouse', 'mouse@mouse.com', 'mouseword', local=True) @@ -75,6 +79,7 @@ class BaseModel(TestCase): ) def test_to_delete_activity(self): + ''' wrapper for Delete activity ''' user = models.User.objects.create_user( 'mouse', 'mouse@mouse.com', 'mouseword', local=True) @@ -98,6 +103,7 @@ class BaseModel(TestCase): ['https://www.w3.org/ns/activitystreams#Public']) def test_to_update_activity(self): + ''' ditto above but for Update ''' user = models.User.objects.create_user( 'mouse', 'mouse@mouse.com', 'mouseword', local=True) @@ -121,6 +127,7 @@ class BaseModel(TestCase): self.assertEqual(activity['object'], {}) def test_to_undo_activity(self): + ''' and again, for Undo ''' user = models.User.objects.create_user( 'mouse', 'mouse@mouse.com', 'mouseword', local=True) @@ -140,12 +147,14 @@ class BaseModel(TestCase): def test_to_activity(self): + ''' model to ActivityPub json ''' @dataclass(init=False) class TestActivity(ActivityObject): + ''' real simple mock ''' type: str = 'Test' class TestModel(ActivitypubMixin, base_model.BookWyrmModel): - pass + ''' real simple mock model because BookWyrmModel is abstract ''' instance = TestModel() instance.remote_id = 'https://www.example.com/test' @@ -155,3 +164,32 @@ class BaseModel(TestCase): self.assertIsInstance(activity, dict) self.assertEqual(activity['id'], 'https://www.example.com/test') self.assertEqual(activity['type'], 'Test') + + + def test_find_existing_by_remote_id(self): + ''' attempt to match a remote id to an object in the db ''' + # uses a different remote id scheme + # this isn't really part of this test directly but it's helpful to state + self.assertEqual(self.book.origin_id, 'http://book.com/book') + self.assertNotEqual(self.book.remote_id, 'http://book.com/book') + + # uses subclasses + models.Comment.objects.create( + user=self.user, content='test status', book=self.book, \ + remote_id='https://comment.net') + + result = models.User.find_existing_by_remote_id('hi') + self.assertIsNone(result) + + result = models.User.find_existing_by_remote_id( + 'http://example.com/a/b') + self.assertEqual(result, self.user) + + # test using origin id + result = models.Edition.find_existing_by_remote_id( + 'http://book.com/book') + self.assertEqual(result, self.book) + + # test subclass match + result = models.Status.find_existing_by_remote_id( + 'https://comment.net') diff --git a/bookwyrm/tests/models/test_fields.py b/bookwyrm/tests/models/test_fields.py index 58d9e08c..a0a95ba9 100644 --- a/bookwyrm/tests/models/test_fields.py +++ b/bookwyrm/tests/models/test_fields.py @@ -48,6 +48,7 @@ class ActivitypubFields(TestCase): instance = fields.ActivitypubFieldMixin() self.assertEqual(instance.field_to_activity('fish'), 'fish') self.assertEqual(instance.field_from_activity('fish'), 'fish') + self.assertFalse(instance.deduplication_field) instance = fields.ActivitypubFieldMixin( activitypub_wrapper='endpoints', activitypub_field='outbox' @@ -70,6 +71,7 @@ class ActivitypubFields(TestCase): ''' just sets some defaults on charfield ''' instance = fields.RemoteIdField() self.assertEqual(instance.max_length, 255) + self.assertTrue(instance.deduplication_field) with self.assertRaises(ValidationError): instance.run_validators('http://www.example.com/dlfjg 23/x') @@ -97,7 +99,7 @@ class ActivitypubFields(TestCase): self.assertEqual(instance.field_to_activity(item), 'https://e.b/c') @responses.activate - def test_foreign_key_from_activity(self): + def test_foreign_key_from_activity_str(self): ''' this is the important stuff ''' instance = fields.ForeignKey(User, on_delete=models.CASCADE) @@ -117,15 +119,47 @@ class ActivitypubFields(TestCase): with patch('bookwyrm.models.user.set_remote_server.delay'): value = instance.field_from_activity( 'https://example.com/user/mouse') + self.assertIsInstance(value, User) + self.assertEqual(value.remote_id, 'https://example.com/user/mouse') + self.assertEqual(value.name, 'MOUSE?? MOUSE!!') - # test recieving activity json - value = instance.field_from_activity(userdata) + + def test_foreign_key_from_activity_dict(self): + ''' test recieving activity json ''' + instance = fields.ForeignKey(User, on_delete=models.CASCADE) + + datafile = pathlib.Path(__file__).parent.joinpath( + '../data/ap_user.json' + ) + userdata = json.loads(datafile.read_bytes()) + # don't try to load the user icon + del userdata['icon'] + with patch('bookwyrm.models.user.set_remote_server.delay'): + value = instance.field_from_activity(userdata) self.assertIsInstance(value, User) self.assertEqual(value.remote_id, 'https://example.com/user/mouse') self.assertEqual(value.name, 'MOUSE?? MOUSE!!') # et cetera but we're not testing serializing user json - # test receiving a remote id of an object in the db + + def test_foreign_key_from_activity_dict_existing(self): + ''' test receiving a dict of an existing object in the db ''' + instance = fields.ForeignKey(User, on_delete=models.CASCADE) + datafile = pathlib.Path(__file__).parent.joinpath( + '../data/ap_user.json' + ) + userdata = json.loads(datafile.read_bytes()) + user = User.objects.create_user( + 'mouse', 'mouse@mouse.mouse', 'mouseword', local=True) + user.remote_id = 'https://example.com/user/mouse' + user.save() + value = instance.field_from_activity(userdata) + self.assertEqual(value, user) + + + def test_foreign_key_from_activity_str_existing(self): + ''' test receiving a remote id of an existing object in the db ''' + instance = fields.ForeignKey(User, on_delete=models.CASCADE) user = User.objects.create_user( 'mouse', 'mouse@mouse.mouse', 'mouseword', local=True) value = instance.field_from_activity(user.remote_id)