diff --git a/bookwyrm/activitypub/book.py b/bookwyrm/activitypub/book.py index 93cd384fe..ee4b88515 100644 --- a/bookwyrm/activitypub/book.py +++ b/bookwyrm/activitypub/book.py @@ -50,7 +50,7 @@ class Work(Book): ''' work instance of a book object ''' lccn: str = '' defaultEdition: str = '' - editions: List[str] + editions: List[str] = field(default_factory=lambda: []) type: str = 'Work' @@ -58,9 +58,9 @@ class Work(Book): class Author(ActivityObject): ''' author of a book ''' name: str - born: str = '' - died: str = '' - aliases: str = '' + born: str = None + died: str = None + aliases: List[str] = field(default_factory=lambda: []) bio: str = '' openlibraryKey: str = '' wikipediaLink: str = '' diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index c9f1ad2e6..5afd10897 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -1,16 +1,14 @@ ''' functionality outline for a book data connector ''' from abc import ABC, abstractmethod from dataclasses import dataclass -import pytz from urllib3.exceptions import RequestError from django.db import transaction -from dateutil import parser import requests from requests import HTTPError from requests.exceptions import SSLError -from bookwyrm import models +from bookwyrm import activitypub, models class ConnectorException(HTTPError): @@ -38,7 +36,7 @@ class AbstractMinimalConnector(ABC): for field in self_fields: setattr(self, field, getattr(info, field)) - def search(self, query, min_confidence=None): + def search(self, query, min_confidence=None):# pylint: disable=unused-argument ''' free text search ''' resp = requests.get( '%s%s' % (self.search_url, query), @@ -72,9 +70,6 @@ class AbstractConnector(AbstractMinimalConnector): ''' generic book data connector ''' def __init__(self, identifier): super().__init__(identifier) - - self.key_mappings = [] - # fields we want to look for in book data to copy over # title we handle separately. self.book_mappings = [] @@ -88,217 +83,110 @@ class AbstractConnector(AbstractMinimalConnector): return True + @transaction.atomic def get_or_create_book(self, remote_id): - # try to load the book - book = models.Book.objects.select_subclasses().filter( - origin_id=remote_id - ).first() - if book: - if isinstance(book, models.Work): - return book.default_edition - return book + ''' translate arbitrary json into an Activitypub dataclass ''' + # first, check if we have the origin_id saved + existing = models.Edition.find_existing_by_remote_id(remote_id) or \ + models.Work.find_existing_by_remote_id(remote_id) + if existing: + if hasattr(existing, 'get_default_editon'): + return existing.get_default_editon() + return existing - # no book was found, so we start creating a new one + # load the json data = get_data(remote_id) - - work = None - edition = None + mapped_data = dict_from_mappings(data, self.book_mappings) if self.is_work_data(data): - work_data = data - # if we requested a work and there's already an edition, we're set - work = self.match_from_mappings(work_data, models.Work) - if work and work.default_edition: - return work.default_edition - - # no such luck, we need more information. try: - edition_data = self.get_edition_from_work_data(work_data) + edition_data = self.get_edition_from_work_data(data) except KeyError: # hack: re-use the work data as the edition data # this is why remote ids aren't necessarily unique edition_data = data + work_data = mapped_data else: - edition_data = data - edition = self.match_from_mappings(edition_data, models.Edition) - # no need to figure out about the work if we already know about it - if edition and edition.parent_work: - return edition - - # no such luck, we need more information. try: - work_data = self.get_work_from_edition_date(edition_data) + work_data = self.get_work_from_edition_data(data) + work_data = dict_from_mappings(work_data, self.book_mappings) except KeyError: - # remember this hack: re-use the work data as the edition data - work_data = data + work_data = mapped_data + edition_data = data if not work_data or not edition_data: raise ConnectorException('Unable to load book data: %s' % remote_id) - # at this point, we need to figure out the work, edition, or both - # atomic so that we don't save a work with no edition for vice versa - with transaction.atomic(): - if not work: - work_key = self.get_remote_id_from_data(work_data) - work = self.create_book(work_key, work_data, models.Work) + # create activitypub object + work_activity = activitypub.Work(**work_data) + # this will dedupe automatically + work = work_activity.to_model(models.Work) + for author in self.get_authors_from_data(data): + work.authors.add(author) + return self.create_edition_from_data(work, edition_data) - if not edition: - ed_key = self.get_remote_id_from_data(edition_data) - edition = self.create_book(ed_key, edition_data, models.Edition) - edition.parent_work = work - edition.save() - work.default_edition = edition - work.save() - # now's our change to fill in author gaps + def create_edition_from_data(self, work, edition_data): + ''' if we already have the work, we're ready ''' + mapped_data = dict_from_mappings(edition_data, self.book_mappings) + mapped_data['work'] = work.remote_id + edition_activity = activitypub.Edition(**mapped_data) + edition = edition_activity.to_model(models.Edition) + edition.connector = self.connector + edition.save() + + work.default_edition = edition + work.save() + + for author in self.get_authors_from_data(edition_data): + edition.authors.add(author) if not edition.authors.exists() and work.authors.exists(): - edition.authors.set(work.authors.all()) - edition.author_text = work.author_text - edition.save() - - if not edition: - raise ConnectorException('Unable to create book: %s' % remote_id) + edition.authors.add(work.authors.all()) return edition - def create_book(self, remote_id, data, model): - ''' create a work or edition from data ''' - book = model.objects.create( - origin_id=remote_id, - title=data['title'], - connector=self.connector, - ) - return self.update_book_from_data(book, data) + def get_or_create_author(self, remote_id): + ''' load that author ''' + existing = models.Author.find_existing_by_remote_id(remote_id) + if existing: + return existing + data = get_data(remote_id) - def update_book_from_data(self, book, data, update_cover=True): - ''' for creating a new book or syncing with data ''' - book = update_from_mappings(book, data, self.book_mappings) - - author_text = [] - for author in self.get_authors_from_data(data): - book.authors.add(author) - author_text.append(author.name) - book.author_text = ', '.join(author_text) - book.save() - - if not update_cover: - return book - - cover = self.get_cover_from_data(data) - if cover: - book.cover.save(*cover, save=True) - return book - - - def update_book(self, book, data=None): - ''' load new data ''' - if not book.sync and not book.sync_cover: - return book - - if not data: - key = getattr(book, self.key_name) - data = self.load_book_data(key) - - if book.sync: - book = self.update_book_from_data( - book, data, update_cover=book.sync_cover) - else: - cover = self.get_cover_from_data(data) - if cover: - book.cover.save(*cover, save=True) - - return book - - - def match_from_mappings(self, data, model): - ''' try to find existing copies of this book using various keys ''' - relevent_mappings = [m for m in self.key_mappings if \ - not m.model or model == m.model] - for mapping in relevent_mappings: - # check if this field is present in the data - value = data.get(mapping.remote_field) - if not value: - continue - - # extract the value in the right format - value = mapping.formatter(value) - - # search our database for a matching book - kwargs = {mapping.local_field: value} - match = model.objects.filter(**kwargs).first() - if match: - return match - return None - - - @abstractmethod - def get_remote_id_from_data(self, data): - ''' otherwise we won't properly set the remote_id in the db ''' + mapped_data = dict_from_mappings(data, self.author_mappings) + activity = activitypub.Author(**mapped_data) + # this will dedupe + return activity.to_model(models.Author) @abstractmethod def is_work_data(self, data): ''' differentiate works and editions ''' - @abstractmethod def get_edition_from_work_data(self, data): ''' every work needs at least one edition ''' - @abstractmethod def get_work_from_edition_date(self, data): ''' every edition needs a work ''' - @abstractmethod def get_authors_from_data(self, data): ''' load author data ''' - - @abstractmethod - def get_cover_from_data(self, data): - ''' load cover ''' - @abstractmethod def expand_book_data(self, book): ''' get more info on a book ''' -def update_from_mappings(obj, data, mappings): - ''' assign data to model with mappings ''' +def dict_from_mappings(data, mappings): + ''' create a dict in Activitypub format, using mappings supplies by + the subclass ''' + result = {} for mapping in mappings: - # check if this field is present in the data - value = data.get(mapping.remote_field) - if not value: - continue - - # extract the value in the right format - try: - value = mapping.formatter(value) - except: - continue - - # assign the formatted value to the model - obj.__setattr__(mapping.local_field, value) - return obj - - -def get_date(date_string): - ''' helper function to try to interpret dates ''' - if not date_string: - return None - - try: - return pytz.utc.localize(parser.parse(date_string)) - except ValueError: - pass - - try: - return parser.parse(date_string) - except ValueError: - return None + result[mapping.local_field] = mapping.get_value(data) + return result def get_data(url): @@ -349,11 +237,19 @@ class SearchResult: class Mapping: ''' associate a local database field with a field in an external dataset ''' - def __init__( - self, local_field, remote_field=None, formatter=None, model=None): + def __init__(self, local_field, remote_field=None, formatter=None): noop = lambda x: x self.local_field = local_field self.remote_field = remote_field or local_field self.formatter = formatter or noop - self.model = model + + def get_value(self, data): + ''' pull a field from incoming json and return the formatted version ''' + value = data.get(self.remote_field) + if not value: + return None + try: + return self.formatter(value) + except:# pylint: disable=bare-except + return None diff --git a/bookwyrm/connectors/openlibrary.py b/bookwyrm/connectors/openlibrary.py index 28eb1ea0a..74f76668c 100644 --- a/bookwyrm/connectors/openlibrary.py +++ b/bookwyrm/connectors/openlibrary.py @@ -1,13 +1,9 @@ ''' openlibrary data connector ''' import re -import requests - -from django.core.files.base import ContentFile from bookwyrm import models from .abstract_connector import AbstractConnector, SearchResult, Mapping -from .abstract_connector import ConnectorException -from .abstract_connector import get_date, get_data, update_from_mappings +from .abstract_connector import ConnectorException, get_data from .openlibrary_languages import languages @@ -17,62 +13,57 @@ class Connector(AbstractConnector): super().__init__(identifier) get_first = lambda a: a[0] - self.key_mappings = [ - Mapping('isbn_13', model=models.Edition, formatter=get_first), - Mapping('isbn_10', model=models.Edition, formatter=get_first), - Mapping('lccn', model=models.Work, formatter=get_first), + get_remote_id = lambda a: self.base_url + a + self.book_mappings = [ + Mapping('title'), + Mapping('id', remote_field='key', formatter=get_remote_id), Mapping( - 'oclc_number', - remote_field='oclc_numbers', - model=models.Edition, - formatter=get_first - ), - Mapping( - 'openlibrary_key', - remote_field='key', - formatter=get_openlibrary_key - ), - Mapping('goodreads_key'), - Mapping('asin'), - ] - - self.book_mappings = self.key_mappings + [ - Mapping('sort_title'), + 'cover', remote_field='covers', formatter=self.get_cover_url), + Mapping('sortTitle', remote_field='sort_title'), Mapping('subtitle'), Mapping('description', formatter=get_description), Mapping('languages', formatter=get_languages), Mapping('series', formatter=get_first), - Mapping('series_number'), + Mapping('seriesNumber', remote_field='series_number'), Mapping('subjects'), - Mapping('subject_places'), + Mapping('subjectPlaces'), + Mapping('isbn13', formatter=get_first), + Mapping('isbn10', formatter=get_first), + Mapping('lccn', formatter=get_first), Mapping( - 'first_published_date', - remote_field='first_publish_date', - formatter=get_date + 'oclcNumber', remote_field='oclc_numbers', + formatter=get_first ), Mapping( - 'published_date', - remote_field='publish_date', - formatter=get_date + 'openlibraryKey', remote_field='key', + formatter=get_openlibrary_key ), + Mapping('goodreadsKey', remote_field='goodreads_key'), + Mapping('asin'), Mapping( - 'pages', - model=models.Edition, - remote_field='number_of_pages' + 'firstPublishedDate', remote_field='first_publish_date', ), - Mapping('physical_format', model=models.Edition), + Mapping('publishedDate', remote_field='publish_date'), + Mapping('pages', remote_field='number_of_pages'), + Mapping('physicalFormat', remote_field='physical_format'), Mapping('publishers'), ] self.author_mappings = [ + Mapping('id', remote_field='key', formatter=get_remote_id), Mapping('name'), - Mapping('born', remote_field='birth_date', formatter=get_date), - Mapping('died', remote_field='death_date', formatter=get_date), + Mapping( + 'openlibraryKey', remote_field='key', + formatter=get_openlibrary_key + ), + Mapping('born', remote_field='birth_date'), + Mapping('died', remote_field='death_date'), Mapping('bio', formatter=get_description), ] def get_remote_id_from_data(self, data): + ''' format a url from an openlibrary id field ''' try: key = data['key'] except KeyError: @@ -107,24 +98,17 @@ class Connector(AbstractConnector): ''' parse author json and load or create authors ''' for author_blob in data.get('authors', []): author_blob = author_blob.get('author', author_blob) - # this id is "/authors/OL1234567A" and we want just "OL1234567A" - author_id = author_blob['key'].split('/')[-1] - yield self.get_or_create_author(author_id) + # this id is "/authors/OL1234567A" + author_id = author_blob['key'] + url = '%s/%s.json' % (self.base_url, author_id) + yield self.get_or_create_author(url) - def get_cover_from_data(self, data): + def get_cover_url(self, cover_blob): ''' ask openlibrary for the cover ''' - if not data.get('covers'): - return None - - cover_id = data.get('covers')[0] + cover_id = cover_blob[0] image_name = '%s-M.jpg' % cover_id - url = '%s/b/id/%s' % (self.covers_url, image_name) - response = requests.get(url) - if not response.ok: - response.raise_for_status() - image_content = ContentFile(response.content) - return [image_name, image_content] + return '%s/b/id/%s' % (self.covers_url, image_name) def parse_search_data(self, data): @@ -158,37 +142,7 @@ class Connector(AbstractConnector): # we can mass download edition data from OL to avoid repeatedly querying edition_options = self.load_edition_data(work.openlibrary_key) for edition_data in edition_options.get('entries'): - olkey = edition_data.get('key').split('/')[-1] - # make sure the edition isn't already in the database - if models.Edition.objects.filter(openlibrary_key=olkey).count(): - continue - - # creates and populates the book from the data - edition = self.create_book(olkey, edition_data, models.Edition) - # ensures that the edition is associated with the work - edition.parent_work = work - edition.save() - # get author data from the work if it's missing from the edition - if not edition.authors and work.authors: - edition.authors.set(work.authors.all()) - - - def get_or_create_author(self, olkey): - ''' load that author ''' - if not re.match(r'^OL\d+A$', olkey): - raise ValueError('Invalid OpenLibrary author ID') - author = models.Author.objects.filter(openlibrary_key=olkey).first() - if author: - return author - - url = '%s/authors/%s.json' % (self.base_url, olkey) - data = get_data(url) - - author = models.Author(openlibrary_key=olkey) - author = update_from_mappings(author, data, self.author_mappings) - author.save() - - return author + self.create_edition_from_data(work, edition_data) def get_description(description_blob): diff --git a/bookwyrm/tests/connectors/test_abstract_connector.py b/bookwyrm/tests/connectors/test_abstract_connector.py index f05645ab1..d6e02960e 100644 --- a/bookwyrm/tests/connectors/test_abstract_connector.py +++ b/bookwyrm/tests/connectors/test_abstract_connector.py @@ -30,10 +30,10 @@ class AbstractConnector(TestCase): 'series': ['one', 'two'], } self.connector.key_mappings = [ - Mapping('isbn_10', model=models.Edition), + Mapping('isbn_10'), Mapping('isbn_13'), - Mapping('lccn', model=models.Work), - Mapping('asin', remote_field='ASIN'), + Mapping('lccn'), + Mapping('asin'), ] @@ -41,7 +41,6 @@ class AbstractConnector(TestCase): mapping = Mapping('isbn') self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.remote_field, 'isbn') - self.assertEqual(mapping.model, None) self.assertEqual(mapping.formatter('bb'), 'bb') @@ -49,7 +48,6 @@ class AbstractConnector(TestCase): mapping = Mapping('isbn', remote_field='isbn13') self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.remote_field, 'isbn13') - self.assertEqual(mapping.model, None) self.assertEqual(mapping.formatter('bb'), 'bb') @@ -59,40 +57,4 @@ class AbstractConnector(TestCase): self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.remote_field, 'isbn') self.assertEqual(mapping.formatter, formatter) - self.assertEqual(mapping.model, None) self.assertEqual(mapping.formatter('bb'), 'aabb') - - - def test_match_from_mappings(self): - edition = models.Edition.objects.create( - title='Blah', - isbn_13='blahhh', - ) - match = self.connector.match_from_mappings(self.data, models.Edition) - self.assertEqual(match, edition) - - - def test_match_from_mappings_with_model(self): - edition = models.Edition.objects.create( - title='Blah', - isbn_10='1234567890', - ) - match = self.connector.match_from_mappings(self.data, models.Edition) - self.assertEqual(match, edition) - - - def test_match_from_mappings_with_remote(self): - edition = models.Edition.objects.create( - title='Blah', - asin='A00BLAH', - ) - match = self.connector.match_from_mappings(self.data, models.Edition) - self.assertEqual(match, edition) - - - def test_match_from_mappings_no_match(self): - edition = models.Edition.objects.create( - title='Blah', - ) - match = self.connector.match_from_mappings(self.data, models.Edition) - self.assertEqual(match, None) diff --git a/bookwyrm/tests/connectors/test_openlibrary_connector.py b/bookwyrm/tests/connectors/test_openlibrary_connector.py index 4a2bc6ea8..b3d97ba31 100644 --- a/bookwyrm/tests/connectors/test_openlibrary_connector.py +++ b/bookwyrm/tests/connectors/test_openlibrary_connector.py @@ -1,15 +1,16 @@ ''' testing book data connectors ''' -from dateutil import parser -from django.test import TestCase import json import pathlib +from dateutil import parser +from django.test import TestCase import pytz from bookwyrm import models from bookwyrm.connectors.openlibrary import Connector from bookwyrm.connectors.openlibrary import get_languages, get_description -from bookwyrm.connectors.openlibrary import pick_default_edition, get_openlibrary_key -from bookwyrm.connectors.abstract_connector import SearchResult, get_date +from bookwyrm.connectors.openlibrary import pick_default_edition, \ + get_openlibrary_key +from bookwyrm.connectors.abstract_connector import SearchResult class Openlibrary(TestCase): @@ -67,12 +68,6 @@ class Openlibrary(TestCase): self.assertEqual(description, expected) - def test_get_date(self): - date = get_date(self.work_data['first_publish_date']) - expected = pytz.utc.localize(parser.parse('1995')) - self.assertEqual(date, expected) - - def test_get_languages(self): languages = get_languages(self.edition_data['languages']) self.assertEqual(languages, ['English']) @@ -81,4 +76,3 @@ class Openlibrary(TestCase): def test_get_ol_key(self): key = get_openlibrary_key('/books/OL27320736M') self.assertEqual(key, 'OL27320736M') - diff --git a/bookwyrm/view_actions.py b/bookwyrm/view_actions.py index fcb684764..1df1dcbaf 100644 --- a/bookwyrm/view_actions.py +++ b/bookwyrm/view_actions.py @@ -223,6 +223,8 @@ def resolve_book(request): remote_id = request.POST.get('remote_id') connector = books_manager.get_or_create_connector(remote_id) book = connector.get_or_create_book(remote_id) + if book.connector: + books_manager.load_more_data.delay(book.id) return redirect('/book/%d' % book.id)