Merge pull request #408 from mouse-reeve/openlibrary-connector

Update openlibrary connector to use activitypub serializers
This commit is contained in:
Mouse Reeve 2020-12-19 17:00:02 -08:00 committed by GitHub
commit 27ec80515e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 120 additions and 312 deletions

View file

@ -50,7 +50,7 @@ class Work(Book):
''' work instance of a book object ''' ''' work instance of a book object '''
lccn: str = '' lccn: str = ''
defaultEdition: str = '' defaultEdition: str = ''
editions: List[str] editions: List[str] = field(default_factory=lambda: [])
type: str = 'Work' type: str = 'Work'
@ -58,9 +58,9 @@ class Work(Book):
class Author(ActivityObject): class Author(ActivityObject):
''' author of a book ''' ''' author of a book '''
name: str name: str
born: str = '' born: str = None
died: str = '' died: str = None
aliases: str = '' aliases: List[str] = field(default_factory=lambda: [])
bio: str = '' bio: str = ''
openlibraryKey: str = '' openlibraryKey: str = ''
wikipediaLink: str = '' wikipediaLink: str = ''

View file

@ -1,16 +1,14 @@
''' functionality outline for a book data connector ''' ''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass from dataclasses import dataclass
import pytz
from urllib3.exceptions import RequestError from urllib3.exceptions import RequestError
from django.db import transaction from django.db import transaction
from dateutil import parser
import requests import requests
from requests import HTTPError from requests import HTTPError
from requests.exceptions import SSLError from requests.exceptions import SSLError
from bookwyrm import models from bookwyrm import activitypub, models
class ConnectorException(HTTPError): class ConnectorException(HTTPError):
@ -38,7 +36,7 @@ class AbstractMinimalConnector(ABC):
for field in self_fields: for field in self_fields:
setattr(self, field, getattr(info, field)) setattr(self, field, getattr(info, field))
def search(self, query, min_confidence=None): def search(self, query, min_confidence=None):# pylint: disable=unused-argument
''' free text search ''' ''' free text search '''
resp = requests.get( resp = requests.get(
'%s%s' % (self.search_url, query), '%s%s' % (self.search_url, query),
@ -72,9 +70,6 @@ class AbstractConnector(AbstractMinimalConnector):
''' generic book data connector ''' ''' generic book data connector '''
def __init__(self, identifier): def __init__(self, identifier):
super().__init__(identifier) super().__init__(identifier)
self.key_mappings = []
# fields we want to look for in book data to copy over # fields we want to look for in book data to copy over
# title we handle separately. # title we handle separately.
self.book_mappings = [] self.book_mappings = []
@ -88,217 +83,110 @@ class AbstractConnector(AbstractMinimalConnector):
return True return True
@transaction.atomic
def get_or_create_book(self, remote_id): def get_or_create_book(self, remote_id):
# try to load the book ''' translate arbitrary json into an Activitypub dataclass '''
book = models.Book.objects.select_subclasses().filter( # first, check if we have the origin_id saved
origin_id=remote_id existing = models.Edition.find_existing_by_remote_id(remote_id) or \
).first() models.Work.find_existing_by_remote_id(remote_id)
if book: if existing:
if isinstance(book, models.Work): if hasattr(existing, 'get_default_editon'):
return book.default_edition return existing.get_default_editon()
return book return existing
# no book was found, so we start creating a new one # load the json
data = get_data(remote_id) data = get_data(remote_id)
mapped_data = dict_from_mappings(data, self.book_mappings)
work = None
edition = None
if self.is_work_data(data): if self.is_work_data(data):
work_data = data
# if we requested a work and there's already an edition, we're set
work = self.match_from_mappings(work_data, models.Work)
if work and work.default_edition:
return work.default_edition
# no such luck, we need more information.
try: try:
edition_data = self.get_edition_from_work_data(work_data) edition_data = self.get_edition_from_work_data(data)
except KeyError: except KeyError:
# hack: re-use the work data as the edition data # hack: re-use the work data as the edition data
# this is why remote ids aren't necessarily unique # this is why remote ids aren't necessarily unique
edition_data = data edition_data = data
work_data = mapped_data
else: else:
edition_data = data
edition = self.match_from_mappings(edition_data, models.Edition)
# no need to figure out about the work if we already know about it
if edition and edition.parent_work:
return edition
# no such luck, we need more information.
try: try:
work_data = self.get_work_from_edition_date(edition_data) work_data = self.get_work_from_edition_data(data)
work_data = dict_from_mappings(work_data, self.book_mappings)
except KeyError: except KeyError:
# remember this hack: re-use the work data as the edition data work_data = mapped_data
work_data = data edition_data = data
if not work_data or not edition_data: if not work_data or not edition_data:
raise ConnectorException('Unable to load book data: %s' % remote_id) raise ConnectorException('Unable to load book data: %s' % remote_id)
# at this point, we need to figure out the work, edition, or both # create activitypub object
# atomic so that we don't save a work with no edition for vice versa work_activity = activitypub.Work(**work_data)
with transaction.atomic(): # this will dedupe automatically
if not work: work = work_activity.to_model(models.Work)
work_key = self.get_remote_id_from_data(work_data) for author in self.get_authors_from_data(data):
work = self.create_book(work_key, work_data, models.Work) work.authors.add(author)
return self.create_edition_from_data(work, edition_data)
if not edition:
ed_key = self.get_remote_id_from_data(edition_data) def create_edition_from_data(self, work, edition_data):
edition = self.create_book(ed_key, edition_data, models.Edition) ''' if we already have the work, we're ready '''
edition.parent_work = work mapped_data = dict_from_mappings(edition_data, self.book_mappings)
mapped_data['work'] = work.remote_id
edition_activity = activitypub.Edition(**mapped_data)
edition = edition_activity.to_model(models.Edition)
edition.connector = self.connector
edition.save() edition.save()
work.default_edition = edition work.default_edition = edition
work.save() work.save()
# now's our change to fill in author gaps for author in self.get_authors_from_data(edition_data):
edition.authors.add(author)
if not edition.authors.exists() and work.authors.exists(): if not edition.authors.exists() and work.authors.exists():
edition.authors.set(work.authors.all()) edition.authors.add(work.authors.all())
edition.author_text = work.author_text
edition.save()
if not edition:
raise ConnectorException('Unable to create book: %s' % remote_id)
return edition return edition
def create_book(self, remote_id, data, model): def get_or_create_author(self, remote_id):
''' create a work or edition from data ''' ''' load that author '''
book = model.objects.create( existing = models.Author.find_existing_by_remote_id(remote_id)
origin_id=remote_id, if existing:
title=data['title'], return existing
connector=self.connector,
)
return self.update_book_from_data(book, data)
data = get_data(remote_id)
def update_book_from_data(self, book, data, update_cover=True): mapped_data = dict_from_mappings(data, self.author_mappings)
''' for creating a new book or syncing with data ''' activity = activitypub.Author(**mapped_data)
book = update_from_mappings(book, data, self.book_mappings) # this will dedupe
return activity.to_model(models.Author)
author_text = []
for author in self.get_authors_from_data(data):
book.authors.add(author)
author_text.append(author.name)
book.author_text = ', '.join(author_text)
book.save()
if not update_cover:
return book
cover = self.get_cover_from_data(data)
if cover:
book.cover.save(*cover, save=True)
return book
def update_book(self, book, data=None):
''' load new data '''
if not book.sync and not book.sync_cover:
return book
if not data:
key = getattr(book, self.key_name)
data = self.load_book_data(key)
if book.sync:
book = self.update_book_from_data(
book, data, update_cover=book.sync_cover)
else:
cover = self.get_cover_from_data(data)
if cover:
book.cover.save(*cover, save=True)
return book
def match_from_mappings(self, data, model):
''' try to find existing copies of this book using various keys '''
relevent_mappings = [m for m in self.key_mappings if \
not m.model or model == m.model]
for mapping in relevent_mappings:
# check if this field is present in the data
value = data.get(mapping.remote_field)
if not value:
continue
# extract the value in the right format
value = mapping.formatter(value)
# search our database for a matching book
kwargs = {mapping.local_field: value}
match = model.objects.filter(**kwargs).first()
if match:
return match
return None
@abstractmethod
def get_remote_id_from_data(self, data):
''' otherwise we won't properly set the remote_id in the db '''
@abstractmethod @abstractmethod
def is_work_data(self, data): def is_work_data(self, data):
''' differentiate works and editions ''' ''' differentiate works and editions '''
@abstractmethod @abstractmethod
def get_edition_from_work_data(self, data): def get_edition_from_work_data(self, data):
''' every work needs at least one edition ''' ''' every work needs at least one edition '''
@abstractmethod @abstractmethod
def get_work_from_edition_date(self, data): def get_work_from_edition_date(self, data):
''' every edition needs a work ''' ''' every edition needs a work '''
@abstractmethod @abstractmethod
def get_authors_from_data(self, data): def get_authors_from_data(self, data):
''' load author data ''' ''' load author data '''
@abstractmethod
def get_cover_from_data(self, data):
''' load cover '''
@abstractmethod @abstractmethod
def expand_book_data(self, book): def expand_book_data(self, book):
''' get more info on a book ''' ''' get more info on a book '''
def update_from_mappings(obj, data, mappings): def dict_from_mappings(data, mappings):
''' assign data to model with mappings ''' ''' create a dict in Activitypub format, using mappings supplies by
the subclass '''
result = {}
for mapping in mappings: for mapping in mappings:
# check if this field is present in the data result[mapping.local_field] = mapping.get_value(data)
value = data.get(mapping.remote_field) return result
if not value:
continue
# extract the value in the right format
try:
value = mapping.formatter(value)
except:
continue
# assign the formatted value to the model
obj.__setattr__(mapping.local_field, value)
return obj
def get_date(date_string):
''' helper function to try to interpret dates '''
if not date_string:
return None
try:
return pytz.utc.localize(parser.parse(date_string))
except ValueError:
pass
try:
return parser.parse(date_string)
except ValueError:
return None
def get_data(url): def get_data(url):
@ -349,11 +237,19 @@ class SearchResult:
class Mapping: class Mapping:
''' associate a local database field with a field in an external dataset ''' ''' associate a local database field with a field in an external dataset '''
def __init__( def __init__(self, local_field, remote_field=None, formatter=None):
self, local_field, remote_field=None, formatter=None, model=None):
noop = lambda x: x noop = lambda x: x
self.local_field = local_field self.local_field = local_field
self.remote_field = remote_field or local_field self.remote_field = remote_field or local_field
self.formatter = formatter or noop self.formatter = formatter or noop
self.model = model
def get_value(self, data):
''' pull a field from incoming json and return the formatted version '''
value = data.get(self.remote_field)
if not value:
return None
try:
return self.formatter(value)
except:# pylint: disable=bare-except
return None

View file

@ -1,13 +1,9 @@
''' openlibrary data connector ''' ''' openlibrary data connector '''
import re import re
import requests
from django.core.files.base import ContentFile
from bookwyrm import models from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult, Mapping from .abstract_connector import AbstractConnector, SearchResult, Mapping
from .abstract_connector import ConnectorException from .abstract_connector import ConnectorException, get_data
from .abstract_connector import get_date, get_data, update_from_mappings
from .openlibrary_languages import languages from .openlibrary_languages import languages
@ -17,62 +13,57 @@ class Connector(AbstractConnector):
super().__init__(identifier) super().__init__(identifier)
get_first = lambda a: a[0] get_first = lambda a: a[0]
self.key_mappings = [ get_remote_id = lambda a: self.base_url + a
Mapping('isbn_13', model=models.Edition, formatter=get_first), self.book_mappings = [
Mapping('isbn_10', model=models.Edition, formatter=get_first), Mapping('title'),
Mapping('lccn', model=models.Work, formatter=get_first), Mapping('id', remote_field='key', formatter=get_remote_id),
Mapping( Mapping(
'oclc_number', 'cover', remote_field='covers', formatter=self.get_cover_url),
remote_field='oclc_numbers', Mapping('sortTitle', remote_field='sort_title'),
model=models.Edition,
formatter=get_first
),
Mapping(
'openlibrary_key',
remote_field='key',
formatter=get_openlibrary_key
),
Mapping('goodreads_key'),
Mapping('asin'),
]
self.book_mappings = self.key_mappings + [
Mapping('sort_title'),
Mapping('subtitle'), Mapping('subtitle'),
Mapping('description', formatter=get_description), Mapping('description', formatter=get_description),
Mapping('languages', formatter=get_languages), Mapping('languages', formatter=get_languages),
Mapping('series', formatter=get_first), Mapping('series', formatter=get_first),
Mapping('series_number'), Mapping('seriesNumber', remote_field='series_number'),
Mapping('subjects'), Mapping('subjects'),
Mapping('subject_places'), Mapping('subjectPlaces'),
Mapping('isbn13', formatter=get_first),
Mapping('isbn10', formatter=get_first),
Mapping('lccn', formatter=get_first),
Mapping( Mapping(
'first_published_date', 'oclcNumber', remote_field='oclc_numbers',
remote_field='first_publish_date', formatter=get_first
formatter=get_date
), ),
Mapping( Mapping(
'published_date', 'openlibraryKey', remote_field='key',
remote_field='publish_date', formatter=get_openlibrary_key
formatter=get_date
), ),
Mapping('goodreadsKey', remote_field='goodreads_key'),
Mapping('asin'),
Mapping( Mapping(
'pages', 'firstPublishedDate', remote_field='first_publish_date',
model=models.Edition,
remote_field='number_of_pages'
), ),
Mapping('physical_format', model=models.Edition), Mapping('publishedDate', remote_field='publish_date'),
Mapping('pages', remote_field='number_of_pages'),
Mapping('physicalFormat', remote_field='physical_format'),
Mapping('publishers'), Mapping('publishers'),
] ]
self.author_mappings = [ self.author_mappings = [
Mapping('id', remote_field='key', formatter=get_remote_id),
Mapping('name'), Mapping('name'),
Mapping('born', remote_field='birth_date', formatter=get_date), Mapping(
Mapping('died', remote_field='death_date', formatter=get_date), 'openlibraryKey', remote_field='key',
formatter=get_openlibrary_key
),
Mapping('born', remote_field='birth_date'),
Mapping('died', remote_field='death_date'),
Mapping('bio', formatter=get_description), Mapping('bio', formatter=get_description),
] ]
def get_remote_id_from_data(self, data): def get_remote_id_from_data(self, data):
''' format a url from an openlibrary id field '''
try: try:
key = data['key'] key = data['key']
except KeyError: except KeyError:
@ -107,24 +98,17 @@ class Connector(AbstractConnector):
''' parse author json and load or create authors ''' ''' parse author json and load or create authors '''
for author_blob in data.get('authors', []): for author_blob in data.get('authors', []):
author_blob = author_blob.get('author', author_blob) author_blob = author_blob.get('author', author_blob)
# this id is "/authors/OL1234567A" and we want just "OL1234567A" # this id is "/authors/OL1234567A"
author_id = author_blob['key'].split('/')[-1] author_id = author_blob['key']
yield self.get_or_create_author(author_id) url = '%s/%s.json' % (self.base_url, author_id)
yield self.get_or_create_author(url)
def get_cover_from_data(self, data): def get_cover_url(self, cover_blob):
''' ask openlibrary for the cover ''' ''' ask openlibrary for the cover '''
if not data.get('covers'): cover_id = cover_blob[0]
return None
cover_id = data.get('covers')[0]
image_name = '%s-M.jpg' % cover_id image_name = '%s-M.jpg' % cover_id
url = '%s/b/id/%s' % (self.covers_url, image_name) return '%s/b/id/%s' % (self.covers_url, image_name)
response = requests.get(url)
if not response.ok:
response.raise_for_status()
image_content = ContentFile(response.content)
return [image_name, image_content]
def parse_search_data(self, data): def parse_search_data(self, data):
@ -158,37 +142,7 @@ class Connector(AbstractConnector):
# we can mass download edition data from OL to avoid repeatedly querying # we can mass download edition data from OL to avoid repeatedly querying
edition_options = self.load_edition_data(work.openlibrary_key) edition_options = self.load_edition_data(work.openlibrary_key)
for edition_data in edition_options.get('entries'): for edition_data in edition_options.get('entries'):
olkey = edition_data.get('key').split('/')[-1] self.create_edition_from_data(work, edition_data)
# make sure the edition isn't already in the database
if models.Edition.objects.filter(openlibrary_key=olkey).count():
continue
# creates and populates the book from the data
edition = self.create_book(olkey, edition_data, models.Edition)
# ensures that the edition is associated with the work
edition.parent_work = work
edition.save()
# get author data from the work if it's missing from the edition
if not edition.authors and work.authors:
edition.authors.set(work.authors.all())
def get_or_create_author(self, olkey):
''' load that author '''
if not re.match(r'^OL\d+A$', olkey):
raise ValueError('Invalid OpenLibrary author ID')
author = models.Author.objects.filter(openlibrary_key=olkey).first()
if author:
return author
url = '%s/authors/%s.json' % (self.base_url, olkey)
data = get_data(url)
author = models.Author(openlibrary_key=olkey)
author = update_from_mappings(author, data, self.author_mappings)
author.save()
return author
def get_description(description_blob): def get_description(description_blob):

View file

@ -30,10 +30,10 @@ class AbstractConnector(TestCase):
'series': ['one', 'two'], 'series': ['one', 'two'],
} }
self.connector.key_mappings = [ self.connector.key_mappings = [
Mapping('isbn_10', model=models.Edition), Mapping('isbn_10'),
Mapping('isbn_13'), Mapping('isbn_13'),
Mapping('lccn', model=models.Work), Mapping('lccn'),
Mapping('asin', remote_field='ASIN'), Mapping('asin'),
] ]
@ -41,7 +41,6 @@ class AbstractConnector(TestCase):
mapping = Mapping('isbn') mapping = Mapping('isbn')
self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn') self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'bb') self.assertEqual(mapping.formatter('bb'), 'bb')
@ -49,7 +48,6 @@ class AbstractConnector(TestCase):
mapping = Mapping('isbn', remote_field='isbn13') mapping = Mapping('isbn', remote_field='isbn13')
self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn13') self.assertEqual(mapping.remote_field, 'isbn13')
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'bb') self.assertEqual(mapping.formatter('bb'), 'bb')
@ -59,40 +57,4 @@ class AbstractConnector(TestCase):
self.assertEqual(mapping.local_field, 'isbn') self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn') self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter, formatter) self.assertEqual(mapping.formatter, formatter)
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'aabb') self.assertEqual(mapping.formatter('bb'), 'aabb')
def test_match_from_mappings(self):
edition = models.Edition.objects.create(
title='Blah',
isbn_13='blahhh',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_with_model(self):
edition = models.Edition.objects.create(
title='Blah',
isbn_10='1234567890',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_with_remote(self):
edition = models.Edition.objects.create(
title='Blah',
asin='A00BLAH',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_no_match(self):
edition = models.Edition.objects.create(
title='Blah',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, None)

View file

@ -1,15 +1,16 @@
''' testing book data connectors ''' ''' testing book data connectors '''
from dateutil import parser
from django.test import TestCase
import json import json
import pathlib import pathlib
from dateutil import parser
from django.test import TestCase
import pytz import pytz
from bookwyrm import models from bookwyrm import models
from bookwyrm.connectors.openlibrary import Connector from bookwyrm.connectors.openlibrary import Connector
from bookwyrm.connectors.openlibrary import get_languages, get_description from bookwyrm.connectors.openlibrary import get_languages, get_description
from bookwyrm.connectors.openlibrary import pick_default_edition, get_openlibrary_key from bookwyrm.connectors.openlibrary import pick_default_edition, \
from bookwyrm.connectors.abstract_connector import SearchResult, get_date get_openlibrary_key
from bookwyrm.connectors.abstract_connector import SearchResult
class Openlibrary(TestCase): class Openlibrary(TestCase):
@ -67,12 +68,6 @@ class Openlibrary(TestCase):
self.assertEqual(description, expected) self.assertEqual(description, expected)
def test_get_date(self):
date = get_date(self.work_data['first_publish_date'])
expected = pytz.utc.localize(parser.parse('1995'))
self.assertEqual(date, expected)
def test_get_languages(self): def test_get_languages(self):
languages = get_languages(self.edition_data['languages']) languages = get_languages(self.edition_data['languages'])
self.assertEqual(languages, ['English']) self.assertEqual(languages, ['English'])
@ -81,4 +76,3 @@ class Openlibrary(TestCase):
def test_get_ol_key(self): def test_get_ol_key(self):
key = get_openlibrary_key('/books/OL27320736M') key = get_openlibrary_key('/books/OL27320736M')
self.assertEqual(key, 'OL27320736M') self.assertEqual(key, 'OL27320736M')

View file

@ -223,6 +223,8 @@ def resolve_book(request):
remote_id = request.POST.get('remote_id') remote_id = request.POST.get('remote_id')
connector = books_manager.get_or_create_connector(remote_id) connector = books_manager.get_or_create_connector(remote_id)
book = connector.get_or_create_book(remote_id) book = connector.get_or_create_book(remote_id)
if book.connector:
books_manager.load_more_data.delay(book.id)
return redirect('/book/%d' % book.id) return redirect('/book/%d' % book.id)