Merge pull request #408 from mouse-reeve/openlibrary-connector

Update openlibrary connector to use activitypub serializers
This commit is contained in:
Mouse Reeve 2020-12-19 17:00:02 -08:00 committed by GitHub
commit 27ec80515e
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 120 additions and 312 deletions

View file

@ -50,7 +50,7 @@ class Work(Book):
''' work instance of a book object '''
lccn: str = ''
defaultEdition: str = ''
editions: List[str]
editions: List[str] = field(default_factory=lambda: [])
type: str = 'Work'
@ -58,9 +58,9 @@ class Work(Book):
class Author(ActivityObject):
''' author of a book '''
name: str
born: str = ''
died: str = ''
aliases: str = ''
born: str = None
died: str = None
aliases: List[str] = field(default_factory=lambda: [])
bio: str = ''
openlibraryKey: str = ''
wikipediaLink: str = ''

View file

@ -1,16 +1,14 @@
''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod
from dataclasses import dataclass
import pytz
from urllib3.exceptions import RequestError
from django.db import transaction
from dateutil import parser
import requests
from requests import HTTPError
from requests.exceptions import SSLError
from bookwyrm import models
from bookwyrm import activitypub, models
class ConnectorException(HTTPError):
@ -38,7 +36,7 @@ class AbstractMinimalConnector(ABC):
for field in self_fields:
setattr(self, field, getattr(info, field))
def search(self, query, min_confidence=None):
def search(self, query, min_confidence=None):# pylint: disable=unused-argument
''' free text search '''
resp = requests.get(
'%s%s' % (self.search_url, query),
@ -72,9 +70,6 @@ class AbstractConnector(AbstractMinimalConnector):
''' generic book data connector '''
def __init__(self, identifier):
super().__init__(identifier)
self.key_mappings = []
# fields we want to look for in book data to copy over
# title we handle separately.
self.book_mappings = []
@ -88,217 +83,110 @@ class AbstractConnector(AbstractMinimalConnector):
return True
@transaction.atomic
def get_or_create_book(self, remote_id):
# try to load the book
book = models.Book.objects.select_subclasses().filter(
origin_id=remote_id
).first()
if book:
if isinstance(book, models.Work):
return book.default_edition
return book
''' translate arbitrary json into an Activitypub dataclass '''
# first, check if we have the origin_id saved
existing = models.Edition.find_existing_by_remote_id(remote_id) or \
models.Work.find_existing_by_remote_id(remote_id)
if existing:
if hasattr(existing, 'get_default_editon'):
return existing.get_default_editon()
return existing
# no book was found, so we start creating a new one
# load the json
data = get_data(remote_id)
work = None
edition = None
mapped_data = dict_from_mappings(data, self.book_mappings)
if self.is_work_data(data):
work_data = data
# if we requested a work and there's already an edition, we're set
work = self.match_from_mappings(work_data, models.Work)
if work and work.default_edition:
return work.default_edition
# no such luck, we need more information.
try:
edition_data = self.get_edition_from_work_data(work_data)
edition_data = self.get_edition_from_work_data(data)
except KeyError:
# hack: re-use the work data as the edition data
# this is why remote ids aren't necessarily unique
edition_data = data
work_data = mapped_data
else:
edition_data = data
edition = self.match_from_mappings(edition_data, models.Edition)
# no need to figure out about the work if we already know about it
if edition and edition.parent_work:
return edition
# no such luck, we need more information.
try:
work_data = self.get_work_from_edition_date(edition_data)
work_data = self.get_work_from_edition_data(data)
work_data = dict_from_mappings(work_data, self.book_mappings)
except KeyError:
# remember this hack: re-use the work data as the edition data
work_data = data
work_data = mapped_data
edition_data = data
if not work_data or not edition_data:
raise ConnectorException('Unable to load book data: %s' % remote_id)
# at this point, we need to figure out the work, edition, or both
# atomic so that we don't save a work with no edition for vice versa
with transaction.atomic():
if not work:
work_key = self.get_remote_id_from_data(work_data)
work = self.create_book(work_key, work_data, models.Work)
# create activitypub object
work_activity = activitypub.Work(**work_data)
# this will dedupe automatically
work = work_activity.to_model(models.Work)
for author in self.get_authors_from_data(data):
work.authors.add(author)
return self.create_edition_from_data(work, edition_data)
if not edition:
ed_key = self.get_remote_id_from_data(edition_data)
edition = self.create_book(ed_key, edition_data, models.Edition)
edition.parent_work = work
edition.save()
work.default_edition = edition
work.save()
# now's our change to fill in author gaps
def create_edition_from_data(self, work, edition_data):
''' if we already have the work, we're ready '''
mapped_data = dict_from_mappings(edition_data, self.book_mappings)
mapped_data['work'] = work.remote_id
edition_activity = activitypub.Edition(**mapped_data)
edition = edition_activity.to_model(models.Edition)
edition.connector = self.connector
edition.save()
work.default_edition = edition
work.save()
for author in self.get_authors_from_data(edition_data):
edition.authors.add(author)
if not edition.authors.exists() and work.authors.exists():
edition.authors.set(work.authors.all())
edition.author_text = work.author_text
edition.save()
if not edition:
raise ConnectorException('Unable to create book: %s' % remote_id)
edition.authors.add(work.authors.all())
return edition
def create_book(self, remote_id, data, model):
''' create a work or edition from data '''
book = model.objects.create(
origin_id=remote_id,
title=data['title'],
connector=self.connector,
)
return self.update_book_from_data(book, data)
def get_or_create_author(self, remote_id):
''' load that author '''
existing = models.Author.find_existing_by_remote_id(remote_id)
if existing:
return existing
data = get_data(remote_id)
def update_book_from_data(self, book, data, update_cover=True):
''' for creating a new book or syncing with data '''
book = update_from_mappings(book, data, self.book_mappings)
author_text = []
for author in self.get_authors_from_data(data):
book.authors.add(author)
author_text.append(author.name)
book.author_text = ', '.join(author_text)
book.save()
if not update_cover:
return book
cover = self.get_cover_from_data(data)
if cover:
book.cover.save(*cover, save=True)
return book
def update_book(self, book, data=None):
''' load new data '''
if not book.sync and not book.sync_cover:
return book
if not data:
key = getattr(book, self.key_name)
data = self.load_book_data(key)
if book.sync:
book = self.update_book_from_data(
book, data, update_cover=book.sync_cover)
else:
cover = self.get_cover_from_data(data)
if cover:
book.cover.save(*cover, save=True)
return book
def match_from_mappings(self, data, model):
''' try to find existing copies of this book using various keys '''
relevent_mappings = [m for m in self.key_mappings if \
not m.model or model == m.model]
for mapping in relevent_mappings:
# check if this field is present in the data
value = data.get(mapping.remote_field)
if not value:
continue
# extract the value in the right format
value = mapping.formatter(value)
# search our database for a matching book
kwargs = {mapping.local_field: value}
match = model.objects.filter(**kwargs).first()
if match:
return match
return None
@abstractmethod
def get_remote_id_from_data(self, data):
''' otherwise we won't properly set the remote_id in the db '''
mapped_data = dict_from_mappings(data, self.author_mappings)
activity = activitypub.Author(**mapped_data)
# this will dedupe
return activity.to_model(models.Author)
@abstractmethod
def is_work_data(self, data):
''' differentiate works and editions '''
@abstractmethod
def get_edition_from_work_data(self, data):
''' every work needs at least one edition '''
@abstractmethod
def get_work_from_edition_date(self, data):
''' every edition needs a work '''
@abstractmethod
def get_authors_from_data(self, data):
''' load author data '''
@abstractmethod
def get_cover_from_data(self, data):
''' load cover '''
@abstractmethod
def expand_book_data(self, book):
''' get more info on a book '''
def update_from_mappings(obj, data, mappings):
''' assign data to model with mappings '''
def dict_from_mappings(data, mappings):
''' create a dict in Activitypub format, using mappings supplies by
the subclass '''
result = {}
for mapping in mappings:
# check if this field is present in the data
value = data.get(mapping.remote_field)
if not value:
continue
# extract the value in the right format
try:
value = mapping.formatter(value)
except:
continue
# assign the formatted value to the model
obj.__setattr__(mapping.local_field, value)
return obj
def get_date(date_string):
''' helper function to try to interpret dates '''
if not date_string:
return None
try:
return pytz.utc.localize(parser.parse(date_string))
except ValueError:
pass
try:
return parser.parse(date_string)
except ValueError:
return None
result[mapping.local_field] = mapping.get_value(data)
return result
def get_data(url):
@ -349,11 +237,19 @@ class SearchResult:
class Mapping:
''' associate a local database field with a field in an external dataset '''
def __init__(
self, local_field, remote_field=None, formatter=None, model=None):
def __init__(self, local_field, remote_field=None, formatter=None):
noop = lambda x: x
self.local_field = local_field
self.remote_field = remote_field or local_field
self.formatter = formatter or noop
self.model = model
def get_value(self, data):
''' pull a field from incoming json and return the formatted version '''
value = data.get(self.remote_field)
if not value:
return None
try:
return self.formatter(value)
except:# pylint: disable=bare-except
return None

View file

@ -1,13 +1,9 @@
''' openlibrary data connector '''
import re
import requests
from django.core.files.base import ContentFile
from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult, Mapping
from .abstract_connector import ConnectorException
from .abstract_connector import get_date, get_data, update_from_mappings
from .abstract_connector import ConnectorException, get_data
from .openlibrary_languages import languages
@ -17,62 +13,57 @@ class Connector(AbstractConnector):
super().__init__(identifier)
get_first = lambda a: a[0]
self.key_mappings = [
Mapping('isbn_13', model=models.Edition, formatter=get_first),
Mapping('isbn_10', model=models.Edition, formatter=get_first),
Mapping('lccn', model=models.Work, formatter=get_first),
get_remote_id = lambda a: self.base_url + a
self.book_mappings = [
Mapping('title'),
Mapping('id', remote_field='key', formatter=get_remote_id),
Mapping(
'oclc_number',
remote_field='oclc_numbers',
model=models.Edition,
formatter=get_first
),
Mapping(
'openlibrary_key',
remote_field='key',
formatter=get_openlibrary_key
),
Mapping('goodreads_key'),
Mapping('asin'),
]
self.book_mappings = self.key_mappings + [
Mapping('sort_title'),
'cover', remote_field='covers', formatter=self.get_cover_url),
Mapping('sortTitle', remote_field='sort_title'),
Mapping('subtitle'),
Mapping('description', formatter=get_description),
Mapping('languages', formatter=get_languages),
Mapping('series', formatter=get_first),
Mapping('series_number'),
Mapping('seriesNumber', remote_field='series_number'),
Mapping('subjects'),
Mapping('subject_places'),
Mapping('subjectPlaces'),
Mapping('isbn13', formatter=get_first),
Mapping('isbn10', formatter=get_first),
Mapping('lccn', formatter=get_first),
Mapping(
'first_published_date',
remote_field='first_publish_date',
formatter=get_date
'oclcNumber', remote_field='oclc_numbers',
formatter=get_first
),
Mapping(
'published_date',
remote_field='publish_date',
formatter=get_date
'openlibraryKey', remote_field='key',
formatter=get_openlibrary_key
),
Mapping('goodreadsKey', remote_field='goodreads_key'),
Mapping('asin'),
Mapping(
'pages',
model=models.Edition,
remote_field='number_of_pages'
'firstPublishedDate', remote_field='first_publish_date',
),
Mapping('physical_format', model=models.Edition),
Mapping('publishedDate', remote_field='publish_date'),
Mapping('pages', remote_field='number_of_pages'),
Mapping('physicalFormat', remote_field='physical_format'),
Mapping('publishers'),
]
self.author_mappings = [
Mapping('id', remote_field='key', formatter=get_remote_id),
Mapping('name'),
Mapping('born', remote_field='birth_date', formatter=get_date),
Mapping('died', remote_field='death_date', formatter=get_date),
Mapping(
'openlibraryKey', remote_field='key',
formatter=get_openlibrary_key
),
Mapping('born', remote_field='birth_date'),
Mapping('died', remote_field='death_date'),
Mapping('bio', formatter=get_description),
]
def get_remote_id_from_data(self, data):
''' format a url from an openlibrary id field '''
try:
key = data['key']
except KeyError:
@ -107,24 +98,17 @@ class Connector(AbstractConnector):
''' parse author json and load or create authors '''
for author_blob in data.get('authors', []):
author_blob = author_blob.get('author', author_blob)
# this id is "/authors/OL1234567A" and we want just "OL1234567A"
author_id = author_blob['key'].split('/')[-1]
yield self.get_or_create_author(author_id)
# this id is "/authors/OL1234567A"
author_id = author_blob['key']
url = '%s/%s.json' % (self.base_url, author_id)
yield self.get_or_create_author(url)
def get_cover_from_data(self, data):
def get_cover_url(self, cover_blob):
''' ask openlibrary for the cover '''
if not data.get('covers'):
return None
cover_id = data.get('covers')[0]
cover_id = cover_blob[0]
image_name = '%s-M.jpg' % cover_id
url = '%s/b/id/%s' % (self.covers_url, image_name)
response = requests.get(url)
if not response.ok:
response.raise_for_status()
image_content = ContentFile(response.content)
return [image_name, image_content]
return '%s/b/id/%s' % (self.covers_url, image_name)
def parse_search_data(self, data):
@ -158,37 +142,7 @@ class Connector(AbstractConnector):
# we can mass download edition data from OL to avoid repeatedly querying
edition_options = self.load_edition_data(work.openlibrary_key)
for edition_data in edition_options.get('entries'):
olkey = edition_data.get('key').split('/')[-1]
# make sure the edition isn't already in the database
if models.Edition.objects.filter(openlibrary_key=olkey).count():
continue
# creates and populates the book from the data
edition = self.create_book(olkey, edition_data, models.Edition)
# ensures that the edition is associated with the work
edition.parent_work = work
edition.save()
# get author data from the work if it's missing from the edition
if not edition.authors and work.authors:
edition.authors.set(work.authors.all())
def get_or_create_author(self, olkey):
''' load that author '''
if not re.match(r'^OL\d+A$', olkey):
raise ValueError('Invalid OpenLibrary author ID')
author = models.Author.objects.filter(openlibrary_key=olkey).first()
if author:
return author
url = '%s/authors/%s.json' % (self.base_url, olkey)
data = get_data(url)
author = models.Author(openlibrary_key=olkey)
author = update_from_mappings(author, data, self.author_mappings)
author.save()
return author
self.create_edition_from_data(work, edition_data)
def get_description(description_blob):

View file

@ -30,10 +30,10 @@ class AbstractConnector(TestCase):
'series': ['one', 'two'],
}
self.connector.key_mappings = [
Mapping('isbn_10', model=models.Edition),
Mapping('isbn_10'),
Mapping('isbn_13'),
Mapping('lccn', model=models.Work),
Mapping('asin', remote_field='ASIN'),
Mapping('lccn'),
Mapping('asin'),
]
@ -41,7 +41,6 @@ class AbstractConnector(TestCase):
mapping = Mapping('isbn')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'bb')
@ -49,7 +48,6 @@ class AbstractConnector(TestCase):
mapping = Mapping('isbn', remote_field='isbn13')
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn13')
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'bb')
@ -59,40 +57,4 @@ class AbstractConnector(TestCase):
self.assertEqual(mapping.local_field, 'isbn')
self.assertEqual(mapping.remote_field, 'isbn')
self.assertEqual(mapping.formatter, formatter)
self.assertEqual(mapping.model, None)
self.assertEqual(mapping.formatter('bb'), 'aabb')
def test_match_from_mappings(self):
edition = models.Edition.objects.create(
title='Blah',
isbn_13='blahhh',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_with_model(self):
edition = models.Edition.objects.create(
title='Blah',
isbn_10='1234567890',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_with_remote(self):
edition = models.Edition.objects.create(
title='Blah',
asin='A00BLAH',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, edition)
def test_match_from_mappings_no_match(self):
edition = models.Edition.objects.create(
title='Blah',
)
match = self.connector.match_from_mappings(self.data, models.Edition)
self.assertEqual(match, None)

View file

@ -1,15 +1,16 @@
''' testing book data connectors '''
from dateutil import parser
from django.test import TestCase
import json
import pathlib
from dateutil import parser
from django.test import TestCase
import pytz
from bookwyrm import models
from bookwyrm.connectors.openlibrary import Connector
from bookwyrm.connectors.openlibrary import get_languages, get_description
from bookwyrm.connectors.openlibrary import pick_default_edition, get_openlibrary_key
from bookwyrm.connectors.abstract_connector import SearchResult, get_date
from bookwyrm.connectors.openlibrary import pick_default_edition, \
get_openlibrary_key
from bookwyrm.connectors.abstract_connector import SearchResult
class Openlibrary(TestCase):
@ -67,12 +68,6 @@ class Openlibrary(TestCase):
self.assertEqual(description, expected)
def test_get_date(self):
date = get_date(self.work_data['first_publish_date'])
expected = pytz.utc.localize(parser.parse('1995'))
self.assertEqual(date, expected)
def test_get_languages(self):
languages = get_languages(self.edition_data['languages'])
self.assertEqual(languages, ['English'])
@ -81,4 +76,3 @@ class Openlibrary(TestCase):
def test_get_ol_key(self):
key = get_openlibrary_key('/books/OL27320736M')
self.assertEqual(key, 'OL27320736M')

View file

@ -223,6 +223,8 @@ def resolve_book(request):
remote_id = request.POST.get('remote_id')
connector = books_manager.get_or_create_connector(remote_id)
book = connector.get_or_create_book(remote_id)
if book.connector:
books_manager.load_more_data.delay(book.id)
return redirect('/book/%d' % book.id)