2020-03-07 20:22:28 +00:00
|
|
|
''' openlibrary data connector '''
|
|
|
|
import re
|
2020-04-22 13:53:22 +00:00
|
|
|
|
2020-09-21 15:10:37 +00:00
|
|
|
from bookwyrm import models
|
2020-05-10 23:41:24 +00:00
|
|
|
from .abstract_connector import AbstractConnector, SearchResult, Mapping
|
2021-01-02 16:38:27 +00:00
|
|
|
from .abstract_connector import get_data
|
|
|
|
from .connector_manager import ConnectorException
|
2020-03-30 20:15:49 +00:00
|
|
|
from .openlibrary_languages import languages
|
2020-03-07 20:22:28 +00:00
|
|
|
|
|
|
|
|
2020-03-27 22:25:08 +00:00
|
|
|
class Connector(AbstractConnector):
|
2020-03-07 20:22:28 +00:00
|
|
|
''' instantiate a connector for OL '''
|
2020-03-27 23:36:52 +00:00
|
|
|
def __init__(self, identifier):
|
2020-05-10 19:56:59 +00:00
|
|
|
super().__init__(identifier)
|
2020-05-10 23:41:24 +00:00
|
|
|
|
2020-04-06 00:00:01 +00:00
|
|
|
get_first = lambda a: a[0]
|
2020-12-20 00:14:05 +00:00
|
|
|
get_remote_id = lambda a: self.base_url + a
|
2020-12-19 22:56:03 +00:00
|
|
|
self.book_mappings = [
|
|
|
|
Mapping('title'),
|
2020-12-20 00:14:05 +00:00
|
|
|
Mapping('id', remote_field='key', formatter=get_remote_id),
|
|
|
|
Mapping(
|
|
|
|
'cover', remote_field='covers', formatter=self.get_cover_url),
|
2020-12-19 22:56:03 +00:00
|
|
|
Mapping('sortTitle', remote_field='sort_title'),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping('subtitle'),
|
|
|
|
Mapping('description', formatter=get_description),
|
|
|
|
Mapping('languages', formatter=get_languages),
|
|
|
|
Mapping('series', formatter=get_first),
|
2020-12-19 22:56:03 +00:00
|
|
|
Mapping('seriesNumber', remote_field='series_number'),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping('subjects'),
|
2021-01-31 01:19:01 +00:00
|
|
|
Mapping('subjectPlaces', remote_field='subject_places'),
|
|
|
|
Mapping('isbn13', remote_field='isbn_13', formatter=get_first),
|
|
|
|
Mapping('isbn10', remote_field='isbn_10', formatter=get_first),
|
2020-12-19 22:56:03 +00:00
|
|
|
Mapping('lccn', formatter=get_first),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping(
|
2020-12-19 22:56:03 +00:00
|
|
|
'oclcNumber', remote_field='oclc_numbers',
|
|
|
|
formatter=get_first
|
2020-05-10 23:41:24 +00:00
|
|
|
),
|
|
|
|
Mapping(
|
2020-12-19 22:56:03 +00:00
|
|
|
'openlibraryKey', remote_field='key',
|
|
|
|
formatter=get_openlibrary_key
|
2020-05-10 23:41:24 +00:00
|
|
|
),
|
2020-12-19 22:56:03 +00:00
|
|
|
Mapping('goodreadsKey', remote_field='goodreads_key'),
|
|
|
|
Mapping('asin'),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping(
|
2020-12-19 22:56:03 +00:00
|
|
|
'firstPublishedDate', remote_field='first_publish_date',
|
2020-05-10 23:41:24 +00:00
|
|
|
),
|
2020-12-19 22:56:03 +00:00
|
|
|
Mapping('publishedDate', remote_field='publish_date'),
|
|
|
|
Mapping('pages', remote_field='number_of_pages'),
|
|
|
|
Mapping('physicalFormat', remote_field='physical_format'),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping('publishers'),
|
|
|
|
]
|
|
|
|
|
|
|
|
self.author_mappings = [
|
2020-12-20 00:14:05 +00:00
|
|
|
Mapping('id', remote_field='key', formatter=get_remote_id),
|
2020-12-12 17:43:07 +00:00
|
|
|
Mapping('name'),
|
2020-12-20 00:14:05 +00:00
|
|
|
Mapping(
|
|
|
|
'openlibraryKey', remote_field='key',
|
|
|
|
formatter=get_openlibrary_key
|
|
|
|
),
|
2020-12-19 23:20:31 +00:00
|
|
|
Mapping('born', remote_field='birth_date'),
|
|
|
|
Mapping('died', remote_field='death_date'),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping('bio', formatter=get_description),
|
|
|
|
]
|
|
|
|
|
2020-03-07 20:22:28 +00:00
|
|
|
|
2020-10-31 00:04:10 +00:00
|
|
|
def get_remote_id_from_data(self, data):
|
2020-12-20 00:14:05 +00:00
|
|
|
''' format a url from an openlibrary id field '''
|
2020-10-31 00:04:10 +00:00
|
|
|
try:
|
|
|
|
key = data['key']
|
|
|
|
except KeyError:
|
|
|
|
raise ConnectorException('Invalid book data')
|
2020-12-31 17:19:39 +00:00
|
|
|
return '%s%s' % (self.books_url, key)
|
2020-10-31 00:04:10 +00:00
|
|
|
|
|
|
|
|
2020-05-10 19:56:59 +00:00
|
|
|
def is_work_data(self, data):
|
2020-05-10 21:12:03 +00:00
|
|
|
return bool(re.match(r'^[\/\w]+OL\d+W$', data['key']))
|
2020-03-07 20:22:28 +00:00
|
|
|
|
|
|
|
|
2020-05-10 19:56:59 +00:00
|
|
|
def get_edition_from_work_data(self, data):
|
|
|
|
try:
|
|
|
|
key = data['key']
|
|
|
|
except KeyError:
|
2020-09-30 17:27:40 +00:00
|
|
|
raise ConnectorException('Invalid book data')
|
2020-12-31 17:19:39 +00:00
|
|
|
url = '%s%s/editions' % (self.books_url, key)
|
2020-05-10 19:56:59 +00:00
|
|
|
data = get_data(url)
|
|
|
|
return pick_default_edition(data['entries'])
|
2020-05-09 00:56:24 +00:00
|
|
|
|
|
|
|
|
2020-12-27 22:27:18 +00:00
|
|
|
def get_work_from_edition_data(self, data):
|
2020-05-10 19:56:59 +00:00
|
|
|
try:
|
|
|
|
key = data['works'][0]['key']
|
|
|
|
except (IndexError, KeyError):
|
2020-10-01 02:57:25 +00:00
|
|
|
raise ConnectorException('No work found for edition')
|
2020-12-31 17:19:39 +00:00
|
|
|
url = '%s%s' % (self.books_url, key)
|
2020-05-10 19:56:59 +00:00
|
|
|
return get_data(url)
|
2020-04-06 00:00:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
def get_authors_from_data(self, data):
|
|
|
|
''' parse author json and load or create authors '''
|
2020-03-14 04:10:53 +00:00
|
|
|
for author_blob in data.get('authors', []):
|
2020-03-07 20:22:28 +00:00
|
|
|
author_blob = author_blob.get('author', author_blob)
|
2020-12-19 23:20:31 +00:00
|
|
|
# this id is "/authors/OL1234567A"
|
2020-12-20 00:14:05 +00:00
|
|
|
author_id = author_blob['key']
|
2020-12-31 17:32:40 +00:00
|
|
|
url = '%s%s' % (self.base_url, author_id)
|
2020-12-19 23:20:31 +00:00
|
|
|
yield self.get_or_create_author(url)
|
2020-04-02 16:11:42 +00:00
|
|
|
|
2020-03-07 20:22:28 +00:00
|
|
|
|
2020-12-20 00:14:05 +00:00
|
|
|
def get_cover_url(self, cover_blob):
|
2020-05-10 19:56:59 +00:00
|
|
|
''' ask openlibrary for the cover '''
|
2020-12-20 00:14:05 +00:00
|
|
|
cover_id = cover_blob[0]
|
2021-01-03 19:10:04 +00:00
|
|
|
image_name = '%s-L.jpg' % cover_id
|
2020-12-20 00:14:05 +00:00
|
|
|
return '%s/b/id/%s' % (self.covers_url, image_name)
|
2020-05-10 19:56:59 +00:00
|
|
|
|
|
|
|
|
|
|
|
def parse_search_data(self, data):
|
|
|
|
return data.get('docs')
|
|
|
|
|
|
|
|
|
2020-09-21 17:25:26 +00:00
|
|
|
def format_search_result(self, search_result):
|
2020-05-13 01:56:28 +00:00
|
|
|
# build the remote id from the openlibrary key
|
2020-09-21 17:25:26 +00:00
|
|
|
key = self.books_url + search_result['key']
|
|
|
|
author = search_result.get('author_name') or ['Unknown']
|
2020-05-10 19:56:59 +00:00
|
|
|
return SearchResult(
|
2020-10-29 22:29:23 +00:00
|
|
|
title=search_result.get('title'),
|
|
|
|
key=key,
|
|
|
|
author=', '.join(author),
|
2020-12-27 22:27:18 +00:00
|
|
|
connector=self,
|
2020-10-29 22:29:23 +00:00
|
|
|
year=search_result.get('first_publish_year'),
|
2020-05-10 19:56:59 +00:00
|
|
|
)
|
2020-04-06 00:00:01 +00:00
|
|
|
|
|
|
|
|
|
|
|
def load_edition_data(self, olkey):
|
|
|
|
''' query openlibrary for editions of a work '''
|
2020-12-31 19:03:13 +00:00
|
|
|
url = '%s/works/%s/editions' % (self.books_url, olkey)
|
2020-05-09 19:39:58 +00:00
|
|
|
return get_data(url)
|
2020-03-07 20:22:28 +00:00
|
|
|
|
|
|
|
|
2020-04-02 05:11:31 +00:00
|
|
|
def expand_book_data(self, book):
|
|
|
|
work = book
|
2020-10-01 02:43:42 +00:00
|
|
|
# go from the edition to the work, if necessary
|
2020-04-02 05:11:31 +00:00
|
|
|
if isinstance(book, models.Edition):
|
|
|
|
work = book.parent_work
|
|
|
|
|
2020-10-01 02:43:42 +00:00
|
|
|
# we can mass download edition data from OL to avoid repeatedly querying
|
2021-02-11 01:54:49 +00:00
|
|
|
try:
|
|
|
|
edition_options = self.load_edition_data(work.openlibrary_key)
|
|
|
|
except ConnectorException:
|
|
|
|
# who knows, man
|
|
|
|
return
|
|
|
|
|
2020-04-06 00:00:01 +00:00
|
|
|
for edition_data in edition_options.get('entries'):
|
2021-01-31 01:19:01 +00:00
|
|
|
# does this edition have ANY interesting data?
|
|
|
|
if ignore_edition(edition_data):
|
|
|
|
continue
|
2020-12-20 00:14:05 +00:00
|
|
|
self.create_edition_from_data(work, edition_data)
|
2020-03-30 19:21:04 +00:00
|
|
|
|
|
|
|
|
2021-01-31 01:19:01 +00:00
|
|
|
def ignore_edition(edition_data):
|
|
|
|
''' don't load a million editions that have no metadata '''
|
|
|
|
# an isbn, we love to see it
|
|
|
|
if edition_data.get('isbn_13') or edition_data.get('isbn_10'):
|
|
|
|
print(edition_data.get('isbn_10'))
|
|
|
|
return False
|
|
|
|
# grudgingly, oclc can stay
|
|
|
|
if edition_data.get('oclc_numbers'):
|
|
|
|
print(edition_data.get('oclc_numbers'))
|
|
|
|
return False
|
|
|
|
# if it has a cover it can stay
|
|
|
|
if edition_data.get('covers'):
|
|
|
|
print(edition_data.get('covers'))
|
|
|
|
return False
|
|
|
|
# keep non-english editions
|
|
|
|
if edition_data.get('languages') and \
|
|
|
|
'languages/eng' not in str(edition_data.get('languages')):
|
|
|
|
print(edition_data.get('languages'))
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2020-03-28 04:28:52 +00:00
|
|
|
def get_description(description_blob):
|
|
|
|
''' descriptions can be a string or a dict '''
|
|
|
|
if isinstance(description_blob, dict):
|
|
|
|
return description_blob.get('value')
|
2020-12-31 19:03:13 +00:00
|
|
|
return description_blob
|
2020-03-28 04:28:52 +00:00
|
|
|
|
2020-03-30 20:15:49 +00:00
|
|
|
|
2020-05-10 23:41:24 +00:00
|
|
|
def get_openlibrary_key(key):
|
|
|
|
''' convert /books/OL27320736M into OL27320736M '''
|
|
|
|
return key.split('/')[-1]
|
|
|
|
|
|
|
|
|
2020-03-30 20:15:49 +00:00
|
|
|
def get_languages(language_blob):
|
|
|
|
''' /language/eng -> English '''
|
|
|
|
langs = []
|
|
|
|
for lang in language_blob:
|
|
|
|
langs.append(
|
|
|
|
languages.get(lang.get('key', ''), None)
|
|
|
|
)
|
|
|
|
return langs
|
|
|
|
|
|
|
|
|
2020-04-06 00:00:01 +00:00
|
|
|
def pick_default_edition(options):
|
|
|
|
''' favor physical copies with covers in english '''
|
2020-04-22 13:53:22 +00:00
|
|
|
if not options:
|
2020-04-06 00:00:01 +00:00
|
|
|
return None
|
|
|
|
if len(options) == 1:
|
|
|
|
return options[0]
|
|
|
|
|
2020-12-21 19:47:47 +00:00
|
|
|
options = [e for e in options if e.get('covers')] or options
|
2020-04-06 00:00:01 +00:00
|
|
|
options = [e for e in options if \
|
|
|
|
'/languages/eng' in str(e.get('languages'))] or options
|
|
|
|
formats = ['paperback', 'hardcover', 'mass market paperback']
|
|
|
|
options = [e for e in options if \
|
|
|
|
str(e.get('physical_format')).lower() in formats] or options
|
|
|
|
options = [e for e in options if e.get('isbn_13')] or options
|
|
|
|
options = [e for e in options if e.get('ocaid')] or options
|
|
|
|
return options[0]
|