bookwyrm/bookwyrm/connectors/openlibrary.py

''' openlibrary data connector '''
import re

from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult, Mapping
from .abstract_connector import get_data
from .connector_manager import ConnectorException
from .openlibrary_languages import languages


class Connector(AbstractConnector):
    ''' instantiate a connector for OL '''
    def __init__(self, identifier):
        super().__init__(identifier)

        get_first = lambda a: a[0]
        get_remote_id = lambda a: self.base_url + a
        self.book_mappings = [
            Mapping('title'),
            Mapping('id', remote_field='key', formatter=get_remote_id),
            Mapping(
                'cover', remote_field='covers', formatter=self.get_cover_url),
            Mapping('sortTitle', remote_field='sort_title'),
            Mapping('subtitle'),
            Mapping('description', formatter=get_description),
            Mapping('languages', formatter=get_languages),
            Mapping('series', formatter=get_first),
            Mapping('seriesNumber', remote_field='series_number'),
            Mapping('subjects'),
            Mapping('subjectPlaces', remote_field='subject_places'),
            Mapping('isbn13', remote_field='isbn_13', formatter=get_first),
            Mapping('isbn10', remote_field='isbn_10', formatter=get_first),
            Mapping('lccn', formatter=get_first),
            Mapping(
                'oclcNumber', remote_field='oclc_numbers',
                formatter=get_first
            ),
            Mapping(
                'openlibraryKey', remote_field='key',
                formatter=get_openlibrary_key
            ),
            Mapping('goodreadsKey', remote_field='goodreads_key'),
            Mapping('asin'),
            Mapping(
                'firstPublishedDate', remote_field='first_publish_date',
            ),
            Mapping('publishedDate', remote_field='publish_date'),
            Mapping('pages', remote_field='number_of_pages'),
            Mapping('physicalFormat', remote_field='physical_format'),
            Mapping('publishers'),
        ]

        self.author_mappings = [
            Mapping('id', remote_field='key', formatter=get_remote_id),
            Mapping('name'),
            Mapping(
                'openlibraryKey', remote_field='key',
                formatter=get_openlibrary_key
            ),
            Mapping('born', remote_field='birth_date'),
            Mapping('died', remote_field='death_date'),
            Mapping('bio', formatter=get_description),
        ]


    def get_remote_id_from_data(self, data):
        ''' format a url from an openlibrary id field '''
        try:
            key = data['key']
        except KeyError:
            raise ConnectorException('Invalid book data')
        return '%s%s' % (self.books_url, key)


    def is_work_data(self, data):
        return bool(re.match(r'^[\/\w]+OL\d+W$', data['key']))


    def get_edition_from_work_data(self, data):
        try:
            key = data['key']
        except KeyError:
            raise ConnectorException('Invalid book data')
        url = '%s%s/editions' % (self.books_url, key)
        data = get_data(url)
        return pick_default_edition(data['entries'])


    def get_work_from_edition_data(self, data):
        try:
            key = data['works'][0]['key']
        except (IndexError, KeyError):
            raise ConnectorException('No work found for edition')
        url = '%s%s' % (self.books_url, key)
        return get_data(url)


    def get_authors_from_data(self, data):
        ''' parse author json and load or create authors '''
        for author_blob in data.get('authors', []):
            author_blob = author_blob.get('author', author_blob)
            # this id is "/authors/OL1234567A"
            author_id = author_blob['key']
            url = '%s%s' % (self.base_url, author_id)
            yield self.get_or_create_author(url)


    def get_cover_url(self, cover_blob):
        ''' ask openlibrary for the cover '''
        cover_id = cover_blob[0]
        image_name = '%s-L.jpg' % cover_id
        return '%s/b/id/%s' % (self.covers_url, image_name)


    def parse_search_data(self, data):
        return data.get('docs')


    def format_search_result(self, search_result):
        # build the remote id from the openlibrary key
        key = self.books_url + search_result['key']
        author = search_result.get('author_name') or ['Unknown']
        return SearchResult(
            title=search_result.get('title'),
            key=key,
            author=', '.join(author),
            connector=self,
            year=search_result.get('first_publish_year'),
        )


    def load_edition_data(self, olkey):
        ''' query openlibrary for editions of a work '''
        url = '%s/works/%s/editions' % (self.books_url, olkey)
        return get_data(url)


    def expand_book_data(self, book):
        work = book
        # go from the edition to the work, if necessary
        if isinstance(book, models.Edition):
            work = book.parent_work

        # we can mass download edition data from OL to avoid repeatedly querying
        try:
            edition_options = self.load_edition_data(work.openlibrary_key)
        except ConnectorException:
            # who knows, man
            return

        for edition_data in edition_options.get('entries'):
            # does this edition have ANY interesting data?
            if ignore_edition(edition_data):
                continue
            self.create_edition_from_data(work, edition_data)


def ignore_edition(edition_data):
    ''' don't load a million editions that have no metadata '''
    # an isbn, we love to see it
    if edition_data.get('isbn_13') or edition_data.get('isbn_10'):
        print(edition_data.get('isbn_10'))
        return False
    # grudgingly, oclc can stay
    if edition_data.get('oclc_numbers'):
        print(edition_data.get('oclc_numbers'))
        return False
    # if it has a cover it can stay
    if edition_data.get('covers'):
        print(edition_data.get('covers'))
        return False
    # keep non-english editions
    if edition_data.get('languages') and \
            'languages/eng' not in str(edition_data.get('languages')):
        print(edition_data.get('languages'))
        return False
    return True


def get_description(description_blob):
    ''' descriptions can be a string or a dict '''
    if isinstance(description_blob, dict):
        return description_blob.get('value')
    return description_blob


def get_openlibrary_key(key):
    ''' convert /books/OL27320736M into OL27320736M '''
    return key.split('/')[-1]


def get_languages(language_blob):
    ''' /language/eng -> English '''
    langs = []
    for lang in language_blob:
        langs.append(
            languages.get(lang.get('key', ''), None)
        )
    return langs


def pick_default_edition(options):
    ''' favor physical copies with covers in english '''
    if not options:
        return None
    if len(options) == 1:
        return options[0]

    options = [e for e in options if e.get('covers')] or options
    options = [e for e in options if \
        '/languages/eng' in str(e.get('languages'))] or options
    formats = ['paperback', 'hardcover', 'mass market paperback']
    options = [e for e in options if \
        str(e.get('physical_format')).lower() in formats] or options
    options = [e for e in options if e.get('isbn_13')] or options
    options = [e for e in options if e.get('ocaid')] or options
    return options[0]
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`''' openlibrary data connector '''`
			`import re`
Style fixes suggested by pylint. 2020-04-22 13:53:22 +00:00
Updates migrations To get the app working again I ran resetdb, let it crash in initdb, then ran the migration, then re-ran initdb 2020-09-21 15:10:37 +00:00			`from bookwyrm import models`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`from .abstract_connector import AbstractConnector, SearchResult, Mapping`
Updates tests 2021-01-02 16:38:27 +00:00			`from .abstract_connector import get_data`
			`from .connector_manager import ConnectorException`
Mark default edition 2020-03-30 20:15:49 +00:00			`from .openlibrary_languages import languages`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

Use database as source for initializing connector 2020-03-27 22:25:08 +00:00			`class Connector(AbstractConnector):`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`''' instantiate a connector for OL '''`
Rename local key and suggest fedireads connectors 2020-03-27 23:36:52 +00:00			`def __init__(self, identifier):`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`super().__init__(identifier)`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`get_first = lambda a: a[0]`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`get_remote_id = lambda a: self.base_url + a`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`self.book_mappings = [`
			`Mapping('title'),`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`Mapping('id', remote_field='key', formatter=get_remote_id),`
			`Mapping(`
			`'cover', remote_field='covers', formatter=self.get_cover_url),`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`Mapping('sortTitle', remote_field='sort_title'),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping('subtitle'),`
			`Mapping('description', formatter=get_description),`
			`Mapping('languages', formatter=get_languages),`
			`Mapping('series', formatter=get_first),`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`Mapping('seriesNumber', remote_field='series_number'),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping('subjects'),`
Ignore openlibrary editions with little to no metadata Also fixes the isbn problem 2021-01-31 01:19:01 +00:00			`Mapping('subjectPlaces', remote_field='subject_places'),`
			`Mapping('isbn13', remote_field='isbn_13', formatter=get_first),`
			`Mapping('isbn10', remote_field='isbn_10', formatter=get_first),`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`Mapping('lccn', formatter=get_first),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping(`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`'oclcNumber', remote_field='oclc_numbers',`
			`formatter=get_first`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`),`
			`Mapping(`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`'openlibraryKey', remote_field='key',`
			`formatter=get_openlibrary_key`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`),`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`Mapping('goodreadsKey', remote_field='goodreads_key'),`
			`Mapping('asin'),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping(`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`'firstPublishedDate', remote_field='first_publish_date',`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`),`
Use activitypub to_model de-serializer in openlibrary connector 2020-12-19 22:56:03 +00:00			`Mapping('publishedDate', remote_field='publish_date'),`
			`Mapping('pages', remote_field='number_of_pages'),`
			`Mapping('physicalFormat', remote_field='physical_format'),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping('publishers'),`
			`]`

			`self.author_mappings = [`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`Mapping('id', remote_field='key', formatter=get_remote_id),`
Fixes name import in openlibrary 2020-12-12 17:43:07 +00:00			`Mapping('name'),`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`Mapping(`
			`'openlibraryKey', remote_field='key',`
			`formatter=get_openlibrary_key`
			`),`
Use author activitypub in OL connector 2020-12-19 23:20:31 +00:00			`Mapping('born', remote_field='birth_date'),`
			`Mapping('died', remote_field='death_date'),`
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`Mapping('bio', formatter=get_description),`
			`]`

Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Fixes loading remote books - saves remote_id correctly - loads remote books for incoming statuses 2020-10-31 00:04:10 +00:00			`def get_remote_id_from_data(self, data):`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`''' format a url from an openlibrary id field '''`
Fixes loading remote books - saves remote_id correctly - loads remote books for incoming statuses 2020-10-31 00:04:10 +00:00			`try:`
			`key = data['key']`
			`except KeyError:`
			`raise ConnectorException('Invalid book data')`
Fixes url formatting in openlibrary connector 2020-12-31 17:19:39 +00:00			`return '%s%s' % (self.books_url, key)`
Fixes loading remote books - saves remote_id correctly - loads remote books for incoming statuses 2020-10-31 00:04:10 +00:00

Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`def is_work_data(self, data):`
more openlibrary connector tests 2020-05-10 21:12:03 +00:00			`return bool(re.match(r'^[\/\w]+OL\d+W$', data['key']))`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`def get_edition_from_work_data(self, data):`
			`try:`
			`key = data['key']`
			`except KeyError:`
Raise errors when connectors fail 2020-09-30 17:27:40 +00:00			`raise ConnectorException('Invalid book data')`
Fixes url formatting in openlibrary connector 2020-12-31 17:19:39 +00:00			`url = '%s%s/editions' % (self.books_url, key)`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`data = get_data(url)`
			`return pick_default_edition(data['entries'])`
Parser for search results in connectors 2020-05-09 00:56:24 +00:00

Send connector with search result also fix typo in get_work_from_edition_data function 2020-12-27 22:27:18 +00:00			`def get_work_from_edition_data(self, data):`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`try:`
			`key = data['works'][0]['key']`
			`except (IndexError, KeyError):`
small style fixes and typo 2020-10-01 02:57:25 +00:00			`raise ConnectorException('No work found for edition')`
Fixes url formatting in openlibrary connector 2020-12-31 17:19:39 +00:00			`url = '%s%s' % (self.books_url, key)`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`return get_data(url)`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00

			`def get_authors_from_data(self, data):`
			`''' parse author json and load or create authors '''`
Fixes bugs in openlibrary connector 2020-03-14 04:10:53 +00:00			`for author_blob in data.get('authors', []):`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`author_blob = author_blob.get('author', author_blob)`
Use author activitypub in OL connector 2020-12-19 23:20:31 +00:00			`# this id is "/authors/OL1234567A"`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`author_id = author_blob['key']`
Fixes author path in openlibrary connector 2020-12-31 17:32:40 +00:00			`url = '%s%s' % (self.base_url, author_id)`
Use author activitypub in OL connector 2020-12-19 23:20:31 +00:00			`yield self.get_or_create_author(url)`
Set work author for editions with no author 2020-04-02 16:11:42 +00:00
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Fixes loading more data 2020-12-20 00:14:05 +00:00			`def get_cover_url(self, cover_blob):`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`''' ask openlibrary for the cover '''`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`cover_id = cover_blob[0]`
Discovery landing page 2021-01-03 19:10:04 +00:00			`image_name = '%s-L.jpg' % cover_id`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`return '%s/b/id/%s' % (self.covers_url, image_name)`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00

			`def parse_search_data(self, data):`
			`return data.get('docs')`


Fixes linter issues 2020-09-21 17:25:26 +00:00			`def format_search_result(self, search_result):`
Unify concept of absolute_id and remote_id 2020-05-13 01:56:28 +00:00			`# build the remote id from the openlibrary key`
Fixes linter issues 2020-09-21 17:25:26 +00:00			`key = self.books_url + search_result['key']`
			`author = search_result.get('author_name') or ['Unknown']`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`return SearchResult(`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`title=search_result.get('title'),`
			`key=key,`
			`author=', '.join(author),`
Send connector with search result also fix typo in get_work_from_edition_data function 2020-12-27 22:27:18 +00:00			`connector=self,`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`year=search_result.get('first_publish_year'),`
Refactors get_or_create_book 2020-05-10 19:56:59 +00:00			`)`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00

			`def load_edition_data(self, olkey):`
			`''' query openlibrary for editions of a work '''`
Small syntax changes in openlibrary connector 2020-12-31 19:03:13 +00:00			`url = '%s/works/%s/editions' % (self.books_url, olkey)`
Generalizes http request for json data in connectors 2020-05-09 19:39:58 +00:00			`return get_data(url)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

move loading editions to task 2020-04-02 05:11:31 +00:00			`def expand_book_data(self, book):`
			`work = book`
Fixes celery media path 2020-10-01 02:43:42 +00:00			`# go from the edition to the work, if necessary`
move loading editions to task 2020-04-02 05:11:31 +00:00			`if isinstance(book, models.Edition):`
			`work = book.parent_work`

Fixes celery media path 2020-10-01 02:43:42 +00:00			`# we can mass download edition data from OL to avoid repeatedly querying`
Catch expand book data load error 2021-02-11 01:54:49 +00:00			`try:`
			`edition_options = self.load_edition_data(work.openlibrary_key)`
			`except ConnectorException:`
			`# who knows, man`
			`return`

Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`for edition_data in edition_options.get('entries'):`
Ignore openlibrary editions with little to no metadata Also fixes the isbn problem 2021-01-31 01:19:01 +00:00			`# does this edition have ANY interesting data?`
			`if ignore_edition(edition_data):`
			`continue`
Fixes loading more data 2020-12-20 00:14:05 +00:00			`self.create_edition_from_data(work, edition_data)`
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00

Ignore openlibrary editions with little to no metadata Also fixes the isbn problem 2021-01-31 01:19:01 +00:00			`def ignore_edition(edition_data):`
			`''' don't load a million editions that have no metadata '''`
			`# an isbn, we love to see it`
			`if edition_data.get('isbn_13') or edition_data.get('isbn_10'):`
			`print(edition_data.get('isbn_10'))`
			`return False`
			`# grudgingly, oclc can stay`
			`if edition_data.get('oclc_numbers'):`
			`print(edition_data.get('oclc_numbers'))`
			`return False`
			`# if it has a cover it can stay`
			`if edition_data.get('covers'):`
			`print(edition_data.get('covers'))`
			`return False`
			`# keep non-english editions`
			`if edition_data.get('languages') and \`
			`'languages/eng' not in str(edition_data.get('languages')):`
			`print(edition_data.get('languages'))`
			`return False`
			`return True`


Get more data out of openlibrary 2020-03-28 04:28:52 +00:00			`def get_description(description_blob):`
			`''' descriptions can be a string or a dict '''`
			`if isinstance(description_blob, dict):`
			`return description_blob.get('value')`
Small syntax changes in openlibrary connector 2020-12-31 19:03:13 +00:00			`return description_blob`
Get more data out of openlibrary 2020-03-28 04:28:52 +00:00
Mark default edition 2020-03-30 20:15:49 +00:00
Re-thinks connector mappings 2020-05-10 23:41:24 +00:00			`def get_openlibrary_key(key):`
			`''' convert /books/OL27320736M into OL27320736M '''`
			`return key.split('/')[-1]`


Mark default edition 2020-03-30 20:15:49 +00:00			`def get_languages(language_blob):`
			`''' /language/eng -> English '''`
			`langs = []`
			`for lang in language_blob:`
			`langs.append(`
			`languages.get(lang.get('key', ''), None)`
			`)`
			`return langs`


Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`def pick_default_edition(options):`
			`''' favor physical copies with covers in english '''`
Style fixes suggested by pylint. 2020-04-22 13:53:22 +00:00			`if not options:`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`return None`
			`if len(options) == 1:`
			`return options[0]`

Fixes openlibrary import to prefer editions with covers 2020-12-21 19:47:47 +00:00			`options = [e for e in options if e.get('covers')] or options`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`options = [e for e in options if \`
			`'/languages/eng' in str(e.get('languages'))] or options`
			`formats = ['paperback', 'hardcover', 'mass market paperback']`
			`options = [e for e in options if \`
			`str(e.get('physical_format')).lower() in formats] or options`
			`options = [e for e in options if e.get('isbn_13')] or options`
			`options = [e for e in options if e.get('ocaid')] or options`
			`return options[0]`