moviewyrm/fedireads/connectors/openlibrary.py

''' openlibrary data connector '''
import re
import requests

from django.core.files.base import ContentFile
from django.db import transaction

from fedireads import models
from .abstract_connector import AbstractConnector, SearchResult
from .abstract_connector import update_from_mappings
from .abstract_connector import get_date
from .openlibrary_languages import languages


class Connector(AbstractConnector):
    ''' instantiate a connector for OL '''
    def __init__(self, identifier):
        get_first = lambda a: a[0]
        self.key_mappings = {
            'isbn_13': ('isbn_13', get_first),
            'isbn_10': ('isbn_10', get_first),
            'oclc_numbers': ('oclc_number', get_first),
            'lccn': ('lccn', get_first),
        }

        self.book_mappings = self.key_mappings.copy()
        self.book_mappings.update({
            'publish_date': ('published_date', get_date),
            'first_publish_date': ('first_published_date', get_date),
            'description': ('description', get_description),
            'languages': ('languages', get_languages),
            'number_of_pages': ('pages', None),
            'series': ('series', get_first),
        })
        super().__init__(identifier)


    def format_search_result(self, doc):
        key = doc['key']
        key = key.split('/')[-1]
        author = doc.get('author_name') or ['Unknown']
        return SearchResult(
            doc.get('title'),
            key,
            author[0],
            doc.get('first_publish_year'),
        )


    def get_or_create_book(self, olkey):
        ''' pull up a book record by whatever means possible.
        if you give a work key, it should give you the default edition,
        annotated with work data. '''

        book = models.Book.objects.select_subclasses().filter(
            openlibrary_key=olkey
        ).first()
        if book:
            if isinstance(book, models.Work):
                return book.default_edition
            return book

        # no book was found, so we start creating a new one
        if re.match(r'^OL\d+W$', olkey):
            with transaction.atomic():
                # create both work and a default edition
                work_data = self.load_book_data(olkey)
                work = self.create_book(olkey, work_data, models.Work)

                edition_options = self.load_edition_data(olkey).get('entries')
                edition_data = pick_default_edition(edition_options)
                if not edition_data:
                    # hack: re-use the work data as the edition data
                    edition_data = work_data
                key = edition_data.get('key').split('/')[-1]
                edition = self.create_book(key, edition_data, models.Edition)
                edition.default = True
                edition.parent_work = work
                edition.save()
        else:
            with transaction.atomic():
                edition_data = self.load_book_data(olkey)
                edition = self.create_book(olkey, edition_data, models.Edition)

                work_data = edition_data.get('works')
                if not work_data:
                    # hack: we're re-using the edition data as the work data
                    work_key = olkey
                else:
                    work_key = work_data[0]['key'].split('/')[-1]

                work = models.Work.objects.filter(
                    openlibrary_key=work_key
                ).first()
                if not work:
                    work_data = self.load_book_data(work_key)
                    work = self.create_book(work_key, work_data, models.Work)
                edition.parent_work = work
                edition.save()
        if not edition.authors and work.authors:
            edition.authors.set(work.authors.all())
            edition.author_text = ', '.join(a.name for a in edition.authors)

        return edition


    def update_book_from_data(self, book, data):
        ''' updaet a book model instance from ol data '''
        # populate the simple data fields
        super().update_book_from_data(book, data)

        authors = self.get_authors_from_data(data)
        for author in authors:
            book.authors.add(author)
        if authors:
            book.author_text = ', '.join(a.name for a in authors)

        if data.get('covers'):
            book.cover.save(*self.get_cover(data['covers'][0]), save=True)
        return book


    def update_book(self, book, data=None):
        ''' load new data '''
        if not book.sync and not book.sync_cover:
            return

        if not data:
            data = self.load_book_data(book.openlibrary_key)

        if book.sync_cover and data.get('covers'):
            book.cover.save(*self.get_cover(data['covers'][0]), save=True)
        if book.sync:
            book = self.update_book_from_data(book, data)
        return book


    def get_authors_from_data(self, data):
        ''' parse author json and load or create authors '''
        authors = []
        for author_blob in data.get('authors', []):
            # this id is "/authors/OL1234567A" and we want just "OL1234567A"
            author_blob = author_blob.get('author', author_blob)
            author_id = author_blob['key'].split('/')[-1]
            authors.append(self.get_or_create_author(author_id))
        return authors


    def load_book_data(self, olkey):
        ''' query openlibrary for data on a book '''
        response = requests.get('%s/works/%s.json' % (self.books_url, olkey))
        if not response.ok:
            response.raise_for_status()
        data = response.json()
        return data


    def load_edition_data(self, olkey):
        ''' query openlibrary for editions of a work '''
        response = requests.get(
            '%s/works/%s/editions.json' % (self.books_url, olkey))
        if not response.ok:
            response.raise_for_status()
        data = response.json()
        return data


    def expand_book_data(self, book):
        work = book
        if isinstance(book, models.Edition):
            work = book.parent_work

        edition_options = self.load_edition_data(work.openlibrary_key)
        for edition_data in edition_options.get('entries'):
            olkey = edition_data.get('key').split('/')[-1]
            if models.Edition.objects.filter(openlibrary_key=olkey).count():
                continue
            edition = self.create_book(olkey, edition_data, models.Edition)
            edition.parent_work = work
            edition.save()
            if not edition.authors and work.authors:
                edition.authors.set(work.authors.all())


    def get_or_create_author(self, olkey):
        ''' load that author '''
        if not re.match(r'^OL\d+A$', olkey):
            raise ValueError('Invalid OpenLibrary author ID')
        try:
            return models.Author.objects.get(openlibrary_key=olkey)
        except models.Author.DoesNotExist:
            pass

        response = requests.get('%s/authors/%s.json' % (self.base_url, olkey))
        if not response.ok:
            response.raise_for_status()

        data = response.json()
        author = models.Author(openlibrary_key=olkey)
        mappings = {
            'birth_date': ('born', get_date),
            'death_date': ('died', get_date),
            'bio': ('bio', get_description),
        }
        author = update_from_mappings(author, data, mappings)
        # TODO this is making some BOLD assumption
        name = data.get('name')
        if name:
            author.last_name = name.split(' ')[-1]
            author.first_name = ' '.join(name.split(' ')[:-1])
        author.save()

        return author


    def get_cover(self, cover_id):
        ''' ask openlibrary for the cover '''
        # TODO: get medium and small versions
        image_name = '%s-M.jpg' % cover_id
        url = '%s/b/id/%s' % (self.covers_url, image_name)
        response = requests.get(url)
        if not response.ok:
            response.raise_for_status()
        image_content = ContentFile(response.content)
        return [image_name, image_content]


def get_description(description_blob):
    ''' descriptions can be a string or a dict '''
    if isinstance(description_blob, dict):
        return description_blob.get('value')
    return  description_blob


def get_languages(language_blob):
    ''' /language/eng -> English '''
    langs = []
    for lang in language_blob:
        langs.append(
            languages.get(lang.get('key', ''), None)
        )
    return langs


def pick_default_edition(options):
    ''' favor physical copies with covers in english '''
    if not options:
        return None
    if len(options) == 1:
        return options[0]

    options = [e for e in options if e.get('cover')] or options
    options = [e for e in options if \
        '/languages/eng' in str(e.get('languages'))] or options
    formats = ['paperback', 'hardcover', 'mass market paperback']
    options = [e for e in options if \
        str(e.get('physical_format')).lower() in formats] or options
    options = [e for e in options if e.get('isbn_13')] or options
    options = [e for e in options if e.get('ocaid')] or options
    return options[0]
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`''' openlibrary data connector '''`
			`import re`
			`import requests`

Style fixes suggested by pylint. 2020-04-22 13:53:22 +00:00			`from django.core.files.base import ContentFile`
			`from django.db import transaction`

Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`from fedireads import models`
Use timezone dates Fixes #114 2020-03-30 00:40:51 +00:00			`from .abstract_connector import AbstractConnector, SearchResult`
Adds create_book functionality for fedireads conn 2020-05-08 23:56:49 +00:00			`from .abstract_connector import update_from_mappings`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`from .abstract_connector import get_date`
Mark default edition 2020-03-30 20:15:49 +00:00			`from .openlibrary_languages import languages`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

Use database as source for initializing connector 2020-03-27 22:25:08 +00:00			`class Connector(AbstractConnector):`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`''' instantiate a connector for OL '''`
Rename local key and suggest fedireads connectors 2020-03-27 23:36:52 +00:00			`def __init__(self, identifier):`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`get_first = lambda a: a[0]`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`self.key_mappings = {`
Adds more fields to book data 2020-04-29 17:09:14 +00:00			`'isbn_13': ('isbn_13', get_first),`
fixes mapping for openlibrary isbn10 field 2020-05-04 17:42:48 +00:00			`'isbn_10': ('isbn_10', get_first),`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`'oclc_numbers': ('oclc_number', get_first),`
			`'lccn': ('lccn', get_first),`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`}`

			`self.book_mappings = self.key_mappings.copy()`
			`self.book_mappings.update({`
			`'publish_date': ('published_date', get_date),`
			`'first_publish_date': ('first_published_date', get_date),`
			`'description': ('description', get_description),`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`'languages': ('languages', get_languages),`
			`'number_of_pages': ('pages', None),`
			`'series': ('series', get_first),`
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`})`
Rename local key and suggest fedireads connectors 2020-03-27 23:36:52 +00:00			`super().__init__(identifier)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`def format_search_result(self, doc):`
			`key = doc['key']`
			`key = key.split('/')[-1]`
			`author = doc.get('author_name') or ['Unknown']`
			`return SearchResult(`
			`doc.get('title'),`
			`key,`
			`author[0],`
			`doc.get('first_publish_year'),`
More connectors more problems 2020-03-28 19:55:53 +00:00			`)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

			`def get_or_create_book(self, olkey):`
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00			`''' pull up a book record by whatever means possible.`
			`if you give a work key, it should give you the default edition,`
			`annotated with work data. '''`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
SMall fixes for update form and ol connector 2020-04-04 20:46:10 +00:00			`book = models.Book.objects.select_subclasses().filter(`
			`openlibrary_key=olkey`
			`).first()`
			`if book:`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`if isinstance(book, models.Work):`
			`return book.default_edition`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`return book`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00			`# no book was found, so we start creating a new one`
			`if re.match(r'^OL\d+W$', olkey):`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`with transaction.atomic():`
			`# create both work and a default edition`
			`work_data = self.load_book_data(olkey)`
			`work = self.create_book(olkey, work_data, models.Work)`

			`edition_options = self.load_edition_data(olkey).get('entries')`
			`edition_data = pick_default_edition(edition_options)`
Re-use work data for editions 2020-05-03 21:09:55 +00:00			`if not edition_data:`
			`# hack: re-use the work data as the edition data`
			`edition_data = work_data`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`key = edition_data.get('key').split('/')[-1]`
			`edition = self.create_book(key, edition_data, models.Edition)`
Mark default editions 2020-04-29 18:21:36 +00:00			`edition.default = True`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`edition.parent_work = work`
			`edition.save()`
			`else:`
			`with transaction.atomic():`
			`edition_data = self.load_book_data(olkey)`
			`edition = self.create_book(olkey, edition_data, models.Edition)`

Fixes exception for openlibrary editions without works 2020-05-03 20:53:56 +00:00			`work_data = edition_data.get('works')`
			`if not work_data:`
			`# hack: we're re-using the edition data as the work data`
			`work_key = olkey`
			`else:`
			`work_key = work_data[0]['key'].split('/')[-1]`

Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`work = models.Work.objects.filter(`
			`openlibrary_key=work_key`
			`).first()`
			`if not work:`
			`work_data = self.load_book_data(work_key)`
			`work = self.create_book(work_key, work_data, models.Work)`
			`edition.parent_work = work`
			`edition.save()`
			`if not edition.authors and work.authors:`
			`edition.authors.set(work.authors.all())`
Set author text when data is inherited from works 2020-04-29 18:08:51 +00:00			`edition.author_text = ', '.join(a.name for a in edition.authors)`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00
			`return edition`


			`def update_book_from_data(self, book, data):`
			`''' updaet a book model instance from ol data '''`
			`# populate the simple data fields`
Adds create_book functionality for fedireads conn 2020-05-08 23:56:49 +00:00			`super().update_book_from_data(book, data)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Adds more fields to book data 2020-04-29 17:09:14 +00:00			`authors = self.get_authors_from_data(data)`
			`for author in authors:`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`book.authors.add(author)`
Set author text when data is inherited from works 2020-04-29 18:08:51 +00:00			`if authors:`
			`book.author_text = ', '.join(a.name for a in authors)`
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`if data.get('covers'):`
			`book.cover.save(*self.get_cover(data['covers'][0]), save=True)`
			`return book`
Get more data out of openlibrary 2020-03-28 04:28:52 +00:00
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Path to update books 2020-05-04 01:56:29 +00:00			`def update_book(self, book, data=None):`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`''' load new data '''`
			`if not book.sync and not book.sync_cover:`
			`return`
Fixes bugs in openlibrary connector 2020-03-14 04:10:53 +00:00
Expand matching books on keys like isbn 2020-05-04 04:00:25 +00:00			`if not data:`
			`data = self.load_book_data(book.openlibrary_key)`

Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`if book.sync_cover and data.get('covers'):`
			`book.cover.save(*self.get_cover(data['covers'][0]), save=True)`
			`if book.sync:`
			`book = self.update_book_from_data(book, data)`
			`return book`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`def get_authors_from_data(self, data):`
			`''' parse author json and load or create authors '''`
			`authors = []`
Fixes bugs in openlibrary connector 2020-03-14 04:10:53 +00:00			`for author_blob in data.get('authors', []):`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`# this id is "/authors/OL1234567A" and we want just "OL1234567A"`
			`author_blob = author_blob.get('author', author_blob)`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`author_id = author_blob['key'].split('/')[-1]`
			`authors.append(self.get_or_create_author(author_id))`
			`return authors`
Set work author for editions with no author 2020-04-02 16:11:42 +00:00
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`def load_book_data(self, olkey):`
			`''' query openlibrary for data on a book '''`
Fixes url fields in openlibrary connector 2020-05-03 20:03:15 +00:00			`response = requests.get('%s/works/%s.json' % (self.books_url, olkey))`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`if not response.ok:`
			`response.raise_for_status()`
			`data = response.json()`
			`return data`


			`def load_edition_data(self, olkey):`
			`''' query openlibrary for editions of a work '''`
			`response = requests.get(`
Fixes url fields in openlibrary connector 2020-05-03 20:03:15 +00:00			`'%s/works/%s/editions.json' % (self.books_url, olkey))`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`if not response.ok:`
			`response.raise_for_status()`
			`data = response.json()`
			`return data`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00

move loading editions to task 2020-04-02 05:11:31 +00:00			`def expand_book_data(self, book):`
			`work = book`
			`if isinstance(book, models.Edition):`
			`work = book.parent_work`

Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`edition_options = self.load_edition_data(work.openlibrary_key)`
			`for edition_data in edition_options.get('entries'):`
			`olkey = edition_data.get('key').split('/')[-1]`
			`if models.Edition.objects.filter(openlibrary_key=olkey).count():`
			`continue`
			`edition = self.create_book(olkey, edition_data, models.Edition)`
			`edition.parent_work = work`
			`edition.save()`
			`if not edition.authors and work.authors:`
			`edition.authors.set(work.authors.all())`
Get editions of openlibrary works 2020-03-30 19:21:04 +00:00

Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`def get_or_create_author(self, olkey):`
			`''' load that author '''`
			`if not re.match(r'^OL\d+A$', olkey):`
			`raise ValueError('Invalid OpenLibrary author ID')`
			`try:`
Fixes bugs in openlibrary connector 2020-03-14 04:10:53 +00:00			`return models.Author.objects.get(openlibrary_key=olkey)`
Re-add tags to book page 2020-04-04 20:12:15 +00:00			`except models.Author.DoesNotExist:`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`pass`

Fixes url fields in openlibrary connector 2020-05-03 20:03:15 +00:00			`response = requests.get('%s/authors/%s.json' % (self.base_url, olkey))`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`if not response.ok:`
			`response.raise_for_status()`

			`data = response.json()`
			`author = models.Author(openlibrary_key=olkey)`
More connectors more problems 2020-03-28 19:55:53 +00:00			`mappings = {`
			`'birth_date': ('born', get_date),`
			`'death_date': ('died', get_date),`
			`'bio': ('bio', get_description),`
			`}`
			`author = update_from_mappings(author, data, mappings)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`# TODO this is making some BOLD assumption`
Fixes for goodreads import + ol changes 2020-04-01 01:58:13 +00:00			`name = data.get('name')`
Don't break on absent author name 2020-04-01 21:18:46 +00:00			`if name:`
			`author.last_name = name.split(' ')[-1]`
			`author.first_name = ' '.join(name.split(' ')[:-1])`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`author.save()`

			`return author`


			`def get_cover(self, cover_id):`
			`''' ask openlibrary for the cover '''`
			`# TODO: get medium and small versions`
			`image_name = '%s-M.jpg' % cover_id`
			`url = '%s/b/id/%s' % (self.covers_url, image_name)`
			`response = requests.get(url)`
			`if not response.ok:`
			`response.raise_for_status()`
Fixes bugs in openlibrary connector 2020-03-14 04:10:53 +00:00			`image_content = ContentFile(response.content)`
Adds generalized book data connectors 2020-03-07 20:22:28 +00:00			`return [image_name, image_content]`


Get more data out of openlibrary 2020-03-28 04:28:52 +00:00			`def get_description(description_blob):`
			`''' descriptions can be a string or a dict '''`
			`if isinstance(description_blob, dict):`
			`return description_blob.get('value')`
			`return description_blob`

Mark default edition 2020-03-30 20:15:49 +00:00
			`def get_languages(language_blob):`
			`''' /language/eng -> English '''`
			`langs = []`
			`for lang in language_blob:`
			`langs.append(`
			`languages.get(lang.get('key', ''), None)`
			`)`
			`return langs`


Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`def pick_default_edition(options):`
			`''' favor physical copies with covers in english '''`
Style fixes suggested by pylint. 2020-04-22 13:53:22 +00:00			`if not options:`
Refactors book ingest from openlibrary it's very repetetive now but also works way better so who can say 2020-04-06 00:00:01 +00:00			`return None`
			`if len(options) == 1:`
			`return options[0]`

			`options = [e for e in options if e.get('cover')] or options`
			`options = [e for e in options if \`
			`'/languages/eng' in str(e.get('languages'))] or options`
			`formats = ['paperback', 'hardcover', 'mass market paperback']`
			`options = [e for e in options if \`
			`str(e.get('physical_format')).lower() in formats] or options`
			`options = [e for e in options if e.get('isbn_13')] or options`
			`options = [e for e in options if e.get('ocaid')] or options`
			`return options[0]`