moviewyrm/bookwyrm/connectors/openlibrary.py
2021-03-08 08:49:10 -08:00

220 lines
7.9 KiB
Python

""" openlibrary data connector """
import re
from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult, Mapping
from .abstract_connector import get_data
from .connector_manager import ConnectorException
from .openlibrary_languages import languages
class Connector(AbstractConnector):
""" instantiate a connector for OL """
def __init__(self, identifier):
super().__init__(identifier)
get_first = lambda a: a[0]
get_remote_id = lambda a: self.base_url + a
self.book_mappings = [
Mapping("title"),
Mapping("id", remote_field="key", formatter=get_remote_id),
Mapping("cover", remote_field="covers", formatter=self.get_cover_url),
Mapping("sortTitle", remote_field="sort_title"),
Mapping("subtitle"),
Mapping("description", formatter=get_description),
Mapping("languages", formatter=get_languages),
Mapping("series", formatter=get_first),
Mapping("seriesNumber", remote_field="series_number"),
Mapping("subjects"),
Mapping("subjectPlaces", remote_field="subject_places"),
Mapping("isbn13", remote_field="isbn_13", formatter=get_first),
Mapping("isbn10", remote_field="isbn_10", formatter=get_first),
Mapping("lccn", formatter=get_first),
Mapping("oclcNumber", remote_field="oclc_numbers", formatter=get_first),
Mapping(
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
),
Mapping("goodreadsKey", remote_field="goodreads_key"),
Mapping("asin"),
Mapping(
"firstPublishedDate",
remote_field="first_publish_date",
),
Mapping("publishedDate", remote_field="publish_date"),
Mapping("pages", remote_field="number_of_pages"),
Mapping("physicalFormat", remote_field="physical_format"),
Mapping("publishers"),
]
self.author_mappings = [
Mapping("id", remote_field="key", formatter=get_remote_id),
Mapping("name"),
Mapping(
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
),
Mapping("born", remote_field="birth_date"),
Mapping("died", remote_field="death_date"),
Mapping("bio", formatter=get_description),
]
def get_remote_id_from_data(self, data):
""" format a url from an openlibrary id field """
try:
key = data["key"]
except KeyError:
raise ConnectorException("Invalid book data")
return "%s%s" % (self.books_url, key)
def is_work_data(self, data):
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
def get_edition_from_work_data(self, data):
try:
key = data["key"]
except KeyError:
raise ConnectorException("Invalid book data")
url = "%s%s/editions" % (self.books_url, key)
data = get_data(url)
return pick_default_edition(data["entries"])
def get_work_from_edition_data(self, data):
try:
key = data["works"][0]["key"]
except (IndexError, KeyError):
raise ConnectorException("No work found for edition")
url = "%s%s" % (self.books_url, key)
return get_data(url)
def get_authors_from_data(self, data):
""" parse author json and load or create authors """
for author_blob in data.get("authors", []):
author_blob = author_blob.get("author", author_blob)
# this id is "/authors/OL1234567A"
author_id = author_blob["key"]
url = "%s%s" % (self.base_url, author_id)
yield self.get_or_create_author(url)
def get_cover_url(self, cover_blob):
""" ask openlibrary for the cover """
cover_id = cover_blob[0]
image_name = "%s-L.jpg" % cover_id
return "%s/b/id/%s" % (self.covers_url, image_name)
def parse_search_data(self, data):
return data.get("docs")
def format_search_result(self, search_result):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
author = search_result.get("author_name") or ["Unknown"]
return SearchResult(
title=search_result.get("title"),
key=key,
author=", ".join(author),
connector=self,
year=search_result.get("first_publish_year"),
)
def parse_isbn_search_data(self, data):
return list(data.values())
def format_isbn_search_result(self, search_result):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
authors = search_result.get("authors") or [{"name": "Unknown"}]
author_names = [author.get("name") for author in authors]
return SearchResult(
title=search_result.get("title"),
key=key,
author=", ".join(author_names),
connector=self,
year=search_result.get("publish_date"),
)
def load_edition_data(self, olkey):
""" query openlibrary for editions of a work """
url = "%s/works/%s/editions" % (self.books_url, olkey)
return get_data(url)
def expand_book_data(self, book):
work = book
# go from the edition to the work, if necessary
if isinstance(book, models.Edition):
work = book.parent_work
# we can mass download edition data from OL to avoid repeatedly querying
try:
edition_options = self.load_edition_data(work.openlibrary_key)
except ConnectorException:
# who knows, man
return
for edition_data in edition_options.get("entries"):
# does this edition have ANY interesting data?
if ignore_edition(edition_data):
continue
self.create_edition_from_data(work, edition_data)
def ignore_edition(edition_data):
""" don't load a million editions that have no metadata """
# an isbn, we love to see it
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
print(edition_data.get("isbn_10"))
return False
# grudgingly, oclc can stay
if edition_data.get("oclc_numbers"):
print(edition_data.get("oclc_numbers"))
return False
# if it has a cover it can stay
if edition_data.get("covers"):
print(edition_data.get("covers"))
return False
# keep non-english editions
if edition_data.get("languages") and "languages/eng" not in str(
edition_data.get("languages")
):
print(edition_data.get("languages"))
return False
return True
def get_description(description_blob):
""" descriptions can be a string or a dict """
if isinstance(description_blob, dict):
return description_blob.get("value")
return description_blob
def get_openlibrary_key(key):
""" convert /books/OL27320736M into OL27320736M """
return key.split("/")[-1]
def get_languages(language_blob):
""" /language/eng -> English """
langs = []
for lang in language_blob:
langs.append(languages.get(lang.get("key", ""), None))
return langs
def pick_default_edition(options):
""" favor physical copies with covers in english """
if not options:
return None
if len(options) == 1:
return options[0]
options = [e for e in options if e.get("covers")] or options
options = [
e for e in options if "/languages/eng" in str(e.get("languages"))
] or options
formats = ["paperback", "hardcover", "mass market paperback"]
options = [
e for e in options if str(e.get("physical_format")).lower() in formats
] or options
options = [e for e in options if e.get("isbn_13")] or options
options = [e for e in options if e.get("ocaid")] or options
return options[0]