bookwyrm/bookwyrm/connectors/openlibrary.py
Mouse Reeve 87fe984462 Combines search formatter and parser function
The parser was extracting the list of search results from the json
object returned by the search endpoint, and the formatter was converting
an individual json entry into a SearchResult object. This just merged
them into one function, because they are never used separately.
2022-05-30 12:52:31 -07:00

301 lines
10 KiB
Python

""" openlibrary data connector """
import re
from bookwyrm import models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractConnector, Mapping
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
from .connector_manager import ConnectorException
from .openlibrary_languages import languages
class Connector(AbstractConnector):
"""instantiate a connector for OL"""
generated_remote_link_field = "openlibrary_link"
def __init__(self, identifier):
super().__init__(identifier)
get_first = lambda a, *args: a[0]
get_remote_id = lambda a, *args: self.base_url + a
self.book_mappings = [
Mapping("title"),
Mapping("id", remote_field="key", formatter=get_remote_id),
Mapping("cover", remote_field="covers", formatter=self.get_cover_url),
Mapping("sortTitle", remote_field="sort_title"),
Mapping("subtitle"),
Mapping("description", formatter=get_description),
Mapping("languages", formatter=get_languages),
Mapping("series", formatter=get_first),
Mapping("seriesNumber", remote_field="series_number"),
Mapping("subjects"),
Mapping("subjectPlaces", remote_field="subject_places"),
Mapping("isbn13", remote_field="isbn_13", formatter=get_first),
Mapping("isbn10", remote_field="isbn_10", formatter=get_first),
Mapping("lccn", formatter=get_first),
Mapping("oclcNumber", remote_field="oclc_numbers", formatter=get_first),
Mapping(
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
),
Mapping("goodreadsKey", remote_field="goodreads_key"),
Mapping("asin"),
Mapping(
"firstPublishedDate",
remote_field="first_publish_date",
),
Mapping("publishedDate", remote_field="publish_date"),
Mapping("pages", remote_field="number_of_pages"),
Mapping(
"physicalFormat",
remote_field="physical_format",
formatter=infer_physical_format,
),
Mapping(
"physicalFormatDetail",
remote_field="physical_format",
formatter=unique_physical_format,
),
Mapping("publishers"),
]
self.author_mappings = [
Mapping("id", remote_field="key", formatter=get_remote_id),
Mapping("name"),
Mapping(
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
),
Mapping("born", remote_field="birth_date"),
Mapping("died", remote_field="death_date"),
Mapping("bio", formatter=get_description),
Mapping(
"isni",
remote_field="remote_ids",
formatter=lambda b: get_dict_field(b, "isni"),
),
Mapping(
"asin",
remote_field="remote_ids",
formatter=lambda b: get_dict_field(b, "amazon"),
),
Mapping(
"viaf",
remote_field="remote_ids",
formatter=lambda b: get_dict_field(b, "viaf"),
),
Mapping(
"wikidata",
remote_field="remote_ids",
formatter=lambda b: get_dict_field(b, "wikidata"),
),
Mapping(
"wikipedia_link", remote_field="links", formatter=get_wikipedia_link
),
Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id),
]
def get_book_data(self, remote_id):
data = get_data(remote_id)
if data.get("type", {}).get("key") == "/type/redirect":
remote_id = self.base_url + data.get("location")
return get_data(remote_id)
return data
def get_remote_id_from_data(self, data):
"""format a url from an openlibrary id field"""
try:
key = data["key"]
except KeyError:
raise ConnectorException("Invalid book data")
return f"{self.books_url}{key}"
def is_work_data(self, data):
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
def get_edition_from_work_data(self, data):
try:
key = data["key"]
except KeyError:
raise ConnectorException("Invalid book data")
url = f"{self.books_url}{key}/editions"
data = self.get_book_data(url)
edition = pick_default_edition(data["entries"])
if not edition:
raise ConnectorException("No editions for work")
return edition
def get_work_from_edition_data(self, data):
try:
key = data["works"][0]["key"]
except (IndexError, KeyError):
raise ConnectorException("No work found for edition")
url = f"{self.books_url}{key}"
return self.get_book_data(url)
def get_authors_from_data(self, data):
"""parse author json and load or create authors"""
for author_blob in data.get("authors", []):
author_blob = author_blob.get("author", author_blob)
# this id is "/authors/OL1234567A"
author_id = author_blob["key"]
url = f"{self.base_url}{author_id}"
author = self.get_or_create_author(url)
if not author:
continue
yield author
def get_cover_url(self, cover_blob, size="L"):
"""ask openlibrary for the cover"""
if not cover_blob:
return None
cover_id = cover_blob[0]
image_name = f"{cover_id}-{size}.jpg"
return f"{self.covers_url}/b/id/{image_name}"
def parse_search_data(self, data):
for search_result in data.get("docs"):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
author = search_result.get("author_name") or ["Unknown"]
cover_blob = search_result.get("cover_i")
cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None
yield SearchResult(
title=search_result.get("title"),
key=key,
author=", ".join(author),
connector=self,
year=search_result.get("first_publish_year"),
cover=cover,
)
def parse_isbn_search_data(self, data):
for search_result in list(data.values()):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
authors = search_result.get("authors") or [{"name": "Unknown"}]
author_names = [author.get("name") for author in authors]
yield SearchResult(
title=search_result.get("title"),
key=key,
author=", ".join(author_names),
connector=self,
year=search_result.get("publish_date"),
)
def load_edition_data(self, olkey):
"""query openlibrary for editions of a work"""
url = f"{self.books_url}/works/{olkey}/editions"
return self.get_book_data(url)
def expand_book_data(self, book):
work = book
# go from the edition to the work, if necessary
if isinstance(book, models.Edition):
work = book.parent_work
# we can mass download edition data from OL to avoid repeatedly querying
try:
edition_options = self.load_edition_data(work.openlibrary_key)
except ConnectorException:
# who knows, man
return
for edition_data in edition_options.get("entries"):
# does this edition have ANY interesting data?
if ignore_edition(edition_data):
continue
self.create_edition_from_data(work, edition_data)
def ignore_edition(edition_data):
"""don't load a million editions that have no metadata"""
# an isbn, we love to see it
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
return False
# grudgingly, oclc can stay
if edition_data.get("oclc_numbers"):
return False
# if it has a cover it can stay
if edition_data.get("covers"):
return False
# keep non-english editions
if edition_data.get("languages") and "languages/eng" not in str(
edition_data.get("languages")
):
return False
return True
def get_description(description_blob):
"""descriptions can be a string or a dict"""
if isinstance(description_blob, dict):
return description_blob.get("value")
return description_blob
def get_openlibrary_key(key):
"""convert /books/OL27320736M into OL27320736M"""
return key.split("/")[-1]
def get_languages(language_blob):
"""/language/eng -> English"""
langs = []
for lang in language_blob:
langs.append(languages.get(lang.get("key", ""), None))
return langs
def get_dict_field(blob, field_name):
"""extract the isni from the remote id data for the author"""
if not blob or not isinstance(blob, dict):
return None
return blob.get(field_name)
def get_wikipedia_link(links):
"""extract wikipedia links"""
if not isinstance(links, list):
return None
for link in links:
if not isinstance(link, dict):
continue
if link.get("title") == "wikipedia":
return link.get("url")
return None
def get_inventaire_id(links):
"""extract and format inventaire ids"""
if not isinstance(links, list):
return None
for link in links:
if not isinstance(link, dict):
continue
if link.get("title") == "inventaire.io":
iv_link = link.get("url")
return iv_link.split("/")[-1]
return None
def pick_default_edition(options):
"""favor physical copies with covers in english"""
if not options:
return None
if len(options) == 1:
return options[0]
options = [e for e in options if e.get("covers")] or options
options = [
e for e in options if "/languages/eng" in str(e.get("languages"))
] or options
formats = ["paperback", "hardcover", "mass market paperback"]
options = [
e for e in options if str(e.get("physical_format")).lower() in formats
] or options
options = [e for e in options if e.get("isbn_13")] or options
options = [e for e in options if e.get("ocaid")] or options
return options[0]