2021-03-08 16:49:10 +00:00
|
|
|
""" openlibrary data connector """
|
2020-03-07 20:22:28 +00:00
|
|
|
import re
|
2020-04-22 13:53:22 +00:00
|
|
|
|
2020-09-21 15:10:37 +00:00
|
|
|
from bookwyrm import models
|
2021-09-16 18:30:04 +00:00
|
|
|
from bookwyrm.book_search import SearchResult
|
|
|
|
from .abstract_connector import AbstractConnector, Mapping
|
2021-09-29 19:29:17 +00:00
|
|
|
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
|
2021-01-02 16:38:27 +00:00
|
|
|
from .connector_manager import ConnectorException
|
2020-03-30 20:15:49 +00:00
|
|
|
from .openlibrary_languages import languages
|
2020-03-07 20:22:28 +00:00
|
|
|
|
|
|
|
|
2020-03-27 22:25:08 +00:00
|
|
|
class Connector(AbstractConnector):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""instantiate a connector for OL"""
|
2021-03-08 16:49:10 +00:00
|
|
|
|
2021-12-05 20:37:19 +00:00
|
|
|
generated_remote_link_field = "openlibrary_link"
|
|
|
|
|
2020-03-27 23:36:52 +00:00
|
|
|
def __init__(self, identifier):
|
2020-05-10 19:56:59 +00:00
|
|
|
super().__init__(identifier)
|
2020-05-10 23:41:24 +00:00
|
|
|
|
2021-04-06 18:58:07 +00:00
|
|
|
get_first = lambda a, *args: a[0]
|
|
|
|
get_remote_id = lambda a, *args: self.base_url + a
|
2020-12-19 22:56:03 +00:00
|
|
|
self.book_mappings = [
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("title"),
|
|
|
|
Mapping("id", remote_field="key", formatter=get_remote_id),
|
|
|
|
Mapping("cover", remote_field="covers", formatter=self.get_cover_url),
|
|
|
|
Mapping("sortTitle", remote_field="sort_title"),
|
|
|
|
Mapping("subtitle"),
|
|
|
|
Mapping("description", formatter=get_description),
|
|
|
|
Mapping("languages", formatter=get_languages),
|
|
|
|
Mapping("series", formatter=get_first),
|
|
|
|
Mapping("seriesNumber", remote_field="series_number"),
|
|
|
|
Mapping("subjects"),
|
|
|
|
Mapping("subjectPlaces", remote_field="subject_places"),
|
|
|
|
Mapping("isbn13", remote_field="isbn_13", formatter=get_first),
|
|
|
|
Mapping("isbn10", remote_field="isbn_10", formatter=get_first),
|
|
|
|
Mapping("lccn", formatter=get_first),
|
|
|
|
Mapping("oclcNumber", remote_field="oclc_numbers", formatter=get_first),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping(
|
2021-03-08 16:49:10 +00:00
|
|
|
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
|
2020-05-10 23:41:24 +00:00
|
|
|
),
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("goodreadsKey", remote_field="goodreads_key"),
|
|
|
|
Mapping("asin"),
|
2020-05-10 23:41:24 +00:00
|
|
|
Mapping(
|
2021-03-08 16:49:10 +00:00
|
|
|
"firstPublishedDate",
|
|
|
|
remote_field="first_publish_date",
|
2020-05-10 23:41:24 +00:00
|
|
|
),
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("publishedDate", remote_field="publish_date"),
|
|
|
|
Mapping("pages", remote_field="number_of_pages"),
|
2021-09-29 19:38:31 +00:00
|
|
|
Mapping(
|
|
|
|
"physicalFormat",
|
|
|
|
remote_field="physical_format",
|
|
|
|
formatter=infer_physical_format,
|
|
|
|
),
|
|
|
|
Mapping(
|
|
|
|
"physicalFormatDetail",
|
|
|
|
remote_field="physical_format",
|
|
|
|
formatter=unique_physical_format,
|
|
|
|
),
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("publishers"),
|
2020-05-10 23:41:24 +00:00
|
|
|
]
|
|
|
|
|
|
|
|
self.author_mappings = [
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("id", remote_field="key", formatter=get_remote_id),
|
|
|
|
Mapping("name"),
|
2020-12-20 00:14:05 +00:00
|
|
|
Mapping(
|
2021-03-08 16:49:10 +00:00
|
|
|
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
|
2020-12-20 00:14:05 +00:00
|
|
|
),
|
2021-03-08 16:49:10 +00:00
|
|
|
Mapping("born", remote_field="birth_date"),
|
|
|
|
Mapping("died", remote_field="death_date"),
|
|
|
|
Mapping("bio", formatter=get_description),
|
2022-01-30 19:41:33 +00:00
|
|
|
Mapping(
|
|
|
|
"isni",
|
|
|
|
remote_field="remote_ids",
|
|
|
|
formatter=lambda b: get_dict_field(b, "isni"),
|
|
|
|
),
|
|
|
|
Mapping(
|
|
|
|
"asin",
|
|
|
|
remote_field="remote_ids",
|
|
|
|
formatter=lambda b: get_dict_field(b, "amazon"),
|
|
|
|
),
|
|
|
|
Mapping(
|
|
|
|
"viaf",
|
|
|
|
remote_field="remote_ids",
|
|
|
|
formatter=lambda b: get_dict_field(b, "viaf"),
|
|
|
|
),
|
|
|
|
Mapping(
|
|
|
|
"wikidata",
|
|
|
|
remote_field="remote_ids",
|
|
|
|
formatter=lambda b: get_dict_field(b, "wikidata"),
|
|
|
|
),
|
2022-01-30 20:02:18 +00:00
|
|
|
Mapping(
|
|
|
|
"wikipedia_link", remote_field="links", formatter=get_wikipedia_link
|
|
|
|
),
|
|
|
|
Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id),
|
2020-05-10 23:41:24 +00:00
|
|
|
]
|
|
|
|
|
2021-05-20 22:55:50 +00:00
|
|
|
def get_book_data(self, remote_id):
|
|
|
|
data = get_data(remote_id)
|
|
|
|
if data.get("type", {}).get("key") == "/type/redirect":
|
|
|
|
remote_id = self.base_url + data.get("location")
|
|
|
|
return get_data(remote_id)
|
|
|
|
return data
|
|
|
|
|
2020-10-31 00:04:10 +00:00
|
|
|
def get_remote_id_from_data(self, data):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""format a url from an openlibrary id field"""
|
2020-10-31 00:04:10 +00:00
|
|
|
try:
|
2021-03-08 16:49:10 +00:00
|
|
|
key = data["key"]
|
2020-10-31 00:04:10 +00:00
|
|
|
except KeyError:
|
2021-03-08 16:49:10 +00:00
|
|
|
raise ConnectorException("Invalid book data")
|
2021-09-18 18:32:00 +00:00
|
|
|
return f"{self.books_url}{key}"
|
2020-10-31 00:04:10 +00:00
|
|
|
|
2020-05-10 19:56:59 +00:00
|
|
|
def is_work_data(self, data):
|
2021-03-08 16:49:10 +00:00
|
|
|
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
|
2020-03-07 20:22:28 +00:00
|
|
|
|
2020-05-10 19:56:59 +00:00
|
|
|
def get_edition_from_work_data(self, data):
|
|
|
|
try:
|
2021-03-08 16:49:10 +00:00
|
|
|
key = data["key"]
|
2020-05-10 19:56:59 +00:00
|
|
|
except KeyError:
|
2021-03-08 16:49:10 +00:00
|
|
|
raise ConnectorException("Invalid book data")
|
2021-09-18 18:32:00 +00:00
|
|
|
url = f"{self.books_url}{key}/editions"
|
2021-05-20 22:55:50 +00:00
|
|
|
data = self.get_book_data(url)
|
|
|
|
edition = pick_default_edition(data["entries"])
|
|
|
|
if not edition:
|
|
|
|
raise ConnectorException("No editions for work")
|
|
|
|
return edition
|
2020-05-09 00:56:24 +00:00
|
|
|
|
2020-12-27 22:27:18 +00:00
|
|
|
def get_work_from_edition_data(self, data):
|
2020-05-10 19:56:59 +00:00
|
|
|
try:
|
2021-03-08 16:49:10 +00:00
|
|
|
key = data["works"][0]["key"]
|
2020-05-10 19:56:59 +00:00
|
|
|
except (IndexError, KeyError):
|
2021-03-08 16:49:10 +00:00
|
|
|
raise ConnectorException("No work found for edition")
|
2021-09-18 18:32:00 +00:00
|
|
|
url = f"{self.books_url}{key}"
|
2021-05-20 22:55:50 +00:00
|
|
|
return self.get_book_data(url)
|
2020-04-06 00:00:01 +00:00
|
|
|
|
|
|
|
def get_authors_from_data(self, data):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""parse author json and load or create authors"""
|
2021-03-08 16:49:10 +00:00
|
|
|
for author_blob in data.get("authors", []):
|
|
|
|
author_blob = author_blob.get("author", author_blob)
|
2020-12-19 23:20:31 +00:00
|
|
|
# this id is "/authors/OL1234567A"
|
2021-03-08 16:49:10 +00:00
|
|
|
author_id = author_blob["key"]
|
2021-09-18 18:32:00 +00:00
|
|
|
url = f"{self.base_url}{author_id}"
|
2021-04-07 15:59:33 +00:00
|
|
|
author = self.get_or_create_author(url)
|
|
|
|
if not author:
|
|
|
|
continue
|
|
|
|
yield author
|
2020-04-02 16:11:42 +00:00
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
def get_cover_url(self, cover_blob, size="L"):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""ask openlibrary for the cover"""
|
2021-03-13 21:55:20 +00:00
|
|
|
if not cover_blob:
|
|
|
|
return None
|
2020-12-20 00:14:05 +00:00
|
|
|
cover_id = cover_blob[0]
|
2021-09-18 18:32:00 +00:00
|
|
|
image_name = f"{cover_id}-{size}.jpg"
|
|
|
|
return f"{self.covers_url}/b/id/{image_name}"
|
2020-05-10 19:56:59 +00:00
|
|
|
|
2022-05-30 23:42:37 +00:00
|
|
|
def parse_search_data(self, data, min_confidence):
|
2022-05-30 19:52:31 +00:00
|
|
|
for search_result in data.get("docs"):
|
|
|
|
# build the remote id from the openlibrary key
|
|
|
|
key = self.books_url + search_result["key"]
|
|
|
|
author = search_result.get("author_name") or ["Unknown"]
|
|
|
|
cover_blob = search_result.get("cover_i")
|
|
|
|
cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None
|
|
|
|
yield SearchResult(
|
|
|
|
title=search_result.get("title"),
|
|
|
|
key=key,
|
|
|
|
author=", ".join(author),
|
|
|
|
connector=self,
|
|
|
|
year=search_result.get("first_publish_year"),
|
|
|
|
cover=cover,
|
|
|
|
)
|
2020-04-06 00:00:01 +00:00
|
|
|
|
2021-03-01 20:09:21 +00:00
|
|
|
def parse_isbn_search_data(self, data):
|
2022-05-30 19:52:31 +00:00
|
|
|
for search_result in list(data.values()):
|
|
|
|
# build the remote id from the openlibrary key
|
|
|
|
key = self.books_url + search_result["key"]
|
|
|
|
authors = search_result.get("authors") or [{"name": "Unknown"}]
|
|
|
|
author_names = [author.get("name") for author in authors]
|
|
|
|
yield SearchResult(
|
|
|
|
title=search_result.get("title"),
|
|
|
|
key=key,
|
|
|
|
author=", ".join(author_names),
|
|
|
|
connector=self,
|
|
|
|
year=search_result.get("publish_date"),
|
|
|
|
)
|
2021-03-01 20:09:21 +00:00
|
|
|
|
2020-04-06 00:00:01 +00:00
|
|
|
def load_edition_data(self, olkey):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""query openlibrary for editions of a work"""
|
2021-09-18 18:32:00 +00:00
|
|
|
url = f"{self.books_url}/works/{olkey}/editions"
|
2021-05-20 22:55:50 +00:00
|
|
|
return self.get_book_data(url)
|
2020-03-07 20:22:28 +00:00
|
|
|
|
2020-04-02 05:11:31 +00:00
|
|
|
def expand_book_data(self, book):
|
|
|
|
work = book
|
2020-10-01 02:43:42 +00:00
|
|
|
# go from the edition to the work, if necessary
|
2020-04-02 05:11:31 +00:00
|
|
|
if isinstance(book, models.Edition):
|
|
|
|
work = book.parent_work
|
|
|
|
|
2020-10-01 02:43:42 +00:00
|
|
|
# we can mass download edition data from OL to avoid repeatedly querying
|
2021-02-11 01:54:49 +00:00
|
|
|
try:
|
|
|
|
edition_options = self.load_edition_data(work.openlibrary_key)
|
|
|
|
except ConnectorException:
|
|
|
|
# who knows, man
|
|
|
|
return
|
|
|
|
|
2021-03-08 16:49:10 +00:00
|
|
|
for edition_data in edition_options.get("entries"):
|
2021-01-31 01:19:01 +00:00
|
|
|
# does this edition have ANY interesting data?
|
|
|
|
if ignore_edition(edition_data):
|
|
|
|
continue
|
2020-12-20 00:14:05 +00:00
|
|
|
self.create_edition_from_data(work, edition_data)
|
2020-03-30 19:21:04 +00:00
|
|
|
|
|
|
|
|
2021-01-31 01:19:01 +00:00
|
|
|
def ignore_edition(edition_data):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""don't load a million editions that have no metadata"""
|
2021-01-31 01:19:01 +00:00
|
|
|
# an isbn, we love to see it
|
2021-03-08 16:49:10 +00:00
|
|
|
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
|
2021-01-31 01:19:01 +00:00
|
|
|
return False
|
|
|
|
# grudgingly, oclc can stay
|
2021-03-08 16:49:10 +00:00
|
|
|
if edition_data.get("oclc_numbers"):
|
2021-01-31 01:19:01 +00:00
|
|
|
return False
|
|
|
|
# if it has a cover it can stay
|
2021-03-08 16:49:10 +00:00
|
|
|
if edition_data.get("covers"):
|
2021-01-31 01:19:01 +00:00
|
|
|
return False
|
|
|
|
# keep non-english editions
|
2021-03-08 16:49:10 +00:00
|
|
|
if edition_data.get("languages") and "languages/eng" not in str(
|
|
|
|
edition_data.get("languages")
|
|
|
|
):
|
2021-01-31 01:19:01 +00:00
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
def get_description(description_blob):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""descriptions can be a string or a dict"""
|
2020-03-28 04:28:52 +00:00
|
|
|
if isinstance(description_blob, dict):
|
2021-03-08 16:49:10 +00:00
|
|
|
return description_blob.get("value")
|
2020-12-31 19:03:13 +00:00
|
|
|
return description_blob
|
2020-03-28 04:28:52 +00:00
|
|
|
|
2020-03-30 20:15:49 +00:00
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
def get_openlibrary_key(key):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""convert /books/OL27320736M into OL27320736M"""
|
2021-03-08 16:49:10 +00:00
|
|
|
return key.split("/")[-1]
|
2020-05-10 23:41:24 +00:00
|
|
|
|
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
def get_languages(language_blob):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""/language/eng -> English"""
|
2020-03-30 20:15:49 +00:00
|
|
|
langs = []
|
|
|
|
for lang in language_blob:
|
2021-03-08 16:49:10 +00:00
|
|
|
langs.append(languages.get(lang.get("key", ""), None))
|
2020-03-30 20:15:49 +00:00
|
|
|
return langs
|
|
|
|
|
|
|
|
|
2022-01-30 19:41:33 +00:00
|
|
|
def get_dict_field(blob, field_name):
|
2021-12-05 21:24:40 +00:00
|
|
|
"""extract the isni from the remote id data for the author"""
|
2022-01-30 19:41:33 +00:00
|
|
|
if not blob or not isinstance(blob, dict):
|
2021-12-05 21:24:40 +00:00
|
|
|
return None
|
2022-01-30 19:41:33 +00:00
|
|
|
return blob.get(field_name)
|
2021-12-05 21:24:40 +00:00
|
|
|
|
|
|
|
|
2022-01-30 20:02:18 +00:00
|
|
|
def get_wikipedia_link(links):
|
|
|
|
"""extract wikipedia links"""
|
|
|
|
if not isinstance(links, list):
|
|
|
|
return None
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
if not isinstance(link, dict):
|
|
|
|
continue
|
|
|
|
if link.get("title") == "wikipedia":
|
|
|
|
return link.get("url")
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
def get_inventaire_id(links):
|
|
|
|
"""extract and format inventaire ids"""
|
|
|
|
if not isinstance(links, list):
|
|
|
|
return None
|
|
|
|
|
|
|
|
for link in links:
|
|
|
|
if not isinstance(link, dict):
|
|
|
|
continue
|
|
|
|
if link.get("title") == "inventaire.io":
|
|
|
|
iv_link = link.get("url")
|
|
|
|
return iv_link.split("/")[-1]
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
def pick_default_edition(options):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""favor physical copies with covers in english"""
|
2020-04-22 13:53:22 +00:00
|
|
|
if not options:
|
2020-04-06 00:00:01 +00:00
|
|
|
return None
|
|
|
|
if len(options) == 1:
|
|
|
|
return options[0]
|
|
|
|
|
2021-03-08 16:49:10 +00:00
|
|
|
options = [e for e in options if e.get("covers")] or options
|
|
|
|
options = [
|
|
|
|
e for e in options if "/languages/eng" in str(e.get("languages"))
|
|
|
|
] or options
|
|
|
|
formats = ["paperback", "hardcover", "mass market paperback"]
|
|
|
|
options = [
|
|
|
|
e for e in options if str(e.get("physical_format")).lower() in formats
|
|
|
|
] or options
|
|
|
|
options = [e for e in options if e.get("isbn_13")] or options
|
|
|
|
options = [e for e in options if e.get("ocaid")] or options
|
2020-04-06 00:00:01 +00:00
|
|
|
return options[0]
|