""" openlibrary data connector """ import re from bookwyrm import models from bookwyrm.book_search import SearchResult from .abstract_connector import AbstractConnector, Mapping from .abstract_connector import get_data, infer_physical_format, unique_physical_format from .connector_manager import ConnectorException from .openlibrary_languages import languages class Connector(AbstractConnector): """instantiate a connector for OL""" generated_remote_link_field = "openlibrary_link" def __init__(self, identifier): super().__init__(identifier) get_first = lambda a, *args: a[0] get_remote_id = lambda a, *args: self.base_url + a self.book_mappings = [ Mapping("title"), Mapping("id", remote_field="key", formatter=get_remote_id), Mapping("cover", remote_field="covers", formatter=self.get_cover_url), Mapping("sortTitle", remote_field="sort_title"), Mapping("subtitle"), Mapping("description", formatter=get_description), Mapping("languages", formatter=get_languages), Mapping("series", formatter=get_first), Mapping("seriesNumber", remote_field="series_number"), Mapping("subjects"), Mapping("subjectPlaces", remote_field="subject_places"), Mapping("isbn13", remote_field="isbn_13", formatter=get_first), Mapping("isbn10", remote_field="isbn_10", formatter=get_first), Mapping("lccn", formatter=get_first), Mapping("oclcNumber", remote_field="oclc_numbers", formatter=get_first), Mapping( "openlibraryKey", remote_field="key", formatter=get_openlibrary_key ), Mapping("goodreadsKey", remote_field="goodreads_key"), Mapping("asin"), Mapping( "firstPublishedDate", remote_field="first_publish_date", ), Mapping("publishedDate", remote_field="publish_date"), Mapping("pages", remote_field="number_of_pages"), Mapping( "physicalFormat", remote_field="physical_format", formatter=infer_physical_format, ), Mapping( "physicalFormatDetail", remote_field="physical_format", formatter=unique_physical_format, ), Mapping("publishers"), ] self.author_mappings = [ Mapping("id", remote_field="key", formatter=get_remote_id), Mapping("name"), Mapping( "openlibraryKey", remote_field="key", formatter=get_openlibrary_key ), Mapping("born", remote_field="birth_date"), Mapping("died", remote_field="death_date"), Mapping("bio", formatter=get_description), Mapping( "isni", remote_field="remote_ids", formatter=lambda b: get_dict_field(b, "isni"), ), Mapping( "asin", remote_field="remote_ids", formatter=lambda b: get_dict_field(b, "amazon"), ), Mapping( "viaf", remote_field="remote_ids", formatter=lambda b: get_dict_field(b, "viaf"), ), Mapping( "wikidata", remote_field="remote_ids", formatter=lambda b: get_dict_field(b, "wikidata"), ), Mapping( "wikipedia_link", remote_field="links", formatter=get_wikipedia_link ), Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id), ] def get_book_data(self, remote_id): data = get_data(remote_id) if data.get("type", {}).get("key") == "/type/redirect": remote_id = self.base_url + data.get("location") return get_data(remote_id) return data def get_remote_id_from_data(self, data): """format a url from an openlibrary id field""" try: key = data["key"] except KeyError: raise ConnectorException("Invalid book data") return f"{self.books_url}{key}" def is_work_data(self, data): return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"])) def get_edition_from_work_data(self, data): try: key = data["key"] except KeyError: raise ConnectorException("Invalid book data") url = f"{self.books_url}{key}/editions" data = self.get_book_data(url) edition = pick_default_edition(data["entries"]) if not edition: raise ConnectorException("No editions for work") return edition def get_work_from_edition_data(self, data): try: key = data["works"][0]["key"] except (IndexError, KeyError): raise ConnectorException("No work found for edition") url = f"{self.books_url}{key}" return self.get_book_data(url) def get_authors_from_data(self, data): """parse author json and load or create authors""" for author_blob in data.get("authors", []): author_blob = author_blob.get("author", author_blob) # this id is "/authors/OL1234567A" author_id = author_blob["key"] url = f"{self.base_url}{author_id}" author = self.get_or_create_author(url) if not author: continue yield author def get_cover_url(self, cover_blob, size="L"): """ask openlibrary for the cover""" if not cover_blob: return None cover_id = cover_blob[0] image_name = f"{cover_id}-{size}.jpg" return f"{self.covers_url}/b/id/{image_name}" def parse_search_data(self, data, min_confidence): for idx, search_result in enumerate(data.get("docs")): # build the remote id from the openlibrary key key = self.books_url + search_result["key"] author = search_result.get("author_name") or ["Unknown"] cover_blob = search_result.get("cover_i") cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None # OL doesn't provide confidence, but it does sort by an internal ranking, so # this confidence value is relative to the list position confidence = 1 / (idx + 1) yield SearchResult( title=search_result.get("title"), key=key, author=", ".join(author), connector=self, year=search_result.get("first_publish_year"), cover=cover, confidence=confidence, ) def parse_isbn_search_data(self, data): for search_result in list(data.values()): # build the remote id from the openlibrary key key = self.books_url + search_result["key"] authors = search_result.get("authors") or [{"name": "Unknown"}] author_names = [author.get("name") for author in authors] yield SearchResult( title=search_result.get("title"), key=key, author=", ".join(author_names), connector=self, year=search_result.get("publish_date"), ) def load_edition_data(self, olkey): """query openlibrary for editions of a work""" url = f"{self.books_url}/works/{olkey}/editions" return self.get_book_data(url) def expand_book_data(self, book): work = book # go from the edition to the work, if necessary if isinstance(book, models.Edition): work = book.parent_work # we can mass download edition data from OL to avoid repeatedly querying try: edition_options = self.load_edition_data(work.openlibrary_key) except ConnectorException: # who knows, man return for edition_data in edition_options.get("entries"): # does this edition have ANY interesting data? if ignore_edition(edition_data): continue self.create_edition_from_data(work, edition_data) def ignore_edition(edition_data): """don't load a million editions that have no metadata""" # an isbn, we love to see it if edition_data.get("isbn_13") or edition_data.get("isbn_10"): return False # grudgingly, oclc can stay if edition_data.get("oclc_numbers"): return False # if it has a cover it can stay if edition_data.get("covers"): return False # keep non-english editions if edition_data.get("languages") and "languages/eng" not in str( edition_data.get("languages") ): return False return True def get_description(description_blob): """descriptions can be a string or a dict""" if isinstance(description_blob, dict): return description_blob.get("value") return description_blob def get_openlibrary_key(key): """convert /books/OL27320736M into OL27320736M""" return key.split("/")[-1] def get_languages(language_blob): """/language/eng -> English""" langs = [] for lang in language_blob: langs.append(languages.get(lang.get("key", ""), None)) return langs def get_dict_field(blob, field_name): """extract the isni from the remote id data for the author""" if not blob or not isinstance(blob, dict): return None return blob.get(field_name) def get_wikipedia_link(links): """extract wikipedia links""" if not isinstance(links, list): return None for link in links: if not isinstance(link, dict): continue if link.get("title") == "wikipedia": return link.get("url") return None def get_inventaire_id(links): """extract and format inventaire ids""" if not isinstance(links, list): return None for link in links: if not isinstance(link, dict): continue if link.get("title") == "inventaire.io": iv_link = link.get("url") return iv_link.split("/")[-1] return None def pick_default_edition(options): """favor physical copies with covers in english""" if not options: return None if len(options) == 1: return options[0] options = [e for e in options if e.get("covers")] or options options = [ e for e in options if "/languages/eng" in str(e.get("languages")) ] or options formats = ["paperback", "hardcover", "mass market paperback"] options = [ e for e in options if str(e.get("physical_format")).lower() in formats ] or options options = [e for e in options if e.get("isbn_13")] or options options = [e for e in options if e.get("ocaid")] or options return options[0]