mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-04-27 12:34:44 +00:00
Set OpenLibrary search condifidence based on the provided result order, just using 1/(list index), so the first has rank 1, the second 0.5, the third 0.33, et cetera.
306 lines
11 KiB
Python
306 lines
11 KiB
Python
""" openlibrary data connector """
|
|
import re
|
|
|
|
from bookwyrm import models
|
|
from bookwyrm.book_search import SearchResult
|
|
from .abstract_connector import AbstractConnector, Mapping
|
|
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
|
|
from .connector_manager import ConnectorException
|
|
from .openlibrary_languages import languages
|
|
|
|
|
|
class Connector(AbstractConnector):
|
|
"""instantiate a connector for OL"""
|
|
|
|
generated_remote_link_field = "openlibrary_link"
|
|
|
|
def __init__(self, identifier):
|
|
super().__init__(identifier)
|
|
|
|
get_first = lambda a, *args: a[0]
|
|
get_remote_id = lambda a, *args: self.base_url + a
|
|
self.book_mappings = [
|
|
Mapping("title"),
|
|
Mapping("id", remote_field="key", formatter=get_remote_id),
|
|
Mapping("cover", remote_field="covers", formatter=self.get_cover_url),
|
|
Mapping("sortTitle", remote_field="sort_title"),
|
|
Mapping("subtitle"),
|
|
Mapping("description", formatter=get_description),
|
|
Mapping("languages", formatter=get_languages),
|
|
Mapping("series", formatter=get_first),
|
|
Mapping("seriesNumber", remote_field="series_number"),
|
|
Mapping("subjects"),
|
|
Mapping("subjectPlaces", remote_field="subject_places"),
|
|
Mapping("isbn13", remote_field="isbn_13", formatter=get_first),
|
|
Mapping("isbn10", remote_field="isbn_10", formatter=get_first),
|
|
Mapping("lccn", formatter=get_first),
|
|
Mapping("oclcNumber", remote_field="oclc_numbers", formatter=get_first),
|
|
Mapping(
|
|
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
|
|
),
|
|
Mapping("goodreadsKey", remote_field="goodreads_key"),
|
|
Mapping("asin"),
|
|
Mapping(
|
|
"firstPublishedDate",
|
|
remote_field="first_publish_date",
|
|
),
|
|
Mapping("publishedDate", remote_field="publish_date"),
|
|
Mapping("pages", remote_field="number_of_pages"),
|
|
Mapping(
|
|
"physicalFormat",
|
|
remote_field="physical_format",
|
|
formatter=infer_physical_format,
|
|
),
|
|
Mapping(
|
|
"physicalFormatDetail",
|
|
remote_field="physical_format",
|
|
formatter=unique_physical_format,
|
|
),
|
|
Mapping("publishers"),
|
|
]
|
|
|
|
self.author_mappings = [
|
|
Mapping("id", remote_field="key", formatter=get_remote_id),
|
|
Mapping("name"),
|
|
Mapping(
|
|
"openlibraryKey", remote_field="key", formatter=get_openlibrary_key
|
|
),
|
|
Mapping("born", remote_field="birth_date"),
|
|
Mapping("died", remote_field="death_date"),
|
|
Mapping("bio", formatter=get_description),
|
|
Mapping(
|
|
"isni",
|
|
remote_field="remote_ids",
|
|
formatter=lambda b: get_dict_field(b, "isni"),
|
|
),
|
|
Mapping(
|
|
"asin",
|
|
remote_field="remote_ids",
|
|
formatter=lambda b: get_dict_field(b, "amazon"),
|
|
),
|
|
Mapping(
|
|
"viaf",
|
|
remote_field="remote_ids",
|
|
formatter=lambda b: get_dict_field(b, "viaf"),
|
|
),
|
|
Mapping(
|
|
"wikidata",
|
|
remote_field="remote_ids",
|
|
formatter=lambda b: get_dict_field(b, "wikidata"),
|
|
),
|
|
Mapping(
|
|
"wikipedia_link", remote_field="links", formatter=get_wikipedia_link
|
|
),
|
|
Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id),
|
|
]
|
|
|
|
def get_book_data(self, remote_id):
|
|
data = get_data(remote_id)
|
|
if data.get("type", {}).get("key") == "/type/redirect":
|
|
remote_id = self.base_url + data.get("location")
|
|
return get_data(remote_id)
|
|
return data
|
|
|
|
def get_remote_id_from_data(self, data):
|
|
"""format a url from an openlibrary id field"""
|
|
try:
|
|
key = data["key"]
|
|
except KeyError:
|
|
raise ConnectorException("Invalid book data")
|
|
return f"{self.books_url}{key}"
|
|
|
|
def is_work_data(self, data):
|
|
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
|
|
|
|
def get_edition_from_work_data(self, data):
|
|
try:
|
|
key = data["key"]
|
|
except KeyError:
|
|
raise ConnectorException("Invalid book data")
|
|
url = f"{self.books_url}{key}/editions"
|
|
data = self.get_book_data(url)
|
|
edition = pick_default_edition(data["entries"])
|
|
if not edition:
|
|
raise ConnectorException("No editions for work")
|
|
return edition
|
|
|
|
def get_work_from_edition_data(self, data):
|
|
try:
|
|
key = data["works"][0]["key"]
|
|
except (IndexError, KeyError):
|
|
raise ConnectorException("No work found for edition")
|
|
url = f"{self.books_url}{key}"
|
|
return self.get_book_data(url)
|
|
|
|
def get_authors_from_data(self, data):
|
|
"""parse author json and load or create authors"""
|
|
for author_blob in data.get("authors", []):
|
|
author_blob = author_blob.get("author", author_blob)
|
|
# this id is "/authors/OL1234567A"
|
|
author_id = author_blob["key"]
|
|
url = f"{self.base_url}{author_id}"
|
|
author = self.get_or_create_author(url)
|
|
if not author:
|
|
continue
|
|
yield author
|
|
|
|
def get_cover_url(self, cover_blob, size="L"):
|
|
"""ask openlibrary for the cover"""
|
|
if not cover_blob:
|
|
return None
|
|
cover_id = cover_blob[0]
|
|
image_name = f"{cover_id}-{size}.jpg"
|
|
return f"{self.covers_url}/b/id/{image_name}"
|
|
|
|
def parse_search_data(self, data, min_confidence):
|
|
for idx, search_result in enumerate(data.get("docs")):
|
|
# build the remote id from the openlibrary key
|
|
key = self.books_url + search_result["key"]
|
|
author = search_result.get("author_name") or ["Unknown"]
|
|
cover_blob = search_result.get("cover_i")
|
|
cover = self.get_cover_url([cover_blob], size="M") if cover_blob else None
|
|
|
|
# OL doesn't provide confidence, but it does sort by an internal ranking, so
|
|
# this confidence value is relative to the list position
|
|
confidence = 1 / (idx + 1)
|
|
|
|
yield SearchResult(
|
|
title=search_result.get("title"),
|
|
key=key,
|
|
author=", ".join(author),
|
|
connector=self,
|
|
year=search_result.get("first_publish_year"),
|
|
cover=cover,
|
|
confidence=confidence,
|
|
)
|
|
|
|
def parse_isbn_search_data(self, data):
|
|
for search_result in list(data.values()):
|
|
# build the remote id from the openlibrary key
|
|
key = self.books_url + search_result["key"]
|
|
authors = search_result.get("authors") or [{"name": "Unknown"}]
|
|
author_names = [author.get("name") for author in authors]
|
|
yield SearchResult(
|
|
title=search_result.get("title"),
|
|
key=key,
|
|
author=", ".join(author_names),
|
|
connector=self,
|
|
year=search_result.get("publish_date"),
|
|
)
|
|
|
|
def load_edition_data(self, olkey):
|
|
"""query openlibrary for editions of a work"""
|
|
url = f"{self.books_url}/works/{olkey}/editions"
|
|
return self.get_book_data(url)
|
|
|
|
def expand_book_data(self, book):
|
|
work = book
|
|
# go from the edition to the work, if necessary
|
|
if isinstance(book, models.Edition):
|
|
work = book.parent_work
|
|
|
|
# we can mass download edition data from OL to avoid repeatedly querying
|
|
try:
|
|
edition_options = self.load_edition_data(work.openlibrary_key)
|
|
except ConnectorException:
|
|
# who knows, man
|
|
return
|
|
|
|
for edition_data in edition_options.get("entries"):
|
|
# does this edition have ANY interesting data?
|
|
if ignore_edition(edition_data):
|
|
continue
|
|
self.create_edition_from_data(work, edition_data)
|
|
|
|
|
|
def ignore_edition(edition_data):
|
|
"""don't load a million editions that have no metadata"""
|
|
# an isbn, we love to see it
|
|
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
|
|
return False
|
|
# grudgingly, oclc can stay
|
|
if edition_data.get("oclc_numbers"):
|
|
return False
|
|
# if it has a cover it can stay
|
|
if edition_data.get("covers"):
|
|
return False
|
|
# keep non-english editions
|
|
if edition_data.get("languages") and "languages/eng" not in str(
|
|
edition_data.get("languages")
|
|
):
|
|
return False
|
|
return True
|
|
|
|
|
|
def get_description(description_blob):
|
|
"""descriptions can be a string or a dict"""
|
|
if isinstance(description_blob, dict):
|
|
return description_blob.get("value")
|
|
return description_blob
|
|
|
|
|
|
def get_openlibrary_key(key):
|
|
"""convert /books/OL27320736M into OL27320736M"""
|
|
return key.split("/")[-1]
|
|
|
|
|
|
def get_languages(language_blob):
|
|
"""/language/eng -> English"""
|
|
langs = []
|
|
for lang in language_blob:
|
|
langs.append(languages.get(lang.get("key", ""), None))
|
|
return langs
|
|
|
|
|
|
def get_dict_field(blob, field_name):
|
|
"""extract the isni from the remote id data for the author"""
|
|
if not blob or not isinstance(blob, dict):
|
|
return None
|
|
return blob.get(field_name)
|
|
|
|
|
|
def get_wikipedia_link(links):
|
|
"""extract wikipedia links"""
|
|
if not isinstance(links, list):
|
|
return None
|
|
|
|
for link in links:
|
|
if not isinstance(link, dict):
|
|
continue
|
|
if link.get("title") == "wikipedia":
|
|
return link.get("url")
|
|
return None
|
|
|
|
|
|
def get_inventaire_id(links):
|
|
"""extract and format inventaire ids"""
|
|
if not isinstance(links, list):
|
|
return None
|
|
|
|
for link in links:
|
|
if not isinstance(link, dict):
|
|
continue
|
|
if link.get("title") == "inventaire.io":
|
|
iv_link = link.get("url")
|
|
return iv_link.split("/")[-1]
|
|
return None
|
|
|
|
|
|
def pick_default_edition(options):
|
|
"""favor physical copies with covers in english"""
|
|
if not options:
|
|
return None
|
|
if len(options) == 1:
|
|
return options[0]
|
|
|
|
options = [e for e in options if e.get("covers")] or options
|
|
options = [
|
|
e for e in options if "/languages/eng" in str(e.get("languages"))
|
|
] or options
|
|
formats = ["paperback", "hardcover", "mass market paperback"]
|
|
options = [
|
|
e for e in options if str(e.get("physical_format")).lower() in formats
|
|
] or options
|
|
options = [e for e in options if e.get("isbn_13")] or options
|
|
options = [e for e in options if e.get("ocaid")] or options
|
|
return options[0]
|