2021-04-06 18:58:07 +00:00
|
|
|
""" inventaire data connector """
|
2021-04-29 00:18:14 +00:00
|
|
|
import re
|
|
|
|
|
2021-04-07 01:34:55 +00:00
|
|
|
from bookwyrm import models
|
2021-09-16 18:30:04 +00:00
|
|
|
from bookwyrm.book_search import SearchResult
|
|
|
|
from .abstract_connector import AbstractConnector, Mapping
|
2021-04-06 18:58:07 +00:00
|
|
|
from .abstract_connector import get_data
|
2021-04-06 20:39:10 +00:00
|
|
|
from .connector_manager import ConnectorException
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
|
|
|
|
class Connector(AbstractConnector):
|
2021-09-29 19:38:31 +00:00
|
|
|
"""instantiate a connector for inventaire"""
|
2021-04-06 18:58:07 +00:00
|
|
|
|
2021-12-05 21:38:15 +00:00
|
|
|
generated_remote_link_field = "inventaire_id"
|
2021-12-05 20:37:19 +00:00
|
|
|
|
2021-04-06 18:58:07 +00:00
|
|
|
def __init__(self, identifier):
|
|
|
|
super().__init__(identifier)
|
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
get_first = lambda a: a[0]
|
|
|
|
shared_mappings = [
|
|
|
|
Mapping("id", remote_field="uri", formatter=self.get_remote_id),
|
|
|
|
Mapping("bnfId", remote_field="wdt:P268", formatter=get_first),
|
|
|
|
Mapping("openlibraryKey", remote_field="wdt:P648", formatter=get_first),
|
|
|
|
]
|
2021-04-06 18:58:07 +00:00
|
|
|
self.book_mappings = [
|
2021-04-06 20:53:58 +00:00
|
|
|
Mapping("title", remote_field="wdt:P1476", formatter=get_first),
|
2021-04-30 19:50:35 +00:00
|
|
|
Mapping("title", remote_field="labels", formatter=get_language_code),
|
2021-04-06 20:53:58 +00:00
|
|
|
Mapping("subtitle", remote_field="wdt:P1680", formatter=get_first),
|
2021-04-06 20:03:22 +00:00
|
|
|
Mapping("inventaireId", remote_field="uri"),
|
2021-04-29 17:40:49 +00:00
|
|
|
Mapping(
|
|
|
|
"description", remote_field="sitelinks", formatter=self.get_description
|
|
|
|
),
|
2021-04-06 18:58:07 +00:00
|
|
|
Mapping("cover", remote_field="image", formatter=self.get_cover_url),
|
2021-04-06 20:53:58 +00:00
|
|
|
Mapping("isbn13", remote_field="wdt:P212", formatter=get_first),
|
|
|
|
Mapping("isbn10", remote_field="wdt:P957", formatter=get_first),
|
2021-04-07 01:00:54 +00:00
|
|
|
Mapping("oclcNumber", remote_field="wdt:P5331", formatter=get_first),
|
2021-04-07 00:46:06 +00:00
|
|
|
Mapping("goodreadsKey", remote_field="wdt:P2969", formatter=get_first),
|
|
|
|
Mapping("librarythingKey", remote_field="wdt:P1085", formatter=get_first),
|
2021-04-07 01:10:42 +00:00
|
|
|
Mapping("languages", remote_field="wdt:P407", formatter=self.resolve_keys),
|
|
|
|
Mapping("publishers", remote_field="wdt:P123", formatter=self.resolve_keys),
|
2021-04-06 20:53:58 +00:00
|
|
|
Mapping("publishedDate", remote_field="wdt:P577", formatter=get_first),
|
|
|
|
Mapping("pages", remote_field="wdt:P1104", formatter=get_first),
|
2021-04-07 01:10:42 +00:00
|
|
|
Mapping(
|
|
|
|
"subjectPlaces", remote_field="wdt:P840", formatter=self.resolve_keys
|
|
|
|
),
|
|
|
|
Mapping("subjects", remote_field="wdt:P921", formatter=self.resolve_keys),
|
2021-04-06 20:53:58 +00:00
|
|
|
Mapping("asin", remote_field="wdt:P5749", formatter=get_first),
|
2021-04-07 00:46:06 +00:00
|
|
|
] + shared_mappings
|
|
|
|
# TODO: P136: genre, P674 characters, P950 bne
|
2021-04-06 18:58:07 +00:00
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
self.author_mappings = [
|
|
|
|
Mapping("id", remote_field="uri", formatter=self.get_remote_id),
|
2021-04-07 01:00:54 +00:00
|
|
|
Mapping("name", remote_field="labels", formatter=get_language_code),
|
2021-04-29 17:30:38 +00:00
|
|
|
Mapping("bio", remote_field="sitelinks", formatter=self.get_description),
|
2021-04-07 00:46:06 +00:00
|
|
|
Mapping("goodreadsKey", remote_field="wdt:P2963", formatter=get_first),
|
|
|
|
Mapping("isni", remote_field="wdt:P213", formatter=get_first),
|
|
|
|
Mapping("viafId", remote_field="wdt:P214", formatter=get_first),
|
|
|
|
Mapping("gutenberg_id", remote_field="wdt:P1938", formatter=get_first),
|
|
|
|
Mapping("born", remote_field="wdt:P569", formatter=get_first),
|
|
|
|
Mapping("died", remote_field="wdt:P570", formatter=get_first),
|
|
|
|
] + shared_mappings
|
|
|
|
|
|
|
|
def get_remote_id(self, value):
|
2021-04-26 21:43:29 +00:00
|
|
|
"""convert an id/uri into a url"""
|
2021-09-18 18:32:00 +00:00
|
|
|
return f"{self.books_url}?action=by-uris&uris={value}"
|
2021-04-06 20:39:10 +00:00
|
|
|
|
2021-04-06 19:29:06 +00:00
|
|
|
def get_book_data(self, remote_id):
|
|
|
|
data = get_data(remote_id)
|
|
|
|
extracted = list(data.get("entities").values())
|
2021-04-06 20:53:58 +00:00
|
|
|
try:
|
|
|
|
data = extracted[0]
|
2021-11-16 18:16:28 +00:00
|
|
|
except (KeyError, IndexError):
|
2021-04-06 20:53:58 +00:00
|
|
|
raise ConnectorException("Invalid book data")
|
|
|
|
# flatten the data so that images, uri, and claims are on the same level
|
|
|
|
return {
|
2021-04-29 17:12:56 +00:00
|
|
|
**data.get("claims", {}),
|
2021-08-08 22:55:49 +00:00
|
|
|
**{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]},
|
2021-04-06 20:53:58 +00:00
|
|
|
}
|
2021-04-06 19:29:06 +00:00
|
|
|
|
2021-06-18 21:12:56 +00:00
|
|
|
def search(self, query, min_confidence=None): # pylint: disable=arguments-differ
|
2021-05-10 22:35:46 +00:00
|
|
|
"""overrides default search function with confidence ranking"""
|
|
|
|
results = super().search(query)
|
|
|
|
if min_confidence:
|
|
|
|
# filter the search results after the fact
|
|
|
|
return [r for r in results if r.confidence >= min_confidence]
|
|
|
|
return results
|
|
|
|
|
2021-04-06 18:58:07 +00:00
|
|
|
def parse_search_data(self, data):
|
2021-04-06 20:53:58 +00:00
|
|
|
return data.get("results")
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def format_search_result(self, search_result):
|
|
|
|
images = search_result.get("image")
|
2021-09-18 18:33:43 +00:00
|
|
|
cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None
|
2021-05-10 22:35:46 +00:00
|
|
|
# a deeply messy translation of inventaire's scores
|
|
|
|
confidence = float(search_result.get("_score", 0.1))
|
|
|
|
confidence = 0.1 if confidence < 150 else 0.999
|
2021-04-06 18:58:07 +00:00
|
|
|
return SearchResult(
|
|
|
|
title=search_result.get("label"),
|
2021-04-29 16:54:42 +00:00
|
|
|
key=self.get_remote_id(search_result.get("uri")),
|
2021-04-29 18:22:09 +00:00
|
|
|
author=search_result.get("description"),
|
2021-09-18 18:32:00 +00:00
|
|
|
view_link=f"{self.base_url}/entity/{search_result.get('uri')}",
|
2021-04-06 18:58:07 +00:00
|
|
|
cover=cover,
|
2021-05-10 22:35:46 +00:00
|
|
|
confidence=confidence,
|
2021-04-06 18:58:07 +00:00
|
|
|
connector=self,
|
|
|
|
)
|
|
|
|
|
|
|
|
def parse_isbn_search_data(self, data):
|
2021-04-29 16:54:42 +00:00
|
|
|
"""got some daaaata"""
|
2021-10-27 17:40:37 +00:00
|
|
|
results = data.get("entities")
|
|
|
|
if not results:
|
|
|
|
return []
|
2021-04-29 16:54:42 +00:00
|
|
|
return list(results.values())
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def format_isbn_search_result(self, search_result):
|
2021-04-29 16:54:42 +00:00
|
|
|
"""totally different format than a regular search result"""
|
|
|
|
title = search_result.get("claims", {}).get("wdt:P1476", [])
|
|
|
|
if not title:
|
|
|
|
return None
|
|
|
|
return SearchResult(
|
|
|
|
title=title[0],
|
|
|
|
key=self.get_remote_id(search_result.get("uri")),
|
2021-04-29 18:22:09 +00:00
|
|
|
author=search_result.get("description"),
|
2021-09-18 18:32:00 +00:00
|
|
|
view_link=f"{self.base_url}/entity/{search_result.get('uri')}",
|
2021-04-29 16:54:42 +00:00
|
|
|
cover=self.get_cover_url(search_result.get("image")),
|
2021-04-29 16:56:35 +00:00
|
|
|
connector=self,
|
2021-04-29 16:54:42 +00:00
|
|
|
)
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def is_work_data(self, data):
|
2021-04-06 19:29:06 +00:00
|
|
|
return data.get("type") == "work"
|
2021-04-06 18:58:07 +00:00
|
|
|
|
2021-04-07 01:34:55 +00:00
|
|
|
def load_edition_data(self, work_uri):
|
2021-04-26 21:43:29 +00:00
|
|
|
"""get a list of editions for a work"""
|
2021-10-27 17:03:09 +00:00
|
|
|
# pylint: disable=line-too-long
|
2021-09-18 18:33:43 +00:00
|
|
|
url = f"{self.books_url}?action=reverse-claims&property=wdt:P629&value={work_uri}&sort=true"
|
2021-04-07 01:34:55 +00:00
|
|
|
return get_data(url)
|
|
|
|
|
|
|
|
def get_edition_from_work_data(self, data):
|
|
|
|
data = self.load_edition_data(data.get("uri"))
|
2021-04-06 20:39:10 +00:00
|
|
|
try:
|
2021-08-23 22:59:58 +00:00
|
|
|
uri = data.get("uris", [])[0]
|
|
|
|
except IndexError:
|
2021-04-06 20:39:10 +00:00
|
|
|
raise ConnectorException("Invalid book data")
|
|
|
|
return self.get_book_data(self.get_remote_id(uri))
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def get_work_from_edition_data(self, data):
|
2021-04-30 22:48:52 +00:00
|
|
|
uri = data.get("wdt:P629", [None])[0]
|
|
|
|
if not uri:
|
2021-04-06 20:39:10 +00:00
|
|
|
raise ConnectorException("Invalid book data")
|
|
|
|
return self.get_book_data(self.get_remote_id(uri))
|
|
|
|
|
2021-04-06 18:58:07 +00:00
|
|
|
def get_authors_from_data(self, data):
|
2021-04-07 01:51:43 +00:00
|
|
|
authors = data.get("wdt:P50", [])
|
2021-04-07 00:46:06 +00:00
|
|
|
for author in authors:
|
|
|
|
yield self.get_or_create_author(self.get_remote_id(author))
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def expand_book_data(self, book):
|
2021-04-07 01:34:55 +00:00
|
|
|
work = book
|
|
|
|
# go from the edition to the work, if necessary
|
|
|
|
if isinstance(book, models.Edition):
|
|
|
|
work = book.parent_work
|
|
|
|
|
|
|
|
try:
|
|
|
|
edition_options = self.load_edition_data(work.inventaire_id)
|
|
|
|
except ConnectorException:
|
|
|
|
# who knows, man
|
|
|
|
return
|
|
|
|
|
|
|
|
for edition_uri in edition_options.get("uris"):
|
|
|
|
remote_id = self.get_remote_id(edition_uri)
|
2021-04-28 23:33:40 +00:00
|
|
|
try:
|
|
|
|
data = self.get_book_data(remote_id)
|
|
|
|
except ConnectorException:
|
|
|
|
# who, indeed, knows
|
|
|
|
continue
|
|
|
|
self.create_edition_from_data(work, data)
|
2021-04-06 18:58:07 +00:00
|
|
|
|
|
|
|
def get_cover_url(self, cover_blob, *_):
|
2021-04-06 20:53:58 +00:00
|
|
|
"""format the relative cover url into an absolute one:
|
2021-04-06 18:58:07 +00:00
|
|
|
{"url": "/img/entities/e794783f01b9d4f897a1ea9820b96e00d346994f"}
|
|
|
|
"""
|
2021-04-29 00:18:14 +00:00
|
|
|
# covers may or may not be a list
|
2021-04-28 20:50:47 +00:00
|
|
|
if isinstance(cover_blob, list) and len(cover_blob) > 0:
|
|
|
|
cover_blob = cover_blob[0]
|
|
|
|
cover_id = cover_blob.get("url")
|
2021-04-06 18:58:07 +00:00
|
|
|
if not cover_id:
|
|
|
|
return None
|
2021-04-29 00:18:14 +00:00
|
|
|
# cover may or may not be an absolute url already
|
|
|
|
if re.match(r"^http", cover_id):
|
|
|
|
return cover_id
|
2021-09-18 18:32:00 +00:00
|
|
|
return f"{self.covers_url}{cover_id}"
|
2021-04-06 18:58:07 +00:00
|
|
|
|
2021-04-07 01:10:42 +00:00
|
|
|
def resolve_keys(self, keys):
|
2021-04-26 21:43:29 +00:00
|
|
|
"""cool, it's "wd:Q3156592" now what the heck does that mean"""
|
2021-04-07 01:10:42 +00:00
|
|
|
results = []
|
|
|
|
for uri in keys:
|
|
|
|
try:
|
|
|
|
data = self.get_book_data(self.get_remote_id(uri))
|
|
|
|
except ConnectorException:
|
|
|
|
continue
|
|
|
|
results.append(get_language_code(data.get("labels")))
|
|
|
|
return results
|
2021-04-06 18:58:07 +00:00
|
|
|
|
2021-04-29 17:30:38 +00:00
|
|
|
def get_description(self, links):
|
2021-04-29 17:54:36 +00:00
|
|
|
"""grab an extracted excerpt from wikipedia"""
|
2021-04-29 17:30:38 +00:00
|
|
|
link = links.get("enwiki")
|
|
|
|
if not link:
|
|
|
|
return ""
|
2021-09-18 18:32:00 +00:00
|
|
|
url = f"{self.base_url}/api/data?action=wp-extract&lang=en&title={link}"
|
2021-04-29 17:30:38 +00:00
|
|
|
try:
|
|
|
|
data = get_data(url)
|
|
|
|
except ConnectorException:
|
|
|
|
return ""
|
|
|
|
return data.get("extract")
|
|
|
|
|
2021-12-05 21:38:15 +00:00
|
|
|
def get_remote_id_from_model(self, obj):
|
|
|
|
"""use get_remote_id to figure out the link from a model obj"""
|
2021-12-05 21:46:13 +00:00
|
|
|
remote_id_value = obj.inventaire_id
|
2021-12-05 21:38:15 +00:00
|
|
|
return self.get_remote_id(remote_id_value)
|
|
|
|
|
2021-04-07 00:46:06 +00:00
|
|
|
|
|
|
|
def get_language_code(options, code="en"):
|
2021-04-26 21:43:29 +00:00
|
|
|
"""when there are a bunch of translation but we need a single field"""
|
2021-04-30 19:50:35 +00:00
|
|
|
result = options.get(code)
|
|
|
|
if result:
|
|
|
|
return result
|
|
|
|
values = list(options.values())
|
|
|
|
return values[0] if values else None
|