bookwyrm/bookwyrm/connectors/abstract_connector.py

280 lines
8.5 KiB
Python
Raw Normal View History

2020-03-07 20:22:28 +00:00
''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod
2020-12-30 19:37:26 +00:00
from dataclasses import asdict, dataclass
2020-12-30 17:14:07 +00:00
import logging
from urllib3.exceptions import RequestError
2020-03-07 20:22:28 +00:00
2020-05-10 19:56:59 +00:00
from django.db import transaction
import requests
2020-12-03 20:35:57 +00:00
from requests.exceptions import SSLError
2020-05-10 19:56:59 +00:00
from bookwyrm import activitypub, models, settings
2021-01-02 16:14:28 +00:00
from .connector_manager import load_more_data, ConnectorException
2020-03-07 20:22:28 +00:00
2020-12-30 17:14:07 +00:00
logger = logging.getLogger(__name__)
2020-11-29 02:56:28 +00:00
class AbstractMinimalConnector(ABC):
''' just the bare bones, for other bookwyrm instances '''
def __init__(self, identifier):
2020-03-07 20:22:28 +00:00
# load connector settings
info = models.Connector.objects.get(identifier=identifier)
2020-03-28 23:01:02 +00:00
self.connector = info
2020-03-07 20:22:28 +00:00
2020-05-10 23:41:24 +00:00
# the things in the connector model to copy over
self_fields = [
2020-05-10 18:29:10 +00:00
'base_url',
'books_url',
'covers_url',
'search_url',
'max_query_count',
'name',
'identifier',
'local'
]
2020-05-10 23:41:24 +00:00
for field in self_fields:
2020-05-10 18:29:10 +00:00
setattr(self, field, getattr(info, field))
def search(self, query, min_confidence=None):
2020-03-07 20:22:28 +00:00
''' free text search '''
params = {}
if min_confidence:
params['min_confidence'] = min_confidence
resp = requests.get(
'%s%s' % (self.search_url, query),
params=params,
headers={
'Accept': 'application/json; charset=utf-8',
'User-Agent': settings.USER_AGENT,
},
)
if not resp.ok:
resp.raise_for_status()
try:
data = resp.json()
except ValueError as e:
2020-12-30 17:14:07 +00:00
logger.exception(e)
raise ConnectorException('Unable to parse json response', e)
results = []
for doc in self.parse_search_data(data)[:10]:
results.append(self.format_search_result(doc))
return results
2020-11-29 02:56:28 +00:00
@abstractmethod
2020-05-10 19:56:59 +00:00
def get_or_create_book(self, remote_id):
''' pull up a book record by whatever means possible '''
2020-11-29 02:56:28 +00:00
@abstractmethod
def parse_search_data(self, data):
''' turn the result json from a search into a list '''
@abstractmethod
def format_search_result(self, search_result):
''' create a SearchResult obj from json '''
class AbstractConnector(AbstractMinimalConnector):
''' generic book data connector '''
def __init__(self, identifier):
super().__init__(identifier)
# fields we want to look for in book data to copy over
# title we handle separately.
self.book_mappings = []
def is_available(self):
''' check if you're allowed to use this connector '''
if self.max_query_count is not None:
if self.connector.query_count >= self.max_query_count:
return False
return True
def get_or_create_book(self, remote_id):
''' translate arbitrary json into an Activitypub dataclass '''
# first, check if we have the origin_id saved
existing = models.Edition.find_existing_by_remote_id(remote_id) or \
models.Work.find_existing_by_remote_id(remote_id)
if existing:
if hasattr(existing, 'get_default_editon'):
return existing.get_default_editon()
return existing
# load the json
2020-05-10 19:56:59 +00:00
data = get_data(remote_id)
2020-12-19 23:20:31 +00:00
mapped_data = dict_from_mappings(data, self.book_mappings)
2020-05-10 19:56:59 +00:00
if self.is_work_data(data):
try:
edition_data = self.get_edition_from_work_data(data)
2020-05-10 19:56:59 +00:00
except KeyError:
# hack: re-use the work data as the edition data
# this is why remote ids aren't necessarily unique
2020-12-20 00:14:05 +00:00
edition_data = data
work_data = mapped_data
2020-05-10 19:56:59 +00:00
else:
try:
work_data = self.get_work_from_edition_data(data)
2020-12-19 23:20:31 +00:00
work_data = dict_from_mappings(work_data, self.book_mappings)
2020-05-10 19:56:59 +00:00
except KeyError:
work_data = mapped_data
2020-12-20 00:14:05 +00:00
edition_data = data
2020-05-10 19:56:59 +00:00
if not work_data or not edition_data:
raise ConnectorException('Unable to load book data: %s' % remote_id)
with transaction.atomic():
# create activitypub object
work_activity = activitypub.Work(**work_data)
# this will dedupe automatically
work = work_activity.to_model(models.Work)
for author in self.get_authors_from_data(data):
work.authors.add(author)
edition = self.create_edition_from_data(work, edition_data)
load_more_data.delay(self.connector.id, work.id)
return edition
2020-12-20 00:14:05 +00:00
def create_edition_from_data(self, work, edition_data):
''' if we already have the work, we're ready '''
mapped_data = dict_from_mappings(edition_data, self.book_mappings)
mapped_data['work'] = work.remote_id
edition_activity = activitypub.Edition(**mapped_data)
edition = edition_activity.to_model(models.Edition)
edition.connector = self.connector
edition.save()
2020-12-20 00:14:05 +00:00
work.default_edition = edition
work.save()
for author in self.get_authors_from_data(edition_data):
edition.authors.add(author)
2020-12-20 00:14:05 +00:00
if not edition.authors.exists() and work.authors.exists():
edition.authors.set(work.authors.all())
return edition
2020-05-10 19:56:59 +00:00
2020-12-19 23:20:31 +00:00
def get_or_create_author(self, remote_id):
''' load that author '''
2020-12-20 00:14:05 +00:00
existing = models.Author.find_existing_by_remote_id(remote_id)
2020-12-19 23:20:31 +00:00
if existing:
return existing
data = get_data(remote_id)
2020-12-20 00:14:05 +00:00
mapped_data = dict_from_mappings(data, self.author_mappings)
activity = activitypub.Author(**mapped_data)
2020-12-19 23:20:31 +00:00
# this will dedupe
2020-12-20 00:14:05 +00:00
return activity.to_model(models.Author)
2020-05-10 19:56:59 +00:00
@abstractmethod
def is_work_data(self, data):
''' differentiate works and editions '''
@abstractmethod
def get_edition_from_work_data(self, data):
''' every work needs at least one edition '''
@abstractmethod
def get_work_from_edition_data(self, data):
2020-05-10 19:56:59 +00:00
''' every edition needs a work '''
2020-05-09 19:53:55 +00:00
@abstractmethod
def get_authors_from_data(self, data):
''' load author data '''
2020-04-02 05:11:31 +00:00
@abstractmethod
def expand_book_data(self, book):
''' get more info on a book '''
2020-12-20 00:14:05 +00:00
def dict_from_mappings(data, mappings):
2020-12-19 23:20:31 +00:00
''' create a dict in Activitypub format, using mappings supplies by
the subclass '''
result = {}
for mapping in mappings:
result[mapping.local_field] = mapping.get_value(data)
return result
2020-03-30 00:40:51 +00:00
def get_data(url):
''' wrapper for request.get '''
try:
resp = requests.get(
url,
headers={
'Accept': 'application/json; charset=utf-8',
'User-Agent': settings.USER_AGENT,
},
)
except (RequestError, SSLError):
raise ConnectorException()
if not resp.ok:
resp.raise_for_status()
try:
data = resp.json()
except ValueError:
raise ConnectorException()
return data
2020-11-29 17:40:15 +00:00
def get_image(url):
''' wrapper for requesting an image '''
try:
resp = requests.get(
url,
headers={
'User-Agent': settings.USER_AGENT,
},
)
2020-12-03 20:35:57 +00:00
except (RequestError, SSLError):
2020-11-29 17:40:15 +00:00
return None
if not resp.ok:
return None
return resp
2020-10-29 22:29:23 +00:00
@dataclass
2020-09-21 17:25:26 +00:00
class SearchResult:
2020-03-07 20:22:28 +00:00
''' standardized search result object '''
2020-10-29 22:29:23 +00:00
title: str
key: str
author: str
year: str
connector: object
2020-10-29 22:29:23 +00:00
confidence: int = 1
def __repr__(self):
return "<SearchResult key={!r} title={!r} author={!r}>".format(
self.key, self.title, self.author)
2020-05-10 23:41:24 +00:00
2020-12-30 19:37:26 +00:00
def json(self):
''' serialize a connector for json response '''
serialized = asdict(self)
del serialized['connector']
return serialized
2020-05-10 23:41:24 +00:00
2020-09-21 17:25:26 +00:00
class Mapping:
2020-05-10 23:41:24 +00:00
''' associate a local database field with a field in an external dataset '''
def __init__(self, local_field, remote_field=None, formatter=None):
2020-05-10 23:41:24 +00:00
noop = lambda x: x
self.local_field = local_field
self.remote_field = remote_field or local_field
self.formatter = formatter or noop
def get_value(self, data):
''' pull a field from incoming json and return the formatted version '''
value = data.get(self.remote_field)
2020-12-20 00:14:05 +00:00
if not value:
return None
try:
return self.formatter(value)
except:# pylint: disable=bare-except
return None