""" functionality outline for a book data connector """ from abc import ABC, abstractmethod import logging from django.db import transaction import requests from requests.exceptions import RequestException from bookwyrm import activitypub, models, settings from .connector_manager import load_more_data, ConnectorException logger = logging.getLogger(__name__) class AbstractMinimalConnector(ABC): """just the bare bones, for other bookwyrm instances""" def __init__(self, identifier): # load connector settings info = models.Connector.objects.get(identifier=identifier) self.connector = info # the things in the connector model to copy over self_fields = [ "base_url", "books_url", "covers_url", "search_url", "isbn_search_url", "name", "identifier", ] for field in self_fields: setattr(self, field, getattr(info, field)) def search(self, query, min_confidence=None, timeout=5): """free text search""" params = {} if min_confidence: params["min_confidence"] = min_confidence data = self.get_search_data( "%s%s" % (self.search_url, query), params=params, timeout=timeout, ) results = [] for doc in self.parse_search_data(data)[:10]: results.append(self.format_search_result(doc)) return results def isbn_search(self, query): """isbn search""" params = {} data = self.get_search_data( "%s%s" % (self.isbn_search_url, query), params=params, ) results = [] # this shouldn't be returning mutliple results, but just in case for doc in self.parse_isbn_search_data(data)[:10]: results.append(self.format_isbn_search_result(doc)) return results def get_search_data(self, remote_id, **kwargs): # pylint: disable=no-self-use """this allows connectors to override the default behavior""" return get_data(remote_id, **kwargs) @abstractmethod def get_or_create_book(self, remote_id): """pull up a book record by whatever means possible""" @abstractmethod def parse_search_data(self, data): """turn the result json from a search into a list""" @abstractmethod def format_search_result(self, search_result): """create a SearchResult obj from json""" @abstractmethod def parse_isbn_search_data(self, data): """turn the result json from a search into a list""" @abstractmethod def format_isbn_search_result(self, search_result): """create a SearchResult obj from json""" class AbstractConnector(AbstractMinimalConnector): """generic book data connector""" def __init__(self, identifier): super().__init__(identifier) # fields we want to look for in book data to copy over # title we handle separately. self.book_mappings = [] def get_or_create_book(self, remote_id): """translate arbitrary json into an Activitypub dataclass""" # first, check if we have the origin_id saved existing = models.Edition.find_existing_by_remote_id( remote_id ) or models.Work.find_existing_by_remote_id(remote_id) if existing: if hasattr(existing, "default_edition"): return existing.default_edition return existing # load the json data = self.get_book_data(remote_id) if self.is_work_data(data): try: edition_data = self.get_edition_from_work_data(data) except (KeyError, ConnectorException): # hack: re-use the work data as the edition data # this is why remote ids aren't necessarily unique edition_data = data work_data = data else: edition_data = data try: work_data = self.get_work_from_edition_data(data) except (KeyError, ConnectorException) as err: logger.exception(err) work_data = data if not work_data or not edition_data: raise ConnectorException("Unable to load book data: %s" % remote_id) with transaction.atomic(): # create activitypub object work_activity = activitypub.Work( **dict_from_mappings(work_data, self.book_mappings) ) # this will dedupe automatically work = work_activity.to_model(model=models.Work, overwrite=False) for author in self.get_authors_from_data(work_data): work.authors.add(author) edition = self.create_edition_from_data(work, edition_data) load_more_data.delay(self.connector.id, work.id) return edition def get_book_data(self, remote_id): # pylint: disable=no-self-use """this allows connectors to override the default behavior""" return get_data(remote_id) def create_edition_from_data(self, work, edition_data): """if we already have the work, we're ready""" mapped_data = dict_from_mappings(edition_data, self.book_mappings) mapped_data["work"] = work.remote_id edition_activity = activitypub.Edition(**mapped_data) edition = edition_activity.to_model(model=models.Edition, overwrite=False) edition.connector = self.connector edition.save() for author in self.get_authors_from_data(edition_data): edition.authors.add(author) if not edition.authors.exists() and work.authors.exists(): edition.authors.set(work.authors.all()) return edition def get_or_create_author(self, remote_id): """load that author""" existing = models.Author.find_existing_by_remote_id(remote_id) if existing: return existing data = self.get_book_data(remote_id) mapped_data = dict_from_mappings(data, self.author_mappings) try: activity = activitypub.Author(**mapped_data) except activitypub.ActivitySerializerError: return None # this will dedupe return activity.to_model(model=models.Author, overwrite=False) @abstractmethod def is_work_data(self, data): """differentiate works and editions""" @abstractmethod def get_edition_from_work_data(self, data): """every work needs at least one edition""" @abstractmethod def get_work_from_edition_data(self, data): """every edition needs a work""" @abstractmethod def get_authors_from_data(self, data): """load author data""" @abstractmethod def expand_book_data(self, book): """get more info on a book""" def dict_from_mappings(data, mappings): """create a dict in Activitypub format, using mappings supplies by the subclass""" result = {} for mapping in mappings: # sometimes there are multiple mappings for one field, don't # overwrite earlier writes in that case if mapping.local_field in result and result[mapping.local_field]: continue result[mapping.local_field] = mapping.get_value(data) return result def get_data(url, params=None, timeout=10): """wrapper for request.get""" # check if the url is blocked if models.FederatedServer.is_blocked(url): raise ConnectorException( "Attempting to load data from blocked url: {:s}".format(url) ) try: resp = requests.get( url, params=params, headers={ "Accept": "application/json; charset=utf-8", "User-Agent": settings.USER_AGENT, }, timeout=timeout, ) except RequestException as err: logger.exception(err) raise ConnectorException() if not resp.ok: raise ConnectorException() try: data = resp.json() except ValueError as err: logger.exception(err) raise ConnectorException() return data def get_image(url, timeout=10): """wrapper for requesting an image""" try: resp = requests.get( url, headers={ "User-Agent": settings.USER_AGENT, }, timeout=timeout, ) except RequestException as err: logger.exception(err) return None if not resp.ok: return None return resp class Mapping: """associate a local database field with a field in an external dataset""" def __init__(self, local_field, remote_field=None, formatter=None): noop = lambda x: x self.local_field = local_field self.remote_field = remote_field or local_field self.formatter = formatter or noop def get_value(self, data): """pull a field from incoming json and return the formatted version""" value = data.get(self.remote_field) if not value: return None try: return self.formatter(value) except: # pylint: disable=bare-except return None