""" functionality outline for a book data connector """ from abc import ABC, abstractmethod from dataclasses import asdict, dataclass import logging from django.db import transaction import requests from requests.exceptions import RequestException from bookwyrm import activitypub, models, settings from .connector_manager import load_more_data, ConnectorException from .format_mappings import format_mappings logger = logging.getLogger(__name__) class AbstractMinimalConnector(ABC): """just the bare bones, for other bookwyrm instances""" def __init__(self, identifier): # load connector settings info = models.Connector.objects.get(identifier=identifier) self.connector = info # the things in the connector model to copy over self_fields = [ "base_url", "books_url", "covers_url", "search_url", "isbn_search_url", "name", "identifier", "local", ] for field in self_fields: setattr(self, field, getattr(info, field)) def search(self, query, min_confidence=None, timeout=5): """free text search""" params = {} if min_confidence: params["min_confidence"] = min_confidence data = self.get_search_data( f"{self.search_url}{query}", params=params, timeout=timeout, ) results = [] for doc in self.parse_search_data(data)[:10]: results.append(self.format_search_result(doc)) return results def isbn_search(self, query): """isbn search""" params = {} data = self.get_search_data( f"{self.isbn_search_url}{query}", params=params, ) results = [] # this shouldn't be returning mutliple results, but just in case for doc in self.parse_isbn_search_data(data)[:10]: results.append(self.format_isbn_search_result(doc)) return results def get_search_data(self, remote_id, **kwargs): # pylint: disable=no-self-use """this allows connectors to override the default behavior""" return get_data(remote_id, **kwargs) @abstractmethod def get_or_create_book(self, remote_id): """pull up a book record by whatever means possible""" @abstractmethod def parse_search_data(self, data): """turn the result json from a search into a list""" @abstractmethod def format_search_result(self, search_result): """create a SearchResult obj from json""" @abstractmethod def parse_isbn_search_data(self, data): """turn the result json from a search into a list""" @abstractmethod def format_isbn_search_result(self, search_result): """create a SearchResult obj from json""" class AbstractConnector(AbstractMinimalConnector): """generic book data connector""" def __init__(self, identifier): super().__init__(identifier) # fields we want to look for in book data to copy over # title we handle separately. self.book_mappings = [] def get_or_create_book(self, remote_id): """translate arbitrary json into an Activitypub dataclass""" # first, check if we have the origin_id saved existing = models.Edition.find_existing_by_remote_id( remote_id ) or models.Work.find_existing_by_remote_id(remote_id) if existing: if hasattr(existing, "default_edition"): return existing.default_edition return existing # load the json data = self.get_book_data(remote_id) if self.is_work_data(data): try: edition_data = self.get_edition_from_work_data(data) except (KeyError, ConnectorException): # hack: re-use the work data as the edition data # this is why remote ids aren't necessarily unique edition_data = data work_data = data else: edition_data = data try: work_data = self.get_work_from_edition_data(data) except (KeyError, ConnectorException) as err: logger.exception(err) work_data = data if not work_data or not edition_data: raise ConnectorException(f"Unable to load book data: {remote_id}") with transaction.atomic(): # create activitypub object work_activity = activitypub.Work( **dict_from_mappings(work_data, self.book_mappings) ) # this will dedupe automatically work = work_activity.to_model(model=models.Work, overwrite=False) for author in self.get_authors_from_data(work_data): work.authors.add(author) edition = self.create_edition_from_data(work, edition_data) load_more_data.delay(self.connector.id, work.id) return edition def get_book_data(self, remote_id): # pylint: disable=no-self-use """this allows connectors to override the default behavior""" return get_data(remote_id) def create_edition_from_data(self, work, edition_data): """if we already have the work, we're ready""" mapped_data = dict_from_mappings(edition_data, self.book_mappings) mapped_data["work"] = work.remote_id edition_activity = activitypub.Edition(**mapped_data) edition = edition_activity.to_model(model=models.Edition, overwrite=False) edition.connector = self.connector edition.save() for author in self.get_authors_from_data(edition_data): edition.authors.add(author) if not edition.authors.exists() and work.authors.exists(): edition.authors.set(work.authors.all()) return edition def get_or_create_author(self, remote_id): """load that author""" existing = models.Author.find_existing_by_remote_id(remote_id) if existing: return existing data = self.get_book_data(remote_id) mapped_data = dict_from_mappings(data, self.author_mappings) try: activity = activitypub.Author(**mapped_data) except activitypub.ActivitySerializerError: return None # this will dedupe return activity.to_model(model=models.Author, overwrite=False) @abstractmethod def is_work_data(self, data): """differentiate works and editions""" @abstractmethod def get_edition_from_work_data(self, data): """every work needs at least one edition""" @abstractmethod def get_work_from_edition_data(self, data): """every edition needs a work""" @abstractmethod def get_authors_from_data(self, data): """load author data""" @abstractmethod def expand_book_data(self, book): """get more info on a book""" def dict_from_mappings(data, mappings): """create a dict in Activitypub format, using mappings supplies by the subclass""" result = {} for mapping in mappings: # sometimes there are multiple mappings for one field, don't # overwrite earlier writes in that case if mapping.local_field in result and result[mapping.local_field]: continue result[mapping.local_field] = mapping.get_value(data) return result def get_data(url, params=None, timeout=10): """wrapper for request.get""" # check if the url is blocked if models.FederatedServer.is_blocked(url): raise ConnectorException(f"Attempting to load data from blocked url: {url}") try: resp = requests.get( url, params=params, headers={ "Accept": "application/json; charset=utf-8", "User-Agent": settings.USER_AGENT, }, timeout=timeout, ) except RequestException as err: logger.exception(err) raise ConnectorException() if not resp.ok: raise ConnectorException() try: data = resp.json() except ValueError as err: logger.exception(err) raise ConnectorException() return data def get_image(url, timeout=10): """wrapper for requesting an image""" try: resp = requests.get( url, headers={ "User-Agent": settings.USER_AGENT, }, timeout=timeout, ) except RequestException as err: logger.exception(err) return None if not resp.ok: return None return resp @dataclass class SearchResult: """standardized search result object""" title: str key: str connector: object view_link: str = None author: str = None year: str = None cover: str = None confidence: int = 1 def __repr__(self): # pylint: disable=consider-using-f-string return "".format( self.key, self.title, self.author ) def json(self): """serialize a connector for json response""" serialized = asdict(self) del serialized["connector"] return serialized class Mapping: """associate a local database field with a field in an external dataset""" def __init__(self, local_field, remote_field=None, formatter=None): noop = lambda x: x self.local_field = local_field self.remote_field = remote_field or local_field self.formatter = formatter or noop def get_value(self, data): """pull a field from incoming json and return the formatted version""" value = data.get(self.remote_field) if not value: return None try: return self.formatter(value) except: # pylint: disable=bare-except return None def infer_physical_format(format_text): """try to figure out what the standardized format is from the free value""" format_text = format_text.lower() if format_text in format_mappings: # try a direct match return format_mappings[format_text] # failing that, try substring matches = [v for k, v in format_mappings.items() if k in format_text] if not matches: return None return matches[0] def unique_physical_format(format_text): """only store the format if it isn't diretly in the format mappings""" format_text = format_text.lower() if format_text in format_mappings: # try a direct match, so saving this would be redundant return None return format_text