Merge pull request #2925 from jderuiter/mypy-connectors

Type annotations and related changes for connectors
This commit is contained in:
Mouse Reeve 2023-08-01 20:46:56 -07:00 committed by GitHub
commit acafa0b417
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
13 changed files with 412 additions and 185 deletions

View file

@ -2,6 +2,8 @@
from dataclasses import dataclass, fields, MISSING
from json import JSONEncoder
import logging
from typing import Optional, Union, TypeVar, overload, Any
import requests
from django.apps import apps
@ -10,12 +12,15 @@ from django.utils.http import http_date
from bookwyrm import models
from bookwyrm.connectors import ConnectorException, get_data
from bookwyrm.models import base_model
from bookwyrm.signatures import make_signature
from bookwyrm.settings import DOMAIN, INSTANCE_ACTOR_USERNAME
from bookwyrm.tasks import app, MISC
logger = logging.getLogger(__name__)
TBookWyrmModel = TypeVar("TBookWyrmModel", bound=base_model.BookWyrmModel)
class ActivitySerializerError(ValueError):
"""routine problems serializing activitypub json"""
@ -65,7 +70,11 @@ class ActivityObject:
id: str
type: str
def __init__(self, activity_objects=None, **kwargs):
def __init__(
self,
activity_objects: Optional[list[str, base_model.BookWyrmModel]] = None,
**kwargs: dict[str, Any],
):
"""this lets you pass in an object with fields that aren't in the
dataclass, which it ignores. Any field in the dataclass is required or
has a default value"""
@ -101,13 +110,13 @@ class ActivityObject:
# pylint: disable=too-many-locals,too-many-branches,too-many-arguments
def to_model(
self,
model=None,
instance=None,
allow_create=True,
save=True,
overwrite=True,
allow_external_connections=True,
):
model: Optional[type[TBookWyrmModel]] = None,
instance: Optional[TBookWyrmModel] = None,
allow_create: bool = True,
save: bool = True,
overwrite: bool = True,
allow_external_connections: bool = True,
) -> Optional[TBookWyrmModel]:
"""convert from an activity to a model instance. Args:
model: the django model that this object is being converted to
(will guess if not known)
@ -296,14 +305,40 @@ def get_model_from_type(activity_type):
# pylint: disable=too-many-arguments
@overload
def resolve_remote_id(
remote_id,
model=None,
refresh=False,
save=True,
get_activity=False,
allow_external_connections=True,
):
remote_id: str,
model: type[TBookWyrmModel],
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> TBookWyrmModel:
...
# pylint: disable=too-many-arguments
@overload
def resolve_remote_id(
remote_id: str,
model: Optional[str] = None,
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> base_model.BookWyrmModel:
...
# pylint: disable=too-many-arguments
def resolve_remote_id(
remote_id: str,
model: Optional[Union[str, type[base_model.BookWyrmModel]]] = None,
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> base_model.BookWyrmModel:
"""take a remote_id and return an instance, creating if necessary. Args:
remote_id: the unique url for looking up the object in the db or by http
model: a string or object representing the model that corresponds to the object

View file

@ -1,6 +1,6 @@
""" book and author data """
from dataclasses import dataclass, field
from typing import List
from typing import Optional
from .base_activity import ActivityObject
from .image import Document
@ -11,19 +11,19 @@ from .image import Document
class BookData(ActivityObject):
"""shared fields for all book data and authors"""
openlibraryKey: str = None
inventaireId: str = None
librarythingKey: str = None
goodreadsKey: str = None
bnfId: str = None
viaf: str = None
wikidata: str = None
asin: str = None
aasin: str = None
isfdb: str = None
lastEditedBy: str = None
links: List[str] = field(default_factory=lambda: [])
fileLinks: List[str] = field(default_factory=lambda: [])
openlibraryKey: Optional[str] = None
inventaireId: Optional[str] = None
librarythingKey: Optional[str] = None
goodreadsKey: Optional[str] = None
bnfId: Optional[str] = None
viaf: Optional[str] = None
wikidata: Optional[str] = None
asin: Optional[str] = None
aasin: Optional[str] = None
isfdb: Optional[str] = None
lastEditedBy: Optional[str] = None
links: list[str] = field(default_factory=list)
fileLinks: list[str] = field(default_factory=list)
# pylint: disable=invalid-name
@ -35,17 +35,17 @@ class Book(BookData):
sortTitle: str = None
subtitle: str = None
description: str = ""
languages: List[str] = field(default_factory=lambda: [])
languages: list[str] = field(default_factory=list)
series: str = ""
seriesNumber: str = ""
subjects: List[str] = field(default_factory=lambda: [])
subjectPlaces: List[str] = field(default_factory=lambda: [])
subjects: list[str] = field(default_factory=list)
subjectPlaces: list[str] = field(default_factory=list)
authors: List[str] = field(default_factory=lambda: [])
authors: list[str] = field(default_factory=list)
firstPublishedDate: str = ""
publishedDate: str = ""
cover: Document = None
cover: Optional[Document] = None
type: str = "Book"
@ -58,10 +58,10 @@ class Edition(Book):
isbn10: str = ""
isbn13: str = ""
oclcNumber: str = ""
pages: int = None
pages: Optional[int] = None
physicalFormat: str = ""
physicalFormatDetail: str = ""
publishers: List[str] = field(default_factory=lambda: [])
publishers: list[str] = field(default_factory=list)
editionRank: int = 0
type: str = "Edition"
@ -73,7 +73,7 @@ class Work(Book):
"""work instance of a book object"""
lccn: str = ""
editions: List[str] = field(default_factory=lambda: [])
editions: list[str] = field(default_factory=list)
type: str = "Work"
@ -83,12 +83,12 @@ class Author(BookData):
"""author of a book"""
name: str
isni: str = None
viafId: str = None
gutenbergId: str = None
born: str = None
died: str = None
aliases: List[str] = field(default_factory=lambda: [])
isni: Optional[str] = None
viafId: Optional[str] = None
gutenbergId: Optional[str] = None
born: Optional[str] = None
died: Optional[str] = None
aliases: list[str] = field(default_factory=list)
bio: str = ""
wikipediaLink: str = ""
type: str = "Author"

View file

@ -1,22 +1,53 @@
""" using a bookwyrm instance as a source of book data """
from __future__ import annotations
from dataclasses import asdict, dataclass
from functools import reduce
import operator
from typing import Optional, Union, Any, Literal, overload
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F, Q
from django.db.models.query import QuerySet
from bookwyrm import models
from bookwyrm import connectors
from bookwyrm.settings import MEDIA_FULL_URL
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[False],
) -> QuerySet[models.Edition]:
...
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[True],
) -> Optional[models.Edition]:
...
# pylint: disable=arguments-differ
def search(query, min_confidence=0, filters=None, return_first=False):
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: bool = False,
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
"""search your local database"""
filters = filters or []
if not query:
return []
return None if return_first else []
query = query.strip()
results = None
@ -66,7 +97,9 @@ def format_search_result(search_result):
).json()
def search_identifiers(query, *filters, return_first=False):
def search_identifiers(
query, *filters, return_first=False
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
"""tries remote_id, isbn; defined as dedupe fields on the model"""
if connectors.maybe_isbn(query):
# Oh did you think the 'S' in ISBN stood for 'standard'?
@ -87,7 +120,9 @@ def search_identifiers(query, *filters, return_first=False):
return results
def search_title_author(query, min_confidence, *filters, return_first=False):
def search_title_author(
query, min_confidence, *filters, return_first=False
) -> QuerySet[models.Edition]:
"""searches for title and author"""
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
results = (
@ -122,11 +157,11 @@ class SearchResult:
title: str
key: str
connector: object
view_link: str = None
author: str = None
year: str = None
cover: str = None
confidence: int = 1
view_link: Optional[str] = None
author: Optional[str] = None
year: Optional[str] = None
cover: Optional[str] = None
confidence: float = 1.0
def __repr__(self):
# pylint: disable=consider-using-f-string

View file

@ -1,5 +1,7 @@
""" functionality outline for a book data connector """
from __future__ import annotations
from abc import ABC, abstractmethod
from typing import Optional, TypedDict, Any, Callable, Union, Iterator
from urllib.parse import quote_plus
import imghdr
import logging
@ -16,33 +18,38 @@ from bookwyrm import activitypub, models, settings
from bookwyrm.settings import USER_AGENT
from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url
from .format_mappings import format_mappings
from ..book_search import SearchResult
logger = logging.getLogger(__name__)
JsonDict = dict[str, Any]
class ConnectorResults(TypedDict):
"""TypedDict for results returned by connector"""
connector: AbstractMinimalConnector
results: list[SearchResult]
class AbstractMinimalConnector(ABC):
"""just the bare bones, for other bookwyrm instances"""
def __init__(self, identifier):
def __init__(self, identifier: str):
# load connector settings
info = models.Connector.objects.get(identifier=identifier)
self.connector = info
# the things in the connector model to copy over
self_fields = [
"base_url",
"books_url",
"covers_url",
"search_url",
"isbn_search_url",
"name",
"identifier",
]
for field in self_fields:
setattr(self, field, getattr(info, field))
self.base_url = info.base_url
self.books_url = info.books_url
self.covers_url = info.covers_url
self.search_url = info.search_url
self.isbn_search_url = info.isbn_search_url
self.name = info.name
self.identifier = info.identifier
def get_search_url(self, query):
def get_search_url(self, query: str) -> str:
"""format the query url"""
# Check if the query resembles an ISBN
if maybe_isbn(query) and self.isbn_search_url and self.isbn_search_url != "":
@ -54,13 +61,21 @@ class AbstractMinimalConnector(ABC):
# searched as free text. This, instead, only searches isbn if it's isbn-y
return f"{self.search_url}{quote_plus(query)}"
def process_search_response(self, query, data, min_confidence):
def process_search_response(
self, query: str, data: Any, min_confidence: float
) -> list[SearchResult]:
"""Format the search results based on the format of the query"""
if maybe_isbn(query):
return list(self.parse_isbn_search_data(data))[:10]
return list(self.parse_search_data(data, min_confidence))[:10]
async def get_results(self, session, url, min_confidence, query):
async def get_results(
self,
session: aiohttp.ClientSession,
url: str,
min_confidence: float,
query: str,
) -> Optional[ConnectorResults]:
"""try this specific connector"""
# pylint: disable=line-too-long
headers = {
@ -74,55 +89,63 @@ class AbstractMinimalConnector(ABC):
async with session.get(url, headers=headers, params=params) as response:
if not response.ok:
logger.info("Unable to connect to %s: %s", url, response.reason)
return
return None
try:
raw_data = await response.json()
except aiohttp.client_exceptions.ContentTypeError as err:
logger.exception(err)
return
return None
return {
"connector": self,
"results": self.process_search_response(
return ConnectorResults(
connector=self,
results=self.process_search_response(
query, raw_data, min_confidence
),
}
)
except asyncio.TimeoutError:
logger.info("Connection timed out for url: %s", url)
except aiohttp.ClientError as err:
logger.info(err)
return None
@abstractmethod
def get_or_create_book(self, remote_id):
def get_or_create_book(self, remote_id: str) -> Optional[models.Book]:
"""pull up a book record by whatever means possible"""
@abstractmethod
def parse_search_data(self, data, min_confidence):
def parse_search_data(
self, data: Any, min_confidence: float
) -> Iterator[SearchResult]:
"""turn the result json from a search into a list"""
@abstractmethod
def parse_isbn_search_data(self, data):
def parse_isbn_search_data(self, data: Any) -> Iterator[SearchResult]:
"""turn the result json from a search into a list"""
class AbstractConnector(AbstractMinimalConnector):
"""generic book data connector"""
def __init__(self, identifier):
generated_remote_link_field = ""
def __init__(self, identifier: str):
super().__init__(identifier)
# fields we want to look for in book data to copy over
# title we handle separately.
self.book_mappings = []
self.book_mappings: list[Mapping] = []
self.author_mappings: list[Mapping] = []
def get_or_create_book(self, remote_id):
def get_or_create_book(self, remote_id: str) -> Optional[models.Book]:
"""translate arbitrary json into an Activitypub dataclass"""
# first, check if we have the origin_id saved
existing = models.Edition.find_existing_by_remote_id(
remote_id
) or models.Work.find_existing_by_remote_id(remote_id)
if existing:
if hasattr(existing, "default_edition"):
if hasattr(existing, "default_edition") and isinstance(
existing.default_edition, models.Edition
):
return existing.default_edition
return existing
@ -154,6 +177,9 @@ class AbstractConnector(AbstractMinimalConnector):
)
# this will dedupe automatically
work = work_activity.to_model(model=models.Work, overwrite=False)
if not work:
return None
for author in self.get_authors_from_data(work_data):
work.authors.add(author)
@ -161,12 +187,21 @@ class AbstractConnector(AbstractMinimalConnector):
load_more_data.delay(self.connector.id, work.id)
return edition
def get_book_data(self, remote_id): # pylint: disable=no-self-use
def get_book_data(self, remote_id: str) -> JsonDict: # pylint: disable=no-self-use
"""this allows connectors to override the default behavior"""
return get_data(remote_id)
def create_edition_from_data(self, work, edition_data, instance=None):
def create_edition_from_data(
self,
work: models.Work,
edition_data: Union[str, JsonDict],
instance: Optional[models.Edition] = None,
) -> Optional[models.Edition]:
"""if we already have the work, we're ready"""
if isinstance(edition_data, str):
# We don't expect a string here
return None
mapped_data = dict_from_mappings(edition_data, self.book_mappings)
mapped_data["work"] = work.remote_id
edition_activity = activitypub.Edition(**mapped_data)
@ -174,6 +209,9 @@ class AbstractConnector(AbstractMinimalConnector):
model=models.Edition, overwrite=False, instance=instance
)
if not edition:
return None
# if we're updating an existing instance, we don't need to load authors
if instance:
return edition
@ -190,7 +228,9 @@ class AbstractConnector(AbstractMinimalConnector):
return edition
def get_or_create_author(self, remote_id, instance=None):
def get_or_create_author(
self, remote_id: str, instance: Optional[models.Author] = None
) -> Optional[models.Author]:
"""load that author"""
if not instance:
existing = models.Author.find_existing_by_remote_id(remote_id)
@ -210,46 +250,51 @@ class AbstractConnector(AbstractMinimalConnector):
model=models.Author, overwrite=False, instance=instance
)
def get_remote_id_from_model(self, obj):
def get_remote_id_from_model(self, obj: models.BookDataModel) -> Optional[str]:
"""given the data stored, how can we look this up"""
return getattr(obj, getattr(self, "generated_remote_link_field"))
remote_id: Optional[str] = getattr(obj, self.generated_remote_link_field)
return remote_id
def update_author_from_remote(self, obj):
def update_author_from_remote(self, obj: models.Author) -> Optional[models.Author]:
"""load the remote data from this connector and add it to an existing author"""
remote_id = self.get_remote_id_from_model(obj)
if not remote_id:
return None
return self.get_or_create_author(remote_id, instance=obj)
def update_book_from_remote(self, obj):
def update_book_from_remote(self, obj: models.Edition) -> Optional[models.Edition]:
"""load the remote data from this connector and add it to an existing book"""
remote_id = self.get_remote_id_from_model(obj)
if not remote_id:
return None
data = self.get_book_data(remote_id)
return self.create_edition_from_data(obj.parent_work, data, instance=obj)
@abstractmethod
def is_work_data(self, data):
def is_work_data(self, data: JsonDict) -> bool:
"""differentiate works and editions"""
@abstractmethod
def get_edition_from_work_data(self, data):
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
"""every work needs at least one edition"""
@abstractmethod
def get_work_from_edition_data(self, data):
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
"""every edition needs a work"""
@abstractmethod
def get_authors_from_data(self, data):
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
"""load author data"""
@abstractmethod
def expand_book_data(self, book):
def expand_book_data(self, book: models.Book) -> None:
"""get more info on a book"""
def dict_from_mappings(data, mappings):
def dict_from_mappings(data: JsonDict, mappings: list[Mapping]) -> JsonDict:
"""create a dict in Activitypub format, using mappings supplies by
the subclass"""
result = {}
result: JsonDict = {}
for mapping in mappings:
# sometimes there are multiple mappings for one field, don't
# overwrite earlier writes in that case
@ -259,7 +304,11 @@ def dict_from_mappings(data, mappings):
return result
def get_data(url, params=None, timeout=settings.QUERY_TIMEOUT):
def get_data(
url: str,
params: Optional[dict[str, str]] = None,
timeout: int = settings.QUERY_TIMEOUT,
) -> JsonDict:
"""wrapper for request.get"""
# check if the url is blocked
raise_not_valid_url(url)
@ -292,10 +341,15 @@ def get_data(url, params=None, timeout=settings.QUERY_TIMEOUT):
logger.info(err)
raise ConnectorException(err)
if not isinstance(data, dict):
raise ConnectorException("Unexpected data format")
return data
def get_image(url, timeout=10):
def get_image(
url: str, timeout: int = 10
) -> Union[tuple[ContentFile[bytes], str], tuple[None, None]]:
"""wrapper for requesting an image"""
raise_not_valid_url(url)
try:
@ -325,14 +379,19 @@ def get_image(url, timeout=10):
class Mapping:
"""associate a local database field with a field in an external dataset"""
def __init__(self, local_field, remote_field=None, formatter=None):
def __init__(
self,
local_field: str,
remote_field: Optional[str] = None,
formatter: Optional[Callable[[Any], Any]] = None,
):
noop = lambda x: x
self.local_field = local_field
self.remote_field = remote_field or local_field
self.formatter = formatter or noop
def get_value(self, data):
def get_value(self, data: JsonDict) -> Optional[Any]:
"""pull a field from incoming json and return the formatted version"""
value = data.get(self.remote_field)
if not value:
@ -343,7 +402,7 @@ class Mapping:
return None
def infer_physical_format(format_text):
def infer_physical_format(format_text: str) -> Optional[str]:
"""try to figure out what the standardized format is from the free value"""
format_text = format_text.lower()
if format_text in format_mappings:
@ -356,7 +415,7 @@ def infer_physical_format(format_text):
return matches[0]
def unique_physical_format(format_text):
def unique_physical_format(format_text: str) -> Optional[str]:
"""only store the format if it isn't directly in the format mappings"""
format_text = format_text.lower()
if format_text in format_mappings:
@ -365,7 +424,7 @@ def unique_physical_format(format_text):
return format_text
def maybe_isbn(query):
def maybe_isbn(query: str) -> bool:
"""check if a query looks like an isbn"""
isbn = re.sub(r"[\W_]", "", query) # removes filler characters
# ISBNs must be numeric except an ISBN10 checkdigit can be 'X'

View file

@ -1,4 +1,7 @@
""" using another bookwyrm instance as a source of book data """
from __future__ import annotations
from typing import Any, Iterator
from bookwyrm import activitypub, models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractMinimalConnector
@ -7,15 +10,19 @@ from .abstract_connector import AbstractMinimalConnector
class Connector(AbstractMinimalConnector):
"""this is basically just for search"""
def get_or_create_book(self, remote_id):
def get_or_create_book(self, remote_id: str) -> models.Edition:
return activitypub.resolve_remote_id(remote_id, model=models.Edition)
def parse_search_data(self, data, min_confidence):
def parse_search_data(
self, data: list[dict[str, Any]], min_confidence: float
) -> Iterator[SearchResult]:
for search_result in data:
search_result["connector"] = self
yield SearchResult(**search_result)
def parse_isbn_search_data(self, data):
def parse_isbn_search_data(
self, data: list[dict[str, Any]]
) -> Iterator[SearchResult]:
for search_result in data:
search_result["connector"] = self
yield SearchResult(**search_result)

View file

@ -1,8 +1,11 @@
""" interface with whatever connectors the app has """
from __future__ import annotations
import asyncio
import importlib
import ipaddress
import logging
from asyncio import Future
from typing import Iterator, Any, Optional, Union, overload, Literal
from urllib.parse import urlparse
import aiohttp
@ -12,6 +15,8 @@ from django.db.models import signals
from requests import HTTPError
from bookwyrm import book_search, models
from bookwyrm.book_search import SearchResult
from bookwyrm.connectors import abstract_connector
from bookwyrm.settings import SEARCH_TIMEOUT
from bookwyrm.tasks import app, CONNECTORS
@ -22,11 +27,15 @@ class ConnectorException(HTTPError):
"""when the connector can't do what was asked"""
async def async_connector_search(query, items, min_confidence):
async def async_connector_search(
query: str,
items: list[tuple[str, abstract_connector.AbstractConnector]],
min_confidence: float,
) -> list[Optional[abstract_connector.ConnectorResults]]:
"""Try a number of requests simultaneously"""
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
async with aiohttp.ClientSession(timeout=timeout) as session:
tasks = []
tasks: list[Future[Optional[abstract_connector.ConnectorResults]]] = []
for url, connector in items:
tasks.append(
asyncio.ensure_future(
@ -35,14 +44,29 @@ async def async_connector_search(query, items, min_confidence):
)
results = await asyncio.gather(*tasks)
return results
return list(results)
def search(query, min_confidence=0.1, return_first=False):
@overload
def search(
query: str, *, min_confidence: float = 0.1, return_first: Literal[False]
) -> list[abstract_connector.ConnectorResults]:
...
@overload
def search(
query: str, *, min_confidence: float = 0.1, return_first: Literal[True]
) -> Optional[SearchResult]:
...
def search(
query: str, *, min_confidence: float = 0.1, return_first: bool = False
) -> Union[list[abstract_connector.ConnectorResults], Optional[SearchResult]]:
"""find books based on arbitrary keywords"""
if not query:
return []
results = []
return None if return_first else []
items = []
for connector in get_connectors():
@ -57,8 +81,12 @@ def search(query, min_confidence=0.1, return_first=False):
items.append((url, connector))
# load as many results as we can
results = asyncio.run(async_connector_search(query, items, min_confidence))
results = [r for r in results if r]
# failed requests will return None, so filter those out
results = [
r
for r in asyncio.run(async_connector_search(query, items, min_confidence))
if r
]
if return_first:
# find the best result from all the responses and return that
@ -66,11 +94,12 @@ def search(query, min_confidence=0.1, return_first=False):
all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True)
return all_results[0] if all_results else None
# failed requests will return None, so filter those out
return results
def first_search_result(query, min_confidence=0.1):
def first_search_result(
query: str, min_confidence: float = 0.1
) -> Union[models.Edition, SearchResult, None]:
"""search until you find a result that fits"""
# try local search first
result = book_search.search(query, min_confidence=min_confidence, return_first=True)
@ -80,13 +109,13 @@ def first_search_result(query, min_confidence=0.1):
return search(query, min_confidence=min_confidence, return_first=True) or None
def get_connectors():
def get_connectors() -> Iterator[abstract_connector.AbstractConnector]:
"""load all connectors"""
for info in models.Connector.objects.filter(active=True).order_by("priority").all():
yield load_connector(info)
def get_or_create_connector(remote_id):
def get_or_create_connector(remote_id: str) -> abstract_connector.AbstractConnector:
"""get the connector related to the object's server"""
url = urlparse(remote_id)
identifier = url.netloc
@ -110,7 +139,7 @@ def get_or_create_connector(remote_id):
@app.task(queue=CONNECTORS)
def load_more_data(connector_id, book_id):
def load_more_data(connector_id: str, book_id: str) -> None:
"""background the work of getting all 10,000 editions of LoTR"""
connector_info = models.Connector.objects.get(id=connector_id)
connector = load_connector(connector_info)
@ -119,7 +148,9 @@ def load_more_data(connector_id, book_id):
@app.task(queue=CONNECTORS)
def create_edition_task(connector_id, work_id, data):
def create_edition_task(
connector_id: int, work_id: int, data: Union[str, abstract_connector.JsonDict]
) -> None:
"""separate task for each of the 10,000 editions of LoTR"""
connector_info = models.Connector.objects.get(id=connector_id)
connector = load_connector(connector_info)
@ -127,23 +158,31 @@ def create_edition_task(connector_id, work_id, data):
connector.create_edition_from_data(work, data)
def load_connector(connector_info):
def load_connector(
connector_info: models.Connector,
) -> abstract_connector.AbstractConnector:
"""instantiate the connector class"""
connector = importlib.import_module(
f"bookwyrm.connectors.{connector_info.connector_file}"
)
return connector.Connector(connector_info.identifier)
return connector.Connector(connector_info.identifier) # type: ignore[no-any-return]
@receiver(signals.post_save, sender="bookwyrm.FederatedServer")
# pylint: disable=unused-argument
def create_connector(sender, instance, created, *args, **kwargs):
def create_connector(
sender: Any,
instance: models.FederatedServer,
created: Any,
*args: Any,
**kwargs: Any,
) -> None:
"""create a connector to an external bookwyrm server"""
if instance.application_type == "bookwyrm":
get_or_create_connector(f"https://{instance.server_name}")
def raise_not_valid_url(url):
def raise_not_valid_url(url: str) -> None:
"""do some basic reality checks on the url"""
parsed = urlparse(url)
if not parsed.scheme in ["http", "https"]:

View file

@ -1,9 +1,10 @@
""" inventaire data connector """
import re
from typing import Any, Union, Optional, Iterator, Iterable
from bookwyrm import models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractConnector, Mapping
from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data
from .connector_manager import ConnectorException, create_edition_task
@ -13,7 +14,7 @@ class Connector(AbstractConnector):
generated_remote_link_field = "inventaire_id"
def __init__(self, identifier):
def __init__(self, identifier: str):
super().__init__(identifier)
get_first = lambda a: a[0]
@ -60,13 +61,13 @@ class Connector(AbstractConnector):
Mapping("died", remote_field="wdt:P570", formatter=get_first),
] + shared_mappings
def get_remote_id(self, value):
def get_remote_id(self, value: str) -> str:
"""convert an id/uri into a url"""
return f"{self.books_url}?action=by-uris&uris={value}"
def get_book_data(self, remote_id):
def get_book_data(self, remote_id: str) -> JsonDict:
data = get_data(remote_id)
extracted = list(data.get("entities").values())
extracted = list(data.get("entities", {}).values())
try:
data = extracted[0]
except (KeyError, IndexError):
@ -74,10 +75,16 @@ class Connector(AbstractConnector):
# flatten the data so that images, uri, and claims are on the same level
return {
**data.get("claims", {}),
**{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]},
**{
k: data.get(k)
for k in ["uri", "image", "labels", "sitelinks", "type"]
if k in data
},
}
def parse_search_data(self, data, min_confidence):
def parse_search_data(
self, data: JsonDict, min_confidence: float
) -> Iterator[SearchResult]:
for search_result in data.get("results", []):
images = search_result.get("image")
cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None
@ -96,7 +103,7 @@ class Connector(AbstractConnector):
connector=self,
)
def parse_isbn_search_data(self, data):
def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]:
"""got some data"""
results = data.get("entities")
if not results:
@ -114,35 +121,44 @@ class Connector(AbstractConnector):
connector=self,
)
def is_work_data(self, data):
def is_work_data(self, data: JsonDict) -> bool:
return data.get("type") == "work"
def load_edition_data(self, work_uri):
def load_edition_data(self, work_uri: str) -> JsonDict:
"""get a list of editions for a work"""
# pylint: disable=line-too-long
url = f"{self.books_url}?action=reverse-claims&property=wdt:P629&value={work_uri}&sort=true"
return get_data(url)
def get_edition_from_work_data(self, data):
data = self.load_edition_data(data.get("uri"))
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
work_uri = data.get("uri")
if not work_uri:
raise ConnectorException("Invalid URI")
data = self.load_edition_data(work_uri)
try:
uri = data.get("uris", [])[0]
except IndexError:
raise ConnectorException("Invalid book data")
return self.get_book_data(self.get_remote_id(uri))
def get_work_from_edition_data(self, data):
uri = data.get("wdt:P629", [None])[0]
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
try:
uri = data.get("wdt:P629", [])[0]
except IndexError:
raise ConnectorException("Invalid book data")
if not uri:
raise ConnectorException("Invalid book data")
return self.get_book_data(self.get_remote_id(uri))
def get_authors_from_data(self, data):
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
authors = data.get("wdt:P50", [])
for author in authors:
yield self.get_or_create_author(self.get_remote_id(author))
model = self.get_or_create_author(self.get_remote_id(author))
if model:
yield model
def expand_book_data(self, book):
def expand_book_data(self, book: models.Book) -> None:
work = book
# go from the edition to the work, if necessary
if isinstance(book, models.Edition):
@ -154,11 +170,16 @@ class Connector(AbstractConnector):
# who knows, man
return
for edition_uri in edition_options.get("uris"):
for edition_uri in edition_options.get("uris", []):
remote_id = self.get_remote_id(edition_uri)
create_edition_task.delay(self.connector.id, work.id, remote_id)
def create_edition_from_data(self, work, edition_data, instance=None):
def create_edition_from_data(
self,
work: models.Work,
edition_data: Union[str, JsonDict],
instance: Optional[models.Edition] = None,
) -> Optional[models.Edition]:
"""pass in the url as data and then call the version in abstract connector"""
if isinstance(edition_data, str):
try:
@ -168,22 +189,26 @@ class Connector(AbstractConnector):
return None
return super().create_edition_from_data(work, edition_data, instance=instance)
def get_cover_url(self, cover_blob, *_):
def get_cover_url(
self, cover_blob: Union[list[JsonDict], JsonDict], *_: Any
) -> Optional[str]:
"""format the relative cover url into an absolute one:
{"url": "/img/entities/e794783f01b9d4f897a1ea9820b96e00d346994f"}
"""
# covers may or may not be a list
if isinstance(cover_blob, list) and len(cover_blob) > 0:
if isinstance(cover_blob, list):
if len(cover_blob) == 0:
return None
cover_blob = cover_blob[0]
cover_id = cover_blob.get("url")
if not cover_id:
if not isinstance(cover_id, str):
return None
# cover may or may not be an absolute url already
if re.match(r"^http", cover_id):
return cover_id
return f"{self.covers_url}{cover_id}"
def resolve_keys(self, keys):
def resolve_keys(self, keys: Iterable[str]) -> list[str]:
"""cool, it's "wd:Q3156592" now what the heck does that mean"""
results = []
for uri in keys:
@ -191,10 +216,10 @@ class Connector(AbstractConnector):
data = self.get_book_data(self.get_remote_id(uri))
except ConnectorException:
continue
results.append(get_language_code(data.get("labels")))
results.append(get_language_code(data.get("labels", {})))
return results
def get_description(self, links):
def get_description(self, links: JsonDict) -> str:
"""grab an extracted excerpt from wikipedia"""
link = links.get("enwiki")
if not link:
@ -204,15 +229,15 @@ class Connector(AbstractConnector):
data = get_data(url)
except ConnectorException:
return ""
return data.get("extract")
return data.get("extract", "")
def get_remote_id_from_model(self, obj):
def get_remote_id_from_model(self, obj: models.BookDataModel) -> str:
"""use get_remote_id to figure out the link from a model obj"""
remote_id_value = obj.inventaire_id
return self.get_remote_id(remote_id_value)
def get_language_code(options, code="en"):
def get_language_code(options: JsonDict, code: str = "en") -> Any:
"""when there are a bunch of translation but we need a single field"""
result = options.get(code)
if result:

View file

@ -1,9 +1,10 @@
""" openlibrary data connector """
import re
from typing import Any, Optional, Union, Iterator, Iterable
from bookwyrm import models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractConnector, Mapping
from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
from .connector_manager import ConnectorException, create_edition_task
from .openlibrary_languages import languages
@ -14,7 +15,7 @@ class Connector(AbstractConnector):
generated_remote_link_field = "openlibrary_link"
def __init__(self, identifier):
def __init__(self, identifier: str):
super().__init__(identifier)
get_first = lambda a, *args: a[0]
@ -94,14 +95,14 @@ class Connector(AbstractConnector):
Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id),
]
def get_book_data(self, remote_id):
def get_book_data(self, remote_id: str) -> JsonDict:
data = get_data(remote_id)
if data.get("type", {}).get("key") == "/type/redirect":
remote_id = self.base_url + data.get("location")
remote_id = self.base_url + data.get("location", "")
return get_data(remote_id)
return data
def get_remote_id_from_data(self, data):
def get_remote_id_from_data(self, data: JsonDict) -> str:
"""format a url from an openlibrary id field"""
try:
key = data["key"]
@ -109,10 +110,10 @@ class Connector(AbstractConnector):
raise ConnectorException("Invalid book data")
return f"{self.books_url}{key}"
def is_work_data(self, data):
def is_work_data(self, data: JsonDict) -> bool:
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
def get_edition_from_work_data(self, data):
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
try:
key = data["key"]
except KeyError:
@ -124,7 +125,7 @@ class Connector(AbstractConnector):
raise ConnectorException("No editions for work")
return edition
def get_work_from_edition_data(self, data):
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
try:
key = data["works"][0]["key"]
except (IndexError, KeyError):
@ -132,7 +133,7 @@ class Connector(AbstractConnector):
url = f"{self.books_url}{key}"
return self.get_book_data(url)
def get_authors_from_data(self, data):
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
"""parse author json and load or create authors"""
for author_blob in data.get("authors", []):
author_blob = author_blob.get("author", author_blob)
@ -144,7 +145,7 @@ class Connector(AbstractConnector):
continue
yield author
def get_cover_url(self, cover_blob, size="L"):
def get_cover_url(self, cover_blob: list[str], size: str = "L") -> Optional[str]:
"""ask openlibrary for the cover"""
if not cover_blob:
return None
@ -152,8 +153,10 @@ class Connector(AbstractConnector):
image_name = f"{cover_id}-{size}.jpg"
return f"{self.covers_url}/b/id/{image_name}"
def parse_search_data(self, data, min_confidence):
for idx, search_result in enumerate(data.get("docs")):
def parse_search_data(
self, data: JsonDict, min_confidence: float
) -> Iterator[SearchResult]:
for idx, search_result in enumerate(data.get("docs", [])):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
author = search_result.get("author_name") or ["Unknown"]
@ -174,7 +177,7 @@ class Connector(AbstractConnector):
confidence=confidence,
)
def parse_isbn_search_data(self, data):
def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]:
for search_result in list(data.values()):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
@ -188,12 +191,12 @@ class Connector(AbstractConnector):
year=search_result.get("publish_date"),
)
def load_edition_data(self, olkey):
def load_edition_data(self, olkey: str) -> JsonDict:
"""query openlibrary for editions of a work"""
url = f"{self.books_url}/works/{olkey}/editions"
return self.get_book_data(url)
def expand_book_data(self, book):
def expand_book_data(self, book: models.Book) -> None:
work = book
# go from the edition to the work, if necessary
if isinstance(book, models.Edition):
@ -206,14 +209,14 @@ class Connector(AbstractConnector):
# who knows, man
return
for edition_data in edition_options.get("entries"):
for edition_data in edition_options.get("entries", []):
# does this edition have ANY interesting data?
if ignore_edition(edition_data):
continue
create_edition_task.delay(self.connector.id, work.id, edition_data)
def ignore_edition(edition_data):
def ignore_edition(edition_data: JsonDict) -> bool:
"""don't load a million editions that have no metadata"""
# an isbn, we love to see it
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
@ -232,19 +235,19 @@ def ignore_edition(edition_data):
return True
def get_description(description_blob):
def get_description(description_blob: Union[JsonDict, str]) -> Optional[str]:
"""descriptions can be a string or a dict"""
if isinstance(description_blob, dict):
return description_blob.get("value")
return description_blob
def get_openlibrary_key(key):
def get_openlibrary_key(key: str) -> str:
"""convert /books/OL27320736M into OL27320736M"""
return key.split("/")[-1]
def get_languages(language_blob):
def get_languages(language_blob: Iterable[JsonDict]) -> list[Optional[str]]:
"""/language/eng -> English"""
langs = []
for lang in language_blob:
@ -252,14 +255,14 @@ def get_languages(language_blob):
return langs
def get_dict_field(blob, field_name):
def get_dict_field(blob: Optional[JsonDict], field_name: str) -> Optional[Any]:
"""extract the isni from the remote id data for the author"""
if not blob or not isinstance(blob, dict):
return None
return blob.get(field_name)
def get_wikipedia_link(links):
def get_wikipedia_link(links: list[Any]) -> Optional[str]:
"""extract wikipedia links"""
if not isinstance(links, list):
return None
@ -272,7 +275,7 @@ def get_wikipedia_link(links):
return None
def get_inventaire_id(links):
def get_inventaire_id(links: list[Any]) -> Optional[str]:
"""extract and format inventaire ids"""
if not isinstance(links, list):
return None
@ -282,11 +285,13 @@ def get_inventaire_id(links):
continue
if link.get("title") == "inventaire.io":
iv_link = link.get("url")
if not isinstance(iv_link, str):
return None
return iv_link.split("/")[-1]
return None
def pick_default_edition(options):
def pick_default_edition(options: list[JsonDict]) -> Optional[JsonDict]:
"""favor physical copies with covers in english"""
if not options:
return None

View file

@ -6,8 +6,9 @@ from functools import reduce
import json
import operator
import logging
from typing import List
from typing import Any, Optional
from uuid import uuid4
from typing_extensions import Self
import aiohttp
from Crypto.PublicKey import RSA
@ -85,7 +86,7 @@ class ActivitypubMixin:
super().__init__(*args, **kwargs)
@classmethod
def find_existing_by_remote_id(cls, remote_id):
def find_existing_by_remote_id(cls, remote_id: str) -> Self:
"""look up a remote id in the db"""
return cls.find_existing({"id": remote_id})
@ -137,7 +138,7 @@ class ActivitypubMixin:
queue=queue,
)
def get_recipients(self, software=None) -> List[str]:
def get_recipients(self, software=None) -> list[str]:
"""figure out which inbox urls to post to"""
# first we have to figure out who should receive this activity
privacy = self.privacy if hasattr(self, "privacy") else "public"
@ -198,7 +199,14 @@ class ActivitypubMixin:
class ObjectMixin(ActivitypubMixin):
"""add this mixin for object models that are AP serializable"""
def save(self, *args, created=None, software=None, priority=BROADCAST, **kwargs):
def save(
self,
*args: Any,
created: Optional[bool] = None,
software: Any = None,
priority: str = BROADCAST,
**kwargs: Any,
) -> None:
"""broadcast created/updated/deleted objects as appropriate"""
broadcast = kwargs.get("broadcast", True)
# this bonus kwarg would cause an error in the base save method
@ -507,14 +515,14 @@ def unfurl_related_field(related_field, sort_field=None):
@app.task(queue=BROADCAST)
def broadcast_task(sender_id: int, activity: str, recipients: List[str]):
def broadcast_task(sender_id: int, activity: str, recipients: list[str]):
"""the celery task for broadcast"""
user_model = apps.get_model("bookwyrm.User", require_ready=True)
sender = user_model.objects.select_related("key_pair").get(id=sender_id)
asyncio.run(async_broadcast(recipients, sender, activity))
async def async_broadcast(recipients: List[str], sender, data: str):
async def async_broadcast(recipients: list[str], sender, data: str):
"""Send all the broadcasts simultaneously"""
timeout = aiohttp.ClientTimeout(total=10)
async with aiohttp.ClientSession(timeout=timeout) as session:

View file

@ -1,6 +1,7 @@
""" database schema for books and shelves """
from itertools import chain
import re
from typing import Any
from django.contrib.postgres.search import SearchVectorField
from django.contrib.postgres.indexes import GinIndex
@ -90,7 +91,7 @@ class BookDataModel(ObjectMixin, BookWyrmModel):
abstract = True
def save(self, *args, **kwargs):
def save(self, *args: Any, **kwargs: Any) -> None:
"""ensure that the remote_id is within this instance"""
if self.id:
self.remote_id = self.get_remote_id()
@ -204,7 +205,7 @@ class Book(BookDataModel):
text += f" ({self.edition_info})"
return text
def save(self, *args, **kwargs):
def save(self, *args: Any, **kwargs: Any) -> None:
"""can't be abstract for query reasons, but you shouldn't USE it"""
if not isinstance(self, Edition) and not isinstance(self, Work):
raise ValueError("Books should be added as Editions or Works")
@ -343,7 +344,7 @@ class Edition(Book):
# max rank is 9
return rank
def save(self, *args, **kwargs):
def save(self, *args: Any, **kwargs: Any) -> None:
"""set some fields on the edition object"""
# calculate isbn 10/13
if self.isbn_13 and self.isbn_13[:3] == "978" and not self.isbn_10:

View file

@ -61,7 +61,7 @@ class FederatedServer(BookWyrmModel):
).update(active=True, deactivation_reason=None)
@classmethod
def is_blocked(cls, url):
def is_blocked(cls, url: str) -> bool:
"""look up if a domain is blocked"""
url = urlparse(url)
domain = url.netloc

View file

@ -233,3 +233,13 @@ class Openlibrary(TestCase):
self.assertFalse(ignore_edition({"languages": "languages/fr"}))
self.assertTrue(ignore_edition({"languages": "languages/eng"}))
self.assertTrue(ignore_edition({"format": "paperback"}))
def test_remote_id_from_model(self):
"""figure out a url from an id"""
obj = models.Author.objects.create(
name="George Elliott", openlibrary_key="OL453734A"
)
self.assertEqual(
self.connector.get_remote_id_from_model(obj),
"https://openlibrary.org/authors/OL453734A",
)

View file

@ -10,6 +10,9 @@ django_settings_module = "bookwyrm.settings"
ignore_errors = True
implicit_reexport = True
[mypy-bookwyrm.connectors.*]
ignore_errors = False
[mypy-celerywyrm.*]
ignore_errors = False