Some small improvements to annotations

This commit is contained in:
Joeri de Ruiter 2023-07-28 20:54:03 +02:00
parent f07d7b02f1
commit 2920973961
5 changed files with 111 additions and 65 deletions

View file

@ -3,7 +3,7 @@ from __future__ import annotations
from dataclasses import asdict, dataclass
from functools import reduce
import operator
from typing import Optional, Union, Any
from typing import Optional, Union, Any, Literal, overload
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F, Q
@ -14,17 +14,40 @@ from bookwyrm import connectors
from bookwyrm.settings import MEDIA_FULL_URL
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[False],
) -> QuerySet[models.Edition]:
...
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[True],
) -> Optional[models.Edition]:
...
# pylint: disable=arguments-differ
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: bool = False,
):
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
"""search your local database"""
filters = filters or []
if not query:
return []
return None if return_first else []
query = query.strip()
results = None
@ -97,7 +120,9 @@ def search_identifiers(
return results
def search_title_author(query, min_confidence, *filters, return_first=False):
def search_title_author(
query, min_confidence, *filters, return_first=False
) -> QuerySet[models.Edition]:
"""searches for title and author"""
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
results = (

View file

@ -22,15 +22,14 @@ from ..book_search import SearchResult
logger = logging.getLogger(__name__)
JsonDict = dict[str, Any]
class ConnectorResults(TypedDict):
"""TypedDict for results returned by connector"""
connector: AbstractMinimalConnector
results: list[Any]
Formatter = Callable[[Any], Optional[Union[str, list[str], list[Optional[str]]]]]
results: list[SearchResult]
class AbstractMinimalConnector(ABC):
@ -98,12 +97,12 @@ class AbstractMinimalConnector(ABC):
logger.exception(err)
return None
return {
"connector": self,
"results": self.process_search_response(
return ConnectorResults(
connector=self,
results=self.process_search_response(
query, raw_data, min_confidence
),
}
)
except asyncio.TimeoutError:
logger.info("Connection timed out for url: %s", url)
except aiohttp.ClientError as err:
@ -188,14 +187,14 @@ class AbstractConnector(AbstractMinimalConnector):
load_more_data.delay(self.connector.id, work.id)
return edition
def get_book_data(self, remote_id: str) -> BookData: # pylint: disable=no-self-use
def get_book_data(self, remote_id: str) -> JsonDict: # pylint: disable=no-self-use
"""this allows connectors to override the default behavior"""
return get_data(remote_id)
def create_edition_from_data(
self,
work: models.Work,
edition_data: Union[str, BookData],
edition_data: Union[str, JsonDict],
instance: Optional[models.Edition] = None,
) -> Optional[models.Edition]:
"""if we already have the work, we're ready"""
@ -272,19 +271,19 @@ class AbstractConnector(AbstractMinimalConnector):
return self.create_edition_from_data(obj.parent_work, data, instance=obj)
@abstractmethod
def is_work_data(self, data: BookData) -> bool:
def is_work_data(self, data: JsonDict) -> bool:
"""differentiate works and editions"""
@abstractmethod
def get_edition_from_work_data(self, data: BookData) -> Optional[BookData]:
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
"""every work needs at least one edition"""
@abstractmethod
def get_work_from_edition_data(self, data: BookData) -> BookData:
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
"""every edition needs a work"""
@abstractmethod
def get_authors_from_data(self, data: BookData) -> Iterator[models.Author]:
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
"""load author data"""
@abstractmethod
@ -292,10 +291,10 @@ class AbstractConnector(AbstractMinimalConnector):
"""get more info on a book"""
def dict_from_mappings(data: BookData, mappings: list[Mapping]) -> dict[str, Any]:
def dict_from_mappings(data: JsonDict, mappings: list[Mapping]) -> JsonDict:
"""create a dict in Activitypub format, using mappings supplies by
the subclass"""
result: dict[str, Any] = {}
result: JsonDict = {}
for mapping in mappings:
# sometimes there are multiple mappings for one field, don't
# overwrite earlier writes in that case
@ -309,7 +308,7 @@ def get_data(
url: str,
params: Optional[dict[str, str]] = None,
timeout: int = settings.QUERY_TIMEOUT,
) -> BookData:
) -> JsonDict:
"""wrapper for request.get"""
# check if the url is blocked
raise_not_valid_url(url)
@ -342,8 +341,10 @@ def get_data(
logger.info(err)
raise ConnectorException(err)
# For now assume the returned data is a correctly formatted dict
return data # type: ignore[no-any-return]
if not isinstance(data, dict):
raise ConnectorException("Unexpected data format")
return data
def get_image(
@ -382,7 +383,7 @@ class Mapping:
self,
local_field: str,
remote_field: Optional[str] = None,
formatter: Optional[Formatter] = None,
formatter: Optional[Callable[[Any], Any]] = None,
):
noop = lambda x: x
@ -390,7 +391,7 @@ class Mapping:
self.remote_field = remote_field or local_field
self.formatter = formatter or noop
def get_value(self, data: BookData) -> Optional[Any]:
def get_value(self, data: JsonDict) -> Optional[Any]:
"""pull a field from incoming json and return the formatted version"""
value = data.get(self.remote_field)
if not value:
@ -434,6 +435,3 @@ def maybe_isbn(query: str) -> bool:
10,
13,
] # ISBN10 or ISBN13, or maybe ISBN10 missing a leading zero
BookData = dict[str, Any]

View file

@ -5,7 +5,7 @@ import importlib
import ipaddress
import logging
from asyncio import Future
from typing import Iterator, Any, Optional, Union
from typing import Iterator, Any, Optional, Union, overload, Literal
from urllib.parse import urlparse
import aiohttp
@ -15,6 +15,7 @@ from django.db.models import signals
from requests import HTTPError
from bookwyrm import book_search, models
from bookwyrm.book_search import SearchResult
from bookwyrm.connectors import abstract_connector
from bookwyrm.settings import SEARCH_TIMEOUT
from bookwyrm.tasks import app, CONNECTORS
@ -46,10 +47,26 @@ async def async_connector_search(
return list(results)
def search(query: str, min_confidence: float = 0.1, return_first: bool = False) -> Any:
@overload
def search(
query: str, *, min_confidence: float = 0.1, return_first: Literal[False]
) -> list[abstract_connector.ConnectorResults]:
...
@overload
def search(
query: str, *, min_confidence: float = 0.1, return_first: Literal[True]
) -> Optional[SearchResult]:
...
def search(
query: str, *, min_confidence: float = 0.1, return_first: bool = False
) -> Union[list[abstract_connector.ConnectorResults], Optional[SearchResult]]:
"""find books based on arbitrary keywords"""
if not query:
return []
return None if return_first else []
items = []
for connector in get_connectors():
@ -80,7 +97,9 @@ def search(query: str, min_confidence: float = 0.1, return_first: bool = False)
return results
def first_search_result(query: str, min_confidence: float = 0.1) -> Any:
def first_search_result(
query: str, min_confidence: float = 0.1
) -> Union[models.Edition, SearchResult, None]:
"""search until you find a result that fits"""
# try local search first
result = book_search.search(query, min_confidence=min_confidence, return_first=True)
@ -130,7 +149,7 @@ def load_more_data(connector_id: str, book_id: str) -> None:
@app.task(queue=CONNECTORS)
def create_edition_task(
connector_id: int, work_id: int, data: Union[str, abstract_connector.BookData]
connector_id: int, work_id: int, data: Union[str, abstract_connector.JsonDict]
) -> None:
"""separate task for each of the 10,000 editions of LoTR"""
connector_info = models.Connector.objects.get(id=connector_id)

View file

@ -4,7 +4,7 @@ from typing import Any, Union, Optional, Iterator, Iterable
from bookwyrm import models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractConnector, Mapping, BookData
from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data
from .connector_manager import ConnectorException, create_edition_task
@ -65,9 +65,9 @@ class Connector(AbstractConnector):
"""convert an id/uri into a url"""
return f"{self.books_url}?action=by-uris&uris={value}"
def get_book_data(self, remote_id: str) -> BookData:
def get_book_data(self, remote_id: str) -> JsonDict:
data = get_data(remote_id)
extracted = list(data.get("entities", []).values())
extracted = list(data.get("entities", {}).values())
try:
data = extracted[0]
except (KeyError, IndexError):
@ -75,11 +75,15 @@ class Connector(AbstractConnector):
# flatten the data so that images, uri, and claims are on the same level
return {
**data.get("claims", {}),
**{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]},
**{
k: data.get(k)
for k in ["uri", "image", "labels", "sitelinks", "type"]
if k in data
},
}
def parse_search_data(
self, data: dict[str, Any], min_confidence: float
self, data: JsonDict, min_confidence: float
) -> Iterator[SearchResult]:
for search_result in data.get("results", []):
images = search_result.get("image")
@ -99,7 +103,7 @@ class Connector(AbstractConnector):
connector=self,
)
def parse_isbn_search_data(self, data: dict[str, Any]) -> Iterator[SearchResult]:
def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]:
"""got some data"""
results = data.get("entities")
if not results:
@ -117,16 +121,16 @@ class Connector(AbstractConnector):
connector=self,
)
def is_work_data(self, data: BookData) -> bool:
def is_work_data(self, data: JsonDict) -> bool:
return data.get("type") == "work"
def load_edition_data(self, work_uri: str) -> BookData:
def load_edition_data(self, work_uri: str) -> JsonDict:
"""get a list of editions for a work"""
# pylint: disable=line-too-long
url = f"{self.books_url}?action=reverse-claims&property=wdt:P629&value={work_uri}&sort=true"
return get_data(url)
def get_edition_from_work_data(self, data: BookData) -> Optional[BookData]:
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
work_uri = data.get("uri")
if not work_uri:
raise ConnectorException("Invalid URI")
@ -137,7 +141,7 @@ class Connector(AbstractConnector):
raise ConnectorException("Invalid book data")
return self.get_book_data(self.get_remote_id(uri))
def get_work_from_edition_data(self, data: BookData) -> BookData:
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
try:
uri = data.get("wdt:P629", [])[0]
except IndexError:
@ -147,7 +151,7 @@ class Connector(AbstractConnector):
raise ConnectorException("Invalid book data")
return self.get_book_data(self.get_remote_id(uri))
def get_authors_from_data(self, data: BookData) -> Iterator[models.Author]:
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
authors = data.get("wdt:P50", [])
for author in authors:
model = self.get_or_create_author(self.get_remote_id(author))
@ -173,7 +177,7 @@ class Connector(AbstractConnector):
def create_edition_from_data(
self,
work: models.Work,
edition_data: Union[str, BookData],
edition_data: Union[str, JsonDict],
instance: Optional[models.Edition] = None,
) -> Optional[models.Edition]:
"""pass in the url as data and then call the version in abstract connector"""
@ -186,7 +190,7 @@ class Connector(AbstractConnector):
return super().create_edition_from_data(work, edition_data, instance=instance)
def get_cover_url(
self, cover_blob: Union[list[dict[str, str]], dict[str, str]], *_: Any
self, cover_blob: Union[list[JsonDict], JsonDict], *_: Any
) -> Optional[str]:
"""format the relative cover url into an absolute one:
{"url": "/img/entities/e794783f01b9d4f897a1ea9820b96e00d346994f"}
@ -197,7 +201,7 @@ class Connector(AbstractConnector):
return None
cover_blob = cover_blob[0]
cover_id = cover_blob.get("url")
if not cover_id:
if not isinstance(cover_id, str):
return None
# cover may or may not be an absolute url already
if re.match(r"^http", cover_id):
@ -215,7 +219,7 @@ class Connector(AbstractConnector):
results.append(get_language_code(data.get("labels", {})))
return results
def get_description(self, links: dict[str, str]) -> Optional[str]:
def get_description(self, links: JsonDict) -> str:
"""grab an extracted excerpt from wikipedia"""
link = links.get("enwiki")
if not link:
@ -225,7 +229,7 @@ class Connector(AbstractConnector):
data = get_data(url)
except ConnectorException:
return ""
return data.get("extract")
return data.get("extract", "")
def get_remote_id_from_model(self, obj: models.BookDataModel) -> str:
"""use get_remote_id to figure out the link from a model obj"""
@ -233,7 +237,7 @@ class Connector(AbstractConnector):
return self.get_remote_id(remote_id_value)
def get_language_code(options: dict[str, Any], code: str = "en") -> Any:
def get_language_code(options: JsonDict, code: str = "en") -> Any:
"""when there are a bunch of translation but we need a single field"""
result = options.get(code)
if result:

View file

@ -4,7 +4,7 @@ from typing import Any, Optional, Union, Iterator, Iterable
from bookwyrm import models
from bookwyrm.book_search import SearchResult
from .abstract_connector import AbstractConnector, Mapping, BookData
from .abstract_connector import AbstractConnector, Mapping, JsonDict
from .abstract_connector import get_data, infer_physical_format, unique_physical_format
from .connector_manager import ConnectorException, create_edition_task
from .openlibrary_languages import languages
@ -95,14 +95,14 @@ class Connector(AbstractConnector):
Mapping("inventaire_id", remote_field="links", formatter=get_inventaire_id),
]
def get_book_data(self, remote_id: str) -> dict[str, Any]:
def get_book_data(self, remote_id: str) -> JsonDict:
data = get_data(remote_id)
if data.get("type", {}).get("key") == "/type/redirect":
remote_id = self.base_url + data.get("location", "")
return get_data(remote_id)
return data
def get_remote_id_from_data(self, data: dict[str, Any]) -> str:
def get_remote_id_from_data(self, data: JsonDict) -> str:
"""format a url from an openlibrary id field"""
try:
key = data["key"]
@ -110,10 +110,10 @@ class Connector(AbstractConnector):
raise ConnectorException("Invalid book data")
return f"{self.books_url}{key}"
def is_work_data(self, data: dict[str, Any]) -> bool:
def is_work_data(self, data: JsonDict) -> bool:
return bool(re.match(r"^[\/\w]+OL\d+W$", data["key"]))
def get_edition_from_work_data(self, data: dict[str, Any]) -> Optional[BookData]:
def get_edition_from_work_data(self, data: JsonDict) -> JsonDict:
try:
key = data["key"]
except KeyError:
@ -125,7 +125,7 @@ class Connector(AbstractConnector):
raise ConnectorException("No editions for work")
return edition
def get_work_from_edition_data(self, data: BookData) -> BookData:
def get_work_from_edition_data(self, data: JsonDict) -> JsonDict:
try:
key = data["works"][0]["key"]
except (IndexError, KeyError):
@ -133,7 +133,7 @@ class Connector(AbstractConnector):
url = f"{self.books_url}{key}"
return self.get_book_data(url)
def get_authors_from_data(self, data: BookData) -> Iterator[models.Author]:
def get_authors_from_data(self, data: JsonDict) -> Iterator[models.Author]:
"""parse author json and load or create authors"""
for author_blob in data.get("authors", []):
author_blob = author_blob.get("author", author_blob)
@ -154,7 +154,7 @@ class Connector(AbstractConnector):
return f"{self.covers_url}/b/id/{image_name}"
def parse_search_data(
self, data: dict[str, Any], min_confidence: float
self, data: JsonDict, min_confidence: float
) -> Iterator[SearchResult]:
for idx, search_result in enumerate(data.get("docs", [])):
# build the remote id from the openlibrary key
@ -177,7 +177,7 @@ class Connector(AbstractConnector):
confidence=confidence,
)
def parse_isbn_search_data(self, data: dict[str, Any]) -> Iterator[SearchResult]:
def parse_isbn_search_data(self, data: JsonDict) -> Iterator[SearchResult]:
for search_result in list(data.values()):
# build the remote id from the openlibrary key
key = self.books_url + search_result["key"]
@ -191,7 +191,7 @@ class Connector(AbstractConnector):
year=search_result.get("publish_date"),
)
def load_edition_data(self, olkey: str) -> BookData:
def load_edition_data(self, olkey: str) -> JsonDict:
"""query openlibrary for editions of a work"""
url = f"{self.books_url}/works/{olkey}/editions"
return self.get_book_data(url)
@ -216,7 +216,7 @@ class Connector(AbstractConnector):
create_edition_task.delay(self.connector.id, work.id, edition_data)
def ignore_edition(edition_data: BookData) -> bool:
def ignore_edition(edition_data: JsonDict) -> bool:
"""don't load a million editions that have no metadata"""
# an isbn, we love to see it
if edition_data.get("isbn_13") or edition_data.get("isbn_10"):
@ -235,7 +235,7 @@ def ignore_edition(edition_data: BookData) -> bool:
return True
def get_description(description_blob: Union[dict[str, Any], str]) -> Optional[str]:
def get_description(description_blob: Union[JsonDict, str]) -> Optional[str]:
"""descriptions can be a string or a dict"""
if isinstance(description_blob, dict):
return description_blob.get("value")
@ -247,7 +247,7 @@ def get_openlibrary_key(key: str) -> str:
return key.split("/")[-1]
def get_languages(language_blob: Iterable[dict[str, str]]) -> list[Optional[str]]:
def get_languages(language_blob: Iterable[JsonDict]) -> list[Optional[str]]:
"""/language/eng -> English"""
langs = []
for lang in language_blob:
@ -255,7 +255,7 @@ def get_languages(language_blob: Iterable[dict[str, str]]) -> list[Optional[str]
return langs
def get_dict_field(blob: Optional[dict[str, Any]], field_name: str) -> Optional[Any]:
def get_dict_field(blob: Optional[JsonDict], field_name: str) -> Optional[Any]:
"""extract the isni from the remote id data for the author"""
if not blob or not isinstance(blob, dict):
return None
@ -291,7 +291,7 @@ def get_inventaire_id(links: list[Any]) -> Optional[str]:
return None
def pick_default_edition(options: list[BookData]) -> Optional[BookData]:
def pick_default_edition(options: list[JsonDict]) -> Optional[JsonDict]:
"""favor physical copies with covers in english"""
if not options:
return None