2021-03-08 16:49:10 +00:00
|
|
|
""" interface with whatever connectors the app has """
|
2023-07-28 15:43:32 +00:00
|
|
|
from __future__ import annotations
|
2022-05-30 17:15:22 +00:00
|
|
|
import asyncio
|
2021-01-02 16:14:28 +00:00
|
|
|
import importlib
|
2022-05-30 17:15:22 +00:00
|
|
|
import ipaddress
|
2021-04-07 15:09:47 +00:00
|
|
|
import logging
|
2023-07-28 15:43:32 +00:00
|
|
|
from asyncio import Future
|
2023-07-28 18:54:03 +00:00
|
|
|
from typing import Iterator, Any, Optional, Union, overload, Literal
|
2020-05-04 19:36:55 +00:00
|
|
|
from urllib.parse import urlparse
|
2020-03-27 22:25:08 +00:00
|
|
|
|
2022-05-30 17:15:22 +00:00
|
|
|
import aiohttp
|
2021-04-02 00:02:45 +00:00
|
|
|
from django.dispatch import receiver
|
|
|
|
from django.db.models import signals
|
|
|
|
|
2020-09-17 20:02:52 +00:00
|
|
|
from requests import HTTPError
|
|
|
|
|
2021-09-16 18:07:36 +00:00
|
|
|
from bookwyrm import book_search, models
|
2023-07-28 18:54:03 +00:00
|
|
|
from bookwyrm.book_search import SearchResult
|
2023-07-28 15:43:32 +00:00
|
|
|
from bookwyrm.connectors import abstract_connector
|
2023-04-04 16:46:32 +00:00
|
|
|
from bookwyrm.settings import SEARCH_TIMEOUT
|
2023-07-20 04:16:38 +00:00
|
|
|
from bookwyrm.tasks import app, CONNECTORS
|
2020-03-27 22:25:08 +00:00
|
|
|
|
2021-04-07 15:09:47 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2020-03-07 20:22:28 +00:00
|
|
|
|
2021-01-02 16:14:28 +00:00
|
|
|
class ConnectorException(HTTPError):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""when the connector can't do what was asked"""
|
2020-05-04 19:36:55 +00:00
|
|
|
|
|
|
|
|
2023-07-28 15:43:32 +00:00
|
|
|
async def async_connector_search(
|
|
|
|
query: str,
|
|
|
|
items: list[tuple[str, abstract_connector.AbstractConnector]],
|
|
|
|
min_confidence: float,
|
|
|
|
) -> list[Optional[abstract_connector.ConnectorResults]]:
|
2022-05-30 18:58:39 +00:00
|
|
|
"""Try a number of requests simultaneously"""
|
2022-05-30 19:35:17 +00:00
|
|
|
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
|
2022-05-30 17:15:22 +00:00
|
|
|
async with aiohttp.ClientSession(timeout=timeout) as session:
|
2023-07-28 15:43:32 +00:00
|
|
|
tasks: list[Future[Optional[abstract_connector.ConnectorResults]]] = []
|
2022-05-30 18:16:05 +00:00
|
|
|
for url, connector in items:
|
2022-05-30 18:58:39 +00:00
|
|
|
tasks.append(
|
|
|
|
asyncio.ensure_future(
|
2023-04-04 16:01:53 +00:00
|
|
|
connector.get_results(session, url, min_confidence, query)
|
2022-05-30 18:58:39 +00:00
|
|
|
)
|
|
|
|
)
|
2022-05-30 17:15:22 +00:00
|
|
|
|
2022-05-30 18:58:39 +00:00
|
|
|
results = await asyncio.gather(*tasks)
|
2023-07-28 15:43:32 +00:00
|
|
|
return list(results)
|
2022-05-30 17:15:22 +00:00
|
|
|
|
|
|
|
|
2023-07-28 18:54:03 +00:00
|
|
|
@overload
|
|
|
|
def search(
|
|
|
|
query: str, *, min_confidence: float = 0.1, return_first: Literal[False]
|
|
|
|
) -> list[abstract_connector.ConnectorResults]:
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
@overload
|
|
|
|
def search(
|
|
|
|
query: str, *, min_confidence: float = 0.1, return_first: Literal[True]
|
|
|
|
) -> Optional[SearchResult]:
|
|
|
|
...
|
|
|
|
|
|
|
|
|
|
|
|
def search(
|
|
|
|
query: str, *, min_confidence: float = 0.1, return_first: bool = False
|
|
|
|
) -> Union[list[abstract_connector.ConnectorResults], Optional[SearchResult]]:
|
2023-04-04 15:12:59 +00:00
|
|
|
"""find books based on arbitrary keywords"""
|
2021-03-31 19:03:58 +00:00
|
|
|
if not query:
|
2023-07-28 18:54:03 +00:00
|
|
|
return None if return_first else []
|
2021-03-01 20:09:21 +00:00
|
|
|
|
2022-05-30 18:16:05 +00:00
|
|
|
items = []
|
|
|
|
for connector in get_connectors():
|
|
|
|
# get the search url from the connector before sending
|
|
|
|
url = connector.get_search_url(query)
|
2022-05-30 19:35:17 +00:00
|
|
|
try:
|
|
|
|
raise_not_valid_url(url)
|
|
|
|
except ConnectorException:
|
|
|
|
# if this URL is invalid we should skip it and move on
|
2022-05-31 16:32:32 +00:00
|
|
|
logger.info("Request denied to blocked domain: %s", url)
|
2022-05-30 19:35:17 +00:00
|
|
|
continue
|
2022-05-30 18:16:05 +00:00
|
|
|
items.append((url, connector))
|
2022-05-30 17:15:22 +00:00
|
|
|
|
|
|
|
# load as many results as we can
|
2023-07-28 15:43:32 +00:00
|
|
|
# failed requests will return None, so filter those out
|
|
|
|
results = [
|
|
|
|
r
|
|
|
|
for r in asyncio.run(async_connector_search(query, items, min_confidence))
|
|
|
|
if r
|
|
|
|
]
|
2020-04-29 17:57:20 +00:00
|
|
|
|
2021-05-10 16:57:53 +00:00
|
|
|
if return_first:
|
2022-05-30 17:15:22 +00:00
|
|
|
# find the best result from all the responses and return that
|
2022-05-31 15:20:59 +00:00
|
|
|
all_results = [r for con in results for r in con["results"]]
|
|
|
|
all_results = sorted(all_results, key=lambda r: r.confidence, reverse=True)
|
2022-05-31 15:49:23 +00:00
|
|
|
return all_results[0] if all_results else None
|
2021-05-10 16:57:53 +00:00
|
|
|
|
2022-05-31 15:20:59 +00:00
|
|
|
return results
|
2020-03-27 22:25:08 +00:00
|
|
|
|
2020-05-03 19:59:06 +00:00
|
|
|
|
2023-07-28 18:54:03 +00:00
|
|
|
def first_search_result(
|
|
|
|
query: str, min_confidence: float = 0.1
|
|
|
|
) -> Union[models.Edition, SearchResult, None]:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""search until you find a result that fits"""
|
2021-09-16 18:07:36 +00:00
|
|
|
# try local search first
|
|
|
|
result = book_search.search(query, min_confidence=min_confidence, return_first=True)
|
|
|
|
if result:
|
|
|
|
return result
|
|
|
|
# otherwise, try remote endpoints
|
2021-05-10 19:53:36 +00:00
|
|
|
return search(query, min_confidence=min_confidence, return_first=True) or None
|
2020-05-03 22:26:47 +00:00
|
|
|
|
|
|
|
|
2023-07-28 15:43:32 +00:00
|
|
|
def get_connectors() -> Iterator[abstract_connector.AbstractConnector]:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""load all connectors"""
|
2021-05-11 18:34:58 +00:00
|
|
|
for info in models.Connector.objects.filter(active=True).order_by("priority").all():
|
2020-05-12 17:01:36 +00:00
|
|
|
yield load_connector(info)
|
2021-01-02 16:14:28 +00:00
|
|
|
|
|
|
|
|
2023-07-28 15:43:32 +00:00
|
|
|
def get_or_create_connector(remote_id: str) -> abstract_connector.AbstractConnector:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""get the connector related to the object's server"""
|
2021-01-02 16:14:28 +00:00
|
|
|
url = urlparse(remote_id)
|
|
|
|
identifier = url.netloc
|
|
|
|
if not identifier:
|
2021-03-08 16:49:10 +00:00
|
|
|
raise ValueError("Invalid remote id")
|
2021-01-02 16:14:28 +00:00
|
|
|
|
|
|
|
try:
|
|
|
|
connector_info = models.Connector.objects.get(identifier=identifier)
|
|
|
|
except models.Connector.DoesNotExist:
|
|
|
|
connector_info = models.Connector.objects.create(
|
|
|
|
identifier=identifier,
|
2021-03-08 16:49:10 +00:00
|
|
|
connector_file="bookwyrm_connector",
|
2021-09-18 18:32:00 +00:00
|
|
|
base_url=f"https://{identifier}",
|
|
|
|
books_url=f"https://{identifier}/book",
|
|
|
|
covers_url=f"https://{identifier}/images/covers",
|
|
|
|
search_url=f"https://{identifier}/search?q=",
|
2021-03-08 16:49:10 +00:00
|
|
|
priority=2,
|
2021-01-02 16:14:28 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
return load_connector(connector_info)
|
|
|
|
|
|
|
|
|
2023-07-20 04:16:38 +00:00
|
|
|
@app.task(queue=CONNECTORS)
|
2023-07-28 15:43:32 +00:00
|
|
|
def load_more_data(connector_id: str, book_id: str) -> None:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""background the work of getting all 10,000 editions of LoTR"""
|
2021-01-02 16:14:28 +00:00
|
|
|
connector_info = models.Connector.objects.get(id=connector_id)
|
|
|
|
connector = load_connector(connector_info)
|
|
|
|
book = models.Book.objects.select_subclasses().get(id=book_id)
|
|
|
|
connector.expand_book_data(book)
|
|
|
|
|
|
|
|
|
2023-07-20 04:16:38 +00:00
|
|
|
@app.task(queue=CONNECTORS)
|
2023-07-28 15:43:32 +00:00
|
|
|
def create_edition_task(
|
2023-07-28 18:54:03 +00:00
|
|
|
connector_id: int, work_id: int, data: Union[str, abstract_connector.JsonDict]
|
2023-07-28 15:43:32 +00:00
|
|
|
) -> None:
|
2022-05-31 19:41:57 +00:00
|
|
|
"""separate task for each of the 10,000 editions of LoTR"""
|
|
|
|
connector_info = models.Connector.objects.get(id=connector_id)
|
|
|
|
connector = load_connector(connector_info)
|
|
|
|
work = models.Work.objects.select_subclasses().get(id=work_id)
|
|
|
|
connector.create_edition_from_data(work, data)
|
|
|
|
|
|
|
|
|
2023-07-28 15:43:32 +00:00
|
|
|
def load_connector(
|
|
|
|
connector_info: models.Connector,
|
|
|
|
) -> abstract_connector.AbstractConnector:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""instantiate the connector class"""
|
2021-01-02 16:14:28 +00:00
|
|
|
connector = importlib.import_module(
|
2021-09-18 18:32:00 +00:00
|
|
|
f"bookwyrm.connectors.{connector_info.connector_file}"
|
2021-01-02 16:14:28 +00:00
|
|
|
)
|
2023-07-28 15:43:32 +00:00
|
|
|
return connector.Connector(connector_info.identifier) # type: ignore[no-any-return]
|
2021-04-02 00:02:45 +00:00
|
|
|
|
|
|
|
|
|
|
|
@receiver(signals.post_save, sender="bookwyrm.FederatedServer")
|
|
|
|
# pylint: disable=unused-argument
|
2023-07-28 15:43:32 +00:00
|
|
|
def create_connector(
|
|
|
|
sender: Any,
|
|
|
|
instance: models.FederatedServer,
|
|
|
|
created: Any,
|
|
|
|
*args: Any,
|
|
|
|
**kwargs: Any,
|
|
|
|
) -> None:
|
2021-04-26 16:15:42 +00:00
|
|
|
"""create a connector to an external bookwyrm server"""
|
2021-04-02 00:02:45 +00:00
|
|
|
if instance.application_type == "bookwyrm":
|
2021-09-18 18:32:00 +00:00
|
|
|
get_or_create_connector(f"https://{instance.server_name}")
|
2022-05-30 17:15:22 +00:00
|
|
|
|
|
|
|
|
2023-07-28 15:43:32 +00:00
|
|
|
def raise_not_valid_url(url: str) -> None:
|
2022-05-30 17:15:22 +00:00
|
|
|
"""do some basic reality checks on the url"""
|
|
|
|
parsed = urlparse(url)
|
|
|
|
if not parsed.scheme in ["http", "https"]:
|
|
|
|
raise ConnectorException("Invalid scheme: ", url)
|
|
|
|
|
|
|
|
try:
|
|
|
|
ipaddress.ip_address(parsed.netloc)
|
|
|
|
raise ConnectorException("Provided url is an IP address: ", url)
|
|
|
|
except ValueError:
|
|
|
|
# it's not an IP address, which is good
|
|
|
|
pass
|
|
|
|
|
|
|
|
if models.FederatedServer.is_blocked(url):
|
|
|
|
raise ConnectorException(f"Attempting to load data from blocked url: {url}")
|