Make an async request to all search connectors

This is the untest first pass at re-arranging remote search to work in
parallel rather than sequence. It moves a couple functions around
(raise_not_valid_url, for example, needs to be in connector_manager.py
now to avoid circular imports). It adds a function to Connector objects
that generates a search result (either to the isbn endpoint or the free
text endpoint) based on the query, which was previously done as part of
the search.

I also lowered the timeout to 8 seconds by default.
This commit is contained in:
Mouse Reeve 2022-05-30 10:15:22 -07:00
parent 4e3c346780
commit 9c03bf782e
4 changed files with 62 additions and 59 deletions

View file

@ -1,9 +1,8 @@
""" functionality outline for a book data connector """
from abc import ABC, abstractmethod
import imghdr
import ipaddress
import logging
from urllib.parse import urlparse
import re
from django.core.files.base import ContentFile
from django.db import transaction
@ -11,7 +10,7 @@ import requests
from requests.exceptions import RequestException
from bookwyrm import activitypub, models, settings
from .connector_manager import load_more_data, ConnectorException
from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url
from .format_mappings import format_mappings
@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC):
for field in self_fields:
setattr(self, field, getattr(info, field))
def get_search_url(self, query):
""" format the query url """
# Check if the query resembles an ISBN
isbn = re.sub(r"[\W_]", "", query) # removes filler characters
maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13
if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "":
return f"{self.isbn_search_url}{query}"
# NOTE: previously, we tried searching isbn and if that produces no results,
# searched as free text. This, instead, only searches isbn if it's isbn-y
return f"{self.search_url}{query}"
def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT):
"""free text search"""
params = {}
@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10):
# check if the url is blocked
raise_not_valid_url(url)
if models.FederatedServer.is_blocked(url):
raise ConnectorException(f"Attempting to load data from blocked url: {url}")
try:
resp = requests.get(
url,
@ -311,20 +319,6 @@ def get_image(url, timeout=10):
return image_content, extension
def raise_not_valid_url(url):
"""do some basic reality checks on the url"""
parsed = urlparse(url)
if not parsed.scheme in ["http", "https"]:
raise ConnectorException("Invalid scheme: ", url)
try:
ipaddress.ip_address(parsed.netloc)
raise ConnectorException("Provided url is an IP address: ", url)
except ValueError:
# it's not an IP address, which is good
pass
class Mapping:
"""associate a local database field with a field in an external dataset"""

View file

@ -1,10 +1,11 @@
""" interface with whatever connectors the app has """
from datetime import datetime
import asyncio
import importlib
import ipaddress
import logging
import re
from urllib.parse import urlparse
import aiohttp
from django.dispatch import receiver
from django.db.models import signals
@ -21,52 +22,42 @@ class ConnectorException(HTTPError):
"""when the connector can't do what was asked"""
async def async_connector_search(query, connectors, params):
"""Try a number of requests simultaneously"""
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
async with aiohttp.ClientSession(timeout=timeout) as session:
for connector in connectors:
url = connector.get_search_url(query)
raise_not_valid_url(url)
async with session.get(url, params=params) as response:
print("Status:", response.status)
print(response.ok)
print("Content-type:", response.headers['content-type'])
raw_response = await response.json()
yield {
"connector": connector,
"results": connector.parse_search_data(raw_response)
}
def search(query, min_confidence=0.1, return_first=False):
"""find books based on arbitary keywords"""
if not query:
return []
results = []
# Have we got a ISBN ?
isbn = re.sub(r"[\W_]", "", query)
maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13
start_time = datetime.now()
for connector in get_connectors():
result_set = None
if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "":
# Search on ISBN
try:
result_set = connector.isbn_search(isbn)
except Exception as err: # pylint: disable=broad-except
logger.info(err)
# if this fails, we can still try regular search
connectors = list(get_connectors())
# if no isbn search results, we fallback to generic search
if not result_set:
try:
result_set = connector.search(query, min_confidence=min_confidence)
except Exception as err: # pylint: disable=broad-except
# we don't want *any* error to crash the whole search page
logger.info(err)
continue
if return_first and result_set:
# if we found anything, return it
return result_set[0]
if result_set:
results.append(
{
"connector": connector,
"results": result_set,
}
)
if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT:
break
# load as many results as we can
params = {"min_confidence": min_confidence}
results = asyncio.run(async_connector_search(query, connectors, params))
if return_first:
return None
# find the best result from all the responses and return that
raise Exception("Not implemented yet")
return results
@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs):
"""create a connector to an external bookwyrm server"""
if instance.application_type == "bookwyrm":
get_or_create_connector(f"https://{instance.server_name}")
def raise_not_valid_url(url):
"""do some basic reality checks on the url"""
parsed = urlparse(url)
if not parsed.scheme in ["http", "https"]:
raise ConnectorException("Invalid scheme: ", url)
try:
ipaddress.ip_address(parsed.netloc)
raise ConnectorException("Provided url is an IP address: ", url)
except ValueError:
# it's not an IP address, which is good
pass
if models.FederatedServer.is_blocked(url):
raise ConnectorException(f"Attempting to load data from blocked url: {url}")

View file

@ -212,7 +212,7 @@ STREAMS = [
# Search configuration
# total time in seconds that the instance will spend searching connectors
SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15))
SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8))
# timeout for a query to an individual connector
QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5))

View file

@ -1,3 +1,4 @@
aiohttp==3.8.1
celery==5.2.2
colorthief==0.2.1
Django==3.2.13