From 9c03bf782e14a991cf317055d4f91eeb44718fc2 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Mon, 30 May 2022 10:15:22 -0700 Subject: [PATCH] Make an async request to all search connectors This is the untest first pass at re-arranging remote search to work in parallel rather than sequence. It moves a couple functions around (raise_not_valid_url, for example, needs to be in connector_manager.py now to avoid circular imports). It adds a function to Connector objects that generates a search result (either to the isbn endpoint or the free text endpoint) based on the query, which was previously done as part of the search. I also lowered the timeout to 8 seconds by default. --- bookwyrm/connectors/abstract_connector.py | 34 ++++----- bookwyrm/connectors/connector_manager.py | 84 +++++++++++++---------- bookwyrm/settings.py | 2 +- requirements.txt | 1 + 4 files changed, 62 insertions(+), 59 deletions(-) diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py index 56e273886..6685d5a09 100644 --- a/bookwyrm/connectors/abstract_connector.py +++ b/bookwyrm/connectors/abstract_connector.py @@ -1,9 +1,8 @@ """ functionality outline for a book data connector """ from abc import ABC, abstractmethod import imghdr -import ipaddress import logging -from urllib.parse import urlparse +import re from django.core.files.base import ContentFile from django.db import transaction @@ -11,7 +10,7 @@ import requests from requests.exceptions import RequestException from bookwyrm import activitypub, models, settings -from .connector_manager import load_more_data, ConnectorException +from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url from .format_mappings import format_mappings @@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC): for field in self_fields: setattr(self, field, getattr(info, field)) + def get_search_url(self, query): + """ format the query url """ + # Check if the query resembles an ISBN + isbn = re.sub(r"[\W_]", "", query) # removes filler characters + maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13 + if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "": + return f"{self.isbn_search_url}{query}" + + # NOTE: previously, we tried searching isbn and if that produces no results, + # searched as free text. This, instead, only searches isbn if it's isbn-y + return f"{self.search_url}{query}" + def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT): """free text search""" params = {} @@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10): # check if the url is blocked raise_not_valid_url(url) - if models.FederatedServer.is_blocked(url): - raise ConnectorException(f"Attempting to load data from blocked url: {url}") - try: resp = requests.get( url, @@ -311,20 +319,6 @@ def get_image(url, timeout=10): return image_content, extension -def raise_not_valid_url(url): - """do some basic reality checks on the url""" - parsed = urlparse(url) - if not parsed.scheme in ["http", "https"]: - raise ConnectorException("Invalid scheme: ", url) - - try: - ipaddress.ip_address(parsed.netloc) - raise ConnectorException("Provided url is an IP address: ", url) - except ValueError: - # it's not an IP address, which is good - pass - - class Mapping: """associate a local database field with a field in an external dataset""" diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py index 14bb702cb..2b5ab1c9d 100644 --- a/bookwyrm/connectors/connector_manager.py +++ b/bookwyrm/connectors/connector_manager.py @@ -1,10 +1,11 @@ """ interface with whatever connectors the app has """ -from datetime import datetime +import asyncio import importlib +import ipaddress import logging -import re from urllib.parse import urlparse +import aiohttp from django.dispatch import receiver from django.db.models import signals @@ -21,52 +22,42 @@ class ConnectorException(HTTPError): """when the connector can't do what was asked""" +async def async_connector_search(query, connectors, params): + """Try a number of requests simultaneously""" + timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) + async with aiohttp.ClientSession(timeout=timeout) as session: + for connector in connectors: + url = connector.get_search_url(query) + raise_not_valid_url(url) + + async with session.get(url, params=params) as response: + print("Status:", response.status) + print(response.ok) + print("Content-type:", response.headers['content-type']) + + raw_response = await response.json() + yield { + "connector": connector, + "results": connector.parse_search_data(raw_response) + } + + def search(query, min_confidence=0.1, return_first=False): """find books based on arbitary keywords""" if not query: return [] results = [] - # Have we got a ISBN ? - isbn = re.sub(r"[\W_]", "", query) - maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13 - start_time = datetime.now() - for connector in get_connectors(): - result_set = None - if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "": - # Search on ISBN - try: - result_set = connector.isbn_search(isbn) - except Exception as err: # pylint: disable=broad-except - logger.info(err) - # if this fails, we can still try regular search + connectors = list(get_connectors()) - # if no isbn search results, we fallback to generic search - if not result_set: - try: - result_set = connector.search(query, min_confidence=min_confidence) - except Exception as err: # pylint: disable=broad-except - # we don't want *any* error to crash the whole search page - logger.info(err) - continue - - if return_first and result_set: - # if we found anything, return it - return result_set[0] - - if result_set: - results.append( - { - "connector": connector, - "results": result_set, - } - ) - if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT: - break + # load as many results as we can + params = {"min_confidence": min_confidence} + results = asyncio.run(async_connector_search(query, connectors, params)) if return_first: - return None + # find the best result from all the responses and return that + raise Exception("Not implemented yet") return results @@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs): """create a connector to an external bookwyrm server""" if instance.application_type == "bookwyrm": get_or_create_connector(f"https://{instance.server_name}") + + +def raise_not_valid_url(url): + """do some basic reality checks on the url""" + parsed = urlparse(url) + if not parsed.scheme in ["http", "https"]: + raise ConnectorException("Invalid scheme: ", url) + + try: + ipaddress.ip_address(parsed.netloc) + raise ConnectorException("Provided url is an IP address: ", url) + except ValueError: + # it's not an IP address, which is good + pass + + if models.FederatedServer.is_blocked(url): + raise ConnectorException(f"Attempting to load data from blocked url: {url}") diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py index e16c576e1..dc0d71f30 100644 --- a/bookwyrm/settings.py +++ b/bookwyrm/settings.py @@ -212,7 +212,7 @@ STREAMS = [ # Search configuration # total time in seconds that the instance will spend searching connectors -SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15)) +SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8)) # timeout for a query to an individual connector QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5)) diff --git a/requirements.txt b/requirements.txt index 7614dc421..c3bdaf757 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,4 @@ +aiohttp==3.8.1 celery==5.2.2 colorthief==0.2.1 Django==3.2.13