Make an async request to all search connectors

This is the untest first pass at re-arranging remote search to work in parallel rather than sequence. It moves a couple functions around (raise_not_valid_url, for example, needs to be in connector_manager.py now to avoid circular imports). It adds a function to Connector objects that generates a search result (either to the isbn endpoint or the free text endpoint) based on the query, which was previously done as part of the search. I also lowered the timeout to 8 seconds by default.
2025-04-24 03:04:10 +00:00 · 2022-05-30 10:15:22 -07:00 · 2022-05-30 10:15:22 -07:00 · 9c03bf782e
commit 9c03bf782e
parent 4e3c346780
4 changed files with 62 additions and 59 deletions
--- a/bookwyrm/connectors/abstract_connector.py
+++ b/bookwyrm/connectors/abstract_connector.py
@ -1,9 +1,8 @@
 """ functionality outline for a book data connector """
 from abc import ABC, abstractmethod
 import imghdr
-import ipaddress
 import logging
-from urllib.parse import urlparse
+import re

 from django.core.files.base import ContentFile
 from django.db import transaction
@ -11,7 +10,7 @@ import requests
 from requests.exceptions import RequestException

 from bookwyrm import activitypub, models, settings
-from .connector_manager import load_more_data, ConnectorException
+from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url
 from .format_mappings import format_mappings


@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC):
        for field in self_fields:
            setattr(self, field, getattr(info, field))

+    def get_search_url(self, query):
+        """ format the query url """
+        # Check if the query resembles an ISBN
+        isbn = re.sub(r"[\W_]", "", query) # removes filler characters
+        maybe_isbn = len(isbn) in [10, 13]  # ISBN10 or ISBN13
+        if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "":
+            return f"{self.isbn_search_url}{query}"
+
+        # NOTE: previously, we tried searching isbn and if that produces no results,
+        # searched as free text. This, instead, only searches isbn if it's isbn-y
+        return f"{self.search_url}{query}"
+
    def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT):
        """free text search"""
        params = {}
@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10):
    # check if the url is blocked
    raise_not_valid_url(url)

-    if models.FederatedServer.is_blocked(url):
-        raise ConnectorException(f"Attempting to load data from blocked url: {url}")
-
    try:
        resp = requests.get(
            url,
@ -311,20 +319,6 @@ def get_image(url, timeout=10):
    return image_content, extension


-def raise_not_valid_url(url):
-    """do some basic reality checks on the url"""
-    parsed = urlparse(url)
-    if not parsed.scheme in ["http", "https"]:
-        raise ConnectorException("Invalid scheme: ", url)
-
-    try:
-        ipaddress.ip_address(parsed.netloc)
-        raise ConnectorException("Provided url is an IP address: ", url)
-    except ValueError:
-        # it's not an IP address, which is good
-        pass
-
-
 class Mapping:
    """associate a local database field with a field in an external dataset"""

--- a/bookwyrm/connectors/connector_manager.py
+++ b/bookwyrm/connectors/connector_manager.py
@ -1,10 +1,11 @@
 """ interface with whatever connectors the app has """
-from datetime import datetime
+import asyncio
 import importlib
+import ipaddress
 import logging
-import re
 from urllib.parse import urlparse

+import aiohttp
 from django.dispatch import receiver
 from django.db.models import signals

@ -21,52 +22,42 @@ class ConnectorException(HTTPError):
    """when the connector can't do what was asked"""


+async def async_connector_search(query, connectors, params):
+    """Try a number of requests simultaneously"""
+    timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
+    async with aiohttp.ClientSession(timeout=timeout) as session:
+        for connector in connectors:
+            url = connector.get_search_url(query)
+            raise_not_valid_url(url)
+
+            async with session.get(url, params=params) as response:
+                print("Status:", response.status)
+                print(response.ok)
+                print("Content-type:", response.headers['content-type'])
+
+                raw_response = await response.json()
+                yield {
+                    "connector": connector,
+                    "results": connector.parse_search_data(raw_response)
+                }
+
+
 def search(query, min_confidence=0.1, return_first=False):
    """find books based on arbitary keywords"""
    if not query:
        return []
    results = []

-    # Have we got a ISBN ?
-    isbn = re.sub(r"[\W_]", "", query)
-    maybe_isbn = len(isbn) in [10, 13]  # ISBN10 or ISBN13

-    start_time = datetime.now()
-    for connector in get_connectors():
-        result_set = None
-        if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "":
-            # Search on ISBN
-            try:
-                result_set = connector.isbn_search(isbn)
-            except Exception as err:  # pylint: disable=broad-except
-                logger.info(err)
-                # if this fails, we can still try regular search
+    connectors = list(get_connectors())

-        # if no isbn search results, we fallback to generic search
-        if not result_set:
-            try:
-                result_set = connector.search(query, min_confidence=min_confidence)
-            except Exception as err:  # pylint: disable=broad-except
-                # we don't want *any* error to crash the whole search page
-                logger.info(err)
-                continue
-
-        if return_first and result_set:
-            # if we found anything, return it
-            return result_set[0]
-
-        if result_set:
-            results.append(
-                {
-                    "connector": connector,
-                    "results": result_set,
-                }
-            )
-        if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT:
-            break
+    # load as many results as we can
+    params = {"min_confidence": min_confidence}
+    results = asyncio.run(async_connector_search(query, connectors, params))

    if return_first:
-        return None
+        # find the best result from all the responses and return that
+        raise Exception("Not implemented yet")

    return results

@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs):
    """create a connector to an external bookwyrm server"""
    if instance.application_type == "bookwyrm":
        get_or_create_connector(f"https://{instance.server_name}")
+
+
+def raise_not_valid_url(url):
+    """do some basic reality checks on the url"""
+    parsed = urlparse(url)
+    if not parsed.scheme in ["http", "https"]:
+        raise ConnectorException("Invalid scheme: ", url)
+
+    try:
+        ipaddress.ip_address(parsed.netloc)
+        raise ConnectorException("Provided url is an IP address: ", url)
+    except ValueError:
+        # it's not an IP address, which is good
+        pass
+
+    if models.FederatedServer.is_blocked(url):
+        raise ConnectorException(f"Attempting to load data from blocked url: {url}")
--- a/bookwyrm/settings.py
+++ b/bookwyrm/settings.py
@ -212,7 +212,7 @@ STREAMS = [

 # Search configuration
 # total time in seconds that the instance will spend searching connectors
-SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15))
+SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8))
 # timeout for a query to an individual connector
 QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5))

--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,4 @@
+aiohttp==3.8.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13