From 9c03bf782e14a991cf317055d4f91eeb44718fc2 Mon Sep 17 00:00:00 2001
From: Mouse Reeve <mousereeve@riseup.net>
Date: Mon, 30 May 2022 10:15:22 -0700
Subject: [PATCH] Make an async request to all search connectors

This is the untest first pass at re-arranging remote search to work in
parallel rather than sequence. It moves a couple functions around
(raise_not_valid_url, for example, needs to be in connector_manager.py
now to avoid circular imports). It adds a function to Connector objects
that generates a search result (either to the isbn endpoint or the free
text endpoint) based on the query, which was previously done as part of
the search.

I also lowered the timeout to 8 seconds by default.
---
 bookwyrm/connectors/abstract_connector.py | 34 ++++-----
 bookwyrm/connectors/connector_manager.py  | 84 +++++++++++++----------
 bookwyrm/settings.py                      |  2 +-
 requirements.txt                          |  1 +
 4 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/bookwyrm/connectors/abstract_connector.py b/bookwyrm/connectors/abstract_connector.py
index 56e273886..6685d5a09 100644
--- a/bookwyrm/connectors/abstract_connector.py
+++ b/bookwyrm/connectors/abstract_connector.py
@@ -1,9 +1,8 @@
 """ functionality outline for a book data connector """
 from abc import ABC, abstractmethod
 import imghdr
-import ipaddress
 import logging
-from urllib.parse import urlparse
+import re
 
 from django.core.files.base import ContentFile
 from django.db import transaction
@@ -11,7 +10,7 @@ import requests
 from requests.exceptions import RequestException
 
 from bookwyrm import activitypub, models, settings
-from .connector_manager import load_more_data, ConnectorException
+from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url
 from .format_mappings import format_mappings
 
 
@@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC):
         for field in self_fields:
             setattr(self, field, getattr(info, field))
 
+    def get_search_url(self, query):
+        """ format the query url """
+        # Check if the query resembles an ISBN
+        isbn = re.sub(r"[\W_]", "", query) # removes filler characters
+        maybe_isbn = len(isbn) in [10, 13]  # ISBN10 or ISBN13
+        if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "":
+            return f"{self.isbn_search_url}{query}"
+
+        # NOTE: previously, we tried searching isbn and if that produces no results,
+        # searched as free text. This, instead, only searches isbn if it's isbn-y
+        return f"{self.search_url}{query}"
+
     def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT):
         """free text search"""
         params = {}
@@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10):
     # check if the url is blocked
     raise_not_valid_url(url)
 
-    if models.FederatedServer.is_blocked(url):
-        raise ConnectorException(f"Attempting to load data from blocked url: {url}")
-
     try:
         resp = requests.get(
             url,
@@ -311,20 +319,6 @@ def get_image(url, timeout=10):
     return image_content, extension
 
 
-def raise_not_valid_url(url):
-    """do some basic reality checks on the url"""
-    parsed = urlparse(url)
-    if not parsed.scheme in ["http", "https"]:
-        raise ConnectorException("Invalid scheme: ", url)
-
-    try:
-        ipaddress.ip_address(parsed.netloc)
-        raise ConnectorException("Provided url is an IP address: ", url)
-    except ValueError:
-        # it's not an IP address, which is good
-        pass
-
-
 class Mapping:
     """associate a local database field with a field in an external dataset"""
 
diff --git a/bookwyrm/connectors/connector_manager.py b/bookwyrm/connectors/connector_manager.py
index 14bb702cb..2b5ab1c9d 100644
--- a/bookwyrm/connectors/connector_manager.py
+++ b/bookwyrm/connectors/connector_manager.py
@@ -1,10 +1,11 @@
 """ interface with whatever connectors the app has """
-from datetime import datetime
+import asyncio
 import importlib
+import ipaddress
 import logging
-import re
 from urllib.parse import urlparse
 
+import aiohttp
 from django.dispatch import receiver
 from django.db.models import signals
 
@@ -21,52 +22,42 @@ class ConnectorException(HTTPError):
     """when the connector can't do what was asked"""
 
 
+async def async_connector_search(query, connectors, params):
+    """Try a number of requests simultaneously"""
+    timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
+    async with aiohttp.ClientSession(timeout=timeout) as session:
+        for connector in connectors:
+            url = connector.get_search_url(query)
+            raise_not_valid_url(url)
+
+            async with session.get(url, params=params) as response:
+                print("Status:", response.status)
+                print(response.ok)
+                print("Content-type:", response.headers['content-type'])
+
+                raw_response = await response.json()
+                yield {
+                    "connector": connector,
+                    "results": connector.parse_search_data(raw_response)
+                }
+
+
 def search(query, min_confidence=0.1, return_first=False):
     """find books based on arbitary keywords"""
     if not query:
         return []
     results = []
 
-    # Have we got a ISBN ?
-    isbn = re.sub(r"[\W_]", "", query)
-    maybe_isbn = len(isbn) in [10, 13]  # ISBN10 or ISBN13
 
-    start_time = datetime.now()
-    for connector in get_connectors():
-        result_set = None
-        if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "":
-            # Search on ISBN
-            try:
-                result_set = connector.isbn_search(isbn)
-            except Exception as err:  # pylint: disable=broad-except
-                logger.info(err)
-                # if this fails, we can still try regular search
+    connectors = list(get_connectors())
 
-        # if no isbn search results, we fallback to generic search
-        if not result_set:
-            try:
-                result_set = connector.search(query, min_confidence=min_confidence)
-            except Exception as err:  # pylint: disable=broad-except
-                # we don't want *any* error to crash the whole search page
-                logger.info(err)
-                continue
-
-        if return_first and result_set:
-            # if we found anything, return it
-            return result_set[0]
-
-        if result_set:
-            results.append(
-                {
-                    "connector": connector,
-                    "results": result_set,
-                }
-            )
-        if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT:
-            break
+    # load as many results as we can
+    params = {"min_confidence": min_confidence}
+    results = asyncio.run(async_connector_search(query, connectors, params))
 
     if return_first:
-        return None
+        # find the best result from all the responses and return that
+        raise Exception("Not implemented yet")
 
     return results
 
@@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs):
     """create a connector to an external bookwyrm server"""
     if instance.application_type == "bookwyrm":
         get_or_create_connector(f"https://{instance.server_name}")
+
+
+def raise_not_valid_url(url):
+    """do some basic reality checks on the url"""
+    parsed = urlparse(url)
+    if not parsed.scheme in ["http", "https"]:
+        raise ConnectorException("Invalid scheme: ", url)
+
+    try:
+        ipaddress.ip_address(parsed.netloc)
+        raise ConnectorException("Provided url is an IP address: ", url)
+    except ValueError:
+        # it's not an IP address, which is good
+        pass
+
+    if models.FederatedServer.is_blocked(url):
+        raise ConnectorException(f"Attempting to load data from blocked url: {url}")
diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py
index e16c576e1..dc0d71f30 100644
--- a/bookwyrm/settings.py
+++ b/bookwyrm/settings.py
@@ -212,7 +212,7 @@ STREAMS = [
 
 # Search configuration
 # total time in seconds that the instance will spend searching connectors
-SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15))
+SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8))
 # timeout for a query to an individual connector
 QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5))
 
diff --git a/requirements.txt b/requirements.txt
index 7614dc421..c3bdaf757 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
+aiohttp==3.8.1
 celery==5.2.2
 colorthief==0.2.1
 Django==3.2.13