mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-25 16:38:09 +00:00
Make an async request to all search connectors
This is the untest first pass at re-arranging remote search to work in parallel rather than sequence. It moves a couple functions around (raise_not_valid_url, for example, needs to be in connector_manager.py now to avoid circular imports). It adds a function to Connector objects that generates a search result (either to the isbn endpoint or the free text endpoint) based on the query, which was previously done as part of the search. I also lowered the timeout to 8 seconds by default.
This commit is contained in:
parent
4e3c346780
commit
9c03bf782e
4 changed files with 62 additions and 59 deletions
|
@ -1,9 +1,8 @@
|
|||
""" functionality outline for a book data connector """
|
||||
from abc import ABC, abstractmethod
|
||||
import imghdr
|
||||
import ipaddress
|
||||
import logging
|
||||
from urllib.parse import urlparse
|
||||
import re
|
||||
|
||||
from django.core.files.base import ContentFile
|
||||
from django.db import transaction
|
||||
|
@ -11,7 +10,7 @@ import requests
|
|||
from requests.exceptions import RequestException
|
||||
|
||||
from bookwyrm import activitypub, models, settings
|
||||
from .connector_manager import load_more_data, ConnectorException
|
||||
from .connector_manager import load_more_data, ConnectorException, raise_not_valid_url
|
||||
from .format_mappings import format_mappings
|
||||
|
||||
|
||||
|
@ -39,6 +38,18 @@ class AbstractMinimalConnector(ABC):
|
|||
for field in self_fields:
|
||||
setattr(self, field, getattr(info, field))
|
||||
|
||||
def get_search_url(self, query):
|
||||
""" format the query url """
|
||||
# Check if the query resembles an ISBN
|
||||
isbn = re.sub(r"[\W_]", "", query) # removes filler characters
|
||||
maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13
|
||||
if maybe_isbn and self.isbn_search_url and self.isbn_search_url != "":
|
||||
return f"{self.isbn_search_url}{query}"
|
||||
|
||||
# NOTE: previously, we tried searching isbn and if that produces no results,
|
||||
# searched as free text. This, instead, only searches isbn if it's isbn-y
|
||||
return f"{self.search_url}{query}"
|
||||
|
||||
def search(self, query, min_confidence=None, timeout=settings.QUERY_TIMEOUT):
|
||||
"""free text search"""
|
||||
params = {}
|
||||
|
@ -254,9 +265,6 @@ def get_data(url, params=None, timeout=10):
|
|||
# check if the url is blocked
|
||||
raise_not_valid_url(url)
|
||||
|
||||
if models.FederatedServer.is_blocked(url):
|
||||
raise ConnectorException(f"Attempting to load data from blocked url: {url}")
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
url,
|
||||
|
@ -311,20 +319,6 @@ def get_image(url, timeout=10):
|
|||
return image_content, extension
|
||||
|
||||
|
||||
def raise_not_valid_url(url):
|
||||
"""do some basic reality checks on the url"""
|
||||
parsed = urlparse(url)
|
||||
if not parsed.scheme in ["http", "https"]:
|
||||
raise ConnectorException("Invalid scheme: ", url)
|
||||
|
||||
try:
|
||||
ipaddress.ip_address(parsed.netloc)
|
||||
raise ConnectorException("Provided url is an IP address: ", url)
|
||||
except ValueError:
|
||||
# it's not an IP address, which is good
|
||||
pass
|
||||
|
||||
|
||||
class Mapping:
|
||||
"""associate a local database field with a field in an external dataset"""
|
||||
|
||||
|
|
|
@ -1,10 +1,11 @@
|
|||
""" interface with whatever connectors the app has """
|
||||
from datetime import datetime
|
||||
import asyncio
|
||||
import importlib
|
||||
import ipaddress
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import aiohttp
|
||||
from django.dispatch import receiver
|
||||
from django.db.models import signals
|
||||
|
||||
|
@ -21,52 +22,42 @@ class ConnectorException(HTTPError):
|
|||
"""when the connector can't do what was asked"""
|
||||
|
||||
|
||||
async def async_connector_search(query, connectors, params):
|
||||
"""Try a number of requests simultaneously"""
|
||||
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
for connector in connectors:
|
||||
url = connector.get_search_url(query)
|
||||
raise_not_valid_url(url)
|
||||
|
||||
async with session.get(url, params=params) as response:
|
||||
print("Status:", response.status)
|
||||
print(response.ok)
|
||||
print("Content-type:", response.headers['content-type'])
|
||||
|
||||
raw_response = await response.json()
|
||||
yield {
|
||||
"connector": connector,
|
||||
"results": connector.parse_search_data(raw_response)
|
||||
}
|
||||
|
||||
|
||||
def search(query, min_confidence=0.1, return_first=False):
|
||||
"""find books based on arbitary keywords"""
|
||||
if not query:
|
||||
return []
|
||||
results = []
|
||||
|
||||
# Have we got a ISBN ?
|
||||
isbn = re.sub(r"[\W_]", "", query)
|
||||
maybe_isbn = len(isbn) in [10, 13] # ISBN10 or ISBN13
|
||||
|
||||
start_time = datetime.now()
|
||||
for connector in get_connectors():
|
||||
result_set = None
|
||||
if maybe_isbn and connector.isbn_search_url and connector.isbn_search_url != "":
|
||||
# Search on ISBN
|
||||
try:
|
||||
result_set = connector.isbn_search(isbn)
|
||||
except Exception as err: # pylint: disable=broad-except
|
||||
logger.info(err)
|
||||
# if this fails, we can still try regular search
|
||||
connectors = list(get_connectors())
|
||||
|
||||
# if no isbn search results, we fallback to generic search
|
||||
if not result_set:
|
||||
try:
|
||||
result_set = connector.search(query, min_confidence=min_confidence)
|
||||
except Exception as err: # pylint: disable=broad-except
|
||||
# we don't want *any* error to crash the whole search page
|
||||
logger.info(err)
|
||||
continue
|
||||
|
||||
if return_first and result_set:
|
||||
# if we found anything, return it
|
||||
return result_set[0]
|
||||
|
||||
if result_set:
|
||||
results.append(
|
||||
{
|
||||
"connector": connector,
|
||||
"results": result_set,
|
||||
}
|
||||
)
|
||||
if (datetime.now() - start_time).seconds >= SEARCH_TIMEOUT:
|
||||
break
|
||||
# load as many results as we can
|
||||
params = {"min_confidence": min_confidence}
|
||||
results = asyncio.run(async_connector_search(query, connectors, params))
|
||||
|
||||
if return_first:
|
||||
return None
|
||||
# find the best result from all the responses and return that
|
||||
raise Exception("Not implemented yet")
|
||||
|
||||
return results
|
||||
|
||||
|
@ -133,3 +124,20 @@ def create_connector(sender, instance, created, *args, **kwargs):
|
|||
"""create a connector to an external bookwyrm server"""
|
||||
if instance.application_type == "bookwyrm":
|
||||
get_or_create_connector(f"https://{instance.server_name}")
|
||||
|
||||
|
||||
def raise_not_valid_url(url):
|
||||
"""do some basic reality checks on the url"""
|
||||
parsed = urlparse(url)
|
||||
if not parsed.scheme in ["http", "https"]:
|
||||
raise ConnectorException("Invalid scheme: ", url)
|
||||
|
||||
try:
|
||||
ipaddress.ip_address(parsed.netloc)
|
||||
raise ConnectorException("Provided url is an IP address: ", url)
|
||||
except ValueError:
|
||||
# it's not an IP address, which is good
|
||||
pass
|
||||
|
||||
if models.FederatedServer.is_blocked(url):
|
||||
raise ConnectorException(f"Attempting to load data from blocked url: {url}")
|
||||
|
|
|
@ -212,7 +212,7 @@ STREAMS = [
|
|||
|
||||
# Search configuration
|
||||
# total time in seconds that the instance will spend searching connectors
|
||||
SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 15))
|
||||
SEARCH_TIMEOUT = int(env("SEARCH_TIMEOUT", 8))
|
||||
# timeout for a query to an individual connector
|
||||
QUERY_TIMEOUT = int(env("QUERY_TIMEOUT", 5))
|
||||
|
||||
|
|
|
@ -1,3 +1,4 @@
|
|||
aiohttp==3.8.1
|
||||
celery==5.2.2
|
||||
colorthief==0.2.1
|
||||
Django==3.2.13
|
||||
|
|
Loading…
Reference in a new issue