Filter intentaire results by confidence

This commit is contained in:
Mouse Reeve 2022-05-30 16:42:37 -07:00
parent af19d728d2
commit 83ee5a756f
5 changed files with 15 additions and 15 deletions

View file

@ -48,20 +48,18 @@ class AbstractMinimalConnector(ABC):
# searched as free text. This, instead, only searches isbn if it's isbn-y # searched as free text. This, instead, only searches isbn if it's isbn-y
return f"{self.search_url}{query}" return f"{self.search_url}{query}"
def process_search_response(self, query, data): def process_search_response(self, query, data, min_confidence):
"""Format the search results based on the formt of the query""" """Format the search results based on the formt of the query"""
# TODO: inventaire min confidence
parser = self.parse_search_data
if maybe_isbn(query): if maybe_isbn(query):
parser = self.parse_isbn_search_data return list(self.parse_isbn_search_data(data))[:10]
return list(parser(data))[:10] return list(self.parse_search_data(data, min_confidence))[:10]
@abstractmethod @abstractmethod
def get_or_create_book(self, remote_id): def get_or_create_book(self, remote_id):
"""pull up a book record by whatever means possible""" """pull up a book record by whatever means possible"""
@abstractmethod @abstractmethod
def parse_search_data(self, data): def parse_search_data(self, data, min_confidence):
"""turn the result json from a search into a list""" """turn the result json from a search into a list"""
@abstractmethod @abstractmethod

View file

@ -10,7 +10,7 @@ class Connector(AbstractMinimalConnector):
def get_or_create_book(self, remote_id): def get_or_create_book(self, remote_id):
return activitypub.resolve_remote_id(remote_id, model=models.Edition) return activitypub.resolve_remote_id(remote_id, model=models.Edition)
def parse_search_data(self, data): def parse_search_data(self, data, min_confidence):
for search_result in data: for search_result in data:
search_result["connector"] = self search_result["connector"] = self
yield SearchResult(**search_result) yield SearchResult(**search_result)

View file

@ -22,7 +22,7 @@ class ConnectorException(HTTPError):
"""when the connector can't do what was asked""" """when the connector can't do what was asked"""
async def get_results(session, url, params, query, connector): async def get_results(session, url, min_confidence, query, connector):
"""try this specific connector""" """try this specific connector"""
# pylint: disable=line-too-long # pylint: disable=line-too-long
headers = { headers = {
@ -31,6 +31,7 @@ async def get_results(session, url, params, query, connector):
), ),
"User-Agent": USER_AGENT, "User-Agent": USER_AGENT,
} }
params = {"min_confidence": min_confidence}
try: try:
async with session.get(url, headers=headers, params=params) as response: async with session.get(url, headers=headers, params=params) as response:
if not response.ok: if not response.ok:
@ -45,7 +46,7 @@ async def get_results(session, url, params, query, connector):
return { return {
"connector": connector, "connector": connector,
"results": connector.process_search_response(query, raw_data), "results": connector.process_search_response(query, raw_data, min_confidence),
} }
except asyncio.TimeoutError: except asyncio.TimeoutError:
logger.info("Connection timed out for url: %s", url) logger.info("Connection timed out for url: %s", url)
@ -53,7 +54,7 @@ async def get_results(session, url, params, query, connector):
logger.exception(err) logger.exception(err)
async def async_connector_search(query, items, params): async def async_connector_search(query, items, min_confidence):
"""Try a number of requests simultaneously""" """Try a number of requests simultaneously"""
timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT) timeout = aiohttp.ClientTimeout(total=SEARCH_TIMEOUT)
async with aiohttp.ClientSession(timeout=timeout) as session: async with aiohttp.ClientSession(timeout=timeout) as session:
@ -61,7 +62,7 @@ async def async_connector_search(query, items, params):
for url, connector in items: for url, connector in items:
tasks.append( tasks.append(
asyncio.ensure_future( asyncio.ensure_future(
get_results(session, url, params, query, connector) get_results(session, url, min_confidence, query, connector)
) )
) )
@ -87,8 +88,7 @@ def search(query, min_confidence=0.1, return_first=False):
items.append((url, connector)) items.append((url, connector))
# load as many results as we can # load as many results as we can
params = {"min_confidence": min_confidence} results = asyncio.run(async_connector_search(query, items, min_confidence))
results = asyncio.run(async_connector_search(query, items, params))
if return_first: if return_first:
# find the best result from all the responses and return that # find the best result from all the responses and return that

View file

@ -77,13 +77,15 @@ class Connector(AbstractConnector):
**{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]}, **{k: data.get(k) for k in ["uri", "image", "labels", "sitelinks", "type"]},
} }
def parse_search_data(self, data): def parse_search_data(self, data, min_confidence):
for search_result in data.get("results", []): for search_result in data.get("results", []):
images = search_result.get("image") images = search_result.get("image")
cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None cover = f"{self.covers_url}/img/entities/{images[0]}" if images else None
# a deeply messy translation of inventaire's scores # a deeply messy translation of inventaire's scores
confidence = float(search_result.get("_score", 0.1)) confidence = float(search_result.get("_score", 0.1))
confidence = 0.1 if confidence < 150 else 0.999 confidence = 0.1 if confidence < 150 else 0.999
if confidence < min_confidence:
continue
yield SearchResult( yield SearchResult(
title=search_result.get("label"), title=search_result.get("label"),
key=self.get_remote_id(search_result.get("uri")), key=self.get_remote_id(search_result.get("uri")),

View file

@ -152,7 +152,7 @@ class Connector(AbstractConnector):
image_name = f"{cover_id}-{size}.jpg" image_name = f"{cover_id}-{size}.jpg"
return f"{self.covers_url}/b/id/{image_name}" return f"{self.covers_url}/b/id/{image_name}"
def parse_search_data(self, data): def parse_search_data(self, data, min_confidence):
for search_result in data.get("docs"): for search_result in data.get("docs"):
# build the remote id from the openlibrary key # build the remote id from the openlibrary key
key = self.books_url + search_result["key"] key = self.books_url + search_result["key"]