moviewyrm/bookwyrm/book_search.py
Mouse Reeve a053f20961 Re-implements return first option
Since we get all the results quickly now, this aggregates all the
results that came back and sorts them by confidence, and returns the
highest confidence result. The confidences aren't great on free text
search, but conceptually that's how it should work at least.

It may make sense to aggregate the search results in all contexts, but
I'll propose that in a separate PR.
2022-05-31 08:20:59 -07:00

160 lines
5 KiB
Python

""" using a bookwyrm instance as a source of book data """
from dataclasses import asdict, dataclass
from functools import reduce
import operator
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import OuterRef, Subquery, F, Q
from bookwyrm import models
from bookwyrm.settings import MEDIA_FULL_URL
# pylint: disable=arguments-differ
def search(query, min_confidence=0, filters=None, return_first=False):
"""search your local database"""
filters = filters or []
if not query:
return []
# first, try searching unqiue identifiers
results = search_identifiers(query, *filters, return_first=return_first)
if not results:
# then try searching title/author
results = search_title_author(
query, min_confidence, *filters, return_first=return_first
)
return results
def isbn_search(query):
"""search your local database"""
if not query:
return []
filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
results = models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
default_editions = models.Edition.objects.filter(
parent_work=OuterRef("parent_work")
).order_by("-edition_rank")
results = (
results.annotate(default_id=Subquery(default_editions.values("id")[:1])).filter(
default_id=F("id")
)
or results
)
return results
def format_search_result(search_result):
"""convert a book object into a search result object"""
cover = None
if search_result.cover:
cover = f"{MEDIA_FULL_URL}{search_result.cover}"
return SearchResult(
title=search_result.title,
key=search_result.remote_id,
author=search_result.author_text,
year=search_result.published_date.year
if search_result.published_date
else None,
cover=cover,
confidence=search_result.rank if hasattr(search_result, "rank") else 1,
connector="",
).json()
def search_identifiers(query, *filters, return_first=False):
"""tries remote_id, isbn; defined as dedupe fields on the model"""
# pylint: disable=W0212
or_filters = [
{f.name: query}
for f in models.Edition._meta.get_fields()
if hasattr(f, "deduplication_field") and f.deduplication_field
]
results = models.Edition.objects.filter(
*filters, reduce(operator.or_, (Q(**f) for f in or_filters))
).distinct()
if results.count() <= 1:
if return_first:
return results.first()
return results
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
default_editions = models.Edition.objects.filter(
parent_work=OuterRef("parent_work")
).order_by("-edition_rank")
results = (
results.annotate(default_id=Subquery(default_editions.values("id")[:1])).filter(
default_id=F("id")
)
or results
)
if return_first:
return results.first()
return results
def search_title_author(query, min_confidence, *filters, return_first=False):
"""searches for title and author"""
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
results = (
models.Edition.objects.filter(*filters, search_vector=query)
.annotate(rank=SearchRank(F("search_vector"), query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
# when there are multiple editions of the same work, pick the closest
editions_of_work = results.values("parent_work__id").values_list("parent_work__id")
# filter out multiple editions of the same work
list_results = []
for work_id in set(editions_of_work):
editions = results.filter(parent_work=work_id)
default = editions.order_by("-edition_rank").first()
default_rank = default.rank if default else 0
# if mutliple books have the top rank, pick the default edition
if default_rank == editions.first().rank:
result = default
else:
result = editions.first()
if return_first:
return result
list_results.append(result)
return list_results
@dataclass
class SearchResult:
"""standardized search result object"""
title: str
key: str
connector: object
view_link: str = None
author: str = None
year: str = None
cover: str = None
confidence: int = 1
def __repr__(self):
# pylint: disable=consider-using-f-string
return "<SearchResult key={!r} title={!r} author={!r} confidence={!r}>".format(
self.key, self.title, self.author, self.confidence
)
def json(self):
"""serialize a connector for json response"""
serialized = asdict(self)
del serialized["connector"]
return serialized