bookwyrm/bookwyrm/book_search.py

193 lines
5.5 KiB
Python
Raw Normal View History

""" using a bookwyrm instance as a source of book data """
from __future__ import annotations
2021-09-16 18:30:04 +00:00
from dataclasses import asdict, dataclass
from functools import reduce
import operator
2023-07-28 18:54:03 +00:00
from typing import Optional, Union, Any, Literal, overload
from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F, Q
from django.db.models.query import QuerySet
from bookwyrm import models
from bookwyrm import connectors
from bookwyrm.settings import MEDIA_FULL_URL
2023-07-28 18:54:03 +00:00
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[False],
) -> QuerySet[models.Edition]:
...
@overload
def search(
query: str,
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: Literal[True],
) -> Optional[models.Edition]:
...
# pylint: disable=arguments-differ
def search(
query: str,
2023-07-28 18:54:03 +00:00
*,
min_confidence: float = 0,
filters: Optional[list[Any]] = None,
return_first: bool = False,
2023-12-06 19:36:15 +00:00
books: Optional[QuerySet[models.Edition]] = None,
2023-07-28 18:54:03 +00:00
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
"""search your local database"""
filters = filters or []
if not query:
2023-07-28 18:54:03 +00:00
return None if return_first else []
query = query.strip()
results = None
# first, try searching unique identifiers
# unique identifiers never have spaces, title/author usually do
if not " " in query:
2023-12-06 19:36:15 +00:00
results = search_identifiers(
query, *filters, return_first=return_first, books=books
)
# if there were no identifier results...
if not results:
# then try searching title/author
results = search_title_author(
2023-11-27 23:03:59 +00:00
query, min_confidence, *filters, return_first=return_first, books=books
)
return results
2023-12-06 19:36:15 +00:00
def isbn_search(query):
"""search your local database"""
if not query:
return []
# Up-case the ISBN string to ensure any 'X' check-digit is correct
# If the ISBN has only 9 characters, prepend missing zero
2022-08-28 07:30:46 +00:00
query = query.strip().upper().rjust(10, "0")
filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
return models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
def format_search_result(search_result):
"""convert a book object into a search result object"""
cover = None
if search_result.cover:
cover = f"{MEDIA_FULL_URL}{search_result.cover}"
return SearchResult(
title=search_result.title,
key=search_result.remote_id,
author=search_result.author_text,
year=search_result.published_date.year
if search_result.published_date
else None,
cover=cover,
confidence=search_result.rank if hasattr(search_result, "rank") else 1,
connector="",
).json()
def search_identifiers(
2023-12-06 19:36:15 +00:00
query,
*filters,
return_first=False,
books=None,
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
2023-12-06 19:36:15 +00:00
"""search Editions by deduplication fields
Best for cases when we can assume someone is searching for an exact match on
commonly unique data identifiers like isbn or specific library ids.
"""
2023-12-06 00:36:58 +00:00
books = books or models.Edition.objects
if connectors.maybe_isbn(query):
# Oh did you think the 'S' in ISBN stood for 'standard'?
normalized_isbn = query.strip().upper().rjust(10, "0")
query = normalized_isbn
# pylint: disable=W0212
or_filters = [
{f.name: query}
for f in models.Edition._meta.get_fields()
if hasattr(f, "deduplication_field") and f.deduplication_field
]
2023-11-27 23:03:59 +00:00
results = books.filter(
*filters, reduce(operator.or_, (Q(**f) for f in or_filters))
).distinct()
if return_first:
return results.first()
return results
2023-07-28 18:54:03 +00:00
def search_title_author(
2023-11-27 23:03:59 +00:00
query,
min_confidence,
*filters,
return_first=False,
books=None,
2023-07-28 18:54:03 +00:00
) -> QuerySet[models.Edition]:
"""searches for title and author"""
2023-11-27 23:03:59 +00:00
books = books or models.Edition.objects
query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
results = (
2023-11-27 23:03:59 +00:00
books.filter(*filters, search_vector=query)
.annotate(rank=SearchRank(F("search_vector"), query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
# when there are multiple editions of the same work, pick the closest
editions_of_work = results.values_list("parent_work__id", flat=True).distinct()
# filter out multiple editions of the same work
2021-09-16 19:52:10 +00:00
list_results = []
for work_id in editions_of_work[:30]:
2022-11-16 04:08:17 +00:00
result = (
results.filter(parent_work=work_id)
.order_by("-rank", "-edition_rank")
.first()
)
2021-11-12 21:48:31 +00:00
if return_first:
return result
2021-09-16 19:52:10 +00:00
list_results.append(result)
return list_results
2021-09-16 18:30:04 +00:00
@dataclass
class SearchResult:
"""standardized search result object"""
title: str
key: str
connector: object
view_link: Optional[str] = None
author: Optional[str] = None
year: Optional[str] = None
cover: Optional[str] = None
confidence: float = 1.0
2021-09-16 18:30:04 +00:00
def __repr__(self):
2021-09-30 17:47:53 +00:00
# pylint: disable=consider-using-f-string
return "<SearchResult key={!r} title={!r} author={!r} confidence={!r}>".format(
self.key, self.title, self.author, self.confidence
2021-09-16 18:30:04 +00:00
)
def json(self):
"""serialize a connector for json response"""
serialized = asdict(self)
del serialized["connector"]
return serialized