bookwyrm/bookwyrm/book_search.py

""" using a bookwyrm instance as a source of book data """
from __future__ import annotations
from dataclasses import asdict, dataclass
from functools import reduce
import operator
from typing import Optional, Union, Any, Literal, overload

from django.contrib.postgres.search import SearchRank, SearchQuery
from django.db.models import F, Q
from django.db.models.query import QuerySet

from bookwyrm import models
from bookwyrm import connectors
from bookwyrm.settings import MEDIA_FULL_URL


@overload
def search(
    query: str,
    *,
    min_confidence: float = 0,
    filters: Optional[list[Any]] = None,
    return_first: Literal[False],
) -> QuerySet[models.Edition]:
    ...


@overload
def search(
    query: str,
    *,
    min_confidence: float = 0,
    filters: Optional[list[Any]] = None,
    return_first: Literal[True],
) -> Optional[models.Edition]:
    ...


# pylint: disable=arguments-differ
def search(
    query: str,
    *,
    min_confidence: float = 0,
    filters: Optional[list[Any]] = None,
    return_first: bool = False,
    books: Optional[QuerySet[models.Edition]] = None,
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
    """search your local database"""
    filters = filters or []
    if not query:
        return None if return_first else []
    query = query.strip()

    results = None
    # first, try searching unique identifiers
    # unique identifiers never have spaces, title/author usually do
    if not " " in query:
        results = search_identifiers(
            query, *filters, return_first=return_first, books=books
        )

    # if there were no identifier results...
    if not results:
        # then try searching title/author
        results = search_title_author(
            query, min_confidence, *filters, return_first=return_first, books=books
        )
    return results


def isbn_search(query):
    """search your local database"""
    if not query:
        return []
    # Up-case the ISBN string to ensure any 'X' check-digit is correct
    # If the ISBN has only 9 characters, prepend missing zero
    query = query.strip().upper().rjust(10, "0")
    filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
    return models.Edition.objects.filter(
        reduce(operator.or_, (Q(**f) for f in filters))
    ).distinct()


def format_search_result(search_result):
    """convert a book object into a search result object"""
    cover = None
    if search_result.cover:
        cover = f"{MEDIA_FULL_URL}{search_result.cover}"

    return SearchResult(
        title=search_result.title,
        key=search_result.remote_id,
        author=search_result.author_text,
        year=search_result.published_date.year
        if search_result.published_date
        else None,
        cover=cover,
        confidence=search_result.rank if hasattr(search_result, "rank") else 1,
        connector="",
    ).json()


def search_identifiers(
    query,
    *filters,
    return_first=False,
    books=None,
) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:
    """search Editions by deduplication fields

    Best for cases when we can assume someone is searching for an exact match on
    commonly unique data identifiers like isbn or specific library ids.
    """
    books = books or models.Edition.objects
    if connectors.maybe_isbn(query):
        # Oh did you think the 'S' in ISBN stood for 'standard'?
        normalized_isbn = query.strip().upper().rjust(10, "0")
        query = normalized_isbn
    # pylint: disable=W0212
    or_filters = [
        {f.name: query}
        for f in models.Edition._meta.get_fields()
        if hasattr(f, "deduplication_field") and f.deduplication_field
    ]
    results = books.filter(
        *filters, reduce(operator.or_, (Q(**f) for f in or_filters))
    ).distinct()

    if return_first:
        return results.first()
    return results


def search_title_author(
    query,
    min_confidence,
    *filters,
    return_first=False,
    books=None,
) -> QuerySet[models.Edition]:
    """searches for title and author"""
    books = books or models.Edition.objects
    query = SearchQuery(query, config="simple") | SearchQuery(query, config="english")
    results = (
        books.filter(*filters, search_vector=query)
        .annotate(rank=SearchRank(F("search_vector"), query))
        .filter(rank__gt=min_confidence)
        .order_by("-rank")
    )

    # when there are multiple editions of the same work, pick the closest
    editions_of_work = results.values_list("parent_work__id", flat=True).distinct()

    # filter out multiple editions of the same work
    list_results = []
    for work_id in editions_of_work[:30]:
        result = (
            results.filter(parent_work=work_id)
            .order_by("-rank", "-edition_rank")
            .first()
        )

        if return_first:
            return result
        list_results.append(result)
    return list_results


@dataclass
class SearchResult:
    """standardized search result object"""

    title: str
    key: str
    connector: object
    view_link: Optional[str] = None
    author: Optional[str] = None
    year: Optional[str] = None
    cover: Optional[str] = None
    confidence: float = 1.0

    def __repr__(self):
        # pylint: disable=consider-using-f-string
        return "<SearchResult key={!r} title={!r} author={!r} confidence={!r}>".format(
            self.key, self.title, self.author, self.confidence
        )

    def json(self):
        """serialize a connector for json response"""
        serialized = asdict(self)
        del serialized["connector"]
        return serialized
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`""" using a bookwyrm instance as a source of book data """`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`from __future__ import annotations`
Fixes circular import 2021-09-16 18:30:04 +00:00			`from dataclasses import asdict, dataclass`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`from functools import reduce`
			`import operator`
Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`from typing import Optional, Union, Any, Literal, overload`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00
			`from django.contrib.postgres.search import SearchRank, SearchQuery`
Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00			`from django.db.models import F, Q`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`from django.db.models.query import QuerySet`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00
			`from bookwyrm import models`
normalise isbn on local book search - uppercase ISBN before checking it's a number to account for trailing 'x' - check maybe_isbn for search_identifiers search. Without this we are only searching external connectors, not locally! 2022-08-30 10:00:09 +00:00			`from bookwyrm import connectors`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`from bookwyrm.settings import MEDIA_FULL_URL`


Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`@overload`
			`def search(`
			`query: str,`
			`*,`
			`min_confidence: float = 0,`
			`filters: Optional[list[Any]] = None,`
			`return_first: Literal[False],`
			`) -> QuerySet[models.Edition]:`
			`...`


			`@overload`
			`def search(`
			`query: str,`
			`*,`
			`min_confidence: float = 0,`
			`filters: Optional[list[Any]] = None,`
			`return_first: Literal[True],`
			`) -> Optional[models.Edition]:`
			`...`


Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`# pylint: disable=arguments-differ`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`def search(`
			`query: str,`
Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`*,`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`min_confidence: float = 0,`
			`filters: Optional[list[Any]] = None,`
			`return_first: bool = False,`
Fixes formatting 2023-12-06 19:36:15 +00:00			`books: Optional[QuerySet[models.Edition]] = None,`
Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`"""search your local database"""`
			`filters = filters or []`
			`if not query:`
Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`return None if return_first else []`
Remove trailing whitespace from queries 2022-11-16 04:09:00 +00:00			`query = query.strip()`

Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00			`results = None`
{list} Fix duplicate suggestions in Add Books section Fixes #2584 (Also fix a spelling mistake in a comment in book_search.py) 2023-01-10 02:40:40 +00:00			`# first, try searching unique identifiers`
Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00			`# unique identifiers never have spaces, title/author usually do`
			`if not " " in query:`
Fixes formatting 2023-12-06 19:36:15 +00:00			`results = search_identifiers(`
			`query, *filters, return_first=return_first, books=books`
			`)`
Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00
			`# if there were no identifier results...`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`if not results:`
			`# then try searching title/author`
Updates first_search_result functionality 2021-09-16 18:07:36 +00:00			`results = search_title_author(`
Checkpoint 2023-11-27 23:03:59 +00:00			`query, min_confidence, *filters, return_first=return_first, books=books`
Updates first_search_result functionality 2021-09-16 18:07:36 +00:00			`)`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`return results`

Fixes formatting 2023-12-06 19:36:15 +00:00
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`def isbn_search(query):`
			`"""search your local database"""`
			`if not query:`
			`return []`
normalise isbn searching ISBNs are always numeric except for when the check digit in ISBN-10s is a ten, indicated with a capital X. These changes ensure that ISBNs are always upper-case so that a lower-case 'x' is not used when searching. Additionally some ancient ISBNs have been printed without a leading zero (i.e. they only have 9 characters on the physical book). This change prepends a zero if something looks like an ISBN but only has 9 chars. 2022-08-28 01:05:40 +00:00			`# Up-case the ISBN string to ensure any 'X' check-digit is correct`
			`# If the ISBN has only 9 characters, prepend missing zero`
linting 2022-08-28 07:30:46 +00:00			`query = query.strip().upper().rjust(10, "0")`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`filters = [{f: query} for f in ["isbn_10", "isbn_13"]]`
Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00			`return models.Edition.objects.filter(`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`reduce(operator.or_, (Q(**f) for f in filters))`
			`).distinct()`


			`def format_search_result(search_result):`
			`"""convert a book object into a search result object"""`
			`cover = None`
			`if search_result.cover:`
			`cover = f"{MEDIA_FULL_URL}{search_result.cover}"`

			`return SearchResult(`
			`title=search_result.title,`
			`key=search_result.remote_id,`
			`author=search_result.author_text,`
			`year=search_result.published_date.year`
			`if search_result.published_date`
			`else None,`
			`cover=cover,`
			`confidence=search_result.rank if hasattr(search_result, "rank") else 1,`
			`connector="",`
			`).json()`


Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`def search_identifiers(`
Fixes formatting 2023-12-06 19:36:15 +00:00			`query,`
			`*filters,`
			`return_first=False,`
			`books=None,`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`) -> Union[Optional[models.Edition], QuerySet[models.Edition]]:`
Fixes formatting 2023-12-06 19:36:15 +00:00			`"""search Editions by deduplication fields`

			`Best for cases when we can assume someone is searching for an exact match on`
			`commonly unique data identifiers like isbn or specific library ids.`
			`"""`
Fixes result set passed to template 2023-12-06 00:36:58 +00:00			`books = books or models.Edition.objects`
normalise isbn on local book search - uppercase ISBN before checking it's a number to account for trailing 'x' - check maybe_isbn for search_identifiers search. Without this we are only searching external connectors, not locally! 2022-08-30 10:00:09 +00:00			`if connectors.maybe_isbn(query):`
			`# Oh did you think the 'S' in ISBN stood for 'standard'?`
			`normalized_isbn = query.strip().upper().rjust(10, "0")`
			`query = normalized_isbn`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`# pylint: disable=W0212`
			`or_filters = [`
			`{f.name: query}`
			`for f in models.Edition._meta.get_fields()`
			`if hasattr(f, "deduplication_field") and f.deduplication_field`
			`]`
Checkpoint 2023-11-27 23:03:59 +00:00			`results = books.filter(`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`filters, reduce(operator.or_, (Q(*f) for f in or_filters))`
			`).distinct()`
Simplify identifier searches This commit removes code that deduplicated search results for identifier searches. If it was the case that multiple books have the same identifier, in theory this would produce better search results, but in practice this doesn't happen very much, is probably worth seeing when it does, and worsens the performance of identifier search overall. 2022-11-16 03:39:57 +00:00
Updates first_search_result functionality 2021-09-16 18:07:36 +00:00			`if return_first:`
			`return results.first()`
			`return results`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00

Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`def search_title_author(`
Checkpoint 2023-11-27 23:03:59 +00:00			`query,`
			`min_confidence,`
			`*filters,`
			`return_first=False,`
			`books=None,`
Some small improvements to annotations 2023-07-28 18:54:03 +00:00			`) -> QuerySet[models.Edition]:`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`"""searches for title and author"""`
Checkpoint 2023-11-27 23:03:59 +00:00			`books = books or models.Edition.objects`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`query = SearchQuery(query, config="simple") \| SearchQuery(query, config="english")`
			`results = (`
Checkpoint 2023-11-27 23:03:59 +00:00			`books.filter(*filters, search_vector=query)`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00			`.annotate(rank=SearchRank(F("search_vector"), query))`
			`.filter(rank__gt=min_confidence)`
			`.order_by("-rank")`
			`)`

			`# when there are multiple editions of the same work, pick the closest`
Simplify how default edition is checked This logic is still totally bonkers, but this change puts a hard limit on how many iterations the loop can go through and makes the query that selects which edition to display a little simpler. 2022-11-16 04:05:41 +00:00			`editions_of_work = results.values_list("parent_work__id", flat=True).distinct()`
Display search results in api mode and regular 2021-09-16 17:44:33 +00:00
			`# filter out multiple editions of the same work`
Fixes formatting isbn endpoint results 2021-09-16 19:52:10 +00:00			`list_results = []`
Do not create a set for already-distinct query result 2023-11-23 20:05:18 +00:00			`for work_id in editions_of_work[:30]:`
Python formatting 2022-11-16 04:08:17 +00:00			`result = (`
			`results.filter(parent_work=work_id)`
			`.order_by("-rank", "-edition_rank")`
			`.first()`
			`)`
Fixes first_search_result behavior 2021-11-12 21:48:31 +00:00
Updates first_search_result functionality 2021-09-16 18:07:36 +00:00			`if return_first:`
title author search already working correctly with return first 2021-11-12 21:59:54 +00:00			`return result`
Fixes formatting isbn endpoint results 2021-09-16 19:52:10 +00:00			`list_results.append(result)`
			`return list_results`
Fixes circular import 2021-09-16 18:30:04 +00:00

			`@dataclass`
			`class SearchResult:`
			`"""standardized search result object"""`

			`title: str`
			`key: str`
			`connector: object`
Type annotations and related changes for bookwyrm.connectors 2023-07-28 15:43:32 +00:00			`view_link: Optional[str] = None`
			`author: Optional[str] = None`
			`year: Optional[str] = None`
			`cover: Optional[str] = None`
			`confidence: float = 1.0`
Fixes circular import 2021-09-16 18:30:04 +00:00
			`def __repr__(self):`
Updates migration 2021-09-30 17:47:53 +00:00			`# pylint: disable=consider-using-f-string`
Re-implements return first option Since we get all the results quickly now, this aggregates all the results that came back and sorts them by confidence, and returns the highest confidence result. The confidences aren't great on free text search, but conceptually that's how it should work at least. It may make sense to aggregate the search results in all contexts, but I'll propose that in a separate PR. 2022-05-31 15:20:59 +00:00			`return "<SearchResult key={!r} title={!r} author={!r} confidence={!r}>".format(`
			`self.key, self.title, self.author, self.confidence`
Fixes circular import 2021-09-16 18:30:04 +00:00			`)`

			`def json(self):`
			`"""serialize a connector for json response"""`
			`serialized = asdict(self)`
			`del serialized["connector"]`
			`return serialized`