moviewyrm/bookwyrm/connectors/self_connector.py
Mouse Reeve 7c15fbbb0b More efficient search index
Co-authored-by: asmr-hex <0.0@asmr.software>
2021-06-23 16:54:59 -07:00

162 lines
5.3 KiB
Python

""" using a bookwyrm instance as a source of book data """
from functools import reduce
import operator
from django.contrib.postgres.search import SearchRank
from django.db.models import OuterRef, Subquery, F, Q
from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector):
"""instantiate a connector"""
# pylint: disable=arguments-differ
def search(self, query, min_confidence=0.1, raw=False, filters=None):
"""search your local database"""
filters = filters or []
if not query:
return []
# first, try searching unqiue identifiers
results = search_identifiers(query, *filters)
if not results:
# then try searching title/author
results = search_title_author(query, min_confidence, *filters)
search_results = []
for result in results:
if raw:
search_results.append(result)
else:
search_results.append(self.format_search_result(result))
if len(search_results) >= 10:
break
if not raw:
search_results.sort(key=lambda r: r.confidence, reverse=True)
return search_results
def isbn_search(self, query, raw=False):
"""search your local database"""
if not query:
return []
filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
results = models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
default_editions = models.Edition.objects.filter(
parent_work=OuterRef("parent_work")
).order_by("-edition_rank")
results = (
results.annotate(
default_id=Subquery(default_editions.values("id")[:1])
).filter(default_id=F("id"))
or results
)
search_results = []
for result in results:
if raw:
search_results.append(result)
else:
search_results.append(self.format_search_result(result))
if len(search_results) >= 10:
break
return search_results
def format_search_result(self, search_result):
cover = None
if search_result.cover:
cover = "%s%s" % (self.covers_url, search_result.cover)
return SearchResult(
title=search_result.title,
key=search_result.remote_id,
author=search_result.author_text,
year=search_result.published_date.year
if search_result.published_date
else None,
connector=self,
cover=cover,
confidence=search_result.rank if hasattr(search_result, "rank") else 1,
)
def format_isbn_search_result(self, search_result):
return self.format_search_result(search_result)
def is_work_data(self, data):
pass
def get_edition_from_work_data(self, data):
pass
def get_work_from_edition_data(self, data):
pass
def get_authors_from_data(self, data):
return None
def parse_isbn_search_data(self, data):
"""it's already in the right format, don't even worry about it"""
return data
def parse_search_data(self, data):
"""it's already in the right format, don't even worry about it"""
return data
def expand_book_data(self, book):
pass
def search_identifiers(query, *filters):
"""tries remote_id, isbn; defined as dedupe fields on the model"""
# pylint: disable=W0212
or_filters = [
{f.name: query}
for f in models.Edition._meta.get_fields()
if hasattr(f, "deduplication_field") and f.deduplication_field
]
results = models.Edition.objects.filter(
*filters, reduce(operator.or_, (Q(**f) for f in or_filters))
).distinct()
if results.count() <= 1:
return results
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
default_editions = models.Edition.objects.filter(
parent_work=OuterRef("parent_work")
).order_by("-edition_rank")
return (
results.annotate(default_id=Subquery(default_editions.values("id")[:1])).filter(
default_id=F("id")
)
or results
)
def search_title_author(query, min_confidence, *filters):
"""searches for title and author"""
results = (
models.Edition.objects.annotate(rank=SearchRank("search_vector", query))
.filter(*filters, search_vector=query, rank__gt=min_confidence)
.order_by("-rank")
)
# when there are multiple editions of the same work, pick the closest
editions_of_work = results.values("parent_work__id").values_list("parent_work__id")
# filter out multiple editions of the same work
for work_id in set(editions_of_work):
editions = results.filter(parent_work=work_id)
default = editions.order_by("-edition_rank").first()
default_rank = default.rank if default else 0
# if mutliple books have the top rank, pick the default edition
if default_rank == editions.first().rank:
yield default
else:
yield editions.first()