moviewyrm/bookwyrm/connectors/self_connector.py

157 lines
5.1 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" using a bookwyrm instance as a source of book data """
from functools import reduce
import operator
from django.contrib.postgres.search import SearchRank, SearchVector
from django.db.models import Count, F, Q
2020-03-28 19:55:53 +00:00
from bookwyrm import models
2020-04-29 17:57:20 +00:00
from .abstract_connector import AbstractConnector, SearchResult
2020-03-28 19:55:53 +00:00
class Connector(AbstractConnector):
2021-03-08 16:49:10 +00:00
""" instantiate a connector """
2021-01-31 19:11:26 +00:00
# pylint: disable=arguments-differ
def search(self, query, min_confidence=0.1, raw=False):
2021-03-08 16:49:10 +00:00
""" search your local database """
if not query:
return []
# first, try searching unqiue identifiers
results = search_identifiers(query)
if not results:
# then try searching title/author
results = search_title_author(query, min_confidence)
2020-04-29 17:57:20 +00:00
search_results = []
for result in results:
2021-01-31 19:11:26 +00:00
if raw:
search_results.append(result)
else:
search_results.append(self.format_search_result(result))
if len(search_results) >= 10:
break
2021-01-31 19:11:26 +00:00
if not raw:
search_results.sort(key=lambda r: r.confidence, reverse=True)
2020-04-29 17:57:20 +00:00
return search_results
2020-03-28 19:55:53 +00:00
2021-03-01 20:09:21 +00:00
def isbn_search(self, query, raw=False):
2021-03-08 16:49:10 +00:00
""" search your local database """
2021-03-01 20:09:21 +00:00
if not query:
return []
2021-03-08 16:49:10 +00:00
filters = [{f: query} for f in ["isbn_10", "isbn_13"]]
2021-03-01 20:09:21 +00:00
results = models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
2021-03-08 16:49:10 +00:00
results = results.filter(parent_work__default_edition__id=F("id")) or results
2021-03-01 20:09:21 +00:00
search_results = []
for result in results:
if raw:
search_results.append(result)
else:
search_results.append(self.format_search_result(result))
if len(search_results) >= 10:
break
return search_results
2020-09-21 17:25:26 +00:00
def format_search_result(self, search_result):
return SearchResult(
2020-10-29 22:29:23 +00:00
title=search_result.title,
2020-11-13 17:47:35 +00:00
key=search_result.remote_id,
2020-10-29 22:29:23 +00:00
author=search_result.author_text,
2021-03-08 16:49:10 +00:00
year=search_result.published_date.year
if search_result.published_date
else None,
connector=self,
2021-03-08 16:49:10 +00:00
confidence=search_result.rank if hasattr(search_result, "rank") else 1,
)
2021-03-01 20:09:21 +00:00
def format_isbn_search_result(self, search_result):
return SearchResult(
title=search_result.title,
key=search_result.remote_id,
author=search_result.author_text,
2021-03-08 16:49:10 +00:00
year=search_result.published_date.year
if search_result.published_date
else None,
2021-03-01 20:09:21 +00:00
connector=self,
2021-03-08 16:49:10 +00:00
confidence=search_result.rank if hasattr(search_result, "rank") else 1,
2021-03-01 20:09:21 +00:00
)
2020-05-10 19:56:59 +00:00
def is_work_data(self, data):
pass
2020-03-28 19:55:53 +00:00
2020-05-10 19:56:59 +00:00
def get_edition_from_work_data(self, data):
pass
2020-03-28 19:55:53 +00:00
def get_work_from_edition_data(self, data):
2020-05-10 19:56:59 +00:00
pass
2020-05-09 20:36:10 +00:00
def get_authors_from_data(self, data):
return None
2021-03-01 20:09:21 +00:00
def parse_isbn_search_data(self, data):
2021-03-08 16:49:10 +00:00
""" it's already in the right format, don't even worry about it """
2021-03-01 20:09:21 +00:00
return data
2020-05-10 19:56:59 +00:00
def parse_search_data(self, data):
2021-03-08 16:49:10 +00:00
""" it's already in the right format, don't even worry about it """
2020-05-10 19:56:59 +00:00
return data
2020-04-29 17:57:20 +00:00
def expand_book_data(self, book):
pass
def search_identifiers(query):
2021-03-08 16:49:10 +00:00
""" tries remote_id, isbn; defined as dedupe fields on the model """
filters = [
{f.name: query}
for f in models.Edition._meta.get_fields()
if hasattr(f, "deduplication_field") and f.deduplication_field
]
results = models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
2021-03-08 16:49:10 +00:00
return results.filter(parent_work__default_edition__id=F("id")) or results
def search_title_author(query, min_confidence):
2021-03-08 16:49:10 +00:00
""" searches for title and author """
vector = (
SearchVector("title", weight="A")
+ SearchVector("subtitle", weight="B")
+ SearchVector("authors__name", weight="C")
+ SearchVector("series", weight="D")
)
results = (
models.Edition.objects.annotate(search=vector)
.annotate(rank=SearchRank(vector, query))
.filter(rank__gt=min_confidence)
.order_by("-rank")
)
# when there are multiple editions of the same work, pick the closest
2021-03-08 16:49:10 +00:00
editions_of_work = (
results.values("parent_work")
.annotate(Count("parent_work"))
.values_list("parent_work")
)
for work_id in set(editions_of_work):
editions = results.filter(parent_work=work_id)
2021-03-08 16:49:10 +00:00
default = editions.filter(parent_work__default_edition=F("id"))
default_rank = default.first().rank if default.exists() else 0
# if mutliple books have the top rank, pick the default edition
if default_rank == editions.first().rank:
yield default.first()
else:
yield editions.first()