From a413c879637e0210c12bbb98d6d41d580441dbaa Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Sat, 2 Jan 2021 15:48:59 -0800 Subject: [PATCH] Separate search for unique identifiers out from text search --- bookwyrm/connectors/self_connector.py | 91 ++++++++++++++++++--------- 1 file changed, 60 insertions(+), 31 deletions(-) diff --git a/bookwyrm/connectors/self_connector.py b/bookwyrm/connectors/self_connector.py index 957adafad..04fea7350 100644 --- a/bookwyrm/connectors/self_connector.py +++ b/bookwyrm/connectors/self_connector.py @@ -1,6 +1,9 @@ ''' using a bookwyrm instance as a source of book data ''' +from functools import reduce +import operator + from django.contrib.postgres.search import SearchRank, SearchVector -from django.db.models import Count, F +from django.db.models import Count, F, Q from bookwyrm import models from .abstract_connector import AbstractConnector, SearchResult @@ -10,36 +13,14 @@ class Connector(AbstractConnector): ''' instantiate a connector ''' def search(self, query, min_confidence=0.1): ''' search your local database ''' - vector = SearchVector('title', weight='A') +\ - SearchVector('subtitle', weight='B') +\ - SearchVector('authors__name', weight='C') - - results = models.Edition.objects.annotate( - search=vector - ).annotate( - rank=SearchRank(vector, query) - ).filter( - rank__gt=min_confidence - ).order_by('-rank') - - # when there are multiple editions of the same work, pick the closest - editions_of_work = results.values( - 'parent_work' - ).annotate( - Count('parent_work') - ).values_list('parent_work') - + # first, try searching unqiue identifiers + results = search_identifiers(query) + if not results: + # then try searching title/author + results = search_title_author(query, min_confidence) search_results = [] - for work_id in set(editions_of_work): - editions = results.filter(parent_work=work_id) - default = editions.filter(parent_work__default_edition=F('id')) - default_rank = default.first().rank if default.exists() else 0 - # if mutliple books have the top rank, pick the default edition - if default_rank == editions.first().rank: - selected = default.first() - else: - selected = editions.first() - search_results.append(self.format_search_result(selected)) + for result in results: + search_results.append(self.format_search_result(result)) if len(search_results) >= 10: break return search_results @@ -53,7 +34,8 @@ class Connector(AbstractConnector): year=search_result.published_date.year if \ search_result.published_date else None, connector=self, - confidence=search_result.rank, + confidence=search_result.rank if \ + hasattr(search_result, 'rank') else 1, ) @@ -75,3 +57,50 @@ class Connector(AbstractConnector): def expand_book_data(self, book): pass + + +def search_identifiers(query): + ''' tries remote_id, isbn; defined as dedupe fields on the model ''' + filters = [{f.name: query} for f in models.Edition._meta.get_fields() \ + if hasattr(f, 'deduplication_field') and f.deduplication_field] + results = models.Edition.objects.filter( + reduce(operator.or_, (Q(**f) for f in filters)) + ).distinct() + + # when there are multiple editions of the same work, pick the default. + # it would be odd for this to happen. + return results.filter(parent_work__default_edition__id=F('id')) \ + or results + + +def search_title_author(query, min_confidence): + ''' searches for title and author ''' + print('DON"T BOTHER') + vector = SearchVector('title', weight='A') +\ + SearchVector('subtitle', weight='B') +\ + SearchVector('authors__name', weight='C') + + results = models.Edition.objects.annotate( + search=vector + ).annotate( + rank=SearchRank(vector, query) + ).filter( + rank__gt=min_confidence + ).order_by('-rank') + + # when there are multiple editions of the same work, pick the closest + editions_of_work = results.values( + 'parent_work' + ).annotate( + Count('parent_work') + ).values_list('parent_work') + + for work_id in set(editions_of_work): + editions = results.filter(parent_work=work_id) + default = editions.filter(parent_work__default_edition=F('id')) + default_rank = default.first().rank if default.exists() else 0 + # if mutliple books have the top rank, pick the default edition + if default_rank == editions.first().rank: + yield default.first() + else: + yield editions.first()