Separate search for unique identifiers out from text search

2021-01-02 15:48:59 -08:00 · 2021-01-02 15:48:59 -08:00 · a413c87963
commit a413c87963
parent a2e8cf1993
1 changed files with 60 additions and 31 deletions
--- a/bookwyrm/connectors/self_connector.py
+++ b/bookwyrm/connectors/self_connector.py
@ -1,6 +1,9 @@
 ''' using a bookwyrm instance as a source of book data '''
 from functools import reduce
 import operator
 from django.contrib.postgres.search import SearchRank, SearchVector
-from django.db.models import Count, F
+from django.db.models import Count, F, Q
 from bookwyrm import models
 from .abstract_connector import AbstractConnector, SearchResult
@ -10,36 +13,14 @@ class Connector(AbstractConnector):
    ''' instantiate a connector  '''
    def search(self, query, min_confidence=0.1):
        ''' search your local database '''
-        vector = SearchVector('title', weight='A') +\
+        # first, try searching unqiue identifiers
-            SearchVector('subtitle', weight='B') +\
+        results = search_identifiers(query)
-            SearchVector('authors__name', weight='C')
+        if not results:
-
+            # then try searching title/author
-        results = models.Edition.objects.annotate(
+            results = search_title_author(query, min_confidence)
            search=vector
        ).annotate(
            rank=SearchRank(vector, query)
        ).filter(
            rank__gt=min_confidence
        ).order_by('-rank')
        # when there are multiple editions of the same work, pick the closest
        editions_of_work = results.values(
            'parent_work'
        ).annotate(
            Count('parent_work')
        ).values_list('parent_work')
        search_results = []
-        for work_id in set(editions_of_work):
+        for result in results:
-            editions = results.filter(parent_work=work_id)
+            search_results.append(self.format_search_result(result))
            default = editions.filter(parent_work__default_edition=F('id'))
            default_rank = default.first().rank if default.exists() else 0
            # if mutliple books have the top rank, pick the default edition
            if default_rank == editions.first().rank:
                selected = default.first()
            else:
                selected = editions.first()
            search_results.append(self.format_search_result(selected))
            if len(search_results) >= 10:
                break
        return search_results
@ -53,7 +34,8 @@ class Connector(AbstractConnector):
            year=search_result.published_date.year if \
                    search_result.published_date else None,
            connector=self,
-            confidence=search_result.rank,
+            confidence=search_result.rank if \
                    hasattr(search_result, 'rank') else 1,
        )
@ -75,3 +57,50 @@ class Connector(AbstractConnector):
    def expand_book_data(self, book):
        pass
 def search_identifiers(query):
    ''' tries remote_id, isbn; defined as dedupe fields on the model '''
    filters = [{f.name: query} for f in models.Edition._meta.get_fields() \
        if hasattr(f, 'deduplication_field') and f.deduplication_field]
    results = models.Edition.objects.filter(
        reduce(operator.or_, (Q(**f) for f in filters))
    ).distinct()
    # when there are multiple editions of the same work, pick the default.
    # it would be odd for this to happen.
    return results.filter(parent_work__default_edition__id=F('id')) \
            or results
 def search_title_author(query, min_confidence):
    ''' searches for title and author '''
    print('DON"T BOTHER')
    vector = SearchVector('title', weight='A') +\
        SearchVector('subtitle', weight='B') +\
        SearchVector('authors__name', weight='C')
    results = models.Edition.objects.annotate(
        search=vector
    ).annotate(
        rank=SearchRank(vector, query)
    ).filter(
        rank__gt=min_confidence
    ).order_by('-rank')
    # when there are multiple editions of the same work, pick the closest
    editions_of_work = results.values(
        'parent_work'
    ).annotate(
        Count('parent_work')
    ).values_list('parent_work')
    for work_id in set(editions_of_work):
        editions = results.filter(parent_work=work_id)
        default = editions.filter(parent_work__default_edition=F('id'))
        default_rank = default.first().rank if default.exists() else 0
        # if mutliple books have the top rank, pick the default edition
        if default_rank == editions.first().rank:
            yield default.first()
        else:
            yield editions.first()