mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-20 14:08:09 +00:00
Separate search for unique identifiers out from text search
This commit is contained in:
parent
a2e8cf1993
commit
a413c87963
1 changed files with 60 additions and 31 deletions
|
@ -1,6 +1,9 @@
|
|||
''' using a bookwyrm instance as a source of book data '''
|
||||
from functools import reduce
|
||||
import operator
|
||||
|
||||
from django.contrib.postgres.search import SearchRank, SearchVector
|
||||
from django.db.models import Count, F
|
||||
from django.db.models import Count, F, Q
|
||||
|
||||
from bookwyrm import models
|
||||
from .abstract_connector import AbstractConnector, SearchResult
|
||||
|
@ -10,36 +13,14 @@ class Connector(AbstractConnector):
|
|||
''' instantiate a connector '''
|
||||
def search(self, query, min_confidence=0.1):
|
||||
''' search your local database '''
|
||||
vector = SearchVector('title', weight='A') +\
|
||||
SearchVector('subtitle', weight='B') +\
|
||||
SearchVector('authors__name', weight='C')
|
||||
|
||||
results = models.Edition.objects.annotate(
|
||||
search=vector
|
||||
).annotate(
|
||||
rank=SearchRank(vector, query)
|
||||
).filter(
|
||||
rank__gt=min_confidence
|
||||
).order_by('-rank')
|
||||
|
||||
# when there are multiple editions of the same work, pick the closest
|
||||
editions_of_work = results.values(
|
||||
'parent_work'
|
||||
).annotate(
|
||||
Count('parent_work')
|
||||
).values_list('parent_work')
|
||||
|
||||
# first, try searching unqiue identifiers
|
||||
results = search_identifiers(query)
|
||||
if not results:
|
||||
# then try searching title/author
|
||||
results = search_title_author(query, min_confidence)
|
||||
search_results = []
|
||||
for work_id in set(editions_of_work):
|
||||
editions = results.filter(parent_work=work_id)
|
||||
default = editions.filter(parent_work__default_edition=F('id'))
|
||||
default_rank = default.first().rank if default.exists() else 0
|
||||
# if mutliple books have the top rank, pick the default edition
|
||||
if default_rank == editions.first().rank:
|
||||
selected = default.first()
|
||||
else:
|
||||
selected = editions.first()
|
||||
search_results.append(self.format_search_result(selected))
|
||||
for result in results:
|
||||
search_results.append(self.format_search_result(result))
|
||||
if len(search_results) >= 10:
|
||||
break
|
||||
return search_results
|
||||
|
@ -53,7 +34,8 @@ class Connector(AbstractConnector):
|
|||
year=search_result.published_date.year if \
|
||||
search_result.published_date else None,
|
||||
connector=self,
|
||||
confidence=search_result.rank,
|
||||
confidence=search_result.rank if \
|
||||
hasattr(search_result, 'rank') else 1,
|
||||
)
|
||||
|
||||
|
||||
|
@ -75,3 +57,50 @@ class Connector(AbstractConnector):
|
|||
|
||||
def expand_book_data(self, book):
|
||||
pass
|
||||
|
||||
|
||||
def search_identifiers(query):
|
||||
''' tries remote_id, isbn; defined as dedupe fields on the model '''
|
||||
filters = [{f.name: query} for f in models.Edition._meta.get_fields() \
|
||||
if hasattr(f, 'deduplication_field') and f.deduplication_field]
|
||||
results = models.Edition.objects.filter(
|
||||
reduce(operator.or_, (Q(**f) for f in filters))
|
||||
).distinct()
|
||||
|
||||
# when there are multiple editions of the same work, pick the default.
|
||||
# it would be odd for this to happen.
|
||||
return results.filter(parent_work__default_edition__id=F('id')) \
|
||||
or results
|
||||
|
||||
|
||||
def search_title_author(query, min_confidence):
|
||||
''' searches for title and author '''
|
||||
print('DON"T BOTHER')
|
||||
vector = SearchVector('title', weight='A') +\
|
||||
SearchVector('subtitle', weight='B') +\
|
||||
SearchVector('authors__name', weight='C')
|
||||
|
||||
results = models.Edition.objects.annotate(
|
||||
search=vector
|
||||
).annotate(
|
||||
rank=SearchRank(vector, query)
|
||||
).filter(
|
||||
rank__gt=min_confidence
|
||||
).order_by('-rank')
|
||||
|
||||
# when there are multiple editions of the same work, pick the closest
|
||||
editions_of_work = results.values(
|
||||
'parent_work'
|
||||
).annotate(
|
||||
Count('parent_work')
|
||||
).values_list('parent_work')
|
||||
|
||||
for work_id in set(editions_of_work):
|
||||
editions = results.filter(parent_work=work_id)
|
||||
default = editions.filter(parent_work__default_edition=F('id'))
|
||||
default_rank = default.first().rank if default.exists() else 0
|
||||
# if mutliple books have the top rank, pick the default edition
|
||||
if default_rank == editions.first().rank:
|
||||
yield default.first()
|
||||
else:
|
||||
yield editions.first()
|
||||
|
|
Loading…
Reference in a new issue