Merge pull request #466 from mouse-reeve/search-tweaks

Search tweaks
This commit is contained in:
Mouse Reeve 2021-01-02 16:30:28 -08:00 committed by GitHub
commit 8afd111ff4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 128 additions and 67 deletions

View file

@ -1,6 +1,9 @@
''' using a bookwyrm instance as a source of book data ''' ''' using a bookwyrm instance as a source of book data '''
from functools import reduce
import operator
from django.contrib.postgres.search import SearchRank, SearchVector from django.contrib.postgres.search import SearchRank, SearchVector
from django.db.models import F from django.db.models import Count, F, Q
from bookwyrm import models from bookwyrm import models
from .abstract_connector import AbstractConnector, SearchResult from .abstract_connector import AbstractConnector, SearchResult
@ -10,36 +13,17 @@ class Connector(AbstractConnector):
''' instantiate a connector ''' ''' instantiate a connector '''
def search(self, query, min_confidence=0.1): def search(self, query, min_confidence=0.1):
''' search your local database ''' ''' search your local database '''
vector = SearchVector('title', weight='A') +\ # first, try searching unqiue identifiers
SearchVector('subtitle', weight='B') +\ results = search_identifiers(query)
SearchVector('authors__name', weight='C') +\ if not results:
SearchVector('isbn_13', weight='A') +\ # then try searching title/author
SearchVector('isbn_10', weight='A') +\ results = search_title_author(query, min_confidence)
SearchVector('openlibrary_key', weight='C') +\
SearchVector('goodreads_key', weight='C') +\
SearchVector('asin', weight='C') +\
SearchVector('oclc_number', weight='C') +\
SearchVector('remote_id', weight='C') +\
SearchVector('description', weight='D') +\
SearchVector('series', weight='D')
results = models.Edition.objects.annotate(
search=vector
).annotate(
rank=SearchRank(vector, query)
).filter(
rank__gt=min_confidence
).order_by('-rank')
# remove non-default editions, if possible
results = results.filter(parent_work__default_edition__id=F('id')) \
or results
search_results = [] search_results = []
for book in results[:10]: for result in results:
search_results.append( search_results.append(self.format_search_result(result))
self.format_search_result(book) if len(search_results) >= 10:
) break
search_results.sort(key=lambda r: r.confidence, reverse=True)
return search_results return search_results
@ -51,7 +35,8 @@ class Connector(AbstractConnector):
year=search_result.published_date.year if \ year=search_result.published_date.year if \
search_result.published_date else None, search_result.published_date else None,
connector=self, connector=self,
confidence=search_result.rank, confidence=search_result.rank if \
hasattr(search_result, 'rank') else 1,
) )
@ -73,3 +58,50 @@ class Connector(AbstractConnector):
def expand_book_data(self, book): def expand_book_data(self, book):
pass pass
def search_identifiers(query):
''' tries remote_id, isbn; defined as dedupe fields on the model '''
filters = [{f.name: query} for f in models.Edition._meta.get_fields() \
if hasattr(f, 'deduplication_field') and f.deduplication_field]
results = models.Edition.objects.filter(
reduce(operator.or_, (Q(**f) for f in filters))
).distinct()
# when there are multiple editions of the same work, pick the default.
# it would be odd for this to happen.
return results.filter(parent_work__default_edition__id=F('id')) \
or results
def search_title_author(query, min_confidence):
''' searches for title and author '''
vector = SearchVector('title', weight='A') +\
SearchVector('subtitle', weight='B') +\
SearchVector('authors__name', weight='C') +\
SearchVector('series', weight='D')
results = models.Edition.objects.annotate(
search=vector
).annotate(
rank=SearchRank(vector, query)
).filter(
rank__gt=min_confidence
).order_by('-rank')
# when there are multiple editions of the same work, pick the closest
editions_of_work = results.values(
'parent_work'
).annotate(
Count('parent_work')
).values_list('parent_work')
for work_id in set(editions_of_work):
editions = results.filter(parent_work=work_id)
default = editions.filter(parent_work__default_edition=F('id'))
default_rank = default.first().rank if default.exists() else 0
# if mutliple books have the top rank, pick the default edition
if default_rank == editions.first().rank:
yield default.first()
else:
yield editions.first()

View file

@ -9,7 +9,9 @@ from bookwyrm.settings import DOMAIN
class SelfConnector(TestCase): class SelfConnector(TestCase):
''' just uses local data '''
def setUp(self): def setUp(self):
''' creating the connector '''
models.Connector.objects.create( models.Connector.objects.create(
identifier=DOMAIN, identifier=DOMAIN,
name='Local', name='Local',
@ -22,58 +24,85 @@ class SelfConnector(TestCase):
priority=1, priority=1,
) )
self.connector = Connector(DOMAIN) self.connector = Connector(DOMAIN)
self.work = models.Work.objects.create(
title='Example Work',
)
author = models.Author.objects.create(name='Anonymous')
self.edition = models.Edition.objects.create(
title='Edition of Example Work',
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
parent_work=self.work,
)
self.edition.authors.add(author)
models.Edition.objects.create(
title='Another Edition',
parent_work=self.work,
series='Anonymous'
)
models.Edition.objects.create(
title='More Editions',
subtitle='The Anonymous Edition',
parent_work=self.work,
)
edition = models.Edition.objects.create(
title='An Edition',
parent_work=self.work
)
edition.authors.add(models.Author.objects.create(name='Fish'))
def test_format_search_result(self): def test_format_search_result(self):
''' create a SearchResult '''
author = models.Author.objects.create(name='Anonymous')
edition = models.Edition.objects.create(
title='Edition of Example Work',
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
)
edition.authors.add(author)
result = self.connector.search('Edition of Example')[0] result = self.connector.search('Edition of Example')[0]
self.assertEqual(result.title, 'Edition of Example Work') self.assertEqual(result.title, 'Edition of Example Work')
self.assertEqual(result.key, self.edition.remote_id) self.assertEqual(result.key, edition.remote_id)
self.assertEqual(result.author, 'Anonymous') self.assertEqual(result.author, 'Anonymous')
self.assertEqual(result.year, 1980) self.assertEqual(result.year, 1980)
self.assertEqual(result.connector, self.connector)
def test_search_rank(self): def test_search_rank(self):
''' prioritize certain results '''
author = models.Author.objects.create(name='Anonymous')
edition = models.Edition.objects.create(
title='Edition of Example Work',
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
parent_work=models.Work.objects.create(title='')
)
# author text is rank C
edition.authors.add(author)
# series is rank D
models.Edition.objects.create(
title='Another Edition',
series='Anonymous',
parent_work=models.Work.objects.create(title='')
)
# subtitle is rank B
models.Edition.objects.create(
title='More Editions',
subtitle='The Anonymous Edition',
parent_work=models.Work.objects.create(title='')
)
# title is rank A
models.Edition.objects.create(title='Anonymous')
# doesn't rank in this search
edition = models.Edition.objects.create(
title='An Edition',
parent_work=models.Work.objects.create(title='')
)
results = self.connector.search('Anonymous') results = self.connector.search('Anonymous')
self.assertEqual(len(results), 2) self.assertEqual(len(results), 3)
self.assertEqual(results[0].title, 'More Editions') self.assertEqual(results[0].title, 'Anonymous')
self.assertEqual(results[1].title, 'Edition of Example Work') self.assertEqual(results[1].title, 'More Editions')
self.assertEqual(results[2].title, 'Edition of Example Work')
def test_search_default_filter(self): def test_search_multiple_editions(self):
''' it should get rid of duplicate editions for the same work ''' ''' it should get rid of duplicate editions for the same work '''
self.work.default_edition = self.edition work = models.Work.objects.create(title='Work Title')
self.work.save() edition_1 = models.Edition.objects.create(
title='Edition 1 Title', parent_work=work)
edition_2 = models.Edition.objects.create(
title='Edition 2 Title', parent_work=work)
edition_3 = models.Edition.objects.create(
title='Fish', parent_work=work)
work.default_edition = edition_2
work.save()
results = self.connector.search('Anonymous') # pick the best edition
results = self.connector.search('Edition 1 Title')
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0].title, 'Edition of Example Work') self.assertEqual(results[0].key, edition_1.remote_id)
# pick the default edition when no match is best
results = self.connector.search('Edition Title')
self.assertEqual(len(results), 1)
self.assertEqual(results[0].key, edition_2.remote_id)
# only matches one edition, so no deduplication takes place
results = self.connector.search('Fish') results = self.connector.search('Fish')
self.assertEqual(len(results), 1) self.assertEqual(len(results), 1)
self.assertEqual(results[0].title, 'An Edition') self.assertEqual(results[0].key, edition_3.remote_id)