diff --git a/bookwyrm/connectors/self_connector.py b/bookwyrm/connectors/self_connector.py index 7d0298be7..0c21e7bcc 100644 --- a/bookwyrm/connectors/self_connector.py +++ b/bookwyrm/connectors/self_connector.py @@ -1,6 +1,9 @@ ''' using a bookwyrm instance as a source of book data ''' +from functools import reduce +import operator + from django.contrib.postgres.search import SearchRank, SearchVector -from django.db.models import F +from django.db.models import Count, F, Q from bookwyrm import models from .abstract_connector import AbstractConnector, SearchResult @@ -10,36 +13,17 @@ class Connector(AbstractConnector): ''' instantiate a connector ''' def search(self, query, min_confidence=0.1): ''' search your local database ''' - vector = SearchVector('title', weight='A') +\ - SearchVector('subtitle', weight='B') +\ - SearchVector('authors__name', weight='C') +\ - SearchVector('isbn_13', weight='A') +\ - SearchVector('isbn_10', weight='A') +\ - SearchVector('openlibrary_key', weight='C') +\ - SearchVector('goodreads_key', weight='C') +\ - SearchVector('asin', weight='C') +\ - SearchVector('oclc_number', weight='C') +\ - SearchVector('remote_id', weight='C') +\ - SearchVector('description', weight='D') +\ - SearchVector('series', weight='D') - - results = models.Edition.objects.annotate( - search=vector - ).annotate( - rank=SearchRank(vector, query) - ).filter( - rank__gt=min_confidence - ).order_by('-rank') - - # remove non-default editions, if possible - results = results.filter(parent_work__default_edition__id=F('id')) \ - or results - + # first, try searching unqiue identifiers + results = search_identifiers(query) + if not results: + # then try searching title/author + results = search_title_author(query, min_confidence) search_results = [] - for book in results[:10]: - search_results.append( - self.format_search_result(book) - ) + for result in results: + search_results.append(self.format_search_result(result)) + if len(search_results) >= 10: + break + search_results.sort(key=lambda r: r.confidence, reverse=True) return search_results @@ -51,7 +35,8 @@ class Connector(AbstractConnector): year=search_result.published_date.year if \ search_result.published_date else None, connector=self, - confidence=search_result.rank, + confidence=search_result.rank if \ + hasattr(search_result, 'rank') else 1, ) @@ -73,3 +58,50 @@ class Connector(AbstractConnector): def expand_book_data(self, book): pass + + +def search_identifiers(query): + ''' tries remote_id, isbn; defined as dedupe fields on the model ''' + filters = [{f.name: query} for f in models.Edition._meta.get_fields() \ + if hasattr(f, 'deduplication_field') and f.deduplication_field] + results = models.Edition.objects.filter( + reduce(operator.or_, (Q(**f) for f in filters)) + ).distinct() + + # when there are multiple editions of the same work, pick the default. + # it would be odd for this to happen. + return results.filter(parent_work__default_edition__id=F('id')) \ + or results + + +def search_title_author(query, min_confidence): + ''' searches for title and author ''' + vector = SearchVector('title', weight='A') +\ + SearchVector('subtitle', weight='B') +\ + SearchVector('authors__name', weight='C') +\ + SearchVector('series', weight='D') + + results = models.Edition.objects.annotate( + search=vector + ).annotate( + rank=SearchRank(vector, query) + ).filter( + rank__gt=min_confidence + ).order_by('-rank') + + # when there are multiple editions of the same work, pick the closest + editions_of_work = results.values( + 'parent_work' + ).annotate( + Count('parent_work') + ).values_list('parent_work') + + for work_id in set(editions_of_work): + editions = results.filter(parent_work=work_id) + default = editions.filter(parent_work__default_edition=F('id')) + default_rank = default.first().rank if default.exists() else 0 + # if mutliple books have the top rank, pick the default edition + if default_rank == editions.first().rank: + yield default.first() + else: + yield editions.first() diff --git a/bookwyrm/tests/connectors/test_self_connector.py b/bookwyrm/tests/connectors/test_self_connector.py index 91857def0..0fc789556 100644 --- a/bookwyrm/tests/connectors/test_self_connector.py +++ b/bookwyrm/tests/connectors/test_self_connector.py @@ -9,7 +9,9 @@ from bookwyrm.settings import DOMAIN class SelfConnector(TestCase): + ''' just uses local data ''' def setUp(self): + ''' creating the connector ''' models.Connector.objects.create( identifier=DOMAIN, name='Local', @@ -22,58 +24,85 @@ class SelfConnector(TestCase): priority=1, ) self.connector = Connector(DOMAIN) - self.work = models.Work.objects.create( - title='Example Work', - ) - author = models.Author.objects.create(name='Anonymous') - self.edition = models.Edition.objects.create( - title='Edition of Example Work', - published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc), - parent_work=self.work, - ) - self.edition.authors.add(author) - models.Edition.objects.create( - title='Another Edition', - parent_work=self.work, - series='Anonymous' - ) - models.Edition.objects.create( - title='More Editions', - subtitle='The Anonymous Edition', - parent_work=self.work, - ) - - edition = models.Edition.objects.create( - title='An Edition', - parent_work=self.work - ) - edition.authors.add(models.Author.objects.create(name='Fish')) def test_format_search_result(self): + ''' create a SearchResult ''' + author = models.Author.objects.create(name='Anonymous') + edition = models.Edition.objects.create( + title='Edition of Example Work', + published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc), + ) + edition.authors.add(author) result = self.connector.search('Edition of Example')[0] self.assertEqual(result.title, 'Edition of Example Work') - self.assertEqual(result.key, self.edition.remote_id) + self.assertEqual(result.key, edition.remote_id) self.assertEqual(result.author, 'Anonymous') self.assertEqual(result.year, 1980) + self.assertEqual(result.connector, self.connector) def test_search_rank(self): + ''' prioritize certain results ''' + author = models.Author.objects.create(name='Anonymous') + edition = models.Edition.objects.create( + title='Edition of Example Work', + published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc), + parent_work=models.Work.objects.create(title='') + ) + # author text is rank C + edition.authors.add(author) + + # series is rank D + models.Edition.objects.create( + title='Another Edition', + series='Anonymous', + parent_work=models.Work.objects.create(title='') + ) + # subtitle is rank B + models.Edition.objects.create( + title='More Editions', + subtitle='The Anonymous Edition', + parent_work=models.Work.objects.create(title='') + ) + # title is rank A + models.Edition.objects.create(title='Anonymous') + # doesn't rank in this search + edition = models.Edition.objects.create( + title='An Edition', + parent_work=models.Work.objects.create(title='') + ) + results = self.connector.search('Anonymous') - self.assertEqual(len(results), 2) - self.assertEqual(results[0].title, 'More Editions') - self.assertEqual(results[1].title, 'Edition of Example Work') + self.assertEqual(len(results), 3) + self.assertEqual(results[0].title, 'Anonymous') + self.assertEqual(results[1].title, 'More Editions') + self.assertEqual(results[2].title, 'Edition of Example Work') - def test_search_default_filter(self): + def test_search_multiple_editions(self): ''' it should get rid of duplicate editions for the same work ''' - self.work.default_edition = self.edition - self.work.save() + work = models.Work.objects.create(title='Work Title') + edition_1 = models.Edition.objects.create( + title='Edition 1 Title', parent_work=work) + edition_2 = models.Edition.objects.create( + title='Edition 2 Title', parent_work=work) + edition_3 = models.Edition.objects.create( + title='Fish', parent_work=work) + work.default_edition = edition_2 + work.save() - results = self.connector.search('Anonymous') + # pick the best edition + results = self.connector.search('Edition 1 Title') self.assertEqual(len(results), 1) - self.assertEqual(results[0].title, 'Edition of Example Work') + self.assertEqual(results[0].key, edition_1.remote_id) + # pick the default edition when no match is best + results = self.connector.search('Edition Title') + self.assertEqual(len(results), 1) + self.assertEqual(results[0].key, edition_2.remote_id) + + # only matches one edition, so no deduplication takes place results = self.connector.search('Fish') self.assertEqual(len(results), 1) - self.assertEqual(results[0].title, 'An Edition') + self.assertEqual(results[0].key, edition_3.remote_id)