mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-21 14:38:08 +00:00
commit
8afd111ff4
2 changed files with 128 additions and 67 deletions
|
@ -1,6 +1,9 @@
|
||||||
''' using a bookwyrm instance as a source of book data '''
|
''' using a bookwyrm instance as a source of book data '''
|
||||||
|
from functools import reduce
|
||||||
|
import operator
|
||||||
|
|
||||||
from django.contrib.postgres.search import SearchRank, SearchVector
|
from django.contrib.postgres.search import SearchRank, SearchVector
|
||||||
from django.db.models import F
|
from django.db.models import Count, F, Q
|
||||||
|
|
||||||
from bookwyrm import models
|
from bookwyrm import models
|
||||||
from .abstract_connector import AbstractConnector, SearchResult
|
from .abstract_connector import AbstractConnector, SearchResult
|
||||||
|
@ -10,36 +13,17 @@ class Connector(AbstractConnector):
|
||||||
''' instantiate a connector '''
|
''' instantiate a connector '''
|
||||||
def search(self, query, min_confidence=0.1):
|
def search(self, query, min_confidence=0.1):
|
||||||
''' search your local database '''
|
''' search your local database '''
|
||||||
vector = SearchVector('title', weight='A') +\
|
# first, try searching unqiue identifiers
|
||||||
SearchVector('subtitle', weight='B') +\
|
results = search_identifiers(query)
|
||||||
SearchVector('authors__name', weight='C') +\
|
if not results:
|
||||||
SearchVector('isbn_13', weight='A') +\
|
# then try searching title/author
|
||||||
SearchVector('isbn_10', weight='A') +\
|
results = search_title_author(query, min_confidence)
|
||||||
SearchVector('openlibrary_key', weight='C') +\
|
|
||||||
SearchVector('goodreads_key', weight='C') +\
|
|
||||||
SearchVector('asin', weight='C') +\
|
|
||||||
SearchVector('oclc_number', weight='C') +\
|
|
||||||
SearchVector('remote_id', weight='C') +\
|
|
||||||
SearchVector('description', weight='D') +\
|
|
||||||
SearchVector('series', weight='D')
|
|
||||||
|
|
||||||
results = models.Edition.objects.annotate(
|
|
||||||
search=vector
|
|
||||||
).annotate(
|
|
||||||
rank=SearchRank(vector, query)
|
|
||||||
).filter(
|
|
||||||
rank__gt=min_confidence
|
|
||||||
).order_by('-rank')
|
|
||||||
|
|
||||||
# remove non-default editions, if possible
|
|
||||||
results = results.filter(parent_work__default_edition__id=F('id')) \
|
|
||||||
or results
|
|
||||||
|
|
||||||
search_results = []
|
search_results = []
|
||||||
for book in results[:10]:
|
for result in results:
|
||||||
search_results.append(
|
search_results.append(self.format_search_result(result))
|
||||||
self.format_search_result(book)
|
if len(search_results) >= 10:
|
||||||
)
|
break
|
||||||
|
search_results.sort(key=lambda r: r.confidence, reverse=True)
|
||||||
return search_results
|
return search_results
|
||||||
|
|
||||||
|
|
||||||
|
@ -51,7 +35,8 @@ class Connector(AbstractConnector):
|
||||||
year=search_result.published_date.year if \
|
year=search_result.published_date.year if \
|
||||||
search_result.published_date else None,
|
search_result.published_date else None,
|
||||||
connector=self,
|
connector=self,
|
||||||
confidence=search_result.rank,
|
confidence=search_result.rank if \
|
||||||
|
hasattr(search_result, 'rank') else 1,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@ -73,3 +58,50 @@ class Connector(AbstractConnector):
|
||||||
|
|
||||||
def expand_book_data(self, book):
|
def expand_book_data(self, book):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
def search_identifiers(query):
|
||||||
|
''' tries remote_id, isbn; defined as dedupe fields on the model '''
|
||||||
|
filters = [{f.name: query} for f in models.Edition._meta.get_fields() \
|
||||||
|
if hasattr(f, 'deduplication_field') and f.deduplication_field]
|
||||||
|
results = models.Edition.objects.filter(
|
||||||
|
reduce(operator.or_, (Q(**f) for f in filters))
|
||||||
|
).distinct()
|
||||||
|
|
||||||
|
# when there are multiple editions of the same work, pick the default.
|
||||||
|
# it would be odd for this to happen.
|
||||||
|
return results.filter(parent_work__default_edition__id=F('id')) \
|
||||||
|
or results
|
||||||
|
|
||||||
|
|
||||||
|
def search_title_author(query, min_confidence):
|
||||||
|
''' searches for title and author '''
|
||||||
|
vector = SearchVector('title', weight='A') +\
|
||||||
|
SearchVector('subtitle', weight='B') +\
|
||||||
|
SearchVector('authors__name', weight='C') +\
|
||||||
|
SearchVector('series', weight='D')
|
||||||
|
|
||||||
|
results = models.Edition.objects.annotate(
|
||||||
|
search=vector
|
||||||
|
).annotate(
|
||||||
|
rank=SearchRank(vector, query)
|
||||||
|
).filter(
|
||||||
|
rank__gt=min_confidence
|
||||||
|
).order_by('-rank')
|
||||||
|
|
||||||
|
# when there are multiple editions of the same work, pick the closest
|
||||||
|
editions_of_work = results.values(
|
||||||
|
'parent_work'
|
||||||
|
).annotate(
|
||||||
|
Count('parent_work')
|
||||||
|
).values_list('parent_work')
|
||||||
|
|
||||||
|
for work_id in set(editions_of_work):
|
||||||
|
editions = results.filter(parent_work=work_id)
|
||||||
|
default = editions.filter(parent_work__default_edition=F('id'))
|
||||||
|
default_rank = default.first().rank if default.exists() else 0
|
||||||
|
# if mutliple books have the top rank, pick the default edition
|
||||||
|
if default_rank == editions.first().rank:
|
||||||
|
yield default.first()
|
||||||
|
else:
|
||||||
|
yield editions.first()
|
||||||
|
|
|
@ -9,7 +9,9 @@ from bookwyrm.settings import DOMAIN
|
||||||
|
|
||||||
|
|
||||||
class SelfConnector(TestCase):
|
class SelfConnector(TestCase):
|
||||||
|
''' just uses local data '''
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
|
''' creating the connector '''
|
||||||
models.Connector.objects.create(
|
models.Connector.objects.create(
|
||||||
identifier=DOMAIN,
|
identifier=DOMAIN,
|
||||||
name='Local',
|
name='Local',
|
||||||
|
@ -22,58 +24,85 @@ class SelfConnector(TestCase):
|
||||||
priority=1,
|
priority=1,
|
||||||
)
|
)
|
||||||
self.connector = Connector(DOMAIN)
|
self.connector = Connector(DOMAIN)
|
||||||
self.work = models.Work.objects.create(
|
|
||||||
title='Example Work',
|
|
||||||
)
|
|
||||||
author = models.Author.objects.create(name='Anonymous')
|
|
||||||
self.edition = models.Edition.objects.create(
|
|
||||||
title='Edition of Example Work',
|
|
||||||
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
|
|
||||||
parent_work=self.work,
|
|
||||||
)
|
|
||||||
self.edition.authors.add(author)
|
|
||||||
models.Edition.objects.create(
|
|
||||||
title='Another Edition',
|
|
||||||
parent_work=self.work,
|
|
||||||
series='Anonymous'
|
|
||||||
)
|
|
||||||
models.Edition.objects.create(
|
|
||||||
title='More Editions',
|
|
||||||
subtitle='The Anonymous Edition',
|
|
||||||
parent_work=self.work,
|
|
||||||
)
|
|
||||||
|
|
||||||
edition = models.Edition.objects.create(
|
|
||||||
title='An Edition',
|
|
||||||
parent_work=self.work
|
|
||||||
)
|
|
||||||
edition.authors.add(models.Author.objects.create(name='Fish'))
|
|
||||||
|
|
||||||
|
|
||||||
def test_format_search_result(self):
|
def test_format_search_result(self):
|
||||||
|
''' create a SearchResult '''
|
||||||
|
author = models.Author.objects.create(name='Anonymous')
|
||||||
|
edition = models.Edition.objects.create(
|
||||||
|
title='Edition of Example Work',
|
||||||
|
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
|
||||||
|
)
|
||||||
|
edition.authors.add(author)
|
||||||
result = self.connector.search('Edition of Example')[0]
|
result = self.connector.search('Edition of Example')[0]
|
||||||
self.assertEqual(result.title, 'Edition of Example Work')
|
self.assertEqual(result.title, 'Edition of Example Work')
|
||||||
self.assertEqual(result.key, self.edition.remote_id)
|
self.assertEqual(result.key, edition.remote_id)
|
||||||
self.assertEqual(result.author, 'Anonymous')
|
self.assertEqual(result.author, 'Anonymous')
|
||||||
self.assertEqual(result.year, 1980)
|
self.assertEqual(result.year, 1980)
|
||||||
|
self.assertEqual(result.connector, self.connector)
|
||||||
|
|
||||||
|
|
||||||
def test_search_rank(self):
|
def test_search_rank(self):
|
||||||
|
''' prioritize certain results '''
|
||||||
|
author = models.Author.objects.create(name='Anonymous')
|
||||||
|
edition = models.Edition.objects.create(
|
||||||
|
title='Edition of Example Work',
|
||||||
|
published_date=datetime.datetime(1980, 5, 10, tzinfo=timezone.utc),
|
||||||
|
parent_work=models.Work.objects.create(title='')
|
||||||
|
)
|
||||||
|
# author text is rank C
|
||||||
|
edition.authors.add(author)
|
||||||
|
|
||||||
|
# series is rank D
|
||||||
|
models.Edition.objects.create(
|
||||||
|
title='Another Edition',
|
||||||
|
series='Anonymous',
|
||||||
|
parent_work=models.Work.objects.create(title='')
|
||||||
|
)
|
||||||
|
# subtitle is rank B
|
||||||
|
models.Edition.objects.create(
|
||||||
|
title='More Editions',
|
||||||
|
subtitle='The Anonymous Edition',
|
||||||
|
parent_work=models.Work.objects.create(title='')
|
||||||
|
)
|
||||||
|
# title is rank A
|
||||||
|
models.Edition.objects.create(title='Anonymous')
|
||||||
|
# doesn't rank in this search
|
||||||
|
edition = models.Edition.objects.create(
|
||||||
|
title='An Edition',
|
||||||
|
parent_work=models.Work.objects.create(title='')
|
||||||
|
)
|
||||||
|
|
||||||
results = self.connector.search('Anonymous')
|
results = self.connector.search('Anonymous')
|
||||||
self.assertEqual(len(results), 2)
|
self.assertEqual(len(results), 3)
|
||||||
self.assertEqual(results[0].title, 'More Editions')
|
self.assertEqual(results[0].title, 'Anonymous')
|
||||||
self.assertEqual(results[1].title, 'Edition of Example Work')
|
self.assertEqual(results[1].title, 'More Editions')
|
||||||
|
self.assertEqual(results[2].title, 'Edition of Example Work')
|
||||||
|
|
||||||
|
|
||||||
def test_search_default_filter(self):
|
def test_search_multiple_editions(self):
|
||||||
''' it should get rid of duplicate editions for the same work '''
|
''' it should get rid of duplicate editions for the same work '''
|
||||||
self.work.default_edition = self.edition
|
work = models.Work.objects.create(title='Work Title')
|
||||||
self.work.save()
|
edition_1 = models.Edition.objects.create(
|
||||||
|
title='Edition 1 Title', parent_work=work)
|
||||||
|
edition_2 = models.Edition.objects.create(
|
||||||
|
title='Edition 2 Title', parent_work=work)
|
||||||
|
edition_3 = models.Edition.objects.create(
|
||||||
|
title='Fish', parent_work=work)
|
||||||
|
work.default_edition = edition_2
|
||||||
|
work.save()
|
||||||
|
|
||||||
results = self.connector.search('Anonymous')
|
# pick the best edition
|
||||||
|
results = self.connector.search('Edition 1 Title')
|
||||||
self.assertEqual(len(results), 1)
|
self.assertEqual(len(results), 1)
|
||||||
self.assertEqual(results[0].title, 'Edition of Example Work')
|
self.assertEqual(results[0].key, edition_1.remote_id)
|
||||||
|
|
||||||
|
# pick the default edition when no match is best
|
||||||
|
results = self.connector.search('Edition Title')
|
||||||
|
self.assertEqual(len(results), 1)
|
||||||
|
self.assertEqual(results[0].key, edition_2.remote_id)
|
||||||
|
|
||||||
|
# only matches one edition, so no deduplication takes place
|
||||||
results = self.connector.search('Fish')
|
results = self.connector.search('Fish')
|
||||||
self.assertEqual(len(results), 1)
|
self.assertEqual(len(results), 1)
|
||||||
self.assertEqual(results[0].title, 'An Edition')
|
self.assertEqual(results[0].key, edition_3.remote_id)
|
||||||
|
|
Loading…
Reference in a new issue