Stop assuming every book is Hamlet

This commit is contained in:
Mouse Reeve 2020-10-29 15:29:23 -07:00
parent a46d7f5dc7
commit 7ce0890a41
7 changed files with 37 additions and 32 deletions

View file

@ -64,14 +64,14 @@ def load_more_data(book_id):
connector.expand_book_data(book) connector.expand_book_data(book)
def search(query): def search(query, min_confidence=0.1):
''' find books based on arbitary keywords ''' ''' find books based on arbitary keywords '''
results = [] results = []
dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year) dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year)
result_index = set() result_index = set()
for connector in get_connectors(): for connector in get_connectors():
try: try:
result_set = connector.search(query) result_set = connector.search(query, min_confidence=min_confidence)
except HTTPError: except HTTPError:
continue continue
@ -87,16 +87,16 @@ def search(query):
return results return results
def local_search(query): def local_search(query, min_confidence=0.1):
''' only look at local search results ''' ''' only look at local search results '''
connector = load_connector(models.Connector.objects.get(local=True)) connector = load_connector(models.Connector.objects.get(local=True))
return connector.search(query) return connector.search(query, min_confidence=min_confidence)
def first_search_result(query): def first_search_result(query, min_confidence=0.1):
''' search until you find a result that fits ''' ''' search until you find a result that fits '''
for connector in get_connectors(): for connector in get_connectors():
result = connector.search(query) result = connector.search(query, min_confidence=min_confidence)
if result: if result:
return result[0] return result[0]
return None return None

View file

@ -1,8 +1,8 @@
''' functionality outline for a book data connector ''' ''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass
from dateutil import parser from dateutil import parser
import pytz import pytz
from urllib3.exceptions import ProtocolError
import requests import requests
from requests import HTTPError from requests import HTTPError
@ -52,7 +52,7 @@ class AbstractConnector(ABC):
return True return True
def search(self, query): def search(self, query, min_confidence=None):
''' free text search ''' ''' free text search '''
resp = requests.get( resp = requests.get(
'%s%s' % (self.search_url, query), '%s%s' % (self.search_url, query),
@ -160,7 +160,7 @@ class AbstractConnector(ABC):
author_text = [] author_text = []
for author in self.get_authors_from_data(data): for author in self.get_authors_from_data(data):
book.authors.add(author) book.authors.add(author)
author_text += author.display_name author_text.append(author.display_name)
book.author_text = ', '.join(author_text) book.author_text = ', '.join(author_text)
book.save() book.save()
@ -298,7 +298,7 @@ def get_data(url):
'Accept': 'application/json; charset=utf-8', 'Accept': 'application/json; charset=utf-8',
}, },
) )
except ProtocolError: except ConnectionError:
raise ConnectorException() raise ConnectorException()
if not resp.ok: if not resp.ok:
resp.raise_for_status() resp.raise_for_status()
@ -306,13 +306,14 @@ def get_data(url):
return data return data
@dataclass
class SearchResult: class SearchResult:
''' standardized search result object ''' ''' standardized search result object '''
def __init__(self, title, key, author, year): title: str
self.title = title key: str
self.key = key author: str
self.author = author year: str
self.year = year confidence: int = 1
def __repr__(self): def __repr__(self):
return "<SearchResult key={!r} title={!r} author={!r}>".format( return "<SearchResult key={!r} title={!r} author={!r}>".format(

View file

@ -129,10 +129,10 @@ class Connector(AbstractConnector):
key = self.books_url + search_result['key'] key = self.books_url + search_result['key']
author = search_result.get('author_name') or ['Unknown'] author = search_result.get('author_name') or ['Unknown']
return SearchResult( return SearchResult(
search_result.get('title'), title=search_result.get('title'),
key, key=key,
', '.join(author), author=', '.join(author),
search_result.get('first_publish_year'), year=search_result.get('first_publish_year'),
) )

View file

@ -7,7 +7,7 @@ from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector): class Connector(AbstractConnector):
''' instantiate a connector ''' ''' instantiate a connector '''
def search(self, query): def search(self, query, min_confidence=0.1):
''' right now you can't search bookwyrm sorry, but when ''' right now you can't search bookwyrm sorry, but when
that gets implemented it will totally rule ''' that gets implemented it will totally rule '''
vector = SearchVector('title', weight='A') +\ vector = SearchVector('title', weight='A') +\
@ -28,7 +28,7 @@ class Connector(AbstractConnector):
).annotate( ).annotate(
rank=SearchRank(vector, query) rank=SearchRank(vector, query)
).filter( ).filter(
rank__gt=0 rank__gt=min_confidence
).order_by('-rank') ).order_by('-rank')
results = results.filter(default=True) or results results = results.filter(default=True) or results
@ -42,11 +42,12 @@ class Connector(AbstractConnector):
def format_search_result(self, search_result): def format_search_result(self, search_result):
return SearchResult( return SearchResult(
search_result.title, title=search_result.title,
search_result.local_id, key=search_result.local_id,
search_result.author_text, author=search_result.author_text,
search_result.published_date.year if \ year=search_result.published_date.year if \
search_result.published_date else None, search_result.published_date else None,
confidence=search_result.rank,
) )

View file

@ -63,7 +63,9 @@ class ImportItem(models.Model):
def get_book_from_isbn(self): def get_book_from_isbn(self):
''' search by isbn ''' ''' search by isbn '''
search_result = books_manager.first_search_result(self.isbn) search_result = books_manager.first_search_result(
self.isbn, min_confidence=0.5
)
if search_result: if search_result:
try: try:
# don't crash the import when the connector fails # don't crash the import when the connector fails
@ -79,7 +81,9 @@ class ImportItem(models.Model):
self.data['Title'], self.data['Title'],
self.data['Author'] self.data['Author']
) )
search_result = books_manager.first_search_result(search_term) search_result = books_manager.first_search_result(
search_term, min_confidence=0.5
)
if search_result: if search_result:
try: try:
return books_manager.get_or_create_book(search_result.key) return books_manager.get_or_create_book(search_result.key)

View file

@ -14,6 +14,7 @@
{% for result in result_set.results %} {% for result in result_set.results %}
<div> <div>
{{ result.confidence }}
<form action="/resolve_book" method="POST"> <form action="/resolve_book" method="POST">
{% csrf_token %} {% csrf_token %}
<input type="hidden" name="remote_id" value="{{ result.key }}"> <input type="hidden" name="remote_id" value="{{ result.key }}">

View file

@ -24,7 +24,7 @@ class ImportJob(TestCase):
'Number of Pages': 416, 'Number of Pages': 416,
'Year Published': 2019, 'Year Published': 2019,
'Original Publication Year': 2019, 'Original Publication Year': 2019,
'Date Read': '2019/04/09', 'Date Read': '2019/04/12',
'Date Added': '2019/04/09', 'Date Added': '2019/04/09',
'Bookshelves': '', 'Bookshelves': '',
'Bookshelves with positions': '', 'Bookshelves with positions': '',
@ -97,11 +97,9 @@ class ImportJob(TestCase):
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date) self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
def test_read_reads(self): def test_read_reads(self):
expected = [models.ReadThrough(
finish_date=datetime.datetime(2019, 4, 9, 0, 0))]
actual = models.ImportItem.objects.get(index=2) actual = models.ImportItem.objects.get(index=2)
self.assertEqual(actual.reads[0].start_date, expected[0].start_date) self.assertEqual(actual.reads[0].start_date, datetime.datetime(2019, 4, 9, 0, 0))
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date) self.assertEqual(actual.reads[0].finish_date, datetime.datetime(2019, 4, 12, 0, 0))
def test_unread_reads(self): def test_unread_reads(self):
expected = [] expected = []