Stop assuming every book is Hamlet

This commit is contained in:
Mouse Reeve 2020-10-29 15:29:23 -07:00
parent a46d7f5dc7
commit 7ce0890a41
7 changed files with 37 additions and 32 deletions

View file

@ -64,14 +64,14 @@ def load_more_data(book_id):
connector.expand_book_data(book)
def search(query):
def search(query, min_confidence=0.1):
''' find books based on arbitary keywords '''
results = []
dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year)
result_index = set()
for connector in get_connectors():
try:
result_set = connector.search(query)
result_set = connector.search(query, min_confidence=min_confidence)
except HTTPError:
continue
@ -87,16 +87,16 @@ def search(query):
return results
def local_search(query):
def local_search(query, min_confidence=0.1):
''' only look at local search results '''
connector = load_connector(models.Connector.objects.get(local=True))
return connector.search(query)
return connector.search(query, min_confidence=min_confidence)
def first_search_result(query):
def first_search_result(query, min_confidence=0.1):
''' search until you find a result that fits '''
for connector in get_connectors():
result = connector.search(query)
result = connector.search(query, min_confidence=min_confidence)
if result:
return result[0]
return None

View file

@ -1,8 +1,8 @@
''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod
from dataclasses import dataclass
from dateutil import parser
import pytz
from urllib3.exceptions import ProtocolError
import requests
from requests import HTTPError
@ -52,7 +52,7 @@ class AbstractConnector(ABC):
return True
def search(self, query):
def search(self, query, min_confidence=None):
''' free text search '''
resp = requests.get(
'%s%s' % (self.search_url, query),
@ -160,7 +160,7 @@ class AbstractConnector(ABC):
author_text = []
for author in self.get_authors_from_data(data):
book.authors.add(author)
author_text += author.display_name
author_text.append(author.display_name)
book.author_text = ', '.join(author_text)
book.save()
@ -298,7 +298,7 @@ def get_data(url):
'Accept': 'application/json; charset=utf-8',
},
)
except ProtocolError:
except ConnectionError:
raise ConnectorException()
if not resp.ok:
resp.raise_for_status()
@ -306,13 +306,14 @@ def get_data(url):
return data
@dataclass
class SearchResult:
''' standardized search result object '''
def __init__(self, title, key, author, year):
self.title = title
self.key = key
self.author = author
self.year = year
title: str
key: str
author: str
year: str
confidence: int = 1
def __repr__(self):
return "<SearchResult key={!r} title={!r} author={!r}>".format(

View file

@ -129,10 +129,10 @@ class Connector(AbstractConnector):
key = self.books_url + search_result['key']
author = search_result.get('author_name') or ['Unknown']
return SearchResult(
search_result.get('title'),
key,
', '.join(author),
search_result.get('first_publish_year'),
title=search_result.get('title'),
key=key,
author=', '.join(author),
year=search_result.get('first_publish_year'),
)

View file

@ -7,7 +7,7 @@ from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector):
''' instantiate a connector '''
def search(self, query):
def search(self, query, min_confidence=0.1):
''' right now you can't search bookwyrm sorry, but when
that gets implemented it will totally rule '''
vector = SearchVector('title', weight='A') +\
@ -28,7 +28,7 @@ class Connector(AbstractConnector):
).annotate(
rank=SearchRank(vector, query)
).filter(
rank__gt=0
rank__gt=min_confidence
).order_by('-rank')
results = results.filter(default=True) or results
@ -42,11 +42,12 @@ class Connector(AbstractConnector):
def format_search_result(self, search_result):
return SearchResult(
search_result.title,
search_result.local_id,
search_result.author_text,
search_result.published_date.year if \
title=search_result.title,
key=search_result.local_id,
author=search_result.author_text,
year=search_result.published_date.year if \
search_result.published_date else None,
confidence=search_result.rank,
)

View file

@ -63,7 +63,9 @@ class ImportItem(models.Model):
def get_book_from_isbn(self):
''' search by isbn '''
search_result = books_manager.first_search_result(self.isbn)
search_result = books_manager.first_search_result(
self.isbn, min_confidence=0.5
)
if search_result:
try:
# don't crash the import when the connector fails
@ -79,7 +81,9 @@ class ImportItem(models.Model):
self.data['Title'],
self.data['Author']
)
search_result = books_manager.first_search_result(search_term)
search_result = books_manager.first_search_result(
search_term, min_confidence=0.5
)
if search_result:
try:
return books_manager.get_or_create_book(search_result.key)

View file

@ -14,6 +14,7 @@
{% for result in result_set.results %}
<div>
{{ result.confidence }}
<form action="/resolve_book" method="POST">
{% csrf_token %}
<input type="hidden" name="remote_id" value="{{ result.key }}">

View file

@ -24,7 +24,7 @@ class ImportJob(TestCase):
'Number of Pages': 416,
'Year Published': 2019,
'Original Publication Year': 2019,
'Date Read': '2019/04/09',
'Date Read': '2019/04/12',
'Date Added': '2019/04/09',
'Bookshelves': '',
'Bookshelves with positions': '',
@ -97,11 +97,9 @@ class ImportJob(TestCase):
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
def test_read_reads(self):
expected = [models.ReadThrough(
finish_date=datetime.datetime(2019, 4, 9, 0, 0))]
actual = models.ImportItem.objects.get(index=2)
self.assertEqual(actual.reads[0].start_date, expected[0].start_date)
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
self.assertEqual(actual.reads[0].start_date, datetime.datetime(2019, 4, 9, 0, 0))
self.assertEqual(actual.reads[0].finish_date, datetime.datetime(2019, 4, 12, 0, 0))
def test_unread_reads(self):
expected = []