Merge pull request #249 from mouse-reeve/import

Change how goodread import writes reviews
This commit is contained in:
Mouse Reeve 2020-10-29 16:50:14 -07:00 committed by GitHub
commit f35ed0e555
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 117 additions and 86 deletions

View file

@ -64,14 +64,14 @@ def load_more_data(book_id):
connector.expand_book_data(book) connector.expand_book_data(book)
def search(query): def search(query, min_confidence=0.1):
''' find books based on arbitary keywords ''' ''' find books based on arbitary keywords '''
results = [] results = []
dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year) dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year)
result_index = set() result_index = set()
for connector in get_connectors(): for connector in get_connectors():
try: try:
result_set = connector.search(query) result_set = connector.search(query, min_confidence=min_confidence)
except HTTPError: except HTTPError:
continue continue
@ -87,16 +87,16 @@ def search(query):
return results return results
def local_search(query): def local_search(query, min_confidence=0.1):
''' only look at local search results ''' ''' only look at local search results '''
connector = load_connector(models.Connector.objects.get(local=True)) connector = load_connector(models.Connector.objects.get(local=True))
return connector.search(query) return connector.search(query, min_confidence=min_confidence)
def first_search_result(query): def first_search_result(query, min_confidence=0.1):
''' search until you find a result that fits ''' ''' search until you find a result that fits '''
for connector in get_connectors(): for connector in get_connectors():
result = connector.search(query) result = connector.search(query, min_confidence=min_confidence)
if result: if result:
return result[0] return result[0]
return None return None

View file

@ -1,15 +1,17 @@
''' functionality outline for a book data connector ''' ''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod from abc import ABC, abstractmethod
from dataclasses import dataclass
from dateutil import parser from dateutil import parser
import pytz import pytz
import requests import requests
from requests import HTTPError
from django.db import transaction from django.db import transaction
from bookwyrm import models from bookwyrm import models
class ConnectorException(Exception): class ConnectorException(HTTPError):
''' when the connector can't do what was asked ''' ''' when the connector can't do what was asked '''
@ -50,7 +52,7 @@ class AbstractConnector(ABC):
return True return True
def search(self, query): def search(self, query, min_confidence=None):
''' free text search ''' ''' free text search '''
resp = requests.get( resp = requests.get(
'%s%s' % (self.search_url, query), '%s%s' % (self.search_url, query),
@ -155,9 +157,11 @@ class AbstractConnector(ABC):
''' for creating a new book or syncing with data ''' ''' for creating a new book or syncing with data '''
book = update_from_mappings(book, data, self.book_mappings) book = update_from_mappings(book, data, self.book_mappings)
author_text = []
for author in self.get_authors_from_data(data): for author in self.get_authors_from_data(data):
book.authors.add(author) book.authors.add(author)
book.author_text = ', '.join(a.display_name for a in book.authors.all()) author_text.append(author.display_name)
book.author_text = ', '.join(author_text)
book.save() book.save()
if not update_cover: if not update_cover:
@ -287,25 +291,29 @@ def get_date(date_string):
def get_data(url): def get_data(url):
''' wrapper for request.get ''' ''' wrapper for request.get '''
resp = requests.get( try:
url, resp = requests.get(
headers={ url,
'Accept': 'application/json; charset=utf-8', headers={
}, 'Accept': 'application/json; charset=utf-8',
) },
)
except ConnectionError:
raise ConnectorException()
if not resp.ok: if not resp.ok:
resp.raise_for_status() resp.raise_for_status()
data = resp.json() data = resp.json()
return data return data
@dataclass
class SearchResult: class SearchResult:
''' standardized search result object ''' ''' standardized search result object '''
def __init__(self, title, key, author, year): title: str
self.title = title key: str
self.key = key author: str
self.author = author year: str
self.year = year confidence: int = 1
def __repr__(self): def __repr__(self):
return "<SearchResult key={!r} title={!r} author={!r}>".format( return "<SearchResult key={!r} title={!r} author={!r}>".format(

View file

@ -129,10 +129,10 @@ class Connector(AbstractConnector):
key = self.books_url + search_result['key'] key = self.books_url + search_result['key']
author = search_result.get('author_name') or ['Unknown'] author = search_result.get('author_name') or ['Unknown']
return SearchResult( return SearchResult(
search_result.get('title'), title=search_result.get('title'),
key, key=key,
', '.join(author), author=', '.join(author),
search_result.get('first_publish_year'), year=search_result.get('first_publish_year'),
) )

View file

@ -7,7 +7,7 @@ from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector): class Connector(AbstractConnector):
''' instantiate a connector ''' ''' instantiate a connector '''
def search(self, query): def search(self, query, min_confidence=0.1):
''' right now you can't search bookwyrm sorry, but when ''' right now you can't search bookwyrm sorry, but when
that gets implemented it will totally rule ''' that gets implemented it will totally rule '''
vector = SearchVector('title', weight='A') +\ vector = SearchVector('title', weight='A') +\
@ -28,7 +28,7 @@ class Connector(AbstractConnector):
).annotate( ).annotate(
rank=SearchRank(vector, query) rank=SearchRank(vector, query)
).filter( ).filter(
rank__gt=0 rank__gt=min_confidence
).order_by('-rank') ).order_by('-rank')
results = results.filter(default=True) or results results = results.filter(default=True) or results
@ -42,11 +42,12 @@ class Connector(AbstractConnector):
def format_search_result(self, search_result): def format_search_result(self, search_result):
return SearchResult( return SearchResult(
search_result.title, title=search_result.title,
search_result.local_id, key=search_result.local_id,
search_result.author_text, author=search_result.author_text,
search_result.published_date.year if \ year=search_result.published_date.year if \
search_result.published_date else None, search_result.published_date else None,
confidence=search_result.rank,
) )

View file

@ -42,13 +42,10 @@ def import_data(job_id):
if item.book: if item.book:
item.save() item.save()
results.append(item) results.append(item)
# shelves book and handles reviews
outgoing.handle_imported_book(job.user, item)
else: else:
item.fail_reason = "Could not match book on OpenLibrary" item.fail_reason = "Could not find a match for book"
item.save() item.save()
status = outgoing.handle_import_books(job.user, results)
if status:
job.import_status = status
job.save()
finally: finally:
create_notification(job.user, 'IMPORT', related_import=job) create_notification(job.user, 'IMPORT', related_import=job)

View file

@ -0,0 +1,17 @@
# Generated by Django 3.0.7 on 2020-10-29 23:48
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('bookwyrm', '0057_auto_20201026_2131'),
]
operations = [
migrations.RemoveField(
model_name='importjob',
name='import_status',
),
]

View file

@ -40,8 +40,7 @@ class ImportJob(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE) user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now) created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True) task_id = models.CharField(max_length=100, null=True)
import_status = models.ForeignKey(
'Status', null=True, on_delete=models.PROTECT)
class ImportItem(models.Model): class ImportItem(models.Model):
''' a single line of a csv being imported ''' ''' a single line of a csv being imported '''
@ -64,13 +63,17 @@ class ImportItem(models.Model):
def get_book_from_isbn(self): def get_book_from_isbn(self):
''' search by isbn ''' ''' search by isbn '''
search_result = books_manager.first_search_result(self.isbn) search_result = books_manager.first_search_result(
self.isbn, min_confidence=0.5
)
if search_result: if search_result:
try: try:
# don't crash the import when the connector fails # don't crash the import when the connector fails
return books_manager.get_or_create_book(search_result.key) return books_manager.get_or_create_book(search_result.key)
except ConnectorException: except ConnectorException:
pass pass
return None
def get_book_from_title_author(self): def get_book_from_title_author(self):
''' search by title and author ''' ''' search by title and author '''
@ -78,12 +81,16 @@ class ImportItem(models.Model):
self.data['Title'], self.data['Title'],
self.data['Author'] self.data['Author']
) )
search_result = books_manager.first_search_result(search_term) search_result = books_manager.first_search_result(
search_term, min_confidence=0.5
)
if search_result: if search_result:
try: try:
return books_manager.get_or_create_book(search_result.key) return books_manager.get_or_create_book(search_result.key)
except ConnectorException: except ConnectorException:
pass pass
return None
@property @property
def isbn(self): def isbn(self):
@ -95,6 +102,7 @@ class ImportItem(models.Model):
''' the goodreads shelf field ''' ''' the goodreads shelf field '''
if self.data['Exclusive Shelf']: if self.data['Exclusive Shelf']:
return GOODREADS_SHELVES.get(self.data['Exclusive Shelf']) return GOODREADS_SHELVES.get(self.data['Exclusive Shelf'])
return None
@property @property
def review(self): def review(self):
@ -111,12 +119,14 @@ class ImportItem(models.Model):
''' when the book was added to this dataset ''' ''' when the book was added to this dataset '''
if self.data['Date Added']: if self.data['Date Added']:
return dateutil.parser.parse(self.data['Date Added']) return dateutil.parser.parse(self.data['Date Added'])
return None
@property @property
def date_read(self): def date_read(self):
''' the date a book was completed ''' ''' the date a book was completed '''
if self.data['Date Read']: if self.data['Date Read']:
return dateutil.parser.parse(self.data['Date Read']) return dateutil.parser.parse(self.data['Date Read'])
return None
@property @property
def reads(self): def reads(self):
@ -126,6 +136,7 @@ class ImportItem(models.Model):
return [ReadThrough(start_date=self.date_added)] return [ReadThrough(start_date=self.date_added)]
if self.date_read: if self.date_read:
return [ReadThrough( return [ReadThrough(
start_date=self.date_added,
finish_date=self.date_read, finish_date=self.date_read,
)] )]
return [] return []

View file

@ -155,51 +155,49 @@ def handle_unshelve(user, book, shelf):
broadcast(user, activity) broadcast(user, activity)
def handle_import_books(user, items): def handle_imported_book(user, item):
''' process a goodreads csv and then post about it ''' ''' process a goodreads csv and then post about it '''
new_books = [] if isinstance(item.book, models.Work):
for item in items: item.book = item.book.default_edition
if item.shelf: if not item.book:
desired_shelf = models.Shelf.objects.get( return
identifier=item.shelf,
user=user
)
if isinstance(item.book, models.Work):
item.book = item.book.default_edition
if not item.book:
continue
shelf_book, created = models.ShelfBook.objects.get_or_create(
book=item.book, shelf=desired_shelf, added_by=user)
if created:
new_books.append(item.book)
activity = shelf_book.to_add_activity(user)
broadcast(user, activity)
if item.rating or item.review: if item.shelf:
review_title = 'Review of {!r} on Goodreads'.format( desired_shelf = models.Shelf.objects.get(
item.book.title, identifier=item.shelf,
) if item.review else '' user=user
)
# shelve the book if it hasn't been shelved already
shelf_book, created = models.ShelfBook.objects.get_or_create(
book=item.book, shelf=desired_shelf, added_by=user)
if created:
broadcast(user, shelf_book.to_add_activity(user))
models.Review.objects.create( # only add new read-throughs if the item isn't already shelved
user=user, for read in item.reads:
book=item.book, read.book = item.book
name=review_title, read.user = user
content=item.review, read.save()
rating=item.rating,
)
for read in item.reads:
read.book = item.book
read.user = user
read.save()
if new_books: if item.rating or item.review:
message = 'imported {} books'.format(len(new_books)) review_title = 'Review of {!r} on Goodreads'.format(
status = create_generated_note(user, message, mention_books=new_books) item.book.title,
status.save() ) if item.review else ''
broadcast(user, status.to_create_activity(user)) # we don't know the publication date of the review,
return status # but "now" is a bad guess
return None published_date_guess = item.date_read or item.date_added
review = models.Review.objects.create(
user=user,
book=item.book,
name=review_title,
content=item.review,
rating=item.rating,
published_date=published_date_guess,
)
# we don't need to send out pure activities because non-bookwyrm
# instances don't need this data
broadcast(user, review.to_create_activity(user))
def handle_delete_status(user, status): def handle_delete_status(user, status):

View file

@ -24,7 +24,7 @@ class ImportJob(TestCase):
'Number of Pages': 416, 'Number of Pages': 416,
'Year Published': 2019, 'Year Published': 2019,
'Original Publication Year': 2019, 'Original Publication Year': 2019,
'Date Read': '2019/04/09', 'Date Read': '2019/04/12',
'Date Added': '2019/04/09', 'Date Added': '2019/04/09',
'Bookshelves': '', 'Bookshelves': '',
'Bookshelves with positions': '', 'Bookshelves with positions': '',
@ -97,11 +97,9 @@ class ImportJob(TestCase):
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date) self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
def test_read_reads(self): def test_read_reads(self):
expected = [models.ReadThrough(
finish_date=datetime.datetime(2019, 4, 9, 0, 0))]
actual = models.ImportItem.objects.get(index=2) actual = models.ImportItem.objects.get(index=2)
self.assertEqual(actual.reads[0].start_date, expected[0].start_date) self.assertEqual(actual.reads[0].start_date, datetime.datetime(2019, 4, 9, 0, 0))
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date) self.assertEqual(actual.reads[0].finish_date, datetime.datetime(2019, 4, 12, 0, 0))
def test_unread_reads(self): def test_unread_reads(self):
expected = [] expected = []

View file

@ -489,7 +489,8 @@ def book_page(request, book_id):
).values_list('identifier', flat=True) ).values_list('identifier', flat=True)
readthroughs = models.ReadThrough.objects.filter( readthroughs = models.ReadThrough.objects.filter(
user=request.user user=request.user,
book=book,
).order_by('start_date') ).order_by('start_date')