Merge pull request #249 from mouse-reeve/import

Change how goodread import writes reviews
This commit is contained in:
Mouse Reeve 2020-10-29 16:50:14 -07:00 committed by GitHub
commit f35ed0e555
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
10 changed files with 117 additions and 86 deletions

View file

@ -64,14 +64,14 @@ def load_more_data(book_id):
connector.expand_book_data(book)
def search(query):
def search(query, min_confidence=0.1):
''' find books based on arbitary keywords '''
results = []
dedup_slug = lambda r: '%s/%s/%s' % (r.title, r.author, r.year)
result_index = set()
for connector in get_connectors():
try:
result_set = connector.search(query)
result_set = connector.search(query, min_confidence=min_confidence)
except HTTPError:
continue
@ -87,16 +87,16 @@ def search(query):
return results
def local_search(query):
def local_search(query, min_confidence=0.1):
''' only look at local search results '''
connector = load_connector(models.Connector.objects.get(local=True))
return connector.search(query)
return connector.search(query, min_confidence=min_confidence)
def first_search_result(query):
def first_search_result(query, min_confidence=0.1):
''' search until you find a result that fits '''
for connector in get_connectors():
result = connector.search(query)
result = connector.search(query, min_confidence=min_confidence)
if result:
return result[0]
return None

View file

@ -1,15 +1,17 @@
''' functionality outline for a book data connector '''
from abc import ABC, abstractmethod
from dataclasses import dataclass
from dateutil import parser
import pytz
import requests
from requests import HTTPError
from django.db import transaction
from bookwyrm import models
class ConnectorException(Exception):
class ConnectorException(HTTPError):
''' when the connector can't do what was asked '''
@ -50,7 +52,7 @@ class AbstractConnector(ABC):
return True
def search(self, query):
def search(self, query, min_confidence=None):
''' free text search '''
resp = requests.get(
'%s%s' % (self.search_url, query),
@ -155,9 +157,11 @@ class AbstractConnector(ABC):
''' for creating a new book or syncing with data '''
book = update_from_mappings(book, data, self.book_mappings)
author_text = []
for author in self.get_authors_from_data(data):
book.authors.add(author)
book.author_text = ', '.join(a.display_name for a in book.authors.all())
author_text.append(author.display_name)
book.author_text = ', '.join(author_text)
book.save()
if not update_cover:
@ -287,25 +291,29 @@ def get_date(date_string):
def get_data(url):
''' wrapper for request.get '''
resp = requests.get(
url,
headers={
'Accept': 'application/json; charset=utf-8',
},
)
try:
resp = requests.get(
url,
headers={
'Accept': 'application/json; charset=utf-8',
},
)
except ConnectionError:
raise ConnectorException()
if not resp.ok:
resp.raise_for_status()
data = resp.json()
return data
@dataclass
class SearchResult:
''' standardized search result object '''
def __init__(self, title, key, author, year):
self.title = title
self.key = key
self.author = author
self.year = year
title: str
key: str
author: str
year: str
confidence: int = 1
def __repr__(self):
return "<SearchResult key={!r} title={!r} author={!r}>".format(

View file

@ -129,10 +129,10 @@ class Connector(AbstractConnector):
key = self.books_url + search_result['key']
author = search_result.get('author_name') or ['Unknown']
return SearchResult(
search_result.get('title'),
key,
', '.join(author),
search_result.get('first_publish_year'),
title=search_result.get('title'),
key=key,
author=', '.join(author),
year=search_result.get('first_publish_year'),
)

View file

@ -7,7 +7,7 @@ from .abstract_connector import AbstractConnector, SearchResult
class Connector(AbstractConnector):
''' instantiate a connector '''
def search(self, query):
def search(self, query, min_confidence=0.1):
''' right now you can't search bookwyrm sorry, but when
that gets implemented it will totally rule '''
vector = SearchVector('title', weight='A') +\
@ -28,7 +28,7 @@ class Connector(AbstractConnector):
).annotate(
rank=SearchRank(vector, query)
).filter(
rank__gt=0
rank__gt=min_confidence
).order_by('-rank')
results = results.filter(default=True) or results
@ -42,11 +42,12 @@ class Connector(AbstractConnector):
def format_search_result(self, search_result):
return SearchResult(
search_result.title,
search_result.local_id,
search_result.author_text,
search_result.published_date.year if \
title=search_result.title,
key=search_result.local_id,
author=search_result.author_text,
year=search_result.published_date.year if \
search_result.published_date else None,
confidence=search_result.rank,
)

View file

@ -42,13 +42,10 @@ def import_data(job_id):
if item.book:
item.save()
results.append(item)
# shelves book and handles reviews
outgoing.handle_imported_book(job.user, item)
else:
item.fail_reason = "Could not match book on OpenLibrary"
item.fail_reason = "Could not find a match for book"
item.save()
status = outgoing.handle_import_books(job.user, results)
if status:
job.import_status = status
job.save()
finally:
create_notification(job.user, 'IMPORT', related_import=job)

View file

@ -0,0 +1,17 @@
# Generated by Django 3.0.7 on 2020-10-29 23:48
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('bookwyrm', '0057_auto_20201026_2131'),
]
operations = [
migrations.RemoveField(
model_name='importjob',
name='import_status',
),
]

View file

@ -40,8 +40,7 @@ class ImportJob(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True)
import_status = models.ForeignKey(
'Status', null=True, on_delete=models.PROTECT)
class ImportItem(models.Model):
''' a single line of a csv being imported '''
@ -64,13 +63,17 @@ class ImportItem(models.Model):
def get_book_from_isbn(self):
''' search by isbn '''
search_result = books_manager.first_search_result(self.isbn)
search_result = books_manager.first_search_result(
self.isbn, min_confidence=0.5
)
if search_result:
try:
# don't crash the import when the connector fails
return books_manager.get_or_create_book(search_result.key)
except ConnectorException:
pass
return None
def get_book_from_title_author(self):
''' search by title and author '''
@ -78,12 +81,16 @@ class ImportItem(models.Model):
self.data['Title'],
self.data['Author']
)
search_result = books_manager.first_search_result(search_term)
search_result = books_manager.first_search_result(
search_term, min_confidence=0.5
)
if search_result:
try:
return books_manager.get_or_create_book(search_result.key)
except ConnectorException:
pass
return None
@property
def isbn(self):
@ -95,6 +102,7 @@ class ImportItem(models.Model):
''' the goodreads shelf field '''
if self.data['Exclusive Shelf']:
return GOODREADS_SHELVES.get(self.data['Exclusive Shelf'])
return None
@property
def review(self):
@ -111,12 +119,14 @@ class ImportItem(models.Model):
''' when the book was added to this dataset '''
if self.data['Date Added']:
return dateutil.parser.parse(self.data['Date Added'])
return None
@property
def date_read(self):
''' the date a book was completed '''
if self.data['Date Read']:
return dateutil.parser.parse(self.data['Date Read'])
return None
@property
def reads(self):
@ -126,6 +136,7 @@ class ImportItem(models.Model):
return [ReadThrough(start_date=self.date_added)]
if self.date_read:
return [ReadThrough(
start_date=self.date_added,
finish_date=self.date_read,
)]
return []

View file

@ -155,51 +155,49 @@ def handle_unshelve(user, book, shelf):
broadcast(user, activity)
def handle_import_books(user, items):
def handle_imported_book(user, item):
''' process a goodreads csv and then post about it '''
new_books = []
for item in items:
if item.shelf:
desired_shelf = models.Shelf.objects.get(
identifier=item.shelf,
user=user
)
if isinstance(item.book, models.Work):
item.book = item.book.default_edition
if not item.book:
continue
shelf_book, created = models.ShelfBook.objects.get_or_create(
book=item.book, shelf=desired_shelf, added_by=user)
if created:
new_books.append(item.book)
activity = shelf_book.to_add_activity(user)
broadcast(user, activity)
if isinstance(item.book, models.Work):
item.book = item.book.default_edition
if not item.book:
return
if item.rating or item.review:
review_title = 'Review of {!r} on Goodreads'.format(
item.book.title,
) if item.review else ''
if item.shelf:
desired_shelf = models.Shelf.objects.get(
identifier=item.shelf,
user=user
)
# shelve the book if it hasn't been shelved already
shelf_book, created = models.ShelfBook.objects.get_or_create(
book=item.book, shelf=desired_shelf, added_by=user)
if created:
broadcast(user, shelf_book.to_add_activity(user))
models.Review.objects.create(
user=user,
book=item.book,
name=review_title,
content=item.review,
rating=item.rating,
)
for read in item.reads:
read.book = item.book
read.user = user
read.save()
# only add new read-throughs if the item isn't already shelved
for read in item.reads:
read.book = item.book
read.user = user
read.save()
if new_books:
message = 'imported {} books'.format(len(new_books))
status = create_generated_note(user, message, mention_books=new_books)
status.save()
if item.rating or item.review:
review_title = 'Review of {!r} on Goodreads'.format(
item.book.title,
) if item.review else ''
broadcast(user, status.to_create_activity(user))
return status
return None
# we don't know the publication date of the review,
# but "now" is a bad guess
published_date_guess = item.date_read or item.date_added
review = models.Review.objects.create(
user=user,
book=item.book,
name=review_title,
content=item.review,
rating=item.rating,
published_date=published_date_guess,
)
# we don't need to send out pure activities because non-bookwyrm
# instances don't need this data
broadcast(user, review.to_create_activity(user))
def handle_delete_status(user, status):

View file

@ -24,7 +24,7 @@ class ImportJob(TestCase):
'Number of Pages': 416,
'Year Published': 2019,
'Original Publication Year': 2019,
'Date Read': '2019/04/09',
'Date Read': '2019/04/12',
'Date Added': '2019/04/09',
'Bookshelves': '',
'Bookshelves with positions': '',
@ -97,11 +97,9 @@ class ImportJob(TestCase):
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
def test_read_reads(self):
expected = [models.ReadThrough(
finish_date=datetime.datetime(2019, 4, 9, 0, 0))]
actual = models.ImportItem.objects.get(index=2)
self.assertEqual(actual.reads[0].start_date, expected[0].start_date)
self.assertEqual(actual.reads[0].finish_date, expected[0].finish_date)
self.assertEqual(actual.reads[0].start_date, datetime.datetime(2019, 4, 9, 0, 0))
self.assertEqual(actual.reads[0].finish_date, datetime.datetime(2019, 4, 12, 0, 0))
def test_unread_reads(self):
expected = []

View file

@ -489,7 +489,8 @@ def book_page(request, book_id):
).values_list('identifier', flat=True)
readthroughs = models.ReadThrough.objects.filter(
user=request.user
user=request.user,
book=book,
).order_by('start_date')