Merge pull request #150 from cthulahoops/async_import

Async import
This commit is contained in:
Mouse Reeve 2020-04-27 10:15:43 -07:00 committed by GitHub
commit 4f9edae05a
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
19 changed files with 423 additions and 174 deletions

View file

@ -1,126 +1,61 @@
''' handle reading a csv from goodreads '''
import re
import csv
import itertools
import dateutil.parser
from requests import HTTPError
from fedireads import books_manager
from fedireads.models import Edition, ReadThrough
from fedireads import outgoing
from fedireads.tasks import app
from fedireads.models import ImportJob, ImportItem
from fedireads.status import create_notification
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
# TODO: remove or notify about this in the UI
MAX_ENTRIES = 20
def unquote_string(text):
''' resolve csv quote weirdness '''
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def create_job(user, csv_file):
job = ImportJob.objects.create(user=user)
for index, entry in enumerate(list(csv.DictReader(csv_file))[:MAX_ENTRIES]):
ImportItem(job=job, index=index, data=entry).save()
return job
def start_import(job):
result = import_data.delay(job.id)
job.task_id = result.id
job.save()
def construct_search_term(title, author):
''' formulate a query for the data connector '''
# Strip brackets (usually series title from search term)
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
@app.task
def import_data(job_id):
job = ImportJob.objects.get(id=job_id)
try:
results = []
reviews = []
for item in job.items.all():
try:
item.resolve()
except HTTPError:
pass
if item.book:
item.save()
results.append(item)
if item.rating or item.review:
reviews.append(item)
else:
item.fail_reason = "Could not match book on OpenLibrary"
item.save()
return ' '.join([title, author])
class GoodreadsCsv:
''' define a goodreads csv '''
def __init__(self, csv_file):
self.reader = csv.DictReader(csv_file)
def __iter__(self):
for line in itertools.islice(self.reader, MAX_ENTRIES):
yield GoodreadsItem(line)
class GoodreadsItem:
''' a processed line in a goodreads csv '''
def __init__(self, line):
self.line = line
self.book = None
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.line['Title'],
self.line['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.line['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.line['Exclusive Shelf']:
return GOODREADS_SHELVES[self.line['Exclusive Shelf']]
@property
def review(self):
return self.line['My Review']
@property
def rating(self):
return int(self.line['My Rating'])
@property
def date_added(self):
if self.line['Date Added']:
return dateutil.parser.parse(self.line['Date Added'])
@property
def date_read(self):
if self.line['Date Read']:
return dateutil.parser.parse(self.line['Date Read'])
@property
def reads(self):
return [ReadThrough(
# Date added isn't the start date, but it's (perhaps) better than nothing.
start_date=self.date_added,
finish_date=self.date_read,
pages_read=None,
)]
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.line['Title'])
def __str__(self):
return "{} by {}".format(self.line['Title'], self.line['Author'])
status = outgoing.handle_import_books(job.user, results)
for item in reviews:
review_title = "Review of {!r} on Goodreads".format(
item.book.title,
) if item.review else ""
outgoing.handle_review(
job.user,
item.book,
review_title,
item.review,
item.rating,
)
if status:
job.import_status = status
job.save()
finally:
create_notification(job.user, 'IMPORT', related_import=job)

View file

@ -14,6 +14,7 @@ from fedireads import models, outgoing
from fedireads import status as status_builder
from fedireads.remote_user import get_or_create_remote_user
from fedireads.tasks import app
from fedireads.status import create_notification
@csrf_exempt

View file

@ -0,0 +1,60 @@
# Generated by Django 3.0.3 on 2020-04-21 13:47
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import fedireads.utils.fields
class Migration(migrations.Migration):
dependencies = [
('fedireads', '0031_readthrough'),
]
operations = [
migrations.CreateModel(
name='ImportItem',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('data', fedireads.utils.fields.JSONField()),
],
),
migrations.CreateModel(
name='ImportJob',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_date', models.DateTimeField(default=django.utils.timezone.now)),
('task_id', models.CharField(max_length=100, null=True)),
],
),
migrations.RemoveConstraint(
model_name='notification',
name='notification_type_valid',
),
migrations.AlterField(
model_name='notification',
name='notification_type',
field=models.CharField(choices=[('FAVORITE', 'Favorite'), ('REPLY', 'Reply'), ('TAG', 'Tag'), ('FOLLOW', 'Follow'), ('FOLLOW_REQUEST', 'Follow Request'), ('BOOST', 'Boost'), ('IMPORT_RESULT', 'Import Result')], max_length=255),
),
migrations.AddConstraint(
model_name='notification',
constraint=models.CheckConstraint(check=models.Q(notification_type__in=['FAVORITE', 'REPLY', 'TAG', 'FOLLOW', 'FOLLOW_REQUEST', 'BOOST', 'IMPORT_RESULT']), name='notification_type_valid'),
),
migrations.AddField(
model_name='importjob',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='importitem',
name='book',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='fedireads.Book'),
),
migrations.AddField(
model_name='importitem',
name='job',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='items', to='fedireads.ImportJob'),
),
]

View file

@ -0,0 +1,43 @@
# Generated by Django 3.0.3 on 2020-04-22 12:49
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('fedireads', '0032_auto_20200421_1347'),
]
operations = [
migrations.RemoveConstraint(
model_name='notification',
name='notification_type_valid',
),
migrations.AddField(
model_name='importitem',
name='fail_reason',
field=models.TextField(null=True),
),
migrations.AddField(
model_name='importitem',
name='index',
field=models.IntegerField(default=1),
preserve_default=False,
),
migrations.AddField(
model_name='notification',
name='related_import',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.PROTECT, to='fedireads.ImportJob'),
),
migrations.AlterField(
model_name='notification',
name='notification_type',
field=models.CharField(choices=[('FAVORITE', 'Favorite'), ('REPLY', 'Reply'), ('TAG', 'Tag'), ('FOLLOW', 'Follow'), ('FOLLOW_REQUEST', 'Follow Request'), ('BOOST', 'Boost'), ('IMPORT', 'Import')], max_length=255),
),
migrations.AddConstraint(
model_name='notification',
constraint=models.CheckConstraint(check=models.Q(notification_type__in=['FAVORITE', 'REPLY', 'TAG', 'FOLLOW', 'FOLLOW_REQUEST', 'BOOST', 'IMPORT']), name='notification_type_valid'),
),
]

View file

@ -0,0 +1,19 @@
# Generated by Django 3.0.3 on 2020-04-22 13:12
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
('fedireads', '0033_auto_20200422_1249'),
]
operations = [
migrations.AddField(
model_name='importjob',
name='import_status',
field=models.ForeignKey(null=True, on_delete=django.db.models.deletion.PROTECT, to='fedireads.Status'),
),
]

View file

@ -5,3 +5,4 @@ from .status import Status, Review, Comment, Quotation
from .status import Favorite, Boost, Tag, Notification, ReadThrough
from .user import User, UserFollows, UserFollowRequest, UserBlocks
from .user import FederatedServer
from .import_job import ImportJob, ImportItem

View file

@ -0,0 +1,127 @@
import re
import dateutil.parser
from django.db import models
from django.utils import timezone
from fedireads import books_manager
from fedireads.models import Edition, ReadThrough, User, Book
from fedireads.utils.fields import JSONField
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
def unquote_string(text):
''' resolve csv quote weirdness '''
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def construct_search_term(title, author):
''' formulate a query for the data connector '''
# Strip brackets (usually series title from search term)
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
return ' '.join([title, author])
class ImportJob(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True)
import_status = models.ForeignKey(
'Status', null=True, on_delete=models.PROTECT)
class ImportItem(models.Model):
job = models.ForeignKey(
ImportJob,
on_delete=models.CASCADE,
related_name='items')
index = models.IntegerField()
data = JSONField()
book = models.ForeignKey(
Book, on_delete=models.SET_NULL, null=True, blank=True)
fail_reason = models.TextField(null=True)
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.data['Title'],
self.data['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.data['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.data['Exclusive Shelf']:
return GOODREADS_SHELVES[self.data['Exclusive Shelf']]
@property
def review(self):
return self.data['My Review']
@property
def rating(self):
return int(self.data['My Rating'])
@property
def date_added(self):
if self.data['Date Added']:
return dateutil.parser.parse(self.data['Date Added'])
@property
def date_read(self):
if self.data['Date Read']:
return dateutil.parser.parse(self.data['Date Read'])
@property
def reads(self):
if (self.shelf == 'reading'
and self.date_added and not self.date_read):
return [ReadThrough(start_date=self.date_added)]
if self.date_read:
return [ReadThrough(
finish_date=self.date_read,
)]
return []
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.data['Title'])
def __str__(self):
return "{} by {}".format(self.data['Title'], self.data['Author'])

View file

@ -152,7 +152,8 @@ class ReadThrough(FedireadsModel):
NotificationType = models.TextChoices(
'NotificationType', 'FAVORITE REPLY TAG FOLLOW FOLLOW_REQUEST BOOST')
'NotificationType',
'FAVORITE REPLY TAG FOLLOW FOLLOW_REQUEST BOOST IMPORT')
class Notification(FedireadsModel):
''' you've been tagged, liked, followed, etc '''
@ -164,6 +165,8 @@ class Notification(FedireadsModel):
on_delete=models.PROTECT, null=True, related_name='related_user')
related_status = models.ForeignKey(
'Status', on_delete=models.PROTECT, null=True)
related_import = models.ForeignKey(
'ImportJob', on_delete=models.PROTECT, null=True)
read = models.BooleanField(default=False)
notification_type = models.CharField(
max_length=255, choices=NotificationType.choices)

View file

@ -158,7 +158,7 @@ def handle_shelve(user, book, shelf):
activity = activitypub.get_status(status)
create_activity = activitypub.get_create(user, activity)
broadcast(user, create_activity, recipients)
broadcast(user, create_activity)
def handle_unshelve(user, book, shelf):
@ -206,6 +206,7 @@ def handle_import_books(user, items):
create_activity = activitypub.get_create(
user, activitypub.get_status(status))
broadcast(user, create_activity)
return status
def handle_rate(user, book, rating):

View file

@ -234,7 +234,7 @@ def create_tag(user, possible_book, name):
def create_notification(user, notification_type, related_user=None, \
related_book=None, related_status=None):
related_book=None, related_status=None, related_import=None):
''' let a user know when someone interacts with their content '''
if user == related_user:
# don't create notification when you interact with your own stuff
@ -244,6 +244,7 @@ def create_notification(user, notification_type, related_user=None, \
related_book=related_book,
related_user=related_user,
related_status=related_status,
related_import=related_import,
notification_type=notification_type,
)

View file

@ -1,4 +1,5 @@
{% extends 'layout.html' %}
{% load humanize %}
{% block content %}
<div class="content-container">
<h2>Import Books from GoodReads</h2>
@ -6,7 +7,13 @@
{% csrf_token %}
{{ import_form.as_p }}
<button type="submit">Import</button>
<small>Hang tight, this may take a minute!</small>
</form>
<h2>Recent Imports</h2>
<ul>
{% for job in jobs %}
<li><a href="/import_status/{{ job.id }}">{{ job.created_date | naturaltime }}</a></li>
{% endfor %}
</ul>
</div>
{% endblock %}

View file

@ -1,18 +0,0 @@
{% extends 'layout.html' %}
{% block content %}
<div id="content">
<div>
<h1>The following books could not be imported: </h1>
<ul>
{% for item in failures %}
<li>
{{ item }}
</li>
{% endfor %}
</ul>
<p>{{ success_count }} books imported successfully</p>
</div>
</div>
{% endblock %}

View file

@ -0,0 +1,69 @@
{% extends 'layout.html' %}
{% load fr_display %}
{% load humanize %}
{% block content %}
<div id="content">
<div>
<h1>Import Status</h1>
<p>
Import started: {{ job.created_date | naturaltime }}
<p>
{% if task.ready %}
Import completed: {{ task.date_done | naturaltime }}
{% if task.failed %}
<h3><span style="background-color: #ffaaaa;">TASK FAILED</span></h3>
<p>
{{ task.info }}
{% endif %}
{% if job.import_status %}
{% include 'snippets/status.html' with status=job.import_status %}
{% endif %}
{% else %}
Import still in progress.
<p>
(Hit reload to update!)
{% endif %}
<table>
<tr>
<th>
Book
</th>
<th>
Title
</th>
<th>
Author
</th>
<th>
</th>
</tr>
{% for item in items %}
<tr>
<td>
{% if item.book %}
<a href="{{ item.book.absolute_id }}">
{% include 'snippets/book_cover.html' with book=item.book size='small' %}
</a>
{% endif %}
</td>
<td>
{{ item.data|dict_key:'Title' }}
</td>
<td>
{{ item.data|dict_key:'Author' }}
</td>
<td>
{% if item.book %}✓
{% elif item.fail_reason %}
{{ item.fail_reason }}
{% endif %}
</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endblock %}

View file

@ -14,6 +14,7 @@
{% for notification in notifications %}
<div class="notification{% if notification.id in unread %} unread{% endif %}">
<small class="time-ago">{{ notification.created_date | naturaltime }}</small>
{% if notification.related_user %}
{% include 'snippets/username.html' with user=notification.related_user %}
{% if notification.notification_type == 'FAVORITE' %}
favorited your
@ -36,6 +37,10 @@
{% elif notification.notification_type == 'BOOST' %}
boosted your <a href="{{ notification.related_status.absolute_id}}">status</a>
{% endif %}
{% else %}
your <a href="/import_status/{{ notification.related_import.id }}">import</a> completed.
{% endif %}
</div>
{% endfor %}
{% if not notifications %}

View file

@ -40,6 +40,7 @@ urlpatterns = [
re_path(r'^notifications/?', views.notifications_page),
re_path(r'books/?$', views.books_page),
re_path(r'import/?$', views.import_page),
re_path(r'import_status/(\d+)/?$', views.import_status),
re_path(r'user-edit/?$', views.edit_profile_page),
# should return a ui view or activitypub json blob as requested

View file

@ -2,7 +2,6 @@
from io import BytesIO, TextIOWrapper
import re
from PIL import Image
from requests import HTTPError
from django.contrib.auth import authenticate, login, logout
from django.contrib.auth.decorators import login_required
@ -12,7 +11,7 @@ from django.shortcuts import redirect
from django.template.response import TemplateResponse
from fedireads import forms, models, books_manager, outgoing
from fedireads.goodreads_import import GoodreadsCsv
from fedireads import goodreads_import
from fedireads.settings import DOMAIN
from fedireads.views import get_user_from_username
from fedireads.books_manager import get_or_create_book
@ -419,37 +418,10 @@ def import_data(request):
''' ingest a goodreads csv '''
form = forms.ImportForm(request.POST, request.FILES)
if form.is_valid():
results = []
reviews = []
failures = []
for item in GoodreadsCsv(TextIOWrapper(
request.FILES['csv_file'],
encoding=request.encoding)):
try:
item.resolve()
except HTTPError:
pass
if item.book:
results.append(item)
if item.rating or item.review:
reviews.append(item)
else:
failures.append(item)
outgoing.handle_import_books(request.user, results)
for item in reviews:
review_title = "Review of {!r} on Goodreads".format(
item.book.title,
) if item.review else ""
outgoing.handle_review(
request.user,
item.book,
review_title,
item.review,
item.rating,
)
return TemplateResponse(request, 'import_results.html', {
'success_count': len(results),
'failures': failures,
})
job = goodreads_import.create_job(
request.user,
TextIOWrapper(request.FILES['csv_file'], encoding=request.encoding)
)
goodreads_import.start_import(job)
return redirect('/import_status/%d' % (job.id,))
return HttpResponseBadRequest()

View file

@ -1,13 +1,15 @@
''' views for pages you can go to in the application '''
from django.contrib.auth.decorators import login_required
from django.db.models import Avg, Q
from django.http import HttpResponseBadRequest, HttpResponseNotFound, \
from django.http import HttpResponseBadRequest, HttpResponseNotFound,\
JsonResponse
from django.core.exceptions import PermissionDenied
from django.template.response import TemplateResponse
from django.views.decorators.csrf import csrf_exempt
from fedireads import activitypub
from fedireads import forms, models, books_manager
from fedireads.tasks import app
def get_user_from_username(username):
@ -158,9 +160,24 @@ def import_page(request):
''' import history from goodreads '''
return TemplateResponse(request, 'import.html', {
'import_form': forms.ImportForm(),
'jobs': models.ImportJob.
objects.filter(user=request.user).order_by('-created_date'),
})
@login_required
def import_status(request, job_id):
''' status of an import job '''
job = models.ImportJob.objects.get(id=job_id)
if job.user != request.user:
raise PermissionDenied
task = app.AsyncResult(job.task_id)
return TemplateResponse(request, 'import_status.html', {
'job': job,
'items': job.items.order_by('index').all(),
'task': task
})
def login_page(request):
''' authentication '''

6
fr-dev
View file

@ -34,7 +34,11 @@ case "$1" in
dbshell)
docker-compose exec db psql -U fedireads fedireads
;;
restart_celery)
docker-compose restart celery_worker
;;
*)
echo "Unrecognised command. Try: up, initdb, resetdb,makemigrations, migrate, shell, dbshell "
echo "Unrecognised command. Try: up, initdb, resetdb,makemigrations, migrate, shell, dbshell,restart_celery"
;;
esac

View file

@ -21,4 +21,5 @@ app.autodiscover_tasks()
app.autodiscover_tasks(['fedireads'], related_name='incoming')
app.autodiscover_tasks(['fedireads'], related_name='broadcast')
app.autodiscover_tasks(['fedireads'], related_name='books_manager')
app.autodiscover_tasks(['fedireads'], related_name='goodreads_import')