Store csv in the database and then import via celery.

This commit is contained in:
Adam Kelly 2020-04-21 15:09:21 +01:00
parent 881cc4d64b
commit 0bf73fef24
10 changed files with 293 additions and 133 deletions

View file

@ -1,65 +1,46 @@
''' handle reading a csv from goodreads ''' ''' handle reading a csv from goodreads '''
import re
import csv import csv
import dateutil.parser
from requests import HTTPError from requests import HTTPError
from fedireads import books_manager
from fedireads import outgoing from fedireads import outgoing
from fedireads.models import Edition, ReadThrough, User
from fedireads.tasks import app from fedireads.tasks import app
from fedireads.models import ImportJob, ImportItem
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
# TODO: remove or notify about this in the UI # TODO: remove or notify about this in the UI
MAX_ENTRIES = 20 MAX_ENTRIES = 20
def unquote_string(text): def create_job(user, csv_file):
''' resolve csv quote weirdness ''' job = ImportJob.objects.create(user=user)
match = re.match(r'="([^"]*)"', text) for index, entry in enumerate(list(csv.DictReader(csv_file))[:MAX_ENTRIES]):
if match: ImportItem(job=job, index=index, data=entry).save()
return match.group(1) return job
return text
def start_import(job):
def construct_search_term(title, author): result = import_data.delay(job.id)
''' formulate a query for the data connector ''' job.task_id = result.id
# Strip brackets (usually series title from search term) job.save()
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
return ' '.join([title, author])
def async_import(user, csv_file):
entries = list(csv.DictReader(csv_file))[:MAX_ENTRIES]
return import_data.delay(user.id, entries)
@app.task @app.task
def import_data(user_id, entries): def import_data(job_id):
user = User.objects.get(pk=user_id) job = ImportJob.objects.get(id=job_id)
user = job.user
results = [] results = []
reviews = [] reviews = []
failures = [] for item in job.items.all():
for item in entries:
item = GoodreadsItem(item)
try: try:
item.resolve() item.resolve()
except HTTPError: except HTTPError:
pass pass
if item.book: if item.book:
item.save()
results.append(item) results.append(item)
if item.rating or item.review: if item.rating or item.review:
reviews.append(item) reviews.append(item)
else: else:
failures.append(item) item.fail_reason = "Could not match book on OpenLibrary"
item.save()
outgoing.handle_import_books(user, results) outgoing.handle_import_books(user, results)
for item in reviews: for item in reviews:
@ -73,84 +54,3 @@ def import_data(user_id, entries):
item.review, item.review,
item.rating, item.rating,
) )
class GoodreadsItem:
''' a processed line in a goodreads csv '''
def __init__(self, line):
self.line = line
self.book = None
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.line['Title'],
self.line['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.line['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.line['Exclusive Shelf']:
return GOODREADS_SHELVES[self.line['Exclusive Shelf']]
@property
def review(self):
return self.line['My Review']
@property
def rating(self):
return int(self.line['My Rating'])
@property
def date_added(self):
if self.line['Date Added']:
return dateutil.parser.parse(self.line['Date Added'])
@property
def date_read(self):
if self.line['Date Read']:
return dateutil.parser.parse(self.line['Date Read'])
@property
def reads(self):
return [ReadThrough(
# Date added isn't the start date, but it's (perhaps) better than nothing.
start_date=self.date_added,
finish_date=self.date_read,
pages_read=None,
)]
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.line['Title'])
def __str__(self):
return "{} by {}".format(self.line['Title'], self.line['Author'])

View file

@ -0,0 +1,60 @@
# Generated by Django 3.0.3 on 2020-04-21 13:47
from django.conf import settings
from django.db import migrations, models
import django.db.models.deletion
import django.utils.timezone
import fedireads.utils.fields
class Migration(migrations.Migration):
dependencies = [
('fedireads', '0031_readthrough'),
]
operations = [
migrations.CreateModel(
name='ImportItem',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('data', fedireads.utils.fields.JSONField()),
],
),
migrations.CreateModel(
name='ImportJob',
fields=[
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('created_date', models.DateTimeField(default=django.utils.timezone.now)),
('task_id', models.CharField(max_length=100, null=True)),
],
),
migrations.RemoveConstraint(
model_name='notification',
name='notification_type_valid',
),
migrations.AlterField(
model_name='notification',
name='notification_type',
field=models.CharField(choices=[('FAVORITE', 'Favorite'), ('REPLY', 'Reply'), ('TAG', 'Tag'), ('FOLLOW', 'Follow'), ('FOLLOW_REQUEST', 'Follow Request'), ('BOOST', 'Boost'), ('IMPORT_RESULT', 'Import Result')], max_length=255),
),
migrations.AddConstraint(
model_name='notification',
constraint=models.CheckConstraint(check=models.Q(notification_type__in=['FAVORITE', 'REPLY', 'TAG', 'FOLLOW', 'FOLLOW_REQUEST', 'BOOST', 'IMPORT_RESULT']), name='notification_type_valid'),
),
migrations.AddField(
model_name='importjob',
name='user',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
),
migrations.AddField(
model_name='importitem',
name='book',
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='fedireads.Book'),
),
migrations.AddField(
model_name='importitem',
name='job',
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='items', to='fedireads.ImportJob'),
),
]

View file

@ -5,3 +5,4 @@ from .status import Status, Review, Comment, Quotation
from .status import Favorite, Boost, Tag, Notification, ReadThrough from .status import Favorite, Boost, Tag, Notification, ReadThrough
from .user import User, UserFollows, UserFollowRequest, UserBlocks from .user import User, UserFollows, UserFollowRequest, UserBlocks
from .user import FederatedServer from .user import FederatedServer
from .import_job import ImportJob, ImportItem

View file

@ -0,0 +1,123 @@
import re
import dateutil.parser
from django.db import models
from django.utils import timezone
from fedireads import books_manager
from fedireads.models import Edition, ReadThrough, User, Book
from fedireads.utils.fields import JSONField
# Mapping goodreads -> fedireads shelf titles.
GOODREADS_SHELVES = {
'read': 'read',
'currently-reading': 'reading',
'to-read': 'to-read',
}
def unquote_string(text):
''' resolve csv quote weirdness '''
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def construct_search_term(title, author):
''' formulate a query for the data connector '''
# Strip brackets (usually series title from search term)
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
# Open library doesn't like including author initials in search term.
author = re.sub(r'(\w\.)+\s*', '', author)
return ' '.join([title, author])
class ImportJob(models.Model):
user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True)
class ImportItem(models.Model):
job = models.ForeignKey(
ImportJob,
on_delete=models.CASCADE,
related_name='items')
index = models.IntegerField()
data = JSONField()
book = models.ForeignKey(
Book, on_delete=models.SET_NULL, null=True, blank=True)
fail_reason = models.TextField(null=True)
def resolve(self):
''' try various ways to lookup a book '''
self.book = (
self.get_book_from_db_isbn() or
self.get_book_from_isbn() or
self.get_book_from_title_author()
)
def get_book_from_db_isbn(self):
''' see if we already know about the book '''
try:
return Edition.objects.get(isbn=self.isbn)
except Edition.DoesNotExist:
return None
def get_book_from_isbn(self):
''' search by isbn '''
search_results = books_manager.search(self.isbn)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
def get_book_from_title_author(self):
''' search by title and author '''
search_term = construct_search_term(
self.data['Title'],
self.data['Author']
)
search_results = books_manager.search(search_term)
if search_results:
return books_manager.get_or_create_book(search_results[0].key)
@property
def isbn(self):
return unquote_string(self.data['ISBN13'])
@property
def shelf(self):
''' the goodreads shelf field '''
if self.data['Exclusive Shelf']:
return GOODREADS_SHELVES[self.data['Exclusive Shelf']]
@property
def review(self):
return self.data['My Review']
@property
def rating(self):
return int(self.data['My Rating'])
@property
def date_added(self):
if self.data['Date Added']:
return dateutil.parser.parse(self.data['Date Added'])
@property
def date_read(self):
if self.data['Date Read']:
return dateutil.parser.parse(self.data['Date Read'])
@property
def reads(self):
return [ReadThrough(
# date_added isn't the start date, but maybe better than nothing.
start_date=self.date_added,
finish_date=self.date_read,
pages_read=None,
)]
def __repr__(self):
return "<GoodreadsItem {!r}>".format(self.data['Title'])
def __str__(self):
return "{} by {}".format(self.data['Title'], self.data['Author'])

View file

@ -1,4 +1,5 @@
{% extends 'layout.html' %} {% extends 'layout.html' %}
{% load humanize %}
{% block content %} {% block content %}
<div class="content-container"> <div class="content-container">
<h2>Import Books from GoodReads</h2> <h2>Import Books from GoodReads</h2>
@ -6,7 +7,13 @@
{% csrf_token %} {% csrf_token %}
{{ import_form.as_p }} {{ import_form.as_p }}
<button type="submit">Import</button> <button type="submit">Import</button>
<small>Hang tight, this may take a minute!</small>
</form> </form>
<h2>Recent Imports</h2>
<ul>
{% for job in jobs %}
<li><a href="/import_status/{{ job.id }}">{{ job.created_date | naturaltime }}</a></li>
{% endfor %}
</ul>
</div> </div>
{% endblock %} {% endblock %}

View file

@ -1,10 +0,0 @@
{% extends 'layout.html' %}
{% block content %}
<div id="content">
<div>
<h1>Import</h1>
Import uploaded successfully. The import is being processed.
</div>
</div>
{% endblock %}

View file

@ -0,0 +1,62 @@
{% extends 'layout.html' %}
{% load fr_display %}
{% load humanize %}
{% block content %}
<div id="content">
<div>
<h1>Import Status</h1>
<p>
Import started: {{ job.created_date | naturaltime }}
<p>
{% if task.ready %}
Import completed: {{ task.date_done | naturaltime }}
{% if task.failed %}
<h3><span style="background-color: #ffaaaa;">TASK FAILED</span></h3>
<p>
{{ task.info }}
{% endif %}
{% else %}
Import still in progress.
<p>
(Hit reload to update!)
{% endif %}
<table>
<tr>
<th>
</th>
<th>
Title
</th>
<th>
Author
</th>
<th>
Book
</th>
</tr>
{% for item in items %}
<tr>
<td>
{% if item.book %}✓{% endif %}
</td>
<td>
{{ item.data|dict_key:'Title' }}
</td>
<td>
{{ item.data|dict_key:'Author' }}
</td>
<td>
{% if item.book %}
<a href="{{ item.book.absolute_id }}">
{% include 'snippets/book_cover.html' with book=item.book size='small' %}
</a>
{% endif %}
</td>
</tr>
{% endfor %}
</table>
</div>
</div>
{% endblock %}

View file

@ -40,6 +40,7 @@ urlpatterns = [
re_path(r'^notifications/?', views.notifications_page), re_path(r'^notifications/?', views.notifications_page),
re_path(r'books/?$', views.books_page), re_path(r'books/?$', views.books_page),
re_path(r'import/?$', views.import_page), re_path(r'import/?$', views.import_page),
re_path(r'import_status/(\d+)/?$', views.import_status),
re_path(r'user-edit/?$', views.edit_profile_page), re_path(r'user-edit/?$', views.edit_profile_page),
# should return a ui view or activitypub json blob as requested # should return a ui view or activitypub json blob as requested

View file

@ -418,10 +418,10 @@ def import_data(request):
''' ingest a goodreads csv ''' ''' ingest a goodreads csv '''
form = forms.ImportForm(request.POST, request.FILES) form = forms.ImportForm(request.POST, request.FILES)
if form.is_valid(): if form.is_valid():
goodreads_import.async_import( job = goodreads_import.create_job(
request.user, request.user,
TextIOWrapper(request.FILES['csv_file'], encoding=request.encoding) TextIOWrapper(request.FILES['csv_file'], encoding=request.encoding)
) )
return TemplateResponse(request, 'import_results.html', {}) goodreads_import.start_import(job)
return redirect('/import_status/%d' % (job.id,))
return HttpResponseBadRequest() return HttpResponseBadRequest()

View file

@ -1,13 +1,15 @@
''' views for pages you can go to in the application ''' ''' views for pages you can go to in the application '''
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.db.models import Avg, Q from django.db.models import Avg, Q
from django.http import HttpResponseBadRequest, HttpResponseNotFound, \ from django.http import HttpResponseBadRequest, HttpResponseNotFound,\
JsonResponse JsonResponse
from django.core.exceptions import PermissionDenied
from django.template.response import TemplateResponse from django.template.response import TemplateResponse
from django.views.decorators.csrf import csrf_exempt from django.views.decorators.csrf import csrf_exempt
from fedireads import activitypub from fedireads import activitypub
from fedireads import forms, models, books_manager from fedireads import forms, models, books_manager
from fedireads.tasks import app
def get_user_from_username(username): def get_user_from_username(username):
@ -158,9 +160,24 @@ def import_page(request):
''' import history from goodreads ''' ''' import history from goodreads '''
return TemplateResponse(request, 'import.html', { return TemplateResponse(request, 'import.html', {
'import_form': forms.ImportForm(), 'import_form': forms.ImportForm(),
'jobs': models.ImportJob.
objects.filter(user=request.user).order_by('-created_date'),
}) })
@login_required
def import_status(request, job_id):
''' status of an import job '''
job = models.ImportJob.objects.get(id=job_id)
if job.user != request.user:
raise PermissionDenied
task = app.AsyncResult(job.task_id)
return TemplateResponse(request, 'import_status.html', {
'job': job,
'items': job.items.order_by('index').all(),
'task': task
})
def login_page(request): def login_page(request):
''' authentication ''' ''' authentication '''
@ -531,4 +548,3 @@ def get_user_shelf_preview(user, shelf_proportions=None):
'size': shelf.books.count(), 'size': shelf.books.count(),
}) })
return shelves return shelves