mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2024-11-24 02:21:04 +00:00
Store csv in the database and then import via celery.
This commit is contained in:
parent
881cc4d64b
commit
0bf73fef24
10 changed files with 293 additions and 133 deletions
|
@ -1,65 +1,46 @@
|
|||
''' handle reading a csv from goodreads '''
|
||||
import re
|
||||
import csv
|
||||
import dateutil.parser
|
||||
from requests import HTTPError
|
||||
|
||||
from fedireads import books_manager
|
||||
from fedireads import outgoing
|
||||
from fedireads.models import Edition, ReadThrough, User
|
||||
from fedireads.tasks import app
|
||||
from fedireads.models import ImportJob, ImportItem
|
||||
|
||||
|
||||
# Mapping goodreads -> fedireads shelf titles.
|
||||
GOODREADS_SHELVES = {
|
||||
'read': 'read',
|
||||
'currently-reading': 'reading',
|
||||
'to-read': 'to-read',
|
||||
}
|
||||
# TODO: remove or notify about this in the UI
|
||||
MAX_ENTRIES = 20
|
||||
|
||||
|
||||
def unquote_string(text):
|
||||
''' resolve csv quote weirdness '''
|
||||
match = re.match(r'="([^"]*)"', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return text
|
||||
def create_job(user, csv_file):
|
||||
job = ImportJob.objects.create(user=user)
|
||||
for index, entry in enumerate(list(csv.DictReader(csv_file))[:MAX_ENTRIES]):
|
||||
ImportItem(job=job, index=index, data=entry).save()
|
||||
return job
|
||||
|
||||
|
||||
def construct_search_term(title, author):
|
||||
''' formulate a query for the data connector '''
|
||||
# Strip brackets (usually series title from search term)
|
||||
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
|
||||
# Open library doesn't like including author initials in search term.
|
||||
author = re.sub(r'(\w\.)+\s*', '', author)
|
||||
|
||||
return ' '.join([title, author])
|
||||
|
||||
|
||||
def async_import(user, csv_file):
|
||||
entries = list(csv.DictReader(csv_file))[:MAX_ENTRIES]
|
||||
return import_data.delay(user.id, entries)
|
||||
def start_import(job):
|
||||
result = import_data.delay(job.id)
|
||||
job.task_id = result.id
|
||||
job.save()
|
||||
|
||||
@app.task
|
||||
def import_data(user_id, entries):
|
||||
user = User.objects.get(pk=user_id)
|
||||
def import_data(job_id):
|
||||
job = ImportJob.objects.get(id=job_id)
|
||||
user = job.user
|
||||
results = []
|
||||
reviews = []
|
||||
failures = []
|
||||
for item in entries:
|
||||
item = GoodreadsItem(item)
|
||||
for item in job.items.all():
|
||||
try:
|
||||
item.resolve()
|
||||
except HTTPError:
|
||||
pass
|
||||
if item.book:
|
||||
item.save()
|
||||
results.append(item)
|
||||
if item.rating or item.review:
|
||||
reviews.append(item)
|
||||
else:
|
||||
failures.append(item)
|
||||
item.fail_reason = "Could not match book on OpenLibrary"
|
||||
item.save()
|
||||
|
||||
outgoing.handle_import_books(user, results)
|
||||
for item in reviews:
|
||||
|
@ -73,84 +54,3 @@ def import_data(user_id, entries):
|
|||
item.review,
|
||||
item.rating,
|
||||
)
|
||||
|
||||
|
||||
class GoodreadsItem:
|
||||
''' a processed line in a goodreads csv '''
|
||||
def __init__(self, line):
|
||||
self.line = line
|
||||
self.book = None
|
||||
|
||||
def resolve(self):
|
||||
''' try various ways to lookup a book '''
|
||||
self.book = (
|
||||
self.get_book_from_db_isbn() or
|
||||
self.get_book_from_isbn() or
|
||||
self.get_book_from_title_author()
|
||||
)
|
||||
|
||||
def get_book_from_db_isbn(self):
|
||||
''' see if we already know about the book '''
|
||||
try:
|
||||
return Edition.objects.get(isbn=self.isbn)
|
||||
except Edition.DoesNotExist:
|
||||
return None
|
||||
|
||||
def get_book_from_isbn(self):
|
||||
''' search by isbn '''
|
||||
search_results = books_manager.search(self.isbn)
|
||||
if search_results:
|
||||
return books_manager.get_or_create_book(search_results[0].key)
|
||||
|
||||
def get_book_from_title_author(self):
|
||||
''' search by title and author '''
|
||||
search_term = construct_search_term(
|
||||
self.line['Title'],
|
||||
self.line['Author']
|
||||
)
|
||||
search_results = books_manager.search(search_term)
|
||||
if search_results:
|
||||
return books_manager.get_or_create_book(search_results[0].key)
|
||||
|
||||
@property
|
||||
def isbn(self):
|
||||
return unquote_string(self.line['ISBN13'])
|
||||
|
||||
@property
|
||||
def shelf(self):
|
||||
''' the goodreads shelf field '''
|
||||
if self.line['Exclusive Shelf']:
|
||||
return GOODREADS_SHELVES[self.line['Exclusive Shelf']]
|
||||
|
||||
@property
|
||||
def review(self):
|
||||
return self.line['My Review']
|
||||
|
||||
@property
|
||||
def rating(self):
|
||||
return int(self.line['My Rating'])
|
||||
|
||||
@property
|
||||
def date_added(self):
|
||||
if self.line['Date Added']:
|
||||
return dateutil.parser.parse(self.line['Date Added'])
|
||||
|
||||
@property
|
||||
def date_read(self):
|
||||
if self.line['Date Read']:
|
||||
return dateutil.parser.parse(self.line['Date Read'])
|
||||
|
||||
@property
|
||||
def reads(self):
|
||||
return [ReadThrough(
|
||||
# Date added isn't the start date, but it's (perhaps) better than nothing.
|
||||
start_date=self.date_added,
|
||||
finish_date=self.date_read,
|
||||
pages_read=None,
|
||||
)]
|
||||
|
||||
def __repr__(self):
|
||||
return "<GoodreadsItem {!r}>".format(self.line['Title'])
|
||||
|
||||
def __str__(self):
|
||||
return "{} by {}".format(self.line['Title'], self.line['Author'])
|
||||
|
|
60
fedireads/migrations/0032_auto_20200421_1347.py
Normal file
60
fedireads/migrations/0032_auto_20200421_1347.py
Normal file
|
@ -0,0 +1,60 @@
|
|||
# Generated by Django 3.0.3 on 2020-04-21 13:47
|
||||
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
import fedireads.utils.fields
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('fedireads', '0031_readthrough'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.CreateModel(
|
||||
name='ImportItem',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('data', fedireads.utils.fields.JSONField()),
|
||||
],
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='ImportJob',
|
||||
fields=[
|
||||
('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('created_date', models.DateTimeField(default=django.utils.timezone.now)),
|
||||
('task_id', models.CharField(max_length=100, null=True)),
|
||||
],
|
||||
),
|
||||
migrations.RemoveConstraint(
|
||||
model_name='notification',
|
||||
name='notification_type_valid',
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='notification',
|
||||
name='notification_type',
|
||||
field=models.CharField(choices=[('FAVORITE', 'Favorite'), ('REPLY', 'Reply'), ('TAG', 'Tag'), ('FOLLOW', 'Follow'), ('FOLLOW_REQUEST', 'Follow Request'), ('BOOST', 'Boost'), ('IMPORT_RESULT', 'Import Result')], max_length=255),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='notification',
|
||||
constraint=models.CheckConstraint(check=models.Q(notification_type__in=['FAVORITE', 'REPLY', 'TAG', 'FOLLOW', 'FOLLOW_REQUEST', 'BOOST', 'IMPORT_RESULT']), name='notification_type_valid'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='importjob',
|
||||
name='user',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='importitem',
|
||||
name='book',
|
||||
field=models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, to='fedireads.Book'),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='importitem',
|
||||
name='job',
|
||||
field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='items', to='fedireads.ImportJob'),
|
||||
),
|
||||
]
|
|
@ -5,3 +5,4 @@ from .status import Status, Review, Comment, Quotation
|
|||
from .status import Favorite, Boost, Tag, Notification, ReadThrough
|
||||
from .user import User, UserFollows, UserFollowRequest, UserBlocks
|
||||
from .user import FederatedServer
|
||||
from .import_job import ImportJob, ImportItem
|
||||
|
|
123
fedireads/models/import_job.py
Normal file
123
fedireads/models/import_job.py
Normal file
|
@ -0,0 +1,123 @@
|
|||
import re
|
||||
import dateutil.parser
|
||||
|
||||
from django.db import models
|
||||
from django.utils import timezone
|
||||
|
||||
from fedireads import books_manager
|
||||
from fedireads.models import Edition, ReadThrough, User, Book
|
||||
from fedireads.utils.fields import JSONField
|
||||
|
||||
# Mapping goodreads -> fedireads shelf titles.
|
||||
GOODREADS_SHELVES = {
|
||||
'read': 'read',
|
||||
'currently-reading': 'reading',
|
||||
'to-read': 'to-read',
|
||||
}
|
||||
|
||||
def unquote_string(text):
|
||||
''' resolve csv quote weirdness '''
|
||||
match = re.match(r'="([^"]*)"', text)
|
||||
if match:
|
||||
return match.group(1)
|
||||
return text
|
||||
|
||||
|
||||
def construct_search_term(title, author):
|
||||
''' formulate a query for the data connector '''
|
||||
# Strip brackets (usually series title from search term)
|
||||
title = re.sub(r'\s*\([^)]*\)\s*', '', title)
|
||||
# Open library doesn't like including author initials in search term.
|
||||
author = re.sub(r'(\w\.)+\s*', '', author)
|
||||
|
||||
return ' '.join([title, author])
|
||||
|
||||
class ImportJob(models.Model):
|
||||
user = models.ForeignKey(User, on_delete=models.CASCADE)
|
||||
created_date = models.DateTimeField(default=timezone.now)
|
||||
task_id = models.CharField(max_length=100, null=True)
|
||||
|
||||
class ImportItem(models.Model):
|
||||
job = models.ForeignKey(
|
||||
ImportJob,
|
||||
on_delete=models.CASCADE,
|
||||
related_name='items')
|
||||
index = models.IntegerField()
|
||||
data = JSONField()
|
||||
book = models.ForeignKey(
|
||||
Book, on_delete=models.SET_NULL, null=True, blank=True)
|
||||
fail_reason = models.TextField(null=True)
|
||||
|
||||
def resolve(self):
|
||||
''' try various ways to lookup a book '''
|
||||
self.book = (
|
||||
self.get_book_from_db_isbn() or
|
||||
self.get_book_from_isbn() or
|
||||
self.get_book_from_title_author()
|
||||
)
|
||||
|
||||
def get_book_from_db_isbn(self):
|
||||
''' see if we already know about the book '''
|
||||
try:
|
||||
return Edition.objects.get(isbn=self.isbn)
|
||||
except Edition.DoesNotExist:
|
||||
return None
|
||||
|
||||
def get_book_from_isbn(self):
|
||||
''' search by isbn '''
|
||||
search_results = books_manager.search(self.isbn)
|
||||
if search_results:
|
||||
return books_manager.get_or_create_book(search_results[0].key)
|
||||
|
||||
def get_book_from_title_author(self):
|
||||
''' search by title and author '''
|
||||
search_term = construct_search_term(
|
||||
self.data['Title'],
|
||||
self.data['Author']
|
||||
)
|
||||
search_results = books_manager.search(search_term)
|
||||
if search_results:
|
||||
return books_manager.get_or_create_book(search_results[0].key)
|
||||
|
||||
@property
|
||||
def isbn(self):
|
||||
return unquote_string(self.data['ISBN13'])
|
||||
|
||||
@property
|
||||
def shelf(self):
|
||||
''' the goodreads shelf field '''
|
||||
if self.data['Exclusive Shelf']:
|
||||
return GOODREADS_SHELVES[self.data['Exclusive Shelf']]
|
||||
|
||||
@property
|
||||
def review(self):
|
||||
return self.data['My Review']
|
||||
|
||||
@property
|
||||
def rating(self):
|
||||
return int(self.data['My Rating'])
|
||||
|
||||
@property
|
||||
def date_added(self):
|
||||
if self.data['Date Added']:
|
||||
return dateutil.parser.parse(self.data['Date Added'])
|
||||
|
||||
@property
|
||||
def date_read(self):
|
||||
if self.data['Date Read']:
|
||||
return dateutil.parser.parse(self.data['Date Read'])
|
||||
|
||||
@property
|
||||
def reads(self):
|
||||
return [ReadThrough(
|
||||
# date_added isn't the start date, but maybe better than nothing.
|
||||
start_date=self.date_added,
|
||||
finish_date=self.date_read,
|
||||
pages_read=None,
|
||||
)]
|
||||
|
||||
def __repr__(self):
|
||||
return "<GoodreadsItem {!r}>".format(self.data['Title'])
|
||||
|
||||
def __str__(self):
|
||||
return "{} by {}".format(self.data['Title'], self.data['Author'])
|
|
@ -1,4 +1,5 @@
|
|||
{% extends 'layout.html' %}
|
||||
{% load humanize %}
|
||||
{% block content %}
|
||||
<div class="content-container">
|
||||
<h2>Import Books from GoodReads</h2>
|
||||
|
@ -6,7 +7,13 @@
|
|||
{% csrf_token %}
|
||||
{{ import_form.as_p }}
|
||||
<button type="submit">Import</button>
|
||||
<small>Hang tight, this may take a minute!</small>
|
||||
</form>
|
||||
|
||||
<h2>Recent Imports</h2>
|
||||
<ul>
|
||||
{% for job in jobs %}
|
||||
<li><a href="/import_status/{{ job.id }}">{{ job.created_date | naturaltime }}</a></li>
|
||||
{% endfor %}
|
||||
</ul>
|
||||
</div>
|
||||
{% endblock %}
|
||||
|
|
|
@ -1,10 +0,0 @@
|
|||
{% extends 'layout.html' %}
|
||||
{% block content %}
|
||||
<div id="content">
|
||||
<div>
|
||||
<h1>Import</h1>
|
||||
|
||||
Import uploaded successfully. The import is being processed.
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
62
fedireads/templates/import_status.html
Normal file
62
fedireads/templates/import_status.html
Normal file
|
@ -0,0 +1,62 @@
|
|||
{% extends 'layout.html' %}
|
||||
{% load fr_display %}
|
||||
{% load humanize %}
|
||||
{% block content %}
|
||||
<div id="content">
|
||||
<div>
|
||||
<h1>Import Status</h1>
|
||||
|
||||
<p>
|
||||
Import started: {{ job.created_date | naturaltime }}
|
||||
<p>
|
||||
{% if task.ready %}
|
||||
Import completed: {{ task.date_done | naturaltime }}
|
||||
{% if task.failed %}
|
||||
<h3><span style="background-color: #ffaaaa;">TASK FAILED</span></h3>
|
||||
<p>
|
||||
{{ task.info }}
|
||||
{% endif %}
|
||||
{% else %}
|
||||
Import still in progress.
|
||||
<p>
|
||||
(Hit reload to update!)
|
||||
{% endif %}
|
||||
|
||||
<table>
|
||||
<tr>
|
||||
<th>
|
||||
</th>
|
||||
<th>
|
||||
Title
|
||||
</th>
|
||||
<th>
|
||||
Author
|
||||
</th>
|
||||
<th>
|
||||
Book
|
||||
</th>
|
||||
</tr>
|
||||
{% for item in items %}
|
||||
<tr>
|
||||
<td>
|
||||
{% if item.book %}✓{% endif %}
|
||||
</td>
|
||||
<td>
|
||||
{{ item.data|dict_key:'Title' }}
|
||||
</td>
|
||||
<td>
|
||||
{{ item.data|dict_key:'Author' }}
|
||||
</td>
|
||||
<td>
|
||||
{% if item.book %}
|
||||
<a href="{{ item.book.absolute_id }}">
|
||||
{% include 'snippets/book_cover.html' with book=item.book size='small' %}
|
||||
</a>
|
||||
{% endif %}
|
||||
</td>
|
||||
</tr>
|
||||
{% endfor %}
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
{% endblock %}
|
|
@ -40,6 +40,7 @@ urlpatterns = [
|
|||
re_path(r'^notifications/?', views.notifications_page),
|
||||
re_path(r'books/?$', views.books_page),
|
||||
re_path(r'import/?$', views.import_page),
|
||||
re_path(r'import_status/(\d+)/?$', views.import_status),
|
||||
re_path(r'user-edit/?$', views.edit_profile_page),
|
||||
|
||||
# should return a ui view or activitypub json blob as requested
|
||||
|
|
|
@ -418,10 +418,10 @@ def import_data(request):
|
|||
''' ingest a goodreads csv '''
|
||||
form = forms.ImportForm(request.POST, request.FILES)
|
||||
if form.is_valid():
|
||||
goodreads_import.async_import(
|
||||
job = goodreads_import.create_job(
|
||||
request.user,
|
||||
TextIOWrapper(request.FILES['csv_file'], encoding=request.encoding)
|
||||
)
|
||||
return TemplateResponse(request, 'import_results.html', {})
|
||||
goodreads_import.start_import(job)
|
||||
return redirect('/import_status/%d' % (job.id,))
|
||||
return HttpResponseBadRequest()
|
||||
|
||||
|
|
|
@ -1,13 +1,15 @@
|
|||
''' views for pages you can go to in the application '''
|
||||
from django.contrib.auth.decorators import login_required
|
||||
from django.db.models import Avg, Q
|
||||
from django.http import HttpResponseBadRequest, HttpResponseNotFound, \
|
||||
from django.http import HttpResponseBadRequest, HttpResponseNotFound,\
|
||||
JsonResponse
|
||||
from django.core.exceptions import PermissionDenied
|
||||
from django.template.response import TemplateResponse
|
||||
from django.views.decorators.csrf import csrf_exempt
|
||||
|
||||
from fedireads import activitypub
|
||||
from fedireads import forms, models, books_manager
|
||||
from fedireads.tasks import app
|
||||
|
||||
|
||||
def get_user_from_username(username):
|
||||
|
@ -158,9 +160,24 @@ def import_page(request):
|
|||
''' import history from goodreads '''
|
||||
return TemplateResponse(request, 'import.html', {
|
||||
'import_form': forms.ImportForm(),
|
||||
'jobs': models.ImportJob.
|
||||
objects.filter(user=request.user).order_by('-created_date'),
|
||||
})
|
||||
|
||||
|
||||
@login_required
|
||||
def import_status(request, job_id):
|
||||
''' status of an import job '''
|
||||
job = models.ImportJob.objects.get(id=job_id)
|
||||
if job.user != request.user:
|
||||
raise PermissionDenied
|
||||
task = app.AsyncResult(job.task_id)
|
||||
return TemplateResponse(request, 'import_status.html', {
|
||||
'job': job,
|
||||
'items': job.items.order_by('index').all(),
|
||||
'task': task
|
||||
})
|
||||
|
||||
|
||||
def login_page(request):
|
||||
''' authentication '''
|
||||
|
@ -531,4 +548,3 @@ def get_user_shelf_preview(user, shelf_proportions=None):
|
|||
'size': shelf.books.count(),
|
||||
})
|
||||
return shelves
|
||||
|
||||
|
|
Loading…
Reference in a new issue