Merge pull request #3135 from hughrun/csv

csv import and export fixes
This commit is contained in:
Mouse Reeve 2024-08-23 16:29:04 -07:00 committed by GitHub
commit 413c26bc5e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 342 additions and 24 deletions

View file

@ -1,7 +1,7 @@
""" import classes """ """ import classes """
from .importer import Importer from .importer import Importer
from .bookwyrm_import import BookwyrmImporter from .bookwyrm_import import BookwyrmImporter, BookwyrmBooksImporter
from .calibre_import import CalibreImporter from .calibre_import import CalibreImporter
from .goodreads_import import GoodreadsImporter from .goodreads_import import GoodreadsImporter
from .librarything_import import LibrarythingImporter from .librarything_import import LibrarythingImporter

View file

@ -3,6 +3,7 @@ from django.http import QueryDict
from bookwyrm.models import User from bookwyrm.models import User
from bookwyrm.models.bookwyrm_import_job import BookwyrmImportJob from bookwyrm.models.bookwyrm_import_job import BookwyrmImportJob
from . import Importer
class BookwyrmImporter: class BookwyrmImporter:
@ -22,3 +23,17 @@ class BookwyrmImporter:
user=user, archive_file=archive_file, required=required user=user, archive_file=archive_file, required=required
) )
return job return job
class BookwyrmBooksImporter(Importer):
"""
Handle reading a csv from BookWyrm.
Goodreads is the default importer, we basically just use the same structure
But BookWyrm has additional attributes in the csv
"""
service = "BookWyrm"
row_mappings_guesses = Importer.row_mappings_guesses + [
("shelf_name", ["shelf_name"]),
("review_published", ["review_published"]),
]

View file

@ -18,17 +18,26 @@ class Importer:
row_mappings_guesses = [ row_mappings_guesses = [
("id", ["id", "book id"]), ("id", ["id", "book id"]),
("title", ["title"]), ("title", ["title"]),
("authors", ["author", "authors", "primary author"]), ("authors", ["author_text", "author", "authors", "primary author"]),
("isbn_10", ["isbn10", "isbn", "isbn/uid"]), ("isbn_10", ["isbn_10", "isbn10", "isbn", "isbn/uid"]),
("isbn_13", ["isbn13", "isbn", "isbns", "isbn/uid"]), ("isbn_13", ["isbn_13", "isbn13", "isbn", "isbns", "isbn/uid"]),
("shelf", ["shelf", "exclusive shelf", "read status", "bookshelf"]), ("shelf", ["shelf", "exclusive shelf", "read status", "bookshelf"]),
("review_name", ["review name"]), ("review_name", ["review_name", "review name"]),
("review_body", ["my review", "review"]), ("review_body", ["review_content", "my review", "review"]),
("rating", ["my rating", "rating", "star rating"]), ("rating", ["my rating", "rating", "star rating"]),
("date_added", ["date added", "entry date", "added"]), (
("date_started", ["date started", "started"]), "date_added",
("date_finished", ["date finished", "last date read", "date read", "finished"]), ["shelf_date", "date_added", "date added", "entry date", "added"],
),
("date_started", ["start_date", "date started", "started"]),
(
"date_finished",
["finish_date", "date finished", "last date read", "date read", "finished"],
),
] ]
# TODO: stopped
date_fields = ["date_added", "date_started", "date_finished"] date_fields = ["date_added", "date_started", "date_finished"]
shelf_mapping_guesses = { shelf_mapping_guesses = {
"to-read": ["to-read", "want to read"], "to-read": ["to-read", "want to read"],
@ -36,8 +45,14 @@ class Importer:
"reading": ["currently-reading", "reading", "currently reading"], "reading": ["currently-reading", "reading", "currently reading"],
} }
# pylint: disable=too-many-arguments
def create_job( def create_job(
self, user: User, csv_file: Iterable[str], include_reviews: bool, privacy: str self,
user: User,
csv_file: Iterable[str],
include_reviews: bool,
privacy: str,
create_shelves: bool = True,
) -> ImportJob: ) -> ImportJob:
"""check over a csv and creates a database entry for the job""" """check over a csv and creates a database entry for the job"""
csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter) csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter)
@ -54,6 +69,7 @@ class Importer:
job = ImportJob.objects.create( job = ImportJob.objects.create(
user=user, user=user,
include_reviews=include_reviews, include_reviews=include_reviews,
create_shelves=create_shelves,
privacy=privacy, privacy=privacy,
mappings=mappings, mappings=mappings,
source=self.service, source=self.service,
@ -113,7 +129,7 @@ class Importer:
shelf = [ shelf = [
s for (s, gs) in self.shelf_mapping_guesses.items() if shelf_name in gs s for (s, gs) in self.shelf_mapping_guesses.items() if shelf_name in gs
] ]
return shelf[0] if shelf else None return shelf[0] if shelf else normalized_row.get("shelf") or None
# pylint: disable=no-self-use # pylint: disable=no-self-use
def normalize_row( def normalize_row(
@ -148,6 +164,7 @@ class Importer:
job = ImportJob.objects.create( job = ImportJob.objects.create(
user=user, user=user,
include_reviews=original_job.include_reviews, include_reviews=original_job.include_reviews,
create_shelves=original_job.create_shelves,
privacy=original_job.privacy, privacy=original_job.privacy,
source=original_job.source, source=original_job.source,
# TODO: allow users to adjust mappings # TODO: allow users to adjust mappings

View file

@ -0,0 +1,18 @@
# Generated by Django 3.2.23 on 2023-11-25 05:49
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0188_theme_loads"),
]
operations = [
migrations.AddField(
model_name="importjob",
name="create_shelves",
field=models.BooleanField(default=True),
),
]

View file

@ -0,0 +1,13 @@
# Generated by Django 4.2.11 on 2024-06-29 06:26
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0189_importjob_create_shelves"),
("bookwyrm", "0206_merge_20240415_1537"),
]
operations = []

View file

@ -0,0 +1,13 @@
# Generated by Django 4.2.11 on 2024-07-28 11:07
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0207_merge_20240629_0626"),
("bookwyrm", "0207_sqlparse_update"),
]
operations = []

View file

@ -4,6 +4,7 @@ import math
import re import re
import dateutil.parser import dateutil.parser
from django.core.exceptions import ObjectDoesNotExist
from django.db import models from django.db import models
from django.utils import timezone from django.utils import timezone
from django.utils.translation import gettext_lazy as _ from django.utils.translation import gettext_lazy as _
@ -59,6 +60,7 @@ class ImportJob(models.Model):
created_date = models.DateTimeField(default=timezone.now) created_date = models.DateTimeField(default=timezone.now)
updated_date = models.DateTimeField(default=timezone.now) updated_date = models.DateTimeField(default=timezone.now)
include_reviews: bool = models.BooleanField(default=True) include_reviews: bool = models.BooleanField(default=True)
create_shelves: bool = models.BooleanField(default=True)
mappings = models.JSONField() mappings = models.JSONField()
source = models.CharField(max_length=100) source = models.CharField(max_length=100)
privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels) privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels)
@ -245,11 +247,26 @@ class ImportItem(models.Model):
"""the goodreads shelf field""" """the goodreads shelf field"""
return self.normalized_data.get("shelf") return self.normalized_data.get("shelf")
@property
def shelf_name(self):
"""the goodreads shelf field"""
return self.normalized_data.get("shelf_name")
@property @property
def review(self): def review(self):
"""a user-written review, to be imported with the book data""" """a user-written review, to be imported with the book data"""
return self.normalized_data.get("review_body") return self.normalized_data.get("review_body")
@property
def review_name(self):
"""a user-written review name, to be imported with the book data"""
return self.normalized_data.get("review_name")
@property
def review_published(self):
"""date the review was published - included in BookWyrm export csv"""
return self.normalized_data.get("review_published", None)
@property @property
def rating(self): def rating(self):
"""x/5 star rating for a book""" """x/5 star rating for a book"""
@ -368,7 +385,7 @@ def import_item_task(item_id):
item.update_job() item.update_job()
def handle_imported_book(item): def handle_imported_book(item): # pylint: disable=too-many-branches
"""process a csv and then post about it""" """process a csv and then post about it"""
job = item.job job = item.job
if job.complete: if job.complete:
@ -385,13 +402,31 @@ def handle_imported_book(item):
item.book = item.book.edition item.book = item.book.edition
existing_shelf = ShelfBook.objects.filter(book=item.book, user=user).exists() existing_shelf = ShelfBook.objects.filter(book=item.book, user=user).exists()
if job.create_shelves and item.shelf and not existing_shelf:
# shelve the book if it hasn't been shelved already # shelve the book if it hasn't been shelved already
if item.shelf and not existing_shelf:
desired_shelf = Shelf.objects.get(identifier=item.shelf, user=user)
shelved_date = item.date_added or timezone.now() shelved_date = item.date_added or timezone.now()
shelfname = getattr(item, "shelf_name", item.shelf)
try:
shelf = Shelf.objects.get(name=shelfname, user=user)
except ObjectDoesNotExist:
try:
shelf = Shelf.objects.get(identifier=item.shelf, user=user)
except ObjectDoesNotExist:
shelf = Shelf.objects.create(
user=user,
identifier=item.shelf,
name=shelfname,
privacy=job.privacy,
)
ShelfBook( ShelfBook(
book=item.book, shelf=desired_shelf, user=user, shelved_date=shelved_date book=item.book,
shelf=shelf,
user=user,
shelved_date=shelved_date,
).save(priority=IMPORT_TRIGGERED) ).save(priority=IMPORT_TRIGGERED)
for read in item.reads: for read in item.reads:
@ -408,19 +443,25 @@ def handle_imported_book(item):
read.save() read.save()
if job.include_reviews and (item.rating or item.review) and not item.linked_review: if job.include_reviews and (item.rating or item.review) and not item.linked_review:
# we don't know the publication date of the review, # we don't necessarily know the publication date of the review,
# but "now" is a bad guess # but "now" is a bad guess unless we have no choice
published_date_guess = item.date_read or item.date_added
published_date_guess = (
item.review_published or item.date_read or item.date_added or timezone.now()
)
if item.review: if item.review:
# pylint: disable=consider-using-f-string # pylint: disable=consider-using-f-string
review_title = "Review of {!r} on {!r}".format( review_title = "Review of {!r} on {!r}".format(
item.book.title, item.book.title,
job.source, job.source,
) )
review_name = getattr(item, "review_name", review_title)
review = Review.objects.filter( review = Review.objects.filter(
user=user, user=user,
book=item.book, book=item.book,
name=review_title, name=review_name,
rating=item.rating, rating=item.rating,
published_date=published_date_guess, published_date=published_date_guess,
).first() ).first()
@ -428,7 +469,7 @@ def handle_imported_book(item):
review = Review( review = Review(
user=user, user=user,
book=item.book, book=item.book,
name=review_title, name=review_name,
content=item.review, content=item.review,
rating=item.rating, rating=item.rating,
published_date=published_date_guess, published_date=published_date_guess,

View file

@ -69,6 +69,9 @@
<option value="Calibre" {% if current == 'Calibre' %}selected{% endif %}> <option value="Calibre" {% if current == 'Calibre' %}selected{% endif %}>
{% trans "Calibre (CSV)" %} {% trans "Calibre (CSV)" %}
</option> </option>
<option value="BookWyrm" {% if current == 'BookWyrm' %}selected{% endif %}>
{% trans "BookWyrm (CSV)" %}
</option>
</select> </select>
</div> </div>
@ -93,9 +96,14 @@
<input type="checkbox" name="include_reviews" checked> {% trans "Include reviews" %} <input type="checkbox" name="include_reviews" checked> {% trans "Include reviews" %}
</label> </label>
</div> </div>
<div class="field">
<label class="label">
<input type="checkbox" name="create_shelves" checked> {% trans "Create new shelves if they do not exist" %}
</label>
</div>
<div class="field"> <div class="field">
<label class="label" for="privacy_import"> <label class="label" for="privacy_import">
{% trans "Privacy setting for imported reviews:" %} {% trans "Privacy setting for imported reviews and shelves:" %}
</label> </label>
{% include 'snippets/privacy_select.html' with no_label=True privacy_uuid="import" %} {% include 'snippets/privacy_select.html' with no_label=True privacy_uuid="import" %}
</div> </div>

View file

@ -0,0 +1,4 @@
title,author_text,remote_id,openlibrary_key,inventaire_id,librarything_key,goodreads_key,bnf_id,viaf,wikidata,asin,aasin,isfdb,isbn_10,isbn_13,oclc_number,start_date,finish_date,stopped_date,rating,review_name,review_cw,review_content,review_published,shelf,shelf_name,shelf_date
我穿我自己,琅俨,https://example.com/book/2010,,,,,,,,,,,,,,,,,,,,,,to-read,To Read,2024-08-10
Ottolenghi Simple,Yotam Ottolenghi,https://example.com/book/2,OL43065148M,,,,,,,,,,0449017036,9780449017036,,2022-08-10,2022-10-10,,4,Too much tahini,,...in his hummus,2022-11-10,cooking-9,Cooking,2024-08-10
The Blue Bedspread,Raj Kamal Jha,https://example.com/book/270,OL7425890M,,,,,,,,,,0375503129,9780375503122,41754476,2001-06-01,2001-07-10,,5,,,,,read,Read,2024-08-10
1 title author_text remote_id openlibrary_key inventaire_id librarything_key goodreads_key bnf_id viaf wikidata asin aasin isfdb isbn_10 isbn_13 oclc_number start_date finish_date stopped_date rating review_name review_cw review_content review_published shelf shelf_name shelf_date
2 我穿我自己 琅俨 https://example.com/book/2010 to-read To Read 2024-08-10
3 Ottolenghi Simple Yotam Ottolenghi https://example.com/book/2 OL43065148M 0449017036 9780449017036 2022-08-10 2022-10-10 4 Too much tahini ...in his hummus 2022-11-10 cooking-9 Cooking 2024-08-10
4 The Blue Bedspread Raj Kamal Jha https://example.com/book/270 OL7425890M 0375503129 9780375503122 41754476 2001-06-01 2001-07-10 5 read Read 2024-08-10

View file

@ -0,0 +1,182 @@
""" testing bookwyrm csv import """
import pathlib
from unittest.mock import patch
import datetime
from django.test import TestCase
from bookwyrm import models
from bookwyrm.importers import BookwyrmBooksImporter
from bookwyrm.models.import_job import handle_imported_book
def make_date(*args):
"""helper function to easily generate a date obj"""
return datetime.datetime(*args, tzinfo=datetime.timezone.utc)
@patch("bookwyrm.suggested_users.rerank_suggestions_task.delay")
@patch("bookwyrm.activitystreams.populate_stream_task.delay")
@patch("bookwyrm.activitystreams.add_book_statuses_task.delay")
class BookwyrmBooksImport(TestCase):
"""importing from BookWyrm csv"""
def setUp(self):
"""use a test csv"""
self.importer = BookwyrmBooksImporter()
datafile = pathlib.Path(__file__).parent.joinpath("../data/bookwyrm.csv")
# pylint: disable-next=consider-using-with
self.csv = open(datafile, "r", encoding=self.importer.encoding)
def tearDown(self):
"""close test csv"""
self.csv.close()
@classmethod
def setUpTestData(cls):
"""populate database"""
with (
patch("bookwyrm.suggested_users.rerank_suggestions_task.delay"),
patch("bookwyrm.activitystreams.populate_stream_task.delay"),
patch("bookwyrm.lists_stream.populate_lists_task.delay"),
):
cls.local_user = models.User.objects.create_user(
"mouse", "mouse@mouse.mouse", "password", local=True
)
models.SiteSettings.objects.create()
work = models.Work.objects.create(title="Test Work")
cls.book = models.Edition.objects.create(
title="Example Edition",
remote_id="https://example.com/book/1",
parent_work=work,
)
def test_create_job(self, *_):
"""creates the import job entry and checks csv"""
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_items = models.ImportItem.objects.filter(job=import_job).all()
self.assertEqual(len(import_items), 3)
self.assertEqual(import_items[0].index, 0)
self.assertEqual(import_items[0].normalized_data["isbn_13"], "")
self.assertEqual(import_items[0].normalized_data["isbn_10"], "")
self.assertEqual(import_items[0].shelf_name, "To Read")
self.assertEqual(import_items[1].index, 1)
self.assertEqual(import_items[1].normalized_data["isbn_13"], "9780449017036")
self.assertEqual(import_items[1].normalized_data["isbn_10"], "0449017036")
self.assertEqual(import_items[1].shelf_name, "Cooking")
self.assertEqual(import_items[2].index, 2)
self.assertEqual(import_items[2].normalized_data["isbn_13"], "9780375503122")
self.assertEqual(import_items[2].normalized_data["isbn_10"], "0375503129")
self.assertEqual(import_items[2].shelf_name, "Read")
def test_create_retry_job(self, *_):
"""trying again with items that didn't import"""
import_job = self.importer.create_job(
self.local_user, self.csv, False, "unlisted"
)
import_items = models.ImportItem.objects.filter(job=import_job).all()[:2]
retry = self.importer.create_retry_job(
self.local_user, import_job, import_items
)
self.assertNotEqual(import_job, retry)
self.assertEqual(retry.user, self.local_user)
self.assertEqual(retry.include_reviews, False)
self.assertEqual(retry.privacy, "unlisted")
retry_items = models.ImportItem.objects.filter(job=retry).all()
self.assertEqual(len(retry_items), 2)
self.assertEqual(retry_items[0].index, 0)
self.assertEqual(retry_items[0].data["title"], "我穿我自己")
self.assertEqual(retry_items[1].index, 1)
self.assertEqual(retry_items[1].data["author_text"], "Yotam Ottolenghi")
def test_handle_imported_book(self, *_):
"""import added a book, this adds related connections"""
shelf = self.local_user.shelf_set.filter(
identifier=models.Shelf.READ_FINISHED
).first()
self.assertIsNone(shelf.books.first())
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_item = import_job.items.last()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
handle_imported_book(import_item)
shelf.refresh_from_db()
self.assertEqual(shelf.books.first(), self.book)
self.assertEqual(
shelf.shelfbook_set.first().shelved_date, make_date(2024, 8, 10)
)
readthrough = models.ReadThrough.objects.get(user=self.local_user)
self.assertEqual(readthrough.book, self.book)
self.assertEqual(readthrough.start_date, make_date(2001, 6, 1))
self.assertEqual(readthrough.finish_date, make_date(2001, 7, 10))
def test_create_new_shelf(self, *_):
"""import added a book, was a new shelf created?"""
shelf = self.local_user.shelf_set.filter(identifier="cooking").first()
self.assertIsNone(shelf)
import_job = self.importer.create_job(
self.local_user, self.csv, False, "public"
)
import_item = models.ImportItem.objects.filter(job=import_job).all()[1]
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
handle_imported_book(import_item)
shelf_after = self.local_user.shelf_set.filter(identifier="cooking-9").first()
self.assertEqual(shelf_after.books.first(), self.book)
@patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_review(self, *_):
"""review import"""
import_job = self.importer.create_job(
self.local_user, self.csv, True, "unlisted"
)
import_item = import_job.items.get(index=1)
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
handle_imported_book(import_item)
review = models.Review.objects.get(book=self.book, user=self.local_user)
self.assertEqual(review.name, "Too much tahini")
self.assertEqual(review.content, "...in his hummus")
self.assertEqual(review.rating, 4)
self.assertEqual(review.published_date, make_date(2022, 11, 10))
self.assertEqual(review.privacy, "unlisted")
@patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_rating(self, *_):
"""rating import"""
import_job = self.importer.create_job(
self.local_user, self.csv, True, "followers"
)
import_item = import_job.items.filter(index=2).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.apply_async"):
handle_imported_book(import_item)
review = models.ReviewRating.objects.get(book=self.book, user=self.local_user)
self.assertIsInstance(review, models.ReviewRating)
self.assertEqual(review.rating, 5)
self.assertEqual(review.published_date, make_date(2001, 7, 10))
self.assertEqual(review.privacy, "followers")

View file

@ -16,6 +16,7 @@ from django.views import View
from bookwyrm import forms, models from bookwyrm import forms, models
from bookwyrm.importers import ( from bookwyrm.importers import (
BookwyrmImporter, BookwyrmImporter,
BookwyrmBooksImporter,
CalibreImporter, CalibreImporter,
LibrarythingImporter, LibrarythingImporter,
GoodreadsImporter, GoodreadsImporter,
@ -69,7 +70,7 @@ class Import(View):
return TemplateResponse(request, "import/import.html", data) return TemplateResponse(request, "import/import.html", data)
def post(self, request): def post(self, request):
"""ingest a goodreads csv""" """ingest a book data csv"""
site = models.SiteSettings.objects.get() site = models.SiteSettings.objects.get()
if not site.imports_enabled: if not site.imports_enabled:
raise PermissionDenied() raise PermissionDenied()
@ -79,11 +80,16 @@ class Import(View):
return HttpResponseBadRequest() return HttpResponseBadRequest()
include_reviews = request.POST.get("include_reviews") == "on" include_reviews = request.POST.get("include_reviews") == "on"
create_shelves = request.POST.get("create_shelves") == "on"
privacy = request.POST.get("privacy") privacy = request.POST.get("privacy")
source = request.POST.get("source") source = request.POST.get("source")
importer = None importer = None
if source == "LibraryThing":
if source == "BookWyrm":
importer = BookwyrmBooksImporter()
print("BookwyrmBooksImporter")
elif source == "LibraryThing":
importer = LibrarythingImporter() importer = LibrarythingImporter()
elif source == "Storygraph": elif source == "Storygraph":
importer = StorygraphImporter() importer = StorygraphImporter()
@ -101,6 +107,7 @@ class Import(View):
TextIOWrapper(request.FILES["csv_file"], encoding=importer.encoding), TextIOWrapper(request.FILES["csv_file"], encoding=importer.encoding),
include_reviews, include_reviews,
privacy, privacy,
create_shelves,
) )
except (UnicodeDecodeError, ValueError, KeyError): except (UnicodeDecodeError, ValueError, KeyError):
return self.get(request, invalid=True) return self.get(request, invalid=True)