From 4ccd9fc633321970d2ecaf64631bf19282852e69 Mon Sep 17 00:00:00 2001 From: Mouse Reeve Date: Wed, 10 Nov 2021 16:49:54 -0800 Subject: [PATCH] Use generalized mappings to handle import --- bookwyrm/importers/goodreads_import.py | 7 -- bookwyrm/importers/importer.py | 87 ++++++++++++--- bookwyrm/importers/librarything_import.py | 34 +----- bookwyrm/importers/storygraph_import.py | 27 +---- bookwyrm/models/import_job.py | 40 ++++--- bookwyrm/tests/data/generic.csv | 8 +- bookwyrm/tests/importers/test_importer.py | 127 +++++++++------------- 7 files changed, 152 insertions(+), 178 deletions(-) diff --git a/bookwyrm/importers/goodreads_import.py b/bookwyrm/importers/goodreads_import.py index c62e6582..c0dc0ea2 100644 --- a/bookwyrm/importers/goodreads_import.py +++ b/bookwyrm/importers/goodreads_import.py @@ -7,10 +7,3 @@ class GoodreadsImporter(Importer): For a more complete example of overriding see librarything_import.py""" service = "Goodreads" - - def parse_fields(self, entry): - """handle the specific fields in goodreads csvs""" - entry.update({"import_source": self.service}) - # add missing 'Date Started' field - entry.update({"Date Started": None}) - return entry diff --git a/bookwyrm/importers/importer.py b/bookwyrm/importers/importer.py index a5243cd3..b0458bab 100644 --- a/bookwyrm/importers/importer.py +++ b/bookwyrm/importers/importer.py @@ -1,5 +1,6 @@ """ handle reading a csv from an external service, defaults are from Goodreads """ import csv +from dataclasses import dataclass import logging from django.utils import timezone @@ -18,30 +19,59 @@ class Importer: service = "Unknown" delimiter = "," encoding = "UTF-8" - mandatory_fields = ["Title", "Author"] + + # these are from Goodreads + row_mappings_guesses = { + "id": ["id", "book id"], + "title": ["title"], + "authors": ["author", "authors", "primary author"], + "isbn_13": ["isbn13", "isbn"], + "isbn_10": ["isbn10", "isbn"], + "shelf": ["shelf", "exclusive shelf", "read status"], + "review_name": [], + "review_body": ["my review"], + "rating": ["my rating", "rating", "star rating"], + "date_added": ["date added", "entry date", "added"], + "date_started": ["date started", "started"], + "date_finished": ["date finished", "last date read", "date read", "finished"], + } def create_job(self, user, csv_file, include_reviews, privacy): """check over a csv and creates a database entry for the job""" + csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter) + rows = enumerate(list(csv_reader)) job = ImportJob.objects.create( - user=user, include_reviews=include_reviews, privacy=privacy + user=user, + include_reviews=include_reviews, + privacy=privacy, + mappings=self.create_row_mappings(csv_reader.fieldnames), ) - for index, entry in enumerate( - list(csv.DictReader(csv_file, delimiter=self.delimiter)) - ): - if not all(x in entry for x in self.mandatory_fields): - raise ValueError("Author and title must be in data.") - entry = self.parse_fields(entry) - self.save_item(job, index, entry) + + for index, entry in rows: + print(index, entry) + self.create_item(job, index, entry) return job - def save_item(self, job, index, data): # pylint: disable=no-self-use - """creates and saves an import item""" - ImportItem(job=job, index=index, data=data).save() + def create_row_mappings(self, headers): + """guess what the headers mean""" + mappings = {} + for (key, guesses) in self.row_mappings_guesses.items(): + value = [h for h in headers if h.lower() in guesses] + value = value[0] if len(value) else None + if value: + headers.remove(value) + mappings[key] = value + return mappings - def parse_fields(self, entry): - """updates csv data with additional info""" - entry.update({"import_source": self.service}) - return entry + def create_item(self, job, index, data): + """creates and saves an import item""" + print(data) + normalized = self.normalize_row(data, job.mappings) + ImportItem(job=job, index=index, data=data, normalized_data=normalized).save() + + def normalize_row(self, entry, mappings): # pylint: disable=no-self-use + """use the dataclass to create the formatted row of data""" + return {k: entry.get(v) for k, v in mappings.items()} def create_retry_job(self, user, original_job, items): """retry items that didn't import""" @@ -49,10 +79,13 @@ class Importer: user=user, include_reviews=original_job.include_reviews, privacy=original_job.privacy, + # TODO: allow users to adjust mappings + mappings=original_job.mappings, retry=True, ) for item in items: - self.save_item(job, item.index, item.data) + # this will re-normalize the raw data + self.create_item(job, item.index, item.data) return job def start_import(self, job): @@ -156,3 +189,23 @@ def handle_imported_book(source, user, item, include_reviews, privacy): ) # only broadcast this review to other bookwyrm instances review.save(software="bookwyrm") + + +@dataclass +class ImportEntry: + """data extracted from a line in a csv""" + + title: str + authors: str = None + isbn_13: str = None + isbn_10: str = None + shelf: str = None + review_name: str = None + review_rating: float = None + review_body: str = None + review_cw: str = None + rating: float = None + date_added: str = None + date_started: str = None + date_finished: str = None + import_source: str = "Unknown" diff --git a/bookwyrm/importers/librarything_import.py b/bookwyrm/importers/librarything_import.py index b3175a82..3d42e539 100644 --- a/bookwyrm/importers/librarything_import.py +++ b/bookwyrm/importers/librarything_import.py @@ -1,7 +1,4 @@ -""" handle reading a csv from librarything """ -import re -import math - +""" handle reading a tsv from librarything """ from . import Importer @@ -11,32 +8,3 @@ class LibrarythingImporter(Importer): service = "LibraryThing" delimiter = "\t" encoding = "ISO-8859-1" - # mandatory_fields : fields matching the book title and author - mandatory_fields = ["Title", "Primary Author"] - - def parse_fields(self, entry): - """custom parsing for librarything""" - data = {} - data["import_source"] = self.service - data["Book Id"] = entry["Book Id"] - data["Title"] = entry["Title"] - data["Author"] = entry["Primary Author"] - data["ISBN13"] = entry["ISBN"] - data["My Review"] = entry["Review"] - if entry["Rating"]: - data["My Rating"] = math.ceil(float(entry["Rating"])) - else: - data["My Rating"] = "" - data["Date Added"] = re.sub(r"\[|\]", "", entry["Entry Date"]) - data["Date Started"] = re.sub(r"\[|\]", "", entry["Date Started"]) - data["Date Read"] = re.sub(r"\[|\]", "", entry["Date Read"]) - - data["Exclusive Shelf"] = None - if data["Date Read"]: - data["Exclusive Shelf"] = "read" - elif data["Date Started"]: - data["Exclusive Shelf"] = "reading" - else: - data["Exclusive Shelf"] = "to-read" - - return data diff --git a/bookwyrm/importers/storygraph_import.py b/bookwyrm/importers/storygraph_import.py index 1333b8b9..9368115d 100644 --- a/bookwyrm/importers/storygraph_import.py +++ b/bookwyrm/importers/storygraph_import.py @@ -1,6 +1,4 @@ -""" handle reading a csv from librarything """ -import re - +""" handle reading a csv from storygraph""" from . import Importer @@ -8,26 +6,3 @@ class StorygraphImporter(Importer): """csv downloads from librarything""" service = "Storygraph" - # mandatory_fields : fields matching the book title and author - mandatory_fields = ["Title"] - - def parse_fields(self, entry): - """custom parsing for storygraph""" - data = {} - data["import_source"] = self.service - data["Title"] = entry["Title"] - data["Author"] = entry["Authors"] if "Authors" in entry else entry["Author"] - data["ISBN13"] = entry["ISBN"] - data["My Review"] = entry["Review"] - if entry["Star Rating"]: - data["My Rating"] = float(entry["Star Rating"]) - else: - data["My Rating"] = "" - - data["Date Added"] = re.sub(r"[/]", "-", entry["Date Added"]) - data["Date Read"] = re.sub(r"[/]", "-", entry["Last Date Read"]) - - data["Exclusive Shelf"] = ( - {"read": "read", "currently-reading": "reading", "to-read": "to-read"} - ).get(entry["Read Status"], None) - return data diff --git a/bookwyrm/models/import_job.py b/bookwyrm/models/import_job.py index 949e3edb..6bca57f8 100644 --- a/bookwyrm/models/import_job.py +++ b/bookwyrm/models/import_job.py @@ -35,6 +35,7 @@ class ImportJob(models.Model): created_date = models.DateTimeField(default=timezone.now) task_id = models.CharField(max_length=100, null=True) include_reviews = models.BooleanField(default=True) + mappings = models.JSONField() complete = models.BooleanField(default=False) privacy = models.CharField( max_length=255, default="public", choices=PrivacyLevels.choices @@ -48,6 +49,7 @@ class ImportItem(models.Model): job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items") index = models.IntegerField() data = models.JSONField() + normalized_data = models.JSONField() book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True) book_guess = models.ForeignKey( Book, @@ -98,55 +100,59 @@ class ImportItem(models.Model): @property def title(self): """get the book title""" - return self.data["title"] + return self.normalized_data["title"] @property def author(self): """get the book's authors""" - return self.data["authors"] + return self.normalized_data["authors"] @property def isbn(self): """pulls out the isbn13 field from the csv line data""" - return unquote_string(self.data["isbn_13"]) + return unquote_string(self.normalized_data["isbn_13"]) @property def shelf(self): """the goodreads shelf field""" - return self.data.get("shelf") + return self.normalized_data.get("shelf") @property def review(self): """a user-written review, to be imported with the book data""" - return self.data["review_body"] + return self.normalized_data["review_body"] @property def rating(self): """x/5 star rating for a book""" - if self.data.get("rating"): - return float(self.data["rating"]) + if self.normalized_data.get("rating"): + return float(self.normalized_data["rating"]) return None @property def date_added(self): """when the book was added to this dataset""" - if self.data.get("date_added"): - return timezone.make_aware(dateutil.parser.parse(self.data["date_added"])) + if self.normalized_data.get("date_added"): + return timezone.make_aware( + dateutil.parser.parse(self.normalized_data["date_added"]) + ) return None @property def date_started(self): """when the book was started""" - if self.data.get("date_started"): - return timezone.make_aware(dateutil.parser.parse(self.data["date_started"])) + if self.normalized_data.get("date_started"): + return timezone.make_aware( + dateutil.parser.parse(self.normalized_data["date_started"]) + ) return None @property def date_read(self): """the date a book was completed""" - if self.data.get("date_finished"): + if self.normalized_data.get("date_finished"): return timezone.make_aware( - dateutil.parser.parse(self.data["date_finished"]) + dateutil.parser.parse(self.normalized_data["date_finished"]) ) return None @@ -177,8 +183,12 @@ class ImportItem(models.Model): def __repr__(self): # pylint: disable=consider-using-f-string - return "<{!r}Item {!r}>".format(self.data["import_source"], self.data["title"]) + return "<{!r}Item {!r}>".format( + self.normalized_data["import_source"], self.normalized_data["title"] + ) def __str__(self): # pylint: disable=consider-using-f-string - return "{} by {}".format(self.data["title"], self.data["authors"]) + return "{} by {}".format( + self.normalized_data["title"], self.normalized_data["authors"] + ) diff --git a/bookwyrm/tests/data/generic.csv b/bookwyrm/tests/data/generic.csv index a081a642..9c5b6f02 100644 --- a/bookwyrm/tests/data/generic.csv +++ b/bookwyrm/tests/data/generic.csv @@ -1,5 +1,5 @@ -id,title,author,ISBN,rating,shelf,review,added -38,Gideon the Ninth (The Locked Tomb #1),Tamsyn Muir,"9781250313195",,read,,2021-11-10 -48,Harrow the Ninth (The Locked Tomb #2),Tamsyn Muir,,3,read,,2021-11-10 +id,title,author,ISBN,rating,shelf,review,added,finished +38,Gideon the Ninth,Tamsyn Muir,"9781250313195",,read,,2021-11-10,2021-11-11 +48,Harrow the Ninth,Tamsyn Muir,,3,read,,2021-11-10 23,Subcutanean,Aaron A. Reed,,,read,,2021-11-10 -10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10 +10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10,2021-11-11 diff --git a/bookwyrm/tests/importers/test_importer.py b/bookwyrm/tests/importers/test_importer.py index f5d9af30..b2f0284d 100644 --- a/bookwyrm/tests/importers/test_importer.py +++ b/bookwyrm/tests/importers/test_importer.py @@ -1,6 +1,5 @@ """ testing import """ from collections import namedtuple -import csv import pathlib from unittest.mock import patch import datetime @@ -29,26 +28,7 @@ class GenericImporter(TestCase): def setUp(self): """use a test csv""" - class TestImporter(Importer): - """basic importer""" - - mandatory_fields = ["title", "author"] - - def parse_fields(self, entry): - return { - "id": entry["id"], - "Title": entry["title"], - "Author": entry["author"], - "ISBN13": entry["ISBN"], - "Star Rating": entry["rating"], - "My Rating": entry["rating"], - "My Review": entry["review"], - "Exclusive Shelf": entry["shelf"], - "Date Added": entry["added"], - "Date Read": None, - } - - self.importer = TestImporter() + self.importer = Importer() datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.csv = open(datafile, "r", encoding=self.importer.encoding) with patch("bookwyrm.suggested_users.rerank_suggestions_task.delay"), patch( @@ -77,13 +57,24 @@ class GenericImporter(TestCase): import_items = models.ImportItem.objects.filter(job=import_job).all() self.assertEqual(len(import_items), 4) self.assertEqual(import_items[0].index, 0) - self.assertEqual(import_items[0].data["id"], "38") + self.assertEqual(import_items[0].normalized_data["id"], "38") + self.assertEqual(import_items[0].normalized_data["title"], "Gideon the Ninth") + self.assertEqual(import_items[0].normalized_data["authors"], "Tamsyn Muir") + self.assertEqual(import_items[0].normalized_data["isbn_13"], "9781250313195") + self.assertIsNone(import_items[0].normalized_data["isbn_10"]) + self.assertEqual(import_items[0].normalized_data["shelf"], "read") + self.assertEqual(import_items[1].index, 1) - self.assertEqual(import_items[1].data["id"], "48") + self.assertEqual(import_items[1].normalized_data["id"], "48") + self.assertEqual(import_items[1].normalized_data["title"], "Harrow the Ninth") + self.assertEqual(import_items[2].index, 2) - self.assertEqual(import_items[2].data["id"], "23") + self.assertEqual(import_items[2].normalized_data["id"], "23") + self.assertEqual(import_items[2].normalized_data["title"], "Subcutanean") + self.assertEqual(import_items[3].index, 3) - self.assertEqual(import_items[3].data["id"], "10") + self.assertEqual(import_items[3].normalized_data["id"], "10") + self.assertEqual(import_items[3].normalized_data["title"], "Patisserie at Home") def test_create_retry_job(self, *_): """trying again with items that didn't import""" @@ -103,9 +94,9 @@ class GenericImporter(TestCase): retry_items = models.ImportItem.objects.filter(job=retry).all() self.assertEqual(len(retry_items), 2) self.assertEqual(retry_items[0].index, 0) - self.assertEqual(retry_items[0].data["id"], "38") + self.assertEqual(retry_items[0].normalized_data["id"], "38") self.assertEqual(retry_items[1].index, 1) - self.assertEqual(retry_items[1].data["id"], "48") + self.assertEqual(retry_items[1].normalized_data["id"], "48") def test_start_import(self, *_): """check that a task was created""" @@ -143,15 +134,12 @@ class GenericImporter(TestCase): shelf = self.local_user.shelf_set.filter(identifier="read").first() self.assertIsNone(shelf.books.first()) - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - for index, entry in enumerate(list(csv.DictReader(csv_file))): - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=index, data=entry, book=self.book - ) - break + import_job = self.importer.create_job( + self.local_user, self.csv, False, "public" + ) + import_item = import_job.items.first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): handle_imported_book( @@ -172,15 +160,12 @@ class GenericImporter(TestCase): shelved_date=make_date(2020, 2, 2), ) - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - for index, entry in enumerate(list(csv.DictReader(csv_file))): - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=index, data=entry, book=self.book - ) - break + import_job = self.importer.create_job( + self.local_user, self.csv, False, "unlisted" + ) + import_item = import_job.items.first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): handle_imported_book( @@ -199,15 +184,12 @@ class GenericImporter(TestCase): def test_handle_import_twice(self, *_): """re-importing books""" shelf = self.local_user.shelf_set.filter(identifier="read").first() - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - for index, entry in enumerate(list(csv.DictReader(csv_file))): - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=index, data=entry, book=self.book - ) - break + import_job = self.importer.create_job( + self.local_user, self.csv, False, "public" + ) + import_item = import_job.items.first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): handle_imported_book( @@ -219,18 +201,15 @@ class GenericImporter(TestCase): shelf.refresh_from_db() self.assertEqual(shelf.books.first(), self.book) + self.assertEqual(models.ReadThrough.objects.count(), 1) @patch("bookwyrm.activitystreams.add_status_task.delay") def test_handle_imported_book_review(self, *_): """review import""" - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - entry = list(csv.DictReader(csv_file))[3] - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=0, data=entry, book=self.book - ) + import_job = self.importer.create_job(self.local_user, self.csv, True, "public") + import_item = import_job.items.filter(index=3).first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.Status.broadcast") as broadcast_mock: @@ -251,14 +230,12 @@ class GenericImporter(TestCase): @patch("bookwyrm.activitystreams.add_status_task.delay") def test_handle_imported_book_rating(self, *_): """rating import""" - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - entry = list(csv.DictReader(csv_file))[1] - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=0, data=entry, book=self.book + import_job = self.importer.create_job( + self.local_user, self.csv, False, "public" ) + import_item = import_job.items.filter(index=1).first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): handle_imported_book( @@ -271,14 +248,12 @@ class GenericImporter(TestCase): def test_handle_imported_book_reviews_disabled(self, *_): """review import""" - import_job = models.ImportJob.objects.create(user=self.local_user) - datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") - csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding - entry = list(csv.DictReader(csv_file))[2] - entry = self.importer.parse_fields(entry) - import_item = models.ImportItem.objects.create( - job_id=import_job.id, index=0, data=entry, book=self.book + import_job = self.importer.create_job( + self.local_user, self.csv, False, "unlisted" ) + import_item = import_job.items.filter(index=3).first() + import_item.book = self.book + import_item.save() with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): handle_imported_book(