Use generalized mappings to handle import

This commit is contained in:
Mouse Reeve 2021-11-10 16:49:54 -08:00
parent 0736c7e160
commit 4ccd9fc633
7 changed files with 152 additions and 178 deletions

View file

@ -7,10 +7,3 @@ class GoodreadsImporter(Importer):
For a more complete example of overriding see librarything_import.py""" For a more complete example of overriding see librarything_import.py"""
service = "Goodreads" service = "Goodreads"
def parse_fields(self, entry):
"""handle the specific fields in goodreads csvs"""
entry.update({"import_source": self.service})
# add missing 'Date Started' field
entry.update({"Date Started": None})
return entry

View file

@ -1,5 +1,6 @@
""" handle reading a csv from an external service, defaults are from Goodreads """ """ handle reading a csv from an external service, defaults are from Goodreads """
import csv import csv
from dataclasses import dataclass
import logging import logging
from django.utils import timezone from django.utils import timezone
@ -18,30 +19,59 @@ class Importer:
service = "Unknown" service = "Unknown"
delimiter = "," delimiter = ","
encoding = "UTF-8" encoding = "UTF-8"
mandatory_fields = ["Title", "Author"]
# these are from Goodreads
row_mappings_guesses = {
"id": ["id", "book id"],
"title": ["title"],
"authors": ["author", "authors", "primary author"],
"isbn_13": ["isbn13", "isbn"],
"isbn_10": ["isbn10", "isbn"],
"shelf": ["shelf", "exclusive shelf", "read status"],
"review_name": [],
"review_body": ["my review"],
"rating": ["my rating", "rating", "star rating"],
"date_added": ["date added", "entry date", "added"],
"date_started": ["date started", "started"],
"date_finished": ["date finished", "last date read", "date read", "finished"],
}
def create_job(self, user, csv_file, include_reviews, privacy): def create_job(self, user, csv_file, include_reviews, privacy):
"""check over a csv and creates a database entry for the job""" """check over a csv and creates a database entry for the job"""
csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter)
rows = enumerate(list(csv_reader))
job = ImportJob.objects.create( job = ImportJob.objects.create(
user=user, include_reviews=include_reviews, privacy=privacy user=user,
include_reviews=include_reviews,
privacy=privacy,
mappings=self.create_row_mappings(csv_reader.fieldnames),
) )
for index, entry in enumerate(
list(csv.DictReader(csv_file, delimiter=self.delimiter)) for index, entry in rows:
): print(index, entry)
if not all(x in entry for x in self.mandatory_fields): self.create_item(job, index, entry)
raise ValueError("Author and title must be in data.")
entry = self.parse_fields(entry)
self.save_item(job, index, entry)
return job return job
def save_item(self, job, index, data): # pylint: disable=no-self-use def create_row_mappings(self, headers):
"""creates and saves an import item""" """guess what the headers mean"""
ImportItem(job=job, index=index, data=data).save() mappings = {}
for (key, guesses) in self.row_mappings_guesses.items():
value = [h for h in headers if h.lower() in guesses]
value = value[0] if len(value) else None
if value:
headers.remove(value)
mappings[key] = value
return mappings
def parse_fields(self, entry): def create_item(self, job, index, data):
"""updates csv data with additional info""" """creates and saves an import item"""
entry.update({"import_source": self.service}) print(data)
return entry normalized = self.normalize_row(data, job.mappings)
ImportItem(job=job, index=index, data=data, normalized_data=normalized).save()
def normalize_row(self, entry, mappings): # pylint: disable=no-self-use
"""use the dataclass to create the formatted row of data"""
return {k: entry.get(v) for k, v in mappings.items()}
def create_retry_job(self, user, original_job, items): def create_retry_job(self, user, original_job, items):
"""retry items that didn't import""" """retry items that didn't import"""
@ -49,10 +79,13 @@ class Importer:
user=user, user=user,
include_reviews=original_job.include_reviews, include_reviews=original_job.include_reviews,
privacy=original_job.privacy, privacy=original_job.privacy,
# TODO: allow users to adjust mappings
mappings=original_job.mappings,
retry=True, retry=True,
) )
for item in items: for item in items:
self.save_item(job, item.index, item.data) # this will re-normalize the raw data
self.create_item(job, item.index, item.data)
return job return job
def start_import(self, job): def start_import(self, job):
@ -156,3 +189,23 @@ def handle_imported_book(source, user, item, include_reviews, privacy):
) )
# only broadcast this review to other bookwyrm instances # only broadcast this review to other bookwyrm instances
review.save(software="bookwyrm") review.save(software="bookwyrm")
@dataclass
class ImportEntry:
"""data extracted from a line in a csv"""
title: str
authors: str = None
isbn_13: str = None
isbn_10: str = None
shelf: str = None
review_name: str = None
review_rating: float = None
review_body: str = None
review_cw: str = None
rating: float = None
date_added: str = None
date_started: str = None
date_finished: str = None
import_source: str = "Unknown"

View file

@ -1,7 +1,4 @@
""" handle reading a csv from librarything """ """ handle reading a tsv from librarything """
import re
import math
from . import Importer from . import Importer
@ -11,32 +8,3 @@ class LibrarythingImporter(Importer):
service = "LibraryThing" service = "LibraryThing"
delimiter = "\t" delimiter = "\t"
encoding = "ISO-8859-1" encoding = "ISO-8859-1"
# mandatory_fields : fields matching the book title and author
mandatory_fields = ["Title", "Primary Author"]
def parse_fields(self, entry):
"""custom parsing for librarything"""
data = {}
data["import_source"] = self.service
data["Book Id"] = entry["Book Id"]
data["Title"] = entry["Title"]
data["Author"] = entry["Primary Author"]
data["ISBN13"] = entry["ISBN"]
data["My Review"] = entry["Review"]
if entry["Rating"]:
data["My Rating"] = math.ceil(float(entry["Rating"]))
else:
data["My Rating"] = ""
data["Date Added"] = re.sub(r"\[|\]", "", entry["Entry Date"])
data["Date Started"] = re.sub(r"\[|\]", "", entry["Date Started"])
data["Date Read"] = re.sub(r"\[|\]", "", entry["Date Read"])
data["Exclusive Shelf"] = None
if data["Date Read"]:
data["Exclusive Shelf"] = "read"
elif data["Date Started"]:
data["Exclusive Shelf"] = "reading"
else:
data["Exclusive Shelf"] = "to-read"
return data

View file

@ -1,6 +1,4 @@
""" handle reading a csv from librarything """ """ handle reading a csv from storygraph"""
import re
from . import Importer from . import Importer
@ -8,26 +6,3 @@ class StorygraphImporter(Importer):
"""csv downloads from librarything""" """csv downloads from librarything"""
service = "Storygraph" service = "Storygraph"
# mandatory_fields : fields matching the book title and author
mandatory_fields = ["Title"]
def parse_fields(self, entry):
"""custom parsing for storygraph"""
data = {}
data["import_source"] = self.service
data["Title"] = entry["Title"]
data["Author"] = entry["Authors"] if "Authors" in entry else entry["Author"]
data["ISBN13"] = entry["ISBN"]
data["My Review"] = entry["Review"]
if entry["Star Rating"]:
data["My Rating"] = float(entry["Star Rating"])
else:
data["My Rating"] = ""
data["Date Added"] = re.sub(r"[/]", "-", entry["Date Added"])
data["Date Read"] = re.sub(r"[/]", "-", entry["Last Date Read"])
data["Exclusive Shelf"] = (
{"read": "read", "currently-reading": "reading", "to-read": "to-read"}
).get(entry["Read Status"], None)
return data

View file

@ -35,6 +35,7 @@ class ImportJob(models.Model):
created_date = models.DateTimeField(default=timezone.now) created_date = models.DateTimeField(default=timezone.now)
task_id = models.CharField(max_length=100, null=True) task_id = models.CharField(max_length=100, null=True)
include_reviews = models.BooleanField(default=True) include_reviews = models.BooleanField(default=True)
mappings = models.JSONField()
complete = models.BooleanField(default=False) complete = models.BooleanField(default=False)
privacy = models.CharField( privacy = models.CharField(
max_length=255, default="public", choices=PrivacyLevels.choices max_length=255, default="public", choices=PrivacyLevels.choices
@ -48,6 +49,7 @@ class ImportItem(models.Model):
job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items") job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
index = models.IntegerField() index = models.IntegerField()
data = models.JSONField() data = models.JSONField()
normalized_data = models.JSONField()
book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True) book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)
book_guess = models.ForeignKey( book_guess = models.ForeignKey(
Book, Book,
@ -98,55 +100,59 @@ class ImportItem(models.Model):
@property @property
def title(self): def title(self):
"""get the book title""" """get the book title"""
return self.data["title"] return self.normalized_data["title"]
@property @property
def author(self): def author(self):
"""get the book's authors""" """get the book's authors"""
return self.data["authors"] return self.normalized_data["authors"]
@property @property
def isbn(self): def isbn(self):
"""pulls out the isbn13 field from the csv line data""" """pulls out the isbn13 field from the csv line data"""
return unquote_string(self.data["isbn_13"]) return unquote_string(self.normalized_data["isbn_13"])
@property @property
def shelf(self): def shelf(self):
"""the goodreads shelf field""" """the goodreads shelf field"""
return self.data.get("shelf") return self.normalized_data.get("shelf")
@property @property
def review(self): def review(self):
"""a user-written review, to be imported with the book data""" """a user-written review, to be imported with the book data"""
return self.data["review_body"] return self.normalized_data["review_body"]
@property @property
def rating(self): def rating(self):
"""x/5 star rating for a book""" """x/5 star rating for a book"""
if self.data.get("rating"): if self.normalized_data.get("rating"):
return float(self.data["rating"]) return float(self.normalized_data["rating"])
return None return None
@property @property
def date_added(self): def date_added(self):
"""when the book was added to this dataset""" """when the book was added to this dataset"""
if self.data.get("date_added"): if self.normalized_data.get("date_added"):
return timezone.make_aware(dateutil.parser.parse(self.data["date_added"])) return timezone.make_aware(
dateutil.parser.parse(self.normalized_data["date_added"])
)
return None return None
@property @property
def date_started(self): def date_started(self):
"""when the book was started""" """when the book was started"""
if self.data.get("date_started"): if self.normalized_data.get("date_started"):
return timezone.make_aware(dateutil.parser.parse(self.data["date_started"])) return timezone.make_aware(
dateutil.parser.parse(self.normalized_data["date_started"])
)
return None return None
@property @property
def date_read(self): def date_read(self):
"""the date a book was completed""" """the date a book was completed"""
if self.data.get("date_finished"): if self.normalized_data.get("date_finished"):
return timezone.make_aware( return timezone.make_aware(
dateutil.parser.parse(self.data["date_finished"]) dateutil.parser.parse(self.normalized_data["date_finished"])
) )
return None return None
@ -177,8 +183,12 @@ class ImportItem(models.Model):
def __repr__(self): def __repr__(self):
# pylint: disable=consider-using-f-string # pylint: disable=consider-using-f-string
return "<{!r}Item {!r}>".format(self.data["import_source"], self.data["title"]) return "<{!r}Item {!r}>".format(
self.normalized_data["import_source"], self.normalized_data["title"]
)
def __str__(self): def __str__(self):
# pylint: disable=consider-using-f-string # pylint: disable=consider-using-f-string
return "{} by {}".format(self.data["title"], self.data["authors"]) return "{} by {}".format(
self.normalized_data["title"], self.normalized_data["authors"]
)

View file

@ -1,5 +1,5 @@
id,title,author,ISBN,rating,shelf,review,added id,title,author,ISBN,rating,shelf,review,added,finished
38,Gideon the Ninth (The Locked Tomb #1),Tamsyn Muir,"9781250313195",,read,,2021-11-10 38,Gideon the Ninth,Tamsyn Muir,"9781250313195",,read,,2021-11-10,2021-11-11
48,Harrow the Ninth (The Locked Tomb #2),Tamsyn Muir,,3,read,,2021-11-10 48,Harrow the Ninth,Tamsyn Muir,,3,read,,2021-11-10
23,Subcutanean,Aaron A. Reed,,,read,,2021-11-10 23,Subcutanean,Aaron A. Reed,,,read,,2021-11-10
10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10 10,Patisserie at Home,Mélanie Dupuis,"9780062445315",2,read,"mixed feelings",2021-11-10,2021-11-11

Can't render this file because it has a wrong number of fields in line 3.

View file

@ -1,6 +1,5 @@
""" testing import """ """ testing import """
from collections import namedtuple from collections import namedtuple
import csv
import pathlib import pathlib
from unittest.mock import patch from unittest.mock import patch
import datetime import datetime
@ -29,26 +28,7 @@ class GenericImporter(TestCase):
def setUp(self): def setUp(self):
"""use a test csv""" """use a test csv"""
class TestImporter(Importer): self.importer = Importer()
"""basic importer"""
mandatory_fields = ["title", "author"]
def parse_fields(self, entry):
return {
"id": entry["id"],
"Title": entry["title"],
"Author": entry["author"],
"ISBN13": entry["ISBN"],
"Star Rating": entry["rating"],
"My Rating": entry["rating"],
"My Review": entry["review"],
"Exclusive Shelf": entry["shelf"],
"Date Added": entry["added"],
"Date Read": None,
}
self.importer = TestImporter()
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv")
self.csv = open(datafile, "r", encoding=self.importer.encoding) self.csv = open(datafile, "r", encoding=self.importer.encoding)
with patch("bookwyrm.suggested_users.rerank_suggestions_task.delay"), patch( with patch("bookwyrm.suggested_users.rerank_suggestions_task.delay"), patch(
@ -77,13 +57,24 @@ class GenericImporter(TestCase):
import_items = models.ImportItem.objects.filter(job=import_job).all() import_items = models.ImportItem.objects.filter(job=import_job).all()
self.assertEqual(len(import_items), 4) self.assertEqual(len(import_items), 4)
self.assertEqual(import_items[0].index, 0) self.assertEqual(import_items[0].index, 0)
self.assertEqual(import_items[0].data["id"], "38") self.assertEqual(import_items[0].normalized_data["id"], "38")
self.assertEqual(import_items[0].normalized_data["title"], "Gideon the Ninth")
self.assertEqual(import_items[0].normalized_data["authors"], "Tamsyn Muir")
self.assertEqual(import_items[0].normalized_data["isbn_13"], "9781250313195")
self.assertIsNone(import_items[0].normalized_data["isbn_10"])
self.assertEqual(import_items[0].normalized_data["shelf"], "read")
self.assertEqual(import_items[1].index, 1) self.assertEqual(import_items[1].index, 1)
self.assertEqual(import_items[1].data["id"], "48") self.assertEqual(import_items[1].normalized_data["id"], "48")
self.assertEqual(import_items[1].normalized_data["title"], "Harrow the Ninth")
self.assertEqual(import_items[2].index, 2) self.assertEqual(import_items[2].index, 2)
self.assertEqual(import_items[2].data["id"], "23") self.assertEqual(import_items[2].normalized_data["id"], "23")
self.assertEqual(import_items[2].normalized_data["title"], "Subcutanean")
self.assertEqual(import_items[3].index, 3) self.assertEqual(import_items[3].index, 3)
self.assertEqual(import_items[3].data["id"], "10") self.assertEqual(import_items[3].normalized_data["id"], "10")
self.assertEqual(import_items[3].normalized_data["title"], "Patisserie at Home")
def test_create_retry_job(self, *_): def test_create_retry_job(self, *_):
"""trying again with items that didn't import""" """trying again with items that didn't import"""
@ -103,9 +94,9 @@ class GenericImporter(TestCase):
retry_items = models.ImportItem.objects.filter(job=retry).all() retry_items = models.ImportItem.objects.filter(job=retry).all()
self.assertEqual(len(retry_items), 2) self.assertEqual(len(retry_items), 2)
self.assertEqual(retry_items[0].index, 0) self.assertEqual(retry_items[0].index, 0)
self.assertEqual(retry_items[0].data["id"], "38") self.assertEqual(retry_items[0].normalized_data["id"], "38")
self.assertEqual(retry_items[1].index, 1) self.assertEqual(retry_items[1].index, 1)
self.assertEqual(retry_items[1].data["id"], "48") self.assertEqual(retry_items[1].normalized_data["id"], "48")
def test_start_import(self, *_): def test_start_import(self, *_):
"""check that a task was created""" """check that a task was created"""
@ -143,15 +134,12 @@ class GenericImporter(TestCase):
shelf = self.local_user.shelf_set.filter(identifier="read").first() shelf = self.local_user.shelf_set.filter(identifier="read").first()
self.assertIsNone(shelf.books.first()) self.assertIsNone(shelf.books.first())
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.local_user, self.csv, False, "public"
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding )
for index, entry in enumerate(list(csv.DictReader(csv_file))): import_item = import_job.items.first()
entry = self.importer.parse_fields(entry) import_item.book = self.book
import_item = models.ImportItem.objects.create( import_item.save()
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book( handle_imported_book(
@ -172,15 +160,12 @@ class GenericImporter(TestCase):
shelved_date=make_date(2020, 2, 2), shelved_date=make_date(2020, 2, 2),
) )
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.local_user, self.csv, False, "unlisted"
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding )
for index, entry in enumerate(list(csv.DictReader(csv_file))): import_item = import_job.items.first()
entry = self.importer.parse_fields(entry) import_item.book = self.book
import_item = models.ImportItem.objects.create( import_item.save()
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book( handle_imported_book(
@ -199,15 +184,12 @@ class GenericImporter(TestCase):
def test_handle_import_twice(self, *_): def test_handle_import_twice(self, *_):
"""re-importing books""" """re-importing books"""
shelf = self.local_user.shelf_set.filter(identifier="read").first() shelf = self.local_user.shelf_set.filter(identifier="read").first()
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.local_user, self.csv, False, "public"
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding )
for index, entry in enumerate(list(csv.DictReader(csv_file))): import_item = import_job.items.first()
entry = self.importer.parse_fields(entry) import_item.book = self.book
import_item = models.ImportItem.objects.create( import_item.save()
job_id=import_job.id, index=index, data=entry, book=self.book
)
break
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book( handle_imported_book(
@ -219,18 +201,15 @@ class GenericImporter(TestCase):
shelf.refresh_from_db() shelf.refresh_from_db()
self.assertEqual(shelf.books.first(), self.book) self.assertEqual(shelf.books.first(), self.book)
self.assertEqual(models.ReadThrough.objects.count(), 1)
@patch("bookwyrm.activitystreams.add_status_task.delay") @patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_review(self, *_): def test_handle_imported_book_review(self, *_):
"""review import""" """review import"""
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(self.local_user, self.csv, True, "public")
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") import_item = import_job.items.filter(index=3).first()
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding import_item.book = self.book
entry = list(csv.DictReader(csv_file))[3] import_item.save()
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
)
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
with patch("bookwyrm.models.Status.broadcast") as broadcast_mock: with patch("bookwyrm.models.Status.broadcast") as broadcast_mock:
@ -251,14 +230,12 @@ class GenericImporter(TestCase):
@patch("bookwyrm.activitystreams.add_status_task.delay") @patch("bookwyrm.activitystreams.add_status_task.delay")
def test_handle_imported_book_rating(self, *_): def test_handle_imported_book_rating(self, *_):
"""rating import""" """rating import"""
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.local_user, self.csv, False, "public"
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
entry = list(csv.DictReader(csv_file))[1]
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
) )
import_item = import_job.items.filter(index=1).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book( handle_imported_book(
@ -271,14 +248,12 @@ class GenericImporter(TestCase):
def test_handle_imported_book_reviews_disabled(self, *_): def test_handle_imported_book_reviews_disabled(self, *_):
"""review import""" """review import"""
import_job = models.ImportJob.objects.create(user=self.local_user) import_job = self.importer.create_job(
datafile = pathlib.Path(__file__).parent.joinpath("../data/generic.csv") self.local_user, self.csv, False, "unlisted"
csv_file = open(datafile, "r") # pylint: disable=unspecified-encoding
entry = list(csv.DictReader(csv_file))[2]
entry = self.importer.parse_fields(entry)
import_item = models.ImportItem.objects.create(
job_id=import_job.id, index=0, data=entry, book=self.book
) )
import_item = import_job.items.filter(index=3).first()
import_item.book = self.book
import_item.save()
with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"): with patch("bookwyrm.models.activitypub_mixin.broadcast_task.delay"):
handle_imported_book( handle_imported_book(