bookwyrm/bookwyrm/importers/importer.py

208 lines
7.1 KiB
Python
Raw Normal View History

""" handle reading a csv from an external service, defaults are from Goodreads """
2021-02-20 16:02:36 +00:00
import csv
import logging
from django.utils import timezone
2021-08-10 20:48:09 +00:00
from django.utils.translation import gettext_lazy as _
2021-02-20 16:02:36 +00:00
from bookwyrm import models
from bookwyrm.models import ImportJob, ImportItem
from bookwyrm.tasks import app, LOW
2021-02-20 16:02:36 +00:00
logger = logging.getLogger(__name__)
2021-03-08 16:49:10 +00:00
2021-02-20 16:02:36 +00:00
class Importer:
2021-04-26 16:15:42 +00:00
"""Generic class for csv data import from an outside service"""
2021-03-30 15:43:38 +00:00
2021-11-13 01:10:47 +00:00
service = "Import"
2021-03-08 16:49:10 +00:00
delimiter = ","
encoding = "UTF-8"
# these are from Goodreads
row_mappings_guesses = {
"id": ["id", "book id"],
"title": ["title"],
"authors": ["author", "authors", "primary author"],
"isbn_13": ["isbn13", "isbn"],
"isbn_10": ["isbn10", "isbn"],
"shelf": ["shelf", "exclusive shelf", "read status"],
2021-11-11 17:21:28 +00:00
"review_name": ["review name"],
"review_body": ["my review", "review"],
"rating": ["my rating", "rating", "star rating"],
"date_added": ["date added", "entry date", "added"],
"date_started": ["date started", "started"],
"date_finished": ["date finished", "last date read", "date read", "finished"],
}
2021-11-11 17:54:36 +00:00
date_fields = ["date_added", "date_started", "date_finished"]
2021-11-11 20:29:38 +00:00
shelf_mapping_guesses = {
"to-read": ["to-read"],
"read": ["read"],
"reading": ["currently-reading", "reading"],
}
2021-02-20 16:02:36 +00:00
def create_job(self, user, csv_file, include_reviews, privacy):
2021-04-26 16:15:42 +00:00
"""check over a csv and creates a database entry for the job"""
csv_reader = csv.DictReader(csv_file, delimiter=self.delimiter)
rows = enumerate(list(csv_reader))
2021-02-20 16:02:36 +00:00
job = ImportJob.objects.create(
user=user,
include_reviews=include_reviews,
privacy=privacy,
mappings=self.create_row_mappings(csv_reader.fieldnames),
2021-11-13 01:10:47 +00:00
source=self.service,
2021-02-20 16:02:36 +00:00
)
for index, entry in rows:
self.create_item(job, index, entry)
2021-02-20 16:02:36 +00:00
return job
def create_row_mappings(self, headers):
"""guess what the headers mean"""
mappings = {}
for (key, guesses) in self.row_mappings_guesses.items():
value = [h for h in headers if h.lower() in guesses]
value = value[0] if len(value) else None
if value:
headers.remove(value)
mappings[key] = value
return mappings
def create_item(self, job, index, data):
2021-04-26 16:15:42 +00:00
"""creates and saves an import item"""
normalized = self.normalize_row(data, job.mappings)
2021-11-11 20:29:38 +00:00
normalized["shelf"] = self.get_shelf(normalized)
ImportItem(job=job, index=index, data=data, normalized_data=normalized).save()
2021-02-20 16:02:36 +00:00
2021-11-11 20:29:38 +00:00
def get_shelf(self, normalized_row):
"""determine which shelf to use"""
shelf_name = normalized_row["shelf"]
2021-11-11 20:39:12 +00:00
shelf = [
s for (s, gs) in self.shelf_mapping_guesses.items() if shelf_name in gs
]
2021-11-11 20:29:38 +00:00
return shelf[0] if shelf else None
def normalize_row(self, entry, mappings): # pylint: disable=no-self-use
"""use the dataclass to create the formatted row of data"""
return {k: entry.get(v) for k, v in mappings.items()}
2021-02-20 16:02:36 +00:00
def create_retry_job(self, user, original_job, items):
2021-04-26 16:15:42 +00:00
"""retry items that didn't import"""
2021-02-20 16:02:36 +00:00
job = ImportJob.objects.create(
user=user,
include_reviews=original_job.include_reviews,
privacy=original_job.privacy,
# TODO: allow users to adjust mappings
mappings=original_job.mappings,
2021-03-08 16:49:10 +00:00
retry=True,
2021-02-20 16:02:36 +00:00
)
for item in items:
# this will re-normalize the raw data
self.create_item(job, item.index, item.data)
2021-02-20 16:02:36 +00:00
return job
2021-11-13 17:02:42 +00:00
def start_import(self, job): # pylint: disable=no-self-use
2021-04-26 16:15:42 +00:00
"""initalizes a csv import job"""
2021-11-13 17:02:42 +00:00
result = start_import_task.delay(job.id)
2021-02-20 16:02:36 +00:00
job.task_id = result.id
job.save()
2021-09-08 00:04:10 +00:00
@app.task(queue="low_priority")
2021-11-13 01:10:47 +00:00
def start_import_task(job_id):
"""trigger the child tasks for each row"""
2021-02-20 16:02:36 +00:00
job = ImportJob.objects.get(id=job_id)
# these are sub-tasks so that one big task doesn't use up all the memory in celery
2021-11-11 23:17:32 +00:00
for item in job.items.values_list("id", flat=True).all():
2021-11-13 01:10:47 +00:00
import_item_task.delay(item)
@app.task(queue="low_priority")
2021-11-13 01:10:47 +00:00
def import_item_task(item_id):
"""resolve a row into a book"""
2021-11-11 23:17:32 +00:00
item = models.ImportItem.objects.get(id=item_id)
2021-02-20 16:02:36 +00:00
try:
item.resolve()
except Exception as err: # pylint: disable=broad-except
item.fail_reason = _("Error loading book")
item.save()
raise err
if item.book:
# shelves book and handles reviews
2021-11-13 01:10:47 +00:00
handle_imported_book(item)
else:
item.fail_reason = _("Could not find a match for book")
item.save()
2021-02-20 16:02:36 +00:00
2021-11-13 01:10:47 +00:00
def handle_imported_book(item):
2021-04-26 16:15:42 +00:00
"""process a csv and then post about it"""
2021-11-13 01:10:47 +00:00
job = item.job
user = job.user
2021-02-20 16:02:36 +00:00
if isinstance(item.book, models.Work):
item.book = item.book.default_edition
if not item.book:
return
2021-11-13 17:07:50 +00:00
if not isinstance(item.book, models.Edition):
item.book = item.book.edition
2021-02-20 16:02:36 +00:00
2021-03-08 16:49:10 +00:00
existing_shelf = models.ShelfBook.objects.filter(book=item.book, user=user).exists()
2021-02-20 16:02:36 +00:00
# shelve the book if it hasn't been shelved already
if item.shelf and not existing_shelf:
2021-03-08 16:49:10 +00:00
desired_shelf = models.Shelf.objects.get(identifier=item.shelf, user=user)
shelved_date = item.date_added or timezone.now()
2021-11-12 16:55:47 +00:00
models.ShelfBook(
book=item.book, shelf=desired_shelf, user=user, shelved_date=shelved_date
2021-11-12 16:55:47 +00:00
).save(priority=LOW)
2021-02-20 16:02:36 +00:00
for read in item.reads:
# check for an existing readthrough with the same dates
if models.ReadThrough.objects.filter(
2021-03-08 16:49:10 +00:00
user=user,
book=item.book,
start_date=read.start_date,
finish_date=read.finish_date,
).exists():
2021-02-20 16:02:36 +00:00
continue
read.book = item.book
read.user = user
read.save()
2021-11-13 01:10:47 +00:00
if job.include_reviews and (item.rating or item.review):
2021-02-20 16:02:36 +00:00
# we don't know the publication date of the review,
# but "now" is a bad guess
published_date_guess = item.date_read or item.date_added
2021-04-26 17:04:33 +00:00
if item.review:
2021-09-18 18:32:00 +00:00
# pylint: disable=consider-using-f-string
2021-04-26 17:04:33 +00:00
review_title = (
"Review of {!r} on {!r}".format(
item.book.title,
2021-11-13 01:10:47 +00:00
job.source,
2021-04-26 17:04:33 +00:00
)
if item.review
else ""
)
review = models.Review(
2021-04-26 17:04:33 +00:00
user=user,
book=item.book,
name=review_title,
content=item.review,
rating=item.rating,
published_date=published_date_guess,
2021-11-13 01:10:47 +00:00
privacy=job.privacy,
2021-04-26 17:04:33 +00:00
)
else:
# just a rating
review = models.ReviewRating(
2021-04-26 17:04:33 +00:00
user=user,
book=item.book,
rating=item.rating,
published_date=published_date_guess,
2021-11-13 01:10:47 +00:00
privacy=job.privacy,
2021-04-26 17:04:33 +00:00
)
# only broadcast this review to other bookwyrm instances
review.save(software="bookwyrm", priority=LOW)