moviewyrm/bookwyrm/models/import_job.py

243 lines
8 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" track progress of goodreads imports """
import re
import dateutil.parser
from django.db import models
from django.utils import timezone
2021-01-02 16:14:28 +00:00
from bookwyrm.connectors import connector_manager
2021-11-12 21:48:31 +00:00
from bookwyrm.models import ReadThrough, User, Book, Edition
from .fields import PrivacyLevels
def unquote_string(text):
2021-04-26 16:15:42 +00:00
"""resolve csv quote weirdness"""
2021-11-13 20:24:16 +00:00
if not text:
return None
match = re.match(r'="([^"]*)"', text)
if match:
return match.group(1)
return text
def construct_search_term(title, author):
2021-04-26 16:15:42 +00:00
"""formulate a query for the data connector"""
# Strip brackets (usually series title from search term)
2021-03-08 16:49:10 +00:00
title = re.sub(r"\s*\([^)]*\)\s*", "", title)
# Open library doesn't like including author initials in search term.
author = re.sub(r"(\w\.)+\s*", "", author) if author else ""
2021-03-08 16:49:10 +00:00
return " ".join([title, author])
2020-05-09 21:26:27 +00:00
class ImportJob(models.Model):
2021-04-26 16:15:42 +00:00
"""entry for a specific request for book data import"""
2021-03-08 16:49:10 +00:00
user = models.ForeignKey(User, on_delete=models.CASCADE)
created_date = models.DateTimeField(default=timezone.now)
2021-11-14 18:20:14 +00:00
updated_date = models.DateTimeField(default=timezone.now)
include_reviews = models.BooleanField(default=True)
mappings = models.JSONField()
2021-11-14 17:56:23 +00:00
complete = models.BooleanField(default=False)
2021-11-13 01:10:47 +00:00
source = models.CharField(max_length=100)
privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels)
2020-11-13 17:02:41 +00:00
retry = models.BooleanField(default=False)
2021-11-14 17:56:23 +00:00
@property
def pending_items(self):
"""items that haven't been processed yet"""
return self.items.filter(fail_reason__isnull=True, book__isnull=True)
class ImportItem(models.Model):
2021-04-26 16:15:42 +00:00
"""a single line of a csv being imported"""
2021-03-08 16:49:10 +00:00
job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
index = models.IntegerField()
2021-03-19 19:43:36 +00:00
data = models.JSONField()
normalized_data = models.JSONField()
2021-03-08 16:49:10 +00:00
book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)
book_guess = models.ForeignKey(
Book,
on_delete=models.SET_NULL,
null=True,
blank=True,
related_name="book_guess",
)
fail_reason = models.TextField(null=True)
linked_review = models.ForeignKey(
"Review", on_delete=models.SET_NULL, null=True, blank=True
)
2021-11-14 17:04:12 +00:00
def update_job(self):
2021-11-14 17:56:23 +00:00
"""let the job know when the items get work done"""
job = self.job
job.updated_date = timezone.now()
job.save()
if not job.pending_items.exists() and not job.complete:
job.complete = True
job.save(update_fields=["complete"])
2021-11-14 17:04:12 +00:00
def resolve(self):
2021-04-26 16:15:42 +00:00
"""try various ways to lookup a book"""
2021-11-13 01:10:47 +00:00
# we might be calling this after manually adding the book,
# so no need to do searches
if self.book:
return
if self.isbn:
2021-12-14 20:49:00 +00:00
self.book = self.get_book_from_identifier()
elif self.openlibrary_key:
self.book = self.get_book_from_identifier(field="openlibrary_key")
else:
2021-08-10 21:02:22 +00:00
# don't fall back on title/author search if isbn is present.
# you're too likely to mismatch
book, confidence = self.get_book_from_title_author()
if confidence > 0.999:
self.book = book
else:
self.book_guess = book
2021-12-14 20:49:00 +00:00
def get_book_from_identifier(self, field="isbn"):
"""search by isbn or other unique identifier"""
2021-01-02 16:14:28 +00:00
search_result = connector_manager.first_search_result(
2021-12-14 20:49:00 +00:00
getattr(self, field), min_confidence=0.999
2020-10-29 22:29:23 +00:00
)
if search_result:
2021-11-12 21:48:31 +00:00
# it's already in the right format
if isinstance(search_result, Edition):
return search_result
# it's just a search result, book needs to be created
# raises ConnectorException
return search_result.connector.get_or_create_book(search_result.key)
return None
def get_book_from_title_author(self):
2021-04-26 16:15:42 +00:00
"""search by title and author"""
if not self.title:
return None, 0
2021-03-08 16:49:10 +00:00
search_term = construct_search_term(self.title, self.author)
2021-01-02 16:14:28 +00:00
search_result = connector_manager.first_search_result(
search_term, min_confidence=0.1
2020-10-29 22:29:23 +00:00
)
if search_result:
2021-11-12 22:46:39 +00:00
if isinstance(search_result, Edition):
return (search_result, 1)
2020-11-13 17:47:35 +00:00
# raises ConnectorException
2021-08-10 21:02:22 +00:00
return (
search_result.connector.get_or_create_book(search_result.key),
search_result.confidence,
)
return None, 0
2020-11-13 17:02:41 +00:00
@property
def title(self):
2021-04-26 16:15:42 +00:00
"""get the book title"""
2021-11-14 18:29:12 +00:00
return self.normalized_data.get("title")
2020-11-13 17:02:41 +00:00
@property
def author(self):
"""get the book's authors"""
2021-11-14 18:29:12 +00:00
return self.normalized_data.get("authors")
2020-11-13 17:02:41 +00:00
@property
def isbn(self):
2021-04-26 16:15:42 +00:00
"""pulls out the isbn13 field from the csv line data"""
2021-11-14 18:29:12 +00:00
return unquote_string(self.normalized_data.get("isbn_13")) or unquote_string(
self.normalized_data.get("isbn_10")
2021-11-14 15:11:48 +00:00
)
2021-12-14 20:49:00 +00:00
@property
def openlibrary_key(self):
"""the edition identifier is preferable to the work key"""
return self.normalized_data.get("openlibrary_key") or self.normalized_data.get(
"openlibrary_work_key"
)
@property
def shelf(self):
2021-04-26 16:15:42 +00:00
"""the goodreads shelf field"""
return self.normalized_data.get("shelf")
@property
def review(self):
2021-04-26 16:15:42 +00:00
"""a user-written review, to be imported with the book data"""
2021-11-14 18:29:12 +00:00
return self.normalized_data.get("review_body")
@property
def rating(self):
2021-04-26 16:15:42 +00:00
"""x/5 star rating for a book"""
if self.normalized_data.get("rating"):
2021-11-14 18:29:12 +00:00
return float(self.normalized_data.get("rating"))
2021-05-10 21:11:28 +00:00
return None
@property
def date_added(self):
2021-04-26 16:15:42 +00:00
"""when the book was added to this dataset"""
if self.normalized_data.get("date_added"):
parsed_date_added = dateutil.parser.parse(
self.normalized_data.get("date_added")
)
if timezone.is_aware(parsed_date_added):
# Keep timezone if import already had one
return parsed_date_added
return timezone.make_aware(parsed_date_added)
return None
2021-02-20 16:02:36 +00:00
@property
def date_started(self):
2021-04-26 16:15:42 +00:00
"""when the book was started"""
if self.normalized_data.get("date_started"):
return timezone.make_aware(
2021-11-14 18:29:12 +00:00
dateutil.parser.parse(self.normalized_data.get("date_started"))
)
2021-02-20 16:02:36 +00:00
return None
@property
def date_read(self):
2021-04-26 16:15:42 +00:00
"""the date a book was completed"""
if self.normalized_data.get("date_finished"):
return timezone.make_aware(
2021-11-14 18:29:12 +00:00
dateutil.parser.parse(self.normalized_data.get("date_finished"))
)
return None
@property
def reads(self):
2021-04-26 16:15:42 +00:00
"""formats a read through dataset for the book in this line"""
2021-02-20 16:02:36 +00:00
start_date = self.date_started
# Goodreads special case (no 'date started' field)
2021-03-08 16:49:10 +00:00
if (
(self.shelf == "reading" or (self.shelf == "read" and self.date_read))
and self.date_added
and not start_date
):
2021-02-20 16:02:36 +00:00
start_date = self.date_added
2021-03-08 16:49:10 +00:00
if start_date and start_date is not None and not self.date_read:
2021-02-20 16:02:36 +00:00
return [ReadThrough(start_date=start_date)]
if self.date_read:
2021-11-14 18:22:26 +00:00
start_date = (
start_date if start_date and start_date < self.date_read else None
)
2021-03-08 16:49:10 +00:00
return [
ReadThrough(
start_date=start_date,
finish_date=self.date_read,
)
]
return []
def __repr__(self):
2021-09-18 18:32:00 +00:00
# pylint: disable=consider-using-f-string
2021-11-14 18:29:12 +00:00
return "<{!r} Item {!r}>".format(self.index, self.normalized_data.get("title"))
def __str__(self):
2021-09-18 18:32:00 +00:00
# pylint: disable=consider-using-f-string
return "{} by {}".format(
2021-11-14 18:29:12 +00:00
self.normalized_data.get("title"), self.normalized_data.get("authors")
)