2021-03-08 16:49:10 +00:00
|
|
|
""" track progress of goodreads imports """
|
2020-04-21 14:09:21 +00:00
|
|
|
import re
|
|
|
|
import dateutil.parser
|
|
|
|
|
|
|
|
from django.db import models
|
|
|
|
from django.utils import timezone
|
|
|
|
|
2021-01-02 16:14:28 +00:00
|
|
|
from bookwyrm.connectors import connector_manager
|
2021-11-12 21:48:31 +00:00
|
|
|
from bookwyrm.models import ReadThrough, User, Book, Edition
|
2020-12-13 21:03:17 +00:00
|
|
|
from .fields import PrivacyLevels
|
2020-10-30 18:21:02 +00:00
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
def unquote_string(text):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""resolve csv quote weirdness"""
|
2020-04-21 14:09:21 +00:00
|
|
|
match = re.match(r'="([^"]*)"', text)
|
|
|
|
if match:
|
|
|
|
return match.group(1)
|
|
|
|
return text
|
|
|
|
|
|
|
|
|
|
|
|
def construct_search_term(title, author):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""formulate a query for the data connector"""
|
2020-04-21 14:09:21 +00:00
|
|
|
# Strip brackets (usually series title from search term)
|
2021-03-08 16:49:10 +00:00
|
|
|
title = re.sub(r"\s*\([^)]*\)\s*", "", title)
|
2020-04-21 14:09:21 +00:00
|
|
|
# Open library doesn't like including author initials in search term.
|
2021-03-08 16:49:10 +00:00
|
|
|
author = re.sub(r"(\w\.)+\s*", "", author)
|
2020-04-21 14:09:21 +00:00
|
|
|
|
2021-03-08 16:49:10 +00:00
|
|
|
return " ".join([title, author])
|
2020-04-21 14:09:21 +00:00
|
|
|
|
2020-05-09 21:26:27 +00:00
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
class ImportJob(models.Model):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""entry for a specific request for book data import"""
|
2021-03-08 16:49:10 +00:00
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
user = models.ForeignKey(User, on_delete=models.CASCADE)
|
|
|
|
created_date = models.DateTimeField(default=timezone.now)
|
2021-11-12 21:10:52 +00:00
|
|
|
task_id = models.CharField(max_length=100, null=True) # TODO: deprecated
|
2020-10-30 18:21:02 +00:00
|
|
|
include_reviews = models.BooleanField(default=True)
|
2021-11-11 00:49:54 +00:00
|
|
|
mappings = models.JSONField()
|
2021-01-07 16:08:12 +00:00
|
|
|
complete = models.BooleanField(default=False)
|
2021-11-13 01:10:47 +00:00
|
|
|
source = models.CharField(max_length=100)
|
2020-10-30 18:21:02 +00:00
|
|
|
privacy = models.CharField(
|
2021-03-08 16:49:10 +00:00
|
|
|
max_length=255, default="public", choices=PrivacyLevels.choices
|
2020-10-30 18:21:02 +00:00
|
|
|
)
|
2020-11-13 17:02:41 +00:00
|
|
|
retry = models.BooleanField(default=False)
|
2020-10-29 21:29:31 +00:00
|
|
|
|
2021-02-10 22:18:55 +00:00
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
class ImportItem(models.Model):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""a single line of a csv being imported"""
|
2021-03-08 16:49:10 +00:00
|
|
|
|
|
|
|
job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
|
2020-04-21 14:09:21 +00:00
|
|
|
index = models.IntegerField()
|
2021-03-19 19:43:36 +00:00
|
|
|
data = models.JSONField()
|
2021-11-11 00:49:54 +00:00
|
|
|
normalized_data = models.JSONField()
|
2021-03-08 16:49:10 +00:00
|
|
|
book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)
|
2021-08-10 20:54:52 +00:00
|
|
|
book_guess = models.ForeignKey(
|
|
|
|
Book,
|
|
|
|
on_delete=models.SET_NULL,
|
|
|
|
null=True,
|
|
|
|
blank=True,
|
|
|
|
related_name="book_guess",
|
|
|
|
)
|
2020-04-21 14:09:21 +00:00
|
|
|
fail_reason = models.TextField(null=True)
|
|
|
|
|
|
|
|
def resolve(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""try various ways to lookup a book"""
|
2021-11-13 01:10:47 +00:00
|
|
|
# we might be calling this after manually adding the book,
|
|
|
|
# so no need to do searches
|
|
|
|
if self.book:
|
|
|
|
return
|
|
|
|
|
2021-06-14 19:30:43 +00:00
|
|
|
if self.isbn:
|
|
|
|
self.book = self.get_book_from_isbn()
|
|
|
|
else:
|
2021-08-10 21:02:22 +00:00
|
|
|
# don't fall back on title/author search if isbn is present.
|
2021-06-14 19:30:43 +00:00
|
|
|
# you're too likely to mismatch
|
2021-08-10 20:54:52 +00:00
|
|
|
book, confidence = self.get_book_from_title_author()
|
|
|
|
if confidence > 0.999:
|
|
|
|
self.book = book
|
|
|
|
else:
|
|
|
|
self.book_guess = book
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
def get_book_from_isbn(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""search by isbn"""
|
2021-01-02 16:14:28 +00:00
|
|
|
search_result = connector_manager.first_search_result(
|
2020-11-13 19:03:39 +00:00
|
|
|
self.isbn, min_confidence=0.999
|
2020-10-29 22:29:23 +00:00
|
|
|
)
|
2020-05-03 19:59:06 +00:00
|
|
|
if search_result:
|
2021-11-12 21:48:31 +00:00
|
|
|
# it's already in the right format
|
|
|
|
if isinstance(search_result, Edition):
|
|
|
|
return search_result
|
|
|
|
# it's just a search result, book needs to be created
|
2020-11-13 19:03:39 +00:00
|
|
|
# raises ConnectorException
|
2020-12-27 22:27:18 +00:00
|
|
|
return search_result.connector.get_or_create_book(search_result.key)
|
2020-10-29 21:29:31 +00:00
|
|
|
return None
|
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
def get_book_from_title_author(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""search by title and author"""
|
2021-03-08 16:49:10 +00:00
|
|
|
search_term = construct_search_term(self.title, self.author)
|
2021-01-02 16:14:28 +00:00
|
|
|
search_result = connector_manager.first_search_result(
|
2021-08-10 20:54:52 +00:00
|
|
|
search_term, min_confidence=0.1
|
2020-10-29 22:29:23 +00:00
|
|
|
)
|
2020-05-03 19:59:06 +00:00
|
|
|
if search_result:
|
2021-11-12 22:46:39 +00:00
|
|
|
if isinstance(search_result, Edition):
|
|
|
|
return (search_result, 1)
|
2020-11-13 17:47:35 +00:00
|
|
|
# raises ConnectorException
|
2021-08-10 21:02:22 +00:00
|
|
|
return (
|
|
|
|
search_result.connector.get_or_create_book(search_result.key),
|
|
|
|
search_result.confidence,
|
|
|
|
)
|
2021-08-10 20:54:52 +00:00
|
|
|
return None, 0
|
2020-10-29 21:29:31 +00:00
|
|
|
|
2020-11-13 17:02:41 +00:00
|
|
|
@property
|
|
|
|
def title(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""get the book title"""
|
2021-11-11 00:49:54 +00:00
|
|
|
return self.normalized_data["title"]
|
2020-11-13 17:02:41 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def author(self):
|
2021-11-10 19:10:09 +00:00
|
|
|
"""get the book's authors"""
|
2021-11-11 00:49:54 +00:00
|
|
|
return self.normalized_data["authors"]
|
2020-11-13 17:02:41 +00:00
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
@property
|
|
|
|
def isbn(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""pulls out the isbn13 field from the csv line data"""
|
2021-11-11 00:49:54 +00:00
|
|
|
return unquote_string(self.normalized_data["isbn_13"])
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def shelf(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""the goodreads shelf field"""
|
2021-11-11 00:49:54 +00:00
|
|
|
return self.normalized_data.get("shelf")
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def review(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""a user-written review, to be imported with the book data"""
|
2021-11-11 00:49:54 +00:00
|
|
|
return self.normalized_data["review_body"]
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def rating(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""x/5 star rating for a book"""
|
2021-11-11 00:49:54 +00:00
|
|
|
if self.normalized_data.get("rating"):
|
|
|
|
return float(self.normalized_data["rating"])
|
2021-05-10 21:11:28 +00:00
|
|
|
return None
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def date_added(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""when the book was added to this dataset"""
|
2021-11-11 00:49:54 +00:00
|
|
|
if self.normalized_data.get("date_added"):
|
|
|
|
return timezone.make_aware(
|
|
|
|
dateutil.parser.parse(self.normalized_data["date_added"])
|
|
|
|
)
|
2020-10-29 21:29:31 +00:00
|
|
|
return None
|
2020-04-21 14:09:21 +00:00
|
|
|
|
2021-02-20 16:02:36 +00:00
|
|
|
@property
|
|
|
|
def date_started(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""when the book was started"""
|
2021-11-11 00:49:54 +00:00
|
|
|
if self.normalized_data.get("date_started"):
|
|
|
|
return timezone.make_aware(
|
|
|
|
dateutil.parser.parse(self.normalized_data["date_started"])
|
|
|
|
)
|
2021-02-20 16:02:36 +00:00
|
|
|
return None
|
|
|
|
|
2020-04-21 14:09:21 +00:00
|
|
|
@property
|
|
|
|
def date_read(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""the date a book was completed"""
|
2021-11-11 00:49:54 +00:00
|
|
|
if self.normalized_data.get("date_finished"):
|
2021-11-10 19:10:09 +00:00
|
|
|
return timezone.make_aware(
|
2021-11-11 00:49:54 +00:00
|
|
|
dateutil.parser.parse(self.normalized_data["date_finished"])
|
2021-11-10 19:10:09 +00:00
|
|
|
)
|
2020-10-29 21:29:31 +00:00
|
|
|
return None
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
@property
|
|
|
|
def reads(self):
|
2021-04-26 16:15:42 +00:00
|
|
|
"""formats a read through dataset for the book in this line"""
|
2021-02-20 16:02:36 +00:00
|
|
|
start_date = self.date_started
|
|
|
|
|
|
|
|
# Goodreads special case (no 'date started' field)
|
2021-03-08 16:49:10 +00:00
|
|
|
if (
|
|
|
|
(self.shelf == "reading" or (self.shelf == "read" and self.date_read))
|
|
|
|
and self.date_added
|
|
|
|
and not start_date
|
|
|
|
):
|
2021-02-20 16:02:36 +00:00
|
|
|
start_date = self.date_added
|
|
|
|
|
2021-03-08 16:49:10 +00:00
|
|
|
if start_date and start_date is not None and not self.date_read:
|
2021-02-20 16:02:36 +00:00
|
|
|
return [ReadThrough(start_date=start_date)]
|
2020-04-25 10:29:30 +00:00
|
|
|
if self.date_read:
|
2021-09-08 01:09:43 +00:00
|
|
|
start_date = start_date if start_date < self.date_read else None
|
2021-03-08 16:49:10 +00:00
|
|
|
return [
|
|
|
|
ReadThrough(
|
|
|
|
start_date=start_date,
|
|
|
|
finish_date=self.date_read,
|
|
|
|
)
|
|
|
|
]
|
2020-04-25 10:29:30 +00:00
|
|
|
return []
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
def __repr__(self):
|
2021-09-18 18:32:00 +00:00
|
|
|
# pylint: disable=consider-using-f-string
|
2021-11-12 21:48:31 +00:00
|
|
|
return "<{!r}Item {!r}>".format(self.index, self.normalized_data["title"])
|
2020-04-21 14:09:21 +00:00
|
|
|
|
|
|
|
def __str__(self):
|
2021-09-18 18:32:00 +00:00
|
|
|
# pylint: disable=consider-using-f-string
|
2021-11-11 00:49:54 +00:00
|
|
|
return "{} by {}".format(
|
|
|
|
self.normalized_data["title"], self.normalized_data["authors"]
|
|
|
|
)
|