moviewyrm/bookwyrm/models/import_job.py

""" track progress of goodreads imports """
import re
import dateutil.parser

from django.db import models
from django.utils import timezone

from bookwyrm.connectors import connector_manager
from bookwyrm.models import ReadThrough, User, Book, Edition
from .fields import PrivacyLevels


def unquote_string(text):
    """resolve csv quote weirdness"""
    if not text:
        return None
    match = re.match(r'="([^"]*)"', text)
    if match:
        return match.group(1)
    return text


def construct_search_term(title, author):
    """formulate a query for the data connector"""
    # Strip brackets (usually series title from search term)
    title = re.sub(r"\s*\([^)]*\)\s*", "", title)
    # Open library doesn't like including author initials in search term.
    author = re.sub(r"(\w\.)+\s*", "", author) if author else ""

    return " ".join([title, author])


class ImportJob(models.Model):
    """entry for a specific request for book data import"""

    user = models.ForeignKey(User, on_delete=models.CASCADE)
    created_date = models.DateTimeField(default=timezone.now)
    updated_date = models.DateTimeField(default=timezone.now)
    include_reviews = models.BooleanField(default=True)
    mappings = models.JSONField()
    complete = models.BooleanField(default=False)
    source = models.CharField(max_length=100)
    privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels)
    retry = models.BooleanField(default=False)

    @property
    def pending_items(self):
        """items that haven't been processed yet"""
        return self.items.filter(fail_reason__isnull=True, book__isnull=True)


class ImportItem(models.Model):
    """a single line of a csv being imported"""

    job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")
    index = models.IntegerField()
    data = models.JSONField()
    normalized_data = models.JSONField()
    book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)
    book_guess = models.ForeignKey(
        Book,
        on_delete=models.SET_NULL,
        null=True,
        blank=True,
        related_name="book_guess",
    )
    fail_reason = models.TextField(null=True)
    linked_review = models.ForeignKey(
        "Review", on_delete=models.SET_NULL, null=True, blank=True
    )

    def update_job(self):
        """let the job know when the items get work done"""
        job = self.job
        job.updated_date = timezone.now()
        job.save()
        if not job.pending_items.exists() and not job.complete:
            job.complete = True
            job.save(update_fields=["complete"])

    def resolve(self):
        """try various ways to lookup a book"""
        # we might be calling this after manually adding the book,
        # so no need to do searches
        if self.book:
            return

        if self.isbn:
            self.book = self.get_book_from_identifier()
        elif self.openlibrary_key:
            self.book = self.get_book_from_identifier(field="openlibrary_key")
        else:
            # don't fall back on title/author search if isbn is present.
            # you're too likely to mismatch
            book, confidence = self.get_book_from_title_author()
            if confidence > 0.999:
                self.book = book
            else:
                self.book_guess = book

    def get_book_from_identifier(self, field="isbn"):
        """search by isbn or other unique identifier"""
        search_result = connector_manager.first_search_result(
            getattr(self, field), min_confidence=0.999
        )
        if search_result:
            # it's already in the right format
            if isinstance(search_result, Edition):
                return search_result
            # it's just a search result, book needs to be created
            # raises ConnectorException
            return search_result.connector.get_or_create_book(search_result.key)
        return None

    def get_book_from_title_author(self):
        """search by title and author"""
        if not self.title:
            return None, 0
        search_term = construct_search_term(self.title, self.author)
        search_result = connector_manager.first_search_result(
            search_term, min_confidence=0.1
        )
        if search_result:
            if isinstance(search_result, Edition):
                return (search_result, 1)
            # raises ConnectorException
            return (
                search_result.connector.get_or_create_book(search_result.key),
                search_result.confidence,
            )
        return None, 0

    @property
    def title(self):
        """get the book title"""
        return self.normalized_data.get("title")

    @property
    def author(self):
        """get the book's authors"""
        return self.normalized_data.get("authors")

    @property
    def isbn(self):
        """pulls out the isbn13 field from the csv line data"""
        return unquote_string(self.normalized_data.get("isbn_13")) or unquote_string(
            self.normalized_data.get("isbn_10")
        )

    @property
    def openlibrary_key(self):
        """the edition identifier is preferable to the work key"""
        return self.normalized_data.get("openlibrary_key") or self.normalized_data.get(
            "openlibrary_work_key"
        )

    @property
    def shelf(self):
        """the goodreads shelf field"""
        return self.normalized_data.get("shelf")

    @property
    def review(self):
        """a user-written review, to be imported with the book data"""
        return self.normalized_data.get("review_body")

    @property
    def rating(self):
        """x/5 star rating for a book"""
        if self.normalized_data.get("rating"):
            return float(self.normalized_data.get("rating"))
        return None

    @property
    def date_added(self):
        """when the book was added to this dataset"""
        if self.normalized_data.get("date_added"):
            return timezone.make_aware(
                dateutil.parser.parse(self.normalized_data.get("date_added"))
            )
        return None

    @property
    def date_started(self):
        """when the book was started"""
        if self.normalized_data.get("date_started"):
            return timezone.make_aware(
                dateutil.parser.parse(self.normalized_data.get("date_started"))
            )
        return None

    @property
    def date_read(self):
        """the date a book was completed"""
        if self.normalized_data.get("date_finished"):
            return timezone.make_aware(
                dateutil.parser.parse(self.normalized_data.get("date_finished"))
            )
        return None

    @property
    def reads(self):
        """formats a read through dataset for the book in this line"""
        start_date = self.date_started

        # Goodreads special case (no 'date started' field)
        if (
            (self.shelf == "reading" or (self.shelf == "read" and self.date_read))
            and self.date_added
            and not start_date
        ):
            start_date = self.date_added

        if start_date and start_date is not None and not self.date_read:
            return [ReadThrough(start_date=start_date)]
        if self.date_read:
            start_date = (
                start_date if start_date and start_date < self.date_read else None
            )
            return [
                ReadThrough(
                    start_date=start_date,
                    finish_date=self.date_read,
                )
            ]
        return []

    def __repr__(self):
        # pylint: disable=consider-using-f-string
        return "<{!r} Item {!r}>".format(self.index, self.normalized_data.get("title"))

    def __str__(self):
        # pylint: disable=consider-using-f-string
        return "{} by {}".format(
            self.normalized_data.get("title"), self.normalized_data.get("authors")
        )
Runs black 2021-03-08 16:49:10 +00:00			`""" track progress of goodreads imports """`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`import re`
			`import dateutil.parser`

			`from django.db import models`
			`from django.utils import timezone`

Remove books manager at long last 2021-01-02 16:14:28 +00:00			`from bookwyrm.connectors import connector_manager`
Fixes first_search_result behavior 2021-11-12 21:48:31 +00:00			`from bookwyrm.models import ReadThrough, User, Book, Edition`
Creates Privacy field that handles setting to/cc 2020-12-13 21:03:17 +00:00			`from .fields import PrivacyLevels`
Allow users to set privacy on imported reviews or not import them at all. Fixes #252 2020-10-30 18:21:02 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`def unquote_string(text):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""resolve csv quote weirdness"""`
Fixes isbn assignment for goodreads 2021-11-13 20:24:16 +00:00			`if not text:`
			`return None`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`match = re.match(r'="([^"]*)"', text)`
			`if match:`
			`return match.group(1)`
			`return text`


			`def construct_search_term(title, author):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""formulate a query for the data connector"""`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`# Strip brackets (usually series title from search term)`
Runs black 2021-03-08 16:49:10 +00:00			`title = re.sub(r"\s\([^)]\)\s*", "", title)`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`# Open library doesn't like including author initials in search term.`
Don't produce error is author is unset in import 2021-12-14 19:27:13 +00:00			`author = re.sub(r"(\w\.)+\s*", "", author) if author else ""`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
Runs black 2021-03-08 16:49:10 +00:00			`return " ".join([title, author])`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
code style cleanup 2020-05-09 21:26:27 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`class ImportJob(models.Model):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""entry for a specific request for book data import"""`
Runs black 2021-03-08 16:49:10 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`user = models.ForeignKey(User, on_delete=models.CASCADE)`
			`created_date = models.DateTimeField(default=timezone.now)`
Retry hanging items 2021-11-14 18:20:14 +00:00			`updated_date = models.DateTimeField(default=timezone.now)`
Allow users to set privacy on imported reviews or not import them at all. Fixes #252 2020-10-30 18:21:02 +00:00			`include_reviews = models.BooleanField(default=True)`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`mappings = models.JSONField()`
Notify when import completes 2021-11-14 17:56:23 +00:00			`complete = models.BooleanField(default=False)`
Approve or delete import guesses 2021-11-13 01:10:47 +00:00			`source = models.CharField(max_length=100)`
Build-in translations to privacy choices dropdwon 2022-01-12 23:25:49 +00:00			`privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels)`
Allow import retry 2020-11-13 17:02:41 +00:00			`retry = models.BooleanField(default=False)`
Change how goodread import writes reviews - adds published date - broadcasts review imports - completes review and shelve actions as it goes - some small connector fixes fixes #247 2020-10-29 21:29:31 +00:00
Notify when import completes 2021-11-14 17:56:23 +00:00			`@property`
			`def pending_items(self):`
			`"""items that haven't been processed yet"""`
			`return self.items.filter(fail_reason__isnull=True, book__isnull=True)`

Moves import complete notification to model 2021-02-10 22:18:55 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`class ImportItem(models.Model):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""a single line of a csv being imported"""`
Runs black 2021-03-08 16:49:10 +00:00
			`job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items")`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`index = models.IntegerField()`
Migrations for django 3.1 upgrade 2021-03-19 19:43:36 +00:00			`data = models.JSONField()`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`normalized_data = models.JSONField()`
Runs black 2021-03-08 16:49:10 +00:00			`book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True)`
Save best-guess search results on import 2021-08-10 20:54:52 +00:00			`book_guess = models.ForeignKey(`
			`Book,`
			`on_delete=models.SET_NULL,`
			`null=True,`
			`blank=True,`
			`related_name="book_guess",`
			`)`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`fail_reason = models.TextField(null=True)`
Associate imported review with import item 2021-11-13 19:44:05 +00:00			`linked_review = models.ForeignKey(`
			`"Review", on_delete=models.SET_NULL, null=True, blank=True`
			`)`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
Track completed items on job 2021-11-14 17:04:12 +00:00			`def update_job(self):`
Notify when import completes 2021-11-14 17:56:23 +00:00			`"""let the job know when the items get work done"""`
			`job = self.job`
			`job.updated_date = timezone.now()`
			`job.save()`
			`if not job.pending_items.exists() and not job.complete:`
			`job.complete = True`
			`job.save(update_fields=["complete"])`
Track completed items on job 2021-11-14 17:04:12 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`def resolve(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""try various ways to lookup a book"""`
Approve or delete import guesses 2021-11-13 01:10:47 +00:00			`# we might be calling this after manually adding the book,`
			`# so no need to do searches`
			`if self.book:`
			`return`

Don't try title/author search when isbn search fails 2021-06-14 19:30:43 +00:00			`if self.isbn:`
Lookup by openlibrary key 2021-12-14 20:49:00 +00:00			`self.book = self.get_book_from_identifier()`
			`elif self.openlibrary_key:`
			`self.book = self.get_book_from_identifier(field="openlibrary_key")`
Don't try title/author search when isbn search fails 2021-06-14 19:30:43 +00:00			`else:`
Return confidence rating 2021-08-10 21:02:22 +00:00			`# don't fall back on title/author search if isbn is present.`
Don't try title/author search when isbn search fails 2021-06-14 19:30:43 +00:00			`# you're too likely to mismatch`
Save best-guess search results on import 2021-08-10 20:54:52 +00:00			`book, confidence = self.get_book_from_title_author()`
			`if confidence > 0.999:`
			`self.book = book`
			`else:`
			`self.book_guess = book`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
Lookup by openlibrary key 2021-12-14 20:49:00 +00:00			`def get_book_from_identifier(self, field="isbn"):`
			`"""search by isbn or other unique identifier"""`
Remove books manager at long last 2021-01-02 16:14:28 +00:00			`search_result = connector_manager.first_search_result(`
Lookup by openlibrary key 2021-12-14 20:49:00 +00:00			`getattr(self, field), min_confidence=0.999`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`)`
Separate out local and remote search results 2020-05-03 19:59:06 +00:00			`if search_result:`
Fixes first_search_result behavior 2021-11-12 21:48:31 +00:00			`# it's already in the right format`
			`if isinstance(search_result, Edition):`
			`return search_result`
			`# it's just a search result, book needs to be created`
tweaks search rankings for better results 2020-11-13 19:03:39 +00:00			`# raises ConnectorException`
Send connector with search result also fix typo in get_work_from_edition_data function 2020-12-27 22:27:18 +00:00			`return search_result.connector.get_or_create_book(search_result.key)`
Change how goodread import writes reviews - adds published date - broadcasts review imports - completes review and shelve actions as it goes - some small connector fixes fixes #247 2020-10-29 21:29:31 +00:00			`return None`

Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`def get_book_from_title_author(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""search by title and author"""`
Don't produce error is author is unset in import 2021-12-14 19:27:13 +00:00			`if not self.title:`
			`return None, 0`
Runs black 2021-03-08 16:49:10 +00:00			`search_term = construct_search_term(self.title, self.author)`
Remove books manager at long last 2021-01-02 16:14:28 +00:00			`search_result = connector_manager.first_search_result(`
Save best-guess search results on import 2021-08-10 20:54:52 +00:00			`search_term, min_confidence=0.1`
Stop assuming every book is Hamlet 2020-10-29 22:29:23 +00:00			`)`
Separate out local and remote search results 2020-05-03 19:59:06 +00:00			`if search_result:`
Fixes title/author search handling 2021-11-12 22:46:39 +00:00			`if isinstance(search_result, Edition):`
			`return (search_result, 1)`
fixes import matching with local books 2020-11-13 17:47:35 +00:00			`# raises ConnectorException`
Return confidence rating 2021-08-10 21:02:22 +00:00			`return (`
			`search_result.connector.get_or_create_book(search_result.key),`
			`search_result.confidence,`
			`)`
Save best-guess search results on import 2021-08-10 20:54:52 +00:00			`return None, 0`
Change how goodread import writes reviews - adds published date - broadcasts review imports - completes review and shelve actions as it goes - some small connector fixes fixes #247 2020-10-29 21:29:31 +00:00
Allow import retry 2020-11-13 17:02:41 +00:00			`@property`
			`def title(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""get the book title"""`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return self.normalized_data.get("title")`
Allow import retry 2020-11-13 17:02:41 +00:00
			`@property`
			`def author(self):`
Uses general names for fields in parsed csvs 2021-11-10 19:10:09 +00:00			`"""get the book's authors"""`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return self.normalized_data.get("authors")`
Allow import retry 2020-11-13 17:02:41 +00:00
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`@property`
			`def isbn(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""pulls out the isbn13 field from the csv line data"""`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return unquote_string(self.normalized_data.get("isbn_13")) or unquote_string(`
			`self.normalized_data.get("isbn_10")`
Python formatting 2021-11-14 15:11:48 +00:00			`)`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
Lookup by openlibrary key 2021-12-14 20:49:00 +00:00			`@property`
			`def openlibrary_key(self):`
			`"""the edition identifier is preferable to the work key"""`
			`return self.normalized_data.get("openlibrary_key") or self.normalized_data.get(`
			`"openlibrary_work_key"`
			`)`

Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`@property`
			`def shelf(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""the goodreads shelf field"""`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`return self.normalized_data.get("shelf")`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`@property`
			`def review(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""a user-written review, to be imported with the book data"""`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return self.normalized_data.get("review_body")`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`@property`
			`def rating(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""x/5 star rating for a book"""`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`if self.normalized_data.get("rating"):`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return float(self.normalized_data.get("rating"))`
fix rating property on ImportItem 2021-05-10 21:11:28 +00:00			`return None`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`@property`
			`def date_added(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""when the book was added to this dataset"""`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`if self.normalized_data.get("date_added"):`
			`return timezone.make_aware(`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`dateutil.parser.parse(self.normalized_data.get("date_added"))`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`)`
Change how goodread import writes reviews - adds published date - broadcasts review imports - completes review and shelve actions as it goes - some small connector fixes fixes #247 2020-10-29 21:29:31 +00:00			`return None`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
librarything import 2021-02-20 16:02:36 +00:00			`@property`
			`def date_started(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""when the book was started"""`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`if self.normalized_data.get("date_started"):`
			`return timezone.make_aware(`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`dateutil.parser.parse(self.normalized_data.get("date_started"))`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`)`
librarything import 2021-02-20 16:02:36 +00:00			`return None`

Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00			`@property`
			`def date_read(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""the date a book was completed"""`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`if self.normalized_data.get("date_finished"):`
Uses general names for fields in parsed csvs 2021-11-10 19:10:09 +00:00			`return timezone.make_aware(`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`dateutil.parser.parse(self.normalized_data.get("date_finished"))`
Uses general names for fields in parsed csvs 2021-11-10 19:10:09 +00:00			`)`
Change how goodread import writes reviews - adds published date - broadcasts review imports - completes review and shelve actions as it goes - some small connector fixes fixes #247 2020-10-29 21:29:31 +00:00			`return None`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`@property`
			`def reads(self):`
New version of black, new whitespace 2021-04-26 16:15:42 +00:00			`"""formats a read through dataset for the book in this line"""`
librarything import 2021-02-20 16:02:36 +00:00			`start_date = self.date_started`

			`# Goodreads special case (no 'date started' field)`
Runs black 2021-03-08 16:49:10 +00:00			`if (`
			`(self.shelf == "reading" or (self.shelf == "read" and self.date_read))`
			`and self.date_added`
			`and not start_date`
			`):`
librarything import 2021-02-20 16:02:36 +00:00			`start_date = self.date_added`

Runs black 2021-03-08 16:49:10 +00:00			`if start_date and start_date is not None and not self.date_read:`
librarything import 2021-02-20 16:02:36 +00:00			`return [ReadThrough(start_date=start_date)]`
Only use added_date as start_date for books being read right now. 2020-04-25 10:29:30 +00:00			`if self.date_read:`
Fixes bug comparing dates to nonetype 2021-11-14 18:22:26 +00:00			`start_date = (`
			`start_date if start_date and start_date < self.date_read else None`
			`)`
Runs black 2021-03-08 16:49:10 +00:00			`return [`
			`ReadThrough(`
			`start_date=start_date,`
			`finish_date=self.date_read,`
			`)`
			`]`
Only use added_date as start_date for books being read right now. 2020-04-25 10:29:30 +00:00			`return []`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`def __repr__(self):`
Updating string format synatx part 2 2021-09-18 18:32:00 +00:00			`# pylint: disable=consider-using-f-string`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`return "<{!r} Item {!r}>".format(self.index, self.normalized_data.get("title"))`
Store csv in the database and then import via celery. 2020-04-21 14:09:21 +00:00
			`def __str__(self):`
Updating string format synatx part 2 2021-09-18 18:32:00 +00:00			`# pylint: disable=consider-using-f-string`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`return "{} by {}".format(`
Safer request for normalized data 2021-11-14 18:29:12 +00:00			`self.normalized_data.get("title"), self.normalized_data.get("authors")`
Use generalized mappings to handle import 2021-11-11 00:49:54 +00:00			`)`