""" track progress of goodreads imports """ import math import re import dateutil.parser from django.db import models from django.utils import timezone from django.utils.translation import gettext_lazy as _ from bookwyrm.connectors import connector_manager from bookwyrm.models import ( User, Book, Edition, Work, ShelfBook, Shelf, ReadThrough, Review, ReviewRating, ) from bookwyrm.tasks import app, IMPORT_TRIGGERED, IMPORTS from .fields import PrivacyLevels def unquote_string(text): """resolve csv quote weirdness""" if not text: return None match = re.match(r'="([^"]*)"', text) if match: return match.group(1) return text def construct_search_term(title, author): """formulate a query for the data connector""" # Strip brackets (usually series title from search term) title = re.sub(r"\s*\([^)]*\)\s*", "", title) # Open library doesn't like including author initials in search term. author = re.sub(r"(\w\.)+\s*", "", author) if author else "" return " ".join([title, author]) ImportStatuses = [ ("pending", _("Pending")), ("active", _("Active")), ("complete", _("Complete")), ("stopped", _("Stopped")), ] class ImportJob(models.Model): """entry for a specific request for book data import""" user: User = models.ForeignKey(User, on_delete=models.CASCADE) created_date = models.DateTimeField(default=timezone.now) updated_date = models.DateTimeField(default=timezone.now) include_reviews: bool = models.BooleanField(default=True) mappings = models.JSONField() source = models.CharField(max_length=100) privacy = models.CharField(max_length=255, default="public", choices=PrivacyLevels) retry = models.BooleanField(default=False) task_id = models.CharField(max_length=200, null=True, blank=True) complete = models.BooleanField(default=False) status = models.CharField( max_length=50, choices=ImportStatuses, default="pending", null=True ) def start_job(self): """Report that the job has started""" task = start_import_task.delay(self.id) self.task_id = task.id self.save(update_fields=["task_id"]) def complete_job(self) -> None: """Report that the job has completed""" self.status = "complete" self.complete = True self.pending_items.update(fail_reason=_("Import stopped")) self.save(update_fields=["status", "complete"]) def stop_job(self): """Stop the job""" self.status = "stopped" self.complete = True self.save(update_fields=["status", "complete"]) self.pending_items.update(fail_reason=_("Import stopped")) # stop starting app.control.revoke(self.task_id, terminate=True) tasks = self.pending_items.filter(task_id__isnull=False).values_list( "task_id", flat=True ) app.control.revoke(list(tasks)) @property def pending_items(self): """items that haven't been processed yet""" return self.items.filter(fail_reason__isnull=True, book__isnull=True) @property def item_count(self): """How many books do you want to import???""" return self.items.count() @property def percent_complete(self): """How far along?""" item_count = self.item_count if not item_count: return 0 return math.floor((item_count - self.pending_item_count) / item_count * 100) @property def pending_item_count(self): """And how many pending items??""" return self.pending_items.count() @property def successful_item_count(self): """How many found a book?""" return self.items.filter(book__isnull=False).count() @property def failed_item_count(self): """How many found a book?""" return self.items.filter(fail_reason__isnull=False).count() class ImportItem(models.Model): """a single line of a csv being imported""" job = models.ForeignKey(ImportJob, on_delete=models.CASCADE, related_name="items") index = models.IntegerField() data = models.JSONField() normalized_data = models.JSONField() book = models.ForeignKey(Book, on_delete=models.SET_NULL, null=True, blank=True) book_guess = models.ForeignKey( Book, on_delete=models.SET_NULL, null=True, blank=True, related_name="book_guess", ) fail_reason = models.TextField(null=True) linked_review = models.ForeignKey( "Review", on_delete=models.SET_NULL, null=True, blank=True ) task_id = models.CharField(max_length=200, null=True, blank=True) def update_job(self): """let the job know when the items get work done""" job = self.job if job.complete: return job.updated_date = timezone.now() job.save() if not job.pending_items.exists() and not job.complete: job.complete_job() def resolve(self): """try various ways to lookup a book""" # we might be calling this after manually adding the book, # so no need to do searches if self.book: return if self.isbn: self.book = self.get_book_from_identifier() elif self.openlibrary_key: self.book = self.get_book_from_identifier(field="openlibrary_key") else: # don't fall back on title/author search if isbn is present. # you're too likely to mismatch book, confidence = self.get_book_from_title_author() if confidence > 0.999: self.book = book else: self.book_guess = book def get_book_from_identifier(self, field="isbn"): """search by isbn or other unique identifier""" search_result = connector_manager.first_search_result( getattr(self, field), min_confidence=0.999 ) if search_result: # it's already in the right format if isinstance(search_result, Edition): return search_result # it's just a search result, book needs to be created # raises ConnectorException return search_result.connector.get_or_create_book(search_result.key) return None def get_book_from_title_author(self): """search by title and author""" if not self.title: return None, 0 search_term = construct_search_term(self.title, self.author) search_result = connector_manager.first_search_result( search_term, min_confidence=0.1 ) if search_result: if isinstance(search_result, Edition): return (search_result, 1) # raises ConnectorException return ( search_result.connector.get_or_create_book(search_result.key), search_result.confidence, ) return None, 0 @property def title(self): """get the book title""" return self.normalized_data.get("title") @property def author(self): """get the book's authors""" return self.normalized_data.get("authors") @property def isbn(self): """pulls out the isbn13 field from the csv line data""" return unquote_string(self.normalized_data.get("isbn_13")) or unquote_string( self.normalized_data.get("isbn_10") ) @property def openlibrary_key(self): """the edition identifier is preferable to the work key""" return self.normalized_data.get("openlibrary_key") or self.normalized_data.get( "openlibrary_work_key" ) @property def shelf(self): """the goodreads shelf field""" return self.normalized_data.get("shelf") @property def review(self): """a user-written review, to be imported with the book data""" return self.normalized_data.get("review_body") @property def rating(self): """x/5 star rating for a book""" if not self.normalized_data.get("rating"): return None try: return float(self.normalized_data.get("rating")) except ValueError: return None @property def date_added(self): """when the book was added to this dataset""" if self.normalized_data.get("date_added"): parsed_date_added = dateutil.parser.parse( self.normalized_data.get("date_added") ) if timezone.is_aware(parsed_date_added): # Keep timezone if import already had one return parsed_date_added return timezone.make_aware(parsed_date_added) return None @property def date_started(self): """when the book was started""" if self.normalized_data.get("date_started"): return timezone.make_aware( dateutil.parser.parse(self.normalized_data.get("date_started")) ) return None @property def date_read(self): """the date a book was completed""" if self.normalized_data.get("date_finished"): return timezone.make_aware( dateutil.parser.parse(self.normalized_data.get("date_finished")) ) return None @property def reads(self): """formats a read through dataset for the book in this line""" start_date = self.date_started # Goodreads special case (no 'date started' field) if ( (self.shelf == "reading" or (self.shelf == "read" and self.date_read)) and self.date_added and not start_date ): start_date = self.date_added if start_date and start_date is not None and not self.date_read: return [ReadThrough(start_date=start_date)] if self.date_read: start_date = ( start_date if start_date and start_date < self.date_read else None ) return [ ReadThrough( start_date=start_date, finish_date=self.date_read, ) ] return [] def __repr__(self): # pylint: disable=consider-using-f-string return "<{!r} Item {!r}>".format(self.index, self.normalized_data.get("title")) def __str__(self): # pylint: disable=consider-using-f-string return "{} by {}".format( self.normalized_data.get("title"), self.normalized_data.get("authors") ) @app.task(queue=IMPORTS) def start_import_task(job_id): """trigger the child tasks for each row""" job = ImportJob.objects.get(id=job_id) job.status = "active" job.save(update_fields=["status"]) # don't start the job if it was stopped from the UI if job.complete: return # these are sub-tasks so that one big task doesn't use up all the memory in celery for item in job.items.all(): task = import_item_task.delay(item.id) item.task_id = task.id item.save() job.status = "active" job.save() @app.task(queue=IMPORTS) def import_item_task(item_id): """resolve a row into a book""" item = ImportItem.objects.get(id=item_id) # make sure the job has not been stopped if item.job.complete: return try: item.resolve() except Exception as err: # pylint: disable=broad-except item.fail_reason = _("Error loading book") item.save() item.update_job() raise err if item.book: # shelves book and handles reviews handle_imported_book(item) else: item.fail_reason = _("Could not find a match for book") item.save() item.update_job() def handle_imported_book(item): """process a csv and then post about it""" job = item.job if job.complete: return user = job.user if isinstance(item.book, Work): item.book = item.book.default_edition if not item.book: item.fail_reason = _("Error loading book") item.save() return if not isinstance(item.book, Edition): item.book = item.book.edition existing_shelf = ShelfBook.objects.filter(book=item.book, user=user).exists() # shelve the book if it hasn't been shelved already if item.shelf and not existing_shelf: desired_shelf = Shelf.objects.get(identifier=item.shelf, user=user) shelved_date = item.date_added or timezone.now() ShelfBook( book=item.book, shelf=desired_shelf, user=user, shelved_date=shelved_date ).save(priority=IMPORT_TRIGGERED) for read in item.reads: # check for an existing readthrough with the same dates if ReadThrough.objects.filter( user=user, book=item.book, start_date=read.start_date, finish_date=read.finish_date, ).exists(): continue read.book = item.book read.user = user read.save() if job.include_reviews and (item.rating or item.review) and not item.linked_review: # we don't know the publication date of the review, # but "now" is a bad guess published_date_guess = item.date_read or item.date_added if item.review: # pylint: disable=consider-using-f-string review_title = "Review of {!r} on {!r}".format( item.book.title, job.source, ) review = Review.objects.filter( user=user, book=item.book, name=review_title, rating=item.rating, published_date=published_date_guess, ).first() if not review: review = Review( user=user, book=item.book, name=review_title, content=item.review, rating=item.rating, published_date=published_date_guess, privacy=job.privacy, ) review.save(software="bookwyrm", priority=IMPORT_TRIGGERED) else: # just a rating review = ReviewRating.objects.filter( user=user, book=item.book, published_date=published_date_guess, rating=item.rating, ).first() if not review: review = ReviewRating( user=user, book=item.book, rating=item.rating, published_date=published_date_guess, privacy=job.privacy, ) review.save(software="bookwyrm", priority=IMPORT_TRIGGERED) # only broadcast this review to other bookwyrm instances item.linked_review = review item.save()