diff --git a/bookwyrm/migrations/0192_auto_20240114_0055.py b/bookwyrm/migrations/0192_auto_20240114_0055.py new file mode 100644 index 000000000..f4d324f7f --- /dev/null +++ b/bookwyrm/migrations/0192_auto_20240114_0055.py @@ -0,0 +1,53 @@ +# Generated by Django 3.2.23 on 2024-01-14 00:55 + +import bookwyrm.storage_backends +import django.core.serializers.json +from django.db import migrations, models +import django.db.models.deletion + + +class Migration(migrations.Migration): + + dependencies = [ + ('bookwyrm', '0191_merge_20240102_0326'), + ] + + operations = [ + migrations.AddField( + model_name='bookwyrmexportjob', + name='export_json', + field=models.JSONField(encoder=django.core.serializers.json.DjangoJSONEncoder, null=True), + ), + migrations.AddField( + model_name='bookwyrmexportjob', + name='json_completed', + field=models.BooleanField(default=False), + ), + migrations.AlterField( + model_name='bookwyrmexportjob', + name='export_data', + field=models.FileField(null=True, storage=bookwyrm.storage_backends.ExportsFileStorage, upload_to=''), + ), + migrations.CreateModel( + name='AddFileToTar', + fields=[ + ('childjob_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='bookwyrm.childjob')), + ('parent_export_job', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='child_edition_export_jobs', to='bookwyrm.bookwyrmexportjob')), + ], + options={ + 'abstract': False, + }, + bases=('bookwyrm.childjob',), + ), + migrations.CreateModel( + name='AddBookToUserExportJob', + fields=[ + ('childjob_ptr', models.OneToOneField(auto_created=True, on_delete=django.db.models.deletion.CASCADE, parent_link=True, primary_key=True, serialize=False, to='bookwyrm.childjob')), + ('edition', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='bookwyrm.edition')), + ], + options={ + 'abstract': False, + }, + bases=('bookwyrm.childjob',), + ), + ] diff --git a/bookwyrm/models/bookwyrm_export_job.py b/bookwyrm/models/bookwyrm_export_job.py index 1f6085e0c..12a9792e2 100644 --- a/bookwyrm/models/bookwyrm_export_job.py +++ b/bookwyrm/models/bookwyrm_export_job.py @@ -2,94 +2,347 @@ import dataclasses import logging +import boto3 +from s3_tar import S3Tar from uuid import uuid4 -from django.db.models import FileField +from django.db.models import CASCADE, BooleanField, FileField, ForeignKey, JSONField from django.db.models import Q from django.core.serializers.json import DjangoJSONEncoder -from django.core.files.base import ContentFile +from django.core.files.base import ContentFile, File +from django.utils import timezone + +from bookwyrm import settings, storage_backends from bookwyrm.models import AnnualGoal, ReadThrough, ShelfBook, List, ListItem from bookwyrm.models import Review, Comment, Quotation from bookwyrm.models import Edition from bookwyrm.models import UserFollows, User, UserBlocks -from bookwyrm.models.job import ParentJob, ParentTask +from bookwyrm.models.job import ParentJob, ChildJob, ParentTask, SubTask from bookwyrm.tasks import app, IMPORTS from bookwyrm.utils.tar import BookwyrmTarFile logger = logging.getLogger(__name__) - class BookwyrmExportJob(ParentJob): """entry for a specific request to export a bookwyrm user""" - export_data = FileField(null=True) + if settings.USE_S3: + storage = storage_backends.ExportsS3Storage + else: + storage = storage_backends.ExportsFileStorage + + export_data = FileField(null=True, storage=storage) # use custom storage backend here + export_json = JSONField(null=True, encoder=DjangoJSONEncoder) + json_completed = BooleanField(default=False) + def start_job(self): """Start the job""" - start_export_task.delay(job_id=self.id, no_children=True) - return self + task = start_export_task.delay(job_id=self.id, no_children=False) + self.task_id = task.id + self.save(update_fields=["task_id"]) + + + def notify_child_job_complete(self): + """let the job know when the items get work done""" + + if self.complete: + return + + self.updated_date = timezone.now() + self.save(update_fields=["updated_date"]) + + if not self.complete and self.has_completed: + if not self.json_completed: + + try: + self.json_completed = True + self.save(update_fields=["json_completed"]) + + # add json file to tarfile + tar_job = AddFileToTar.objects.create( + parent_job=self, + parent_export_job=self + ) + tar_job.start_job() + + except Exception as err: # pylint: disable=broad-except + logger.exception("job %s failed with error: %s", self.id, err) + tar_job.set_status("failed") + self.stop_job(reason="failed") + + else: + self.complete_job() + + +class AddBookToUserExportJob(ChildJob): + """append book metadata for each book in an export""" + + edition = ForeignKey(Edition, on_delete=CASCADE) + + def start_job(self): + """Start the job""" + try: + + book = {} + book["work"] = self.edition.parent_work.to_activity() + book["edition"] = self.edition.to_activity() + + if book["edition"].get("cover"): + # change the URL to be relative to the JSON file + filename = book["edition"]["cover"]["url"].rsplit("/", maxsplit=1)[-1] + book["edition"]["cover"]["url"] = f"covers/{filename}" + + # authors + book["authors"] = [] + for author in self.edition.authors.all(): + book["authors"].append(author.to_activity()) + + # Shelves this book is on + # Every ShelfItem is this book so we don't other serializing + book["shelves"] = [] + shelf_books = ( + ShelfBook.objects.select_related("shelf") + .filter(user=self.parent_job.user, book=self.edition) + .distinct() + ) + + for shelfbook in shelf_books: + book["shelves"].append(shelfbook.shelf.to_activity()) + + # Lists and ListItems + # ListItems include "notes" and "approved" so we need them + # even though we know it's this book + book["lists"] = [] + list_items = ListItem.objects.filter(book=self.edition, user=self.parent_job.user).distinct() + + for item in list_items: + list_info = item.book_list.to_activity() + list_info[ + "privacy" + ] = item.book_list.privacy # this isn't serialized so we add it + list_info["list_item"] = item.to_activity() + book["lists"].append(list_info) + + # Statuses + # Can't use select_subclasses here because + # we need to filter on the "book" value, + # which is not available on an ordinary Status + for status in ["comments", "quotations", "reviews"]: + book[status] = [] + + + comments = Comment.objects.filter(user=self.parent_job.user, book=self.edition).all() + for status in comments: + obj = status.to_activity() + obj["progress"] = status.progress + obj["progress_mode"] = status.progress_mode + book["comments"].append(obj) + + + quotes = Quotation.objects.filter(user=self.parent_job.user, book=self.edition).all() + for status in quotes: + obj = status.to_activity() + obj["position"] = status.position + obj["endposition"] = status.endposition + obj["position_mode"] = status.position_mode + book["quotations"].append(obj) + + + reviews = Review.objects.filter(user=self.parent_job.user, book=self.edition).all() + for status in reviews: + obj = status.to_activity() + book["reviews"].append(obj) + + # readthroughs can't be serialized to activity + book_readthroughs = ( + ReadThrough.objects.filter(user=self.parent_job.user, book=self.edition).distinct().values() + ) + book["readthroughs"] = list(book_readthroughs) + + self.parent_job.export_json["books"].append(book) + self.parent_job.save(update_fields=["export_json"]) + self.complete_job() + + except Exception as err: # pylint: disable=broad-except + logger.exception("AddBookToUserExportJob %s Failed with error: %s", self.id, err) + self.set_status("failed") + + +class AddFileToTar(ChildJob): + """add files to export""" + + parent_export_job = ForeignKey( + BookwyrmExportJob, on_delete=CASCADE, related_name="child_edition_export_jobs" + ) # TODO: do we actually need this? Does self.parent_job.export_data work? + + + def start_job(self): + """Start the job""" + + # NOTE we are doing this all in one big job, which has the potential to block a thread + # This is because we need to refer to the same s3_job or BookwyrmTarFile whilst writing + # Alternatives using a series of jobs in a loop would be beter + # but Hugh couldn't make that work + + try: + task_id=self.parent_export_job.task_id + export_data = self.parent_export_job.export_data + export_json = self.parent_export_job.export_json + json_data = DjangoJSONEncoder().encode(export_json) + user = self.parent_export_job.user + editions = get_books_for_user(user) + + if settings.USE_S3: + s3_job = S3Tar( + settings.AWS_STORAGE_BUCKET_NAME, + f"exports/{str(self.parent_export_job.task_id)}.tar.gz" + ) + + # TODO: either encrypt the file or we will need to get it to the user + # from this secure part of the bucket + export_data.save("archive.json", ContentFile(json_data.encode("utf-8"))) + + s3_job.add_file( + f"exports/{export_data.name}" + ) + s3_job.add_file( + f"images/{user.avatar.name}", + folder="avatar" + ) + for book in editions: + if getattr(book, "cover", False): + cover_name = f"images/{book.cover.name}" + s3_job.add_file( + cover_name, + folder="covers" + ) + + s3_job.tar() + # delete export json as soon as it's tarred + # there is probably a better way to do this + # Currently this merely makes the file empty + export_data.delete(save=False) + + else: + # TODO: is the export_data file open to the world? + logger.info( "export file URL: %s",export_data.url) + + export_data.open("wb") + with BookwyrmTarFile.open(mode="w:gz", fileobj=export_data) as tar: + + tar.write_bytes(json_data.encode("utf-8")) + + # Add avatar image if present + if getattr(user, "avatar", False): + tar.add_image(user.avatar, filename="avatar", directory=f"avatar/") # TODO: does this work? + + for book in editions: + if getattr(book, "cover", False): + tar.add_image(book.cover) + + export_data.close() + + + self.complete_job() + + except Exception as err: # pylint: disable=broad-except + logger.exception("AddFileToTar %s Failed with error: %s", self.id, err) + self.stop_job(reason="failed") + self.parent_job.stop_job(reason="failed") @app.task(queue=IMPORTS, base=ParentTask) def start_export_task(**kwargs): - """trigger the child tasks for each row""" + """trigger the child tasks for user export""" + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) # don't start the job if it was stopped from the UI if job.complete: return try: - # This is where ChildJobs get made + + # prepare the initial file and base json job.export_data = ContentFile(b"", str(uuid4())) - json_data = json_export(job.user) - tar_export(json_data, job.user, job.export_data) - job.save(update_fields=["export_data"]) + job.export_json = job.user.to_activity() + job.save(update_fields=["export_data", "export_json"]) + + # let's go + json_export.delay(job_id=job.id, job_user=job.user.id, no_children=False) + except Exception as err: # pylint: disable=broad-except logger.exception("User Export Job %s Failed with error: %s", job.id, err) job.set_status("failed") - job.set_status("complete") +@app.task(queue=IMPORTS, base=ParentTask) +def export_saved_lists_task(**kwargs): + """add user saved lists to export JSON""" + + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + saved_lists = List.objects.filter(id__in=job.user.saved_lists.all()).distinct() + job.export_json["saved_lists"] = [l.remote_id for l in saved_lists] + job.save(update_fields=["export_json"]) -def tar_export(json_data: str, user, file): - """wrap the export information in a tar file""" - file.open("wb") - with BookwyrmTarFile.open(mode="w:gz", fileobj=file) as tar: - tar.write_bytes(json_data.encode("utf-8")) +@app.task(queue=IMPORTS, base=ParentTask) +def export_follows_task(**kwargs): + """add user follows to export JSON""" - # Add avatar image if present - if getattr(user, "avatar", False): - tar.add_image(user.avatar, filename="avatar") - - editions = get_books_for_user(user) - for book in editions: - if getattr(book, "cover", False): - tar.add_image(book.cover) - - file.close() + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + follows = UserFollows.objects.filter(user_subject=job.user).distinct() + following = User.objects.filter(userfollows_user_object__in=follows).distinct() + job.export_json["follows"] = [f.remote_id for f in following] + job.save(update_fields=["export_json"]) -def json_export( - user, -): # pylint: disable=too-many-locals, too-many-statements, too-many-branches +@app.task(queue=IMPORTS, base=ParentTask) +def export_blocks_task(**kwargs): + """add user blocks to export JSON""" + + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + blocks = UserBlocks.objects.filter(user_subject=job.user).distinct() + blocking = User.objects.filter(userblocks_user_object__in=blocks).distinct() + job.export_json["blocks"] = [b.remote_id for b in blocking] + job.save(update_fields=["export_json"]) + + +@app.task(queue=IMPORTS, base=ParentTask) +def export_reading_goals_task(**kwargs): + """add user reading goals to export JSON""" + + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + reading_goals = AnnualGoal.objects.filter(user=job.user).distinct() + job.export_json["goals"] = [] + for goal in reading_goals: + exported_user["goals"].append( + {"goal": goal.goal, "year": goal.year, "privacy": goal.privacy} + ) + job.save(update_fields=["export_json"]) + + +@app.task(queue=IMPORTS, base=ParentTask) +def json_export(**kwargs): """Generate an export for a user""" - # User as AP object - exported_user = user.to_activity() + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + job.set_status("active") + job_id = kwargs["job_id"] + # I don't love this but it prevents a JSON encoding error # when there is no user image if isinstance( - exported_user["icon"], + job.export_json["icon"], dataclasses._MISSING_TYPE, # pylint: disable=protected-access ): - exported_user["icon"] = {} + job.export_json["icon"] = {} else: # change the URL to be relative to the JSON file - file_type = exported_user["icon"]["url"].rsplit(".", maxsplit=1)[-1] + file_type = job.export_json["icon"]["url"].rsplit(".", maxsplit=1)[-1] filename = f"avatar.{file_type}" - exported_user["icon"]["url"] = filename + job.export_json["icon"]["url"] = filename # Additional settings - can't be serialized as AP vals = [ @@ -98,120 +351,45 @@ def json_export( "default_post_privacy", "show_suggested_users", ] - exported_user["settings"] = {} + job.export_json["settings"] = {} for k in vals: - exported_user["settings"][k] = getattr(user, k) + job.export_json["settings"][k] = getattr(job.user, k) - # Reading goals - can't be serialized as AP - reading_goals = AnnualGoal.objects.filter(user=user).distinct() - exported_user["goals"] = [] - for goal in reading_goals: - exported_user["goals"].append( - {"goal": goal.goal, "year": goal.year, "privacy": goal.privacy} - ) + job.export_json["books"] = [] - # Reading history - can't be serialized as AP - readthroughs = ReadThrough.objects.filter(user=user).distinct().values() - readthroughs = list(readthroughs) + # save settings we just updated + job.save(update_fields=["export_json"]) - # Books - editions = get_books_for_user(user) - exported_user["books"] = [] + # trigger subtasks + export_saved_lists_task.delay(job_id=job_id, no_children=False) + export_follows_task.delay(job_id=job_id, no_children=False) + export_blocks_task.delay(job_id=job_id, no_children=False) + trigger_books_jobs.delay(job_id=job_id, no_children=False) - for edition in editions: - book = {} - book["work"] = edition.parent_work.to_activity() - book["edition"] = edition.to_activity() - if book["edition"].get("cover"): - # change the URL to be relative to the JSON file - filename = book["edition"]["cover"]["url"].rsplit("/", maxsplit=1)[-1] - book["edition"]["cover"]["url"] = f"covers/{filename}" +@app.task(queue=IMPORTS, base=ParentTask) +def trigger_books_jobs(**kwargs): + """trigger tasks to get data for each book""" - # authors - book["authors"] = [] - for author in edition.authors.all(): - book["authors"].append(author.to_activity()) + try: + job = BookwyrmExportJob.objects.get(id=kwargs["job_id"]) + editions = get_books_for_user(job.user) - # Shelves this book is on - # Every ShelfItem is this book so we don't other serializing - book["shelves"] = [] - shelf_books = ( - ShelfBook.objects.select_related("shelf") - .filter(user=user, book=edition) - .distinct() - ) + if len(editions) == 0: + job.notify_child_job_complete() + return - for shelfbook in shelf_books: - book["shelves"].append(shelfbook.shelf.to_activity()) - - # Lists and ListItems - # ListItems include "notes" and "approved" so we need them - # even though we know it's this book - book["lists"] = [] - list_items = ListItem.objects.filter(book=edition, user=user).distinct() - - for item in list_items: - list_info = item.book_list.to_activity() - list_info[ - "privacy" - ] = item.book_list.privacy # this isn't serialized so we add it - list_info["list_item"] = item.to_activity() - book["lists"].append(list_info) - - # Statuses - # Can't use select_subclasses here because - # we need to filter on the "book" value, - # which is not available on an ordinary Status - for status in ["comments", "quotations", "reviews"]: - book[status] = [] - - comments = Comment.objects.filter(user=user, book=edition).all() - for status in comments: - obj = status.to_activity() - obj["progress"] = status.progress - obj["progress_mode"] = status.progress_mode - book["comments"].append(obj) - - quotes = Quotation.objects.filter(user=user, book=edition).all() - for status in quotes: - obj = status.to_activity() - obj["position"] = status.position - obj["endposition"] = status.endposition - obj["position_mode"] = status.position_mode - book["quotations"].append(obj) - - reviews = Review.objects.filter(user=user, book=edition).all() - for status in reviews: - obj = status.to_activity() - book["reviews"].append(obj) - - # readthroughs can't be serialized to activity - book_readthroughs = ( - ReadThrough.objects.filter(user=user, book=edition).distinct().values() - ) - book["readthroughs"] = list(book_readthroughs) - - # append everything - exported_user["books"].append(book) - - # saved book lists - just the remote id - saved_lists = List.objects.filter(id__in=user.saved_lists.all()).distinct() - exported_user["saved_lists"] = [l.remote_id for l in saved_lists] - - # follows - just the remote id - follows = UserFollows.objects.filter(user_subject=user).distinct() - following = User.objects.filter(userfollows_user_object__in=follows).distinct() - exported_user["follows"] = [f.remote_id for f in following] - - # blocks - just the remote id - blocks = UserBlocks.objects.filter(user_subject=user).distinct() - blocking = User.objects.filter(userblocks_user_object__in=blocks).distinct() - - exported_user["blocks"] = [b.remote_id for b in blocking] - - return DjangoJSONEncoder().encode(exported_user) + for edition in editions: + try: + edition_job = AddBookToUserExportJob.objects.create(edition=edition, parent_job=job) + edition_job.start_job() + except Exception as err: # pylint: disable=broad-except + logger.exception("AddBookToUserExportJob %s Failed with error: %s", edition_job.id, err) + edition_job.set_status("failed") + except Exception as err: # pylint: disable=broad-except + logger.exception("trigger_books_jobs %s Failed with error: %s", job.id, err) + job.set_status("failed") def get_books_for_user(user): """Get all the books and editions related to a user""" diff --git a/bookwyrm/models/job.py b/bookwyrm/models/job.py index 4f5cb2093..5a2653571 100644 --- a/bookwyrm/models/job.py +++ b/bookwyrm/models/job.py @@ -135,8 +135,7 @@ class ParentJob(Job): ) app.control.revoke(list(tasks)) - for task in self.pending_child_jobs: - task.update(status=self.Status.STOPPED) + self.pending_child_jobs.update(status=self.Status.STOPPED) @property def has_completed(self): @@ -248,7 +247,7 @@ class SubTask(app.Task): """ def before_start( - self, task_id, args, kwargs + self, task_id, *args, **kwargs ): # pylint: disable=no-self-use, unused-argument """Handler called before the task starts. Override. @@ -272,7 +271,7 @@ class SubTask(app.Task): child_job.set_status(ChildJob.Status.ACTIVE) def on_success( - self, retval, task_id, args, kwargs + self, retval, task_id, *args, **kwargs ): # pylint: disable=no-self-use, unused-argument """Run by the worker if the task executes successfully. Override. diff --git a/bookwyrm/settings.py b/bookwyrm/settings.py index fcc91857a..7896850e3 100644 --- a/bookwyrm/settings.py +++ b/bookwyrm/settings.py @@ -442,3 +442,4 @@ if HTTP_X_FORWARDED_PROTO: # Do not change this setting unless you already have an existing # user with the same username - in which case you should change it! INSTANCE_ACTOR_USERNAME = "bookwyrm.instance.actor" +DATA_UPLOAD_MAX_MEMORY_SIZE = (1024**2 * 20) # 20MB TEMPORARY FIX WHILST WORKING ON THIS \ No newline at end of file diff --git a/bookwyrm/storage_backends.py b/bookwyrm/storage_backends.py index 6dd9f522c..c97b4e848 100644 --- a/bookwyrm/storage_backends.py +++ b/bookwyrm/storage_backends.py @@ -1,6 +1,7 @@ """Handles backends for storages""" import os from tempfile import SpooledTemporaryFile +from django.core.files.storage import FileSystemStorage from storages.backends.s3boto3 import S3Boto3Storage from storages.backends.azure_storage import AzureStorage @@ -61,3 +62,16 @@ class AzureImagesStorage(AzureStorage): # pylint: disable=abstract-method location = "images" overwrite_files = False + +class ExportsFileStorage(FileSystemStorage): # pylint: disable=abstract-method + """Storage class for exports contents with local files""" + + location = "exports" + overwrite_files = False + +class ExportsS3Storage(S3Boto3Storage): # pylint: disable=abstract-method + """Storage class for exports contents with S3""" + + location = "exports" + default_acl = None + overwrite_files = False \ No newline at end of file diff --git a/bookwyrm/templatetags/utilities.py b/bookwyrm/templatetags/utilities.py index fca66688a..754db41dd 100644 --- a/bookwyrm/templatetags/utilities.py +++ b/bookwyrm/templatetags/utilities.py @@ -9,7 +9,7 @@ from django.utils.translation import gettext_lazy as _ from django.templatetags.static import static from bookwyrm.models import User -from bookwyrm.settings import INSTANCE_ACTOR_USERNAME +from bookwyrm.settings import INSTANCE_ACTOR_USERNAME, USE_S3 register = template.Library() @@ -133,15 +133,22 @@ def get_file_size(file): """display the size of a file in human readable terms""" try: - raw_size = os.stat(file.path).st_size - if raw_size < 1024: - return f"{raw_size} bytes" - if raw_size < 1024**2: - return f"{raw_size/1024:.2f} KB" - if raw_size < 1024**3: - return f"{raw_size/1024**2:.2f} MB" - return f"{raw_size/1024**3:.2f} GB" - except Exception: # pylint: disable=broad-except + # TODO: this obviously isn't a proper solution + # boto storages do not implement 'path' + if not USE_S3: + raw_size = os.stat(file.path).st_size + if raw_size < 1024: + return f"{raw_size} bytes" + if raw_size < 1024**2: + return f"{raw_size/1024:.2f} KB" + if raw_size < 1024**3: + return f"{raw_size/1024**2:.2f} MB" + return f"{raw_size/1024**3:.2f} GB" + + return "" + + except Exception as e: # pylint: disable=broad-except + print(e) return "" diff --git a/exports/6ee95f7f-58cd-4bff-9d41-1ac2b3db6187 b/exports/6ee95f7f-58cd-4bff-9d41-1ac2b3db6187 new file mode 100644 index 000000000..d7166b703 Binary files /dev/null and b/exports/6ee95f7f-58cd-4bff-9d41-1ac2b3db6187 differ diff --git a/exports/ba15a57f-e29e-4a29-aaf4-306b66960273 b/exports/ba15a57f-e29e-4a29-aaf4-306b66960273 new file mode 100644 index 000000000..318069303 Binary files /dev/null and b/exports/ba15a57f-e29e-4a29-aaf4-306b66960273 differ diff --git a/requirements.txt b/requirements.txt index 6509effc7..31e59e8ef 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,6 +35,7 @@ opentelemetry-sdk==1.16.0 protobuf==3.20.* pyotp==2.8.0 qrcode==7.3.1 +s3-tar==0.1.13 # Dev pytest-django==4.1.0