bookwyrm/bookwyrm/models/bookwyrm_export_job.py
Bart Schuurmans 471233c1dc Use different export job fields for the different storage backends
This way, the database definition is not depdendent on the runtime configuration.
2024-03-24 12:46:42 +01:00

454 lines
16 KiB
Python

"""Export user account to tar.gz file for import into another Bookwyrm instance"""
import logging
from uuid import uuid4
from boto3.session import Session as BotoSession
from s3_tar import S3Tar
from storages.backends.s3boto3 import S3Boto3Storage
from django.db.models import CASCADE, BooleanField, FileField, ForeignKey, JSONField
from django.db.models import Q
from django.core.serializers.json import DjangoJSONEncoder
from django.core.files.base import ContentFile
from django.utils import timezone
from bookwyrm import settings, storage_backends
from bookwyrm.models import AnnualGoal, ReadThrough, ShelfBook, List, ListItem
from bookwyrm.models import Review, Comment, Quotation
from bookwyrm.models import Edition
from bookwyrm.models import UserFollows, User, UserBlocks
from bookwyrm.models.job import ParentJob, ChildJob, ParentTask
from bookwyrm.tasks import app, IMPORTS
from bookwyrm.utils.tar import BookwyrmTarFile
logger = logging.getLogger(__name__)
class BookwyrmAwsSession(BotoSession):
"""a boto session that always uses settings.AWS_S3_ENDPOINT_URL"""
def client(self, *args, **kwargs): # pylint: disable=arguments-differ
kwargs["endpoint_url"] = settings.AWS_S3_ENDPOINT_URL
return super().client("s3", *args, **kwargs)
class BookwyrmExportJob(ParentJob):
"""entry for a specific request to export a bookwyrm user"""
# Only one of these fields is used, dependent on the configuration.
export_data_file = FileField(null=True, storage=storage_backends.ExportsFileStorage)
export_data_s3 = FileField(null=True, storage=storage_backends.ExportsS3Storage)
export_json = JSONField(null=True, encoder=DjangoJSONEncoder)
json_completed = BooleanField(default=False)
@property
def export_data(self):
"""returns the file field of the configured storage backend"""
# TODO: We could check whether a field for a different backend is
# filled, to support migrating to a different backend.
if settings.USE_S3:
return self.export_data_s3
return self.export_data_file
@export_data.setter
def export_data(self, value):
"""sets the file field of the configured storage backend"""
if settings.USE_S3:
self.export_data_s3 = value
else:
self.export_data_file = value
def start_job(self):
"""Start the job"""
task = start_export_task.delay(job_id=self.id, no_children=False)
self.task_id = task.id
self.save(update_fields=["task_id"])
def notify_child_job_complete(self):
"""let the job know when the items get work done"""
if self.complete:
return
self.updated_date = timezone.now()
self.save(update_fields=["updated_date"])
if not self.complete and self.has_completed:
if not self.json_completed:
try:
self.json_completed = True
self.save(update_fields=["json_completed"])
tar_job = AddFileToTar.objects.create(
parent_job=self, parent_export_job=self
)
tar_job.start_job()
except Exception as err: # pylint: disable=broad-except
logger.exception("job %s failed with error: %s", self.id, err)
tar_job.set_status("failed")
self.stop_job(reason="failed")
else:
self.complete_job()
class AddBookToUserExportJob(ChildJob):
"""append book metadata for each book in an export"""
edition = ForeignKey(Edition, on_delete=CASCADE)
# pylint: disable=too-many-locals
def start_job(self):
"""Start the job"""
try:
book = {}
book["work"] = self.edition.parent_work.to_activity()
book["edition"] = self.edition.to_activity()
if book["edition"].get("cover"):
# change the URL to be relative to the JSON file
filename = book["edition"]["cover"]["url"].rsplit("/", maxsplit=1)[-1]
book["edition"]["cover"]["url"] = f"covers/{filename}"
# authors
book["authors"] = []
for author in self.edition.authors.all():
book["authors"].append(author.to_activity())
# Shelves this book is on
# Every ShelfItem is this book so we don't other serializing
book["shelves"] = []
shelf_books = (
ShelfBook.objects.select_related("shelf")
.filter(user=self.parent_job.user, book=self.edition)
.distinct()
)
for shelfbook in shelf_books:
book["shelves"].append(shelfbook.shelf.to_activity())
# Lists and ListItems
# ListItems include "notes" and "approved" so we need them
# even though we know it's this book
book["lists"] = []
list_items = ListItem.objects.filter(
book=self.edition, user=self.parent_job.user
).distinct()
for item in list_items:
list_info = item.book_list.to_activity()
list_info[
"privacy"
] = item.book_list.privacy # this isn't serialized so we add it
list_info["list_item"] = item.to_activity()
book["lists"].append(list_info)
# Statuses
# Can't use select_subclasses here because
# we need to filter on the "book" value,
# which is not available on an ordinary Status
for status in ["comments", "quotations", "reviews"]:
book[status] = []
comments = Comment.objects.filter(
user=self.parent_job.user, book=self.edition
).all()
for status in comments:
obj = status.to_activity()
obj["progress"] = status.progress
obj["progress_mode"] = status.progress_mode
book["comments"].append(obj)
quotes = Quotation.objects.filter(
user=self.parent_job.user, book=self.edition
).all()
for status in quotes:
obj = status.to_activity()
obj["position"] = status.position
obj["endposition"] = status.endposition
obj["position_mode"] = status.position_mode
book["quotations"].append(obj)
reviews = Review.objects.filter(
user=self.parent_job.user, book=self.edition
).all()
for status in reviews:
obj = status.to_activity()
book["reviews"].append(obj)
# readthroughs can't be serialized to activity
book_readthroughs = (
ReadThrough.objects.filter(user=self.parent_job.user, book=self.edition)
.distinct()
.values()
)
book["readthroughs"] = list(book_readthroughs)
self.parent_job.export_json["books"].append(book)
self.parent_job.save(update_fields=["export_json"])
self.complete_job()
except Exception as err: # pylint: disable=broad-except
logger.exception(
"AddBookToUserExportJob %s Failed with error: %s", self.id, err
)
self.set_status("failed")
class AddFileToTar(ChildJob):
"""add files to export"""
parent_export_job = ForeignKey(
BookwyrmExportJob, on_delete=CASCADE, related_name="child_edition_export_jobs"
)
def start_job(self):
"""Start the job"""
# NOTE we are doing this all in one big job,
# which has the potential to block a thread
# This is because we need to refer to the same s3_job
# or BookwyrmTarFile whilst writing
# Using a series of jobs in a loop would be better
try:
export_data = self.parent_export_job.export_data
export_json = self.parent_export_job.export_json
json_data = DjangoJSONEncoder().encode(export_json)
user = self.parent_export_job.user
editions = get_books_for_user(user)
# filenames for later
export_data_original = str(export_data)
filename = str(self.parent_export_job.task_id)
if settings.USE_S3:
s3_job = S3Tar(
settings.AWS_STORAGE_BUCKET_NAME,
f"exports/{filename}.tar.gz",
session=BookwyrmAwsSession(),
)
# save json file
export_data.save(
f"archive_{filename}.json", ContentFile(json_data.encode("utf-8"))
)
s3_job.add_file(f"exports/{export_data.name}")
# save image file
file_type = user.avatar.name.rsplit(".", maxsplit=1)[-1]
export_data.save(f"avatar_{filename}.{file_type}", user.avatar)
s3_job.add_file(f"exports/{export_data.name}")
for book in editions:
if getattr(book, "cover", False):
cover_name = f"images/{book.cover.name}"
s3_job.add_file(cover_name, folder="covers")
s3_job.tar()
# delete child files - we don't need them any more
s3_storage = S3Boto3Storage(querystring_auth=True, custom_domain=None)
S3Boto3Storage.delete(s3_storage, f"exports/{export_data_original}")
S3Boto3Storage.delete(s3_storage, f"exports/archive_{filename}.json")
S3Boto3Storage.delete(
s3_storage, f"exports/avatar_{filename}.{file_type}"
)
else:
export_data.open("wb")
with BookwyrmTarFile.open(mode="w:gz", fileobj=export_data) as tar:
tar.write_bytes(json_data.encode("utf-8"))
# Add avatar image if present
if getattr(user, "avatar", False):
tar.add_image(user.avatar, filename="avatar")
for book in editions:
if getattr(book, "cover", False):
tar.add_image(book.cover)
export_data.close()
self.complete_job()
except Exception as err: # pylint: disable=broad-except
logger.exception("AddFileToTar %s Failed with error: %s", self.id, err)
self.stop_job(reason="failed")
self.parent_job.stop_job(reason="failed")
@app.task(queue=IMPORTS, base=ParentTask)
def start_export_task(**kwargs):
"""trigger the child tasks for user export"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
# don't start the job if it was stopped from the UI
if job.complete:
return
try:
# prepare the initial file and base json
job.export_data = ContentFile(b"", str(uuid4()))
job.export_json = job.user.to_activity()
job.save(update_fields=["export_data_file", "export_data_s3", "export_json"])
# let's go
json_export.delay(job_id=job.id, job_user=job.user.id, no_children=False)
except Exception as err: # pylint: disable=broad-except
logger.exception("User Export Job %s Failed with error: %s", job.id, err)
job.set_status("failed")
@app.task(queue=IMPORTS, base=ParentTask)
def export_saved_lists_task(**kwargs):
"""add user saved lists to export JSON"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
saved_lists = List.objects.filter(id__in=job.user.saved_lists.all()).distinct()
job.export_json["saved_lists"] = [l.remote_id for l in saved_lists]
job.save(update_fields=["export_json"])
@app.task(queue=IMPORTS, base=ParentTask)
def export_follows_task(**kwargs):
"""add user follows to export JSON"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
follows = UserFollows.objects.filter(user_subject=job.user).distinct()
following = User.objects.filter(userfollows_user_object__in=follows).distinct()
job.export_json["follows"] = [f.remote_id for f in following]
job.save(update_fields=["export_json"])
@app.task(queue=IMPORTS, base=ParentTask)
def export_blocks_task(**kwargs):
"""add user blocks to export JSON"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
blocks = UserBlocks.objects.filter(user_subject=job.user).distinct()
blocking = User.objects.filter(userblocks_user_object__in=blocks).distinct()
job.export_json["blocks"] = [b.remote_id for b in blocking]
job.save(update_fields=["export_json"])
@app.task(queue=IMPORTS, base=ParentTask)
def export_reading_goals_task(**kwargs):
"""add user reading goals to export JSON"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
reading_goals = AnnualGoal.objects.filter(user=job.user).distinct()
job.export_json["goals"] = []
for goal in reading_goals:
job.export_json["goals"].append(
{"goal": goal.goal, "year": goal.year, "privacy": goal.privacy}
)
job.save(update_fields=["export_json"])
@app.task(queue=IMPORTS, base=ParentTask)
def json_export(**kwargs):
"""Generate an export for a user"""
try:
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
job.set_status("active")
job_id = kwargs["job_id"]
if not job.export_json.get("icon"):
job.export_json["icon"] = {}
else:
# change the URL to be relative to the JSON file
file_type = job.export_json["icon"]["url"].rsplit(".", maxsplit=1)[-1]
filename = f"avatar.{file_type}"
job.export_json["icon"]["url"] = filename
# Additional settings - can't be serialized as AP
vals = [
"show_goal",
"preferred_timezone",
"default_post_privacy",
"show_suggested_users",
]
job.export_json["settings"] = {}
for k in vals:
job.export_json["settings"][k] = getattr(job.user, k)
job.export_json["books"] = []
# save settings we just updated
job.save(update_fields=["export_json"])
# trigger subtasks
export_saved_lists_task.delay(job_id=job_id, no_children=False)
export_follows_task.delay(job_id=job_id, no_children=False)
export_blocks_task.delay(job_id=job_id, no_children=False)
trigger_books_jobs.delay(job_id=job_id, no_children=False)
except Exception as err: # pylint: disable=broad-except
logger.exception(
"json_export task in job %s Failed with error: %s",
job.id,
err,
)
job.set_status("failed")
@app.task(queue=IMPORTS, base=ParentTask)
def trigger_books_jobs(**kwargs):
"""trigger tasks to get data for each book"""
try:
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
editions = get_books_for_user(job.user)
if len(editions) == 0:
job.notify_child_job_complete()
return
for edition in editions:
try:
edition_job = AddBookToUserExportJob.objects.create(
edition=edition, parent_job=job
)
edition_job.start_job()
except Exception as err: # pylint: disable=broad-except
logger.exception(
"AddBookToUserExportJob %s Failed with error: %s",
edition_job.id,
err,
)
edition_job.set_status("failed")
except Exception as err: # pylint: disable=broad-except
logger.exception("trigger_books_jobs %s Failed with error: %s", job.id, err)
job.set_status("failed")
def get_books_for_user(user):
"""Get all the books and editions related to a user"""
editions = (
Edition.objects.select_related("parent_work")
.filter(
Q(shelves__user=user)
| Q(readthrough__user=user)
| Q(review__user=user)
| Q(list__user=user)
| Q(comment__user=user)
| Q(quotation__user=user)
)
.distinct()
)
return editions