Merge pull request #3228 from hughrun/user-export

Fix user exports to deal with s3 storage
This commit is contained in:
Bart Schuurmans 2024-04-13 22:53:58 +02:00 committed by GitHub
commit 21a39f8170
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
25 changed files with 730 additions and 303 deletions

View file

@ -71,6 +71,9 @@ ENABLE_THUMBNAIL_GENERATION=true
USE_S3=false
AWS_ACCESS_KEY_ID=
AWS_SECRET_ACCESS_KEY=
# seconds for signed S3 urls to expire
# this is currently only used for user export files
S3_SIGNED_URL_EXPIRY=900
# Commented are example values if you use a non-AWS, S3-compatible service
# AWS S3 should work with only AWS_STORAGE_BUCKET_NAME and AWS_S3_REGION_NAME

1
.gitignore vendored
View file

@ -16,6 +16,7 @@
# BookWyrm
.env
/images/
/exports/
/static/
bookwyrm/static/css/bookwyrm.css
bookwyrm/static/css/themes/

View file

@ -0,0 +1,92 @@
# Generated by Django 3.2.23 on 2024-01-28 02:49
import bookwyrm.storage_backends
import django.core.serializers.json
from django.db import migrations, models
import django.db.models.deletion
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0192_sitesettings_user_exports_enabled"),
]
operations = [
migrations.AddField(
model_name="bookwyrmexportjob",
name="export_json",
field=models.JSONField(
encoder=django.core.serializers.json.DjangoJSONEncoder, null=True
),
),
migrations.AddField(
model_name="bookwyrmexportjob",
name="json_completed",
field=models.BooleanField(default=False),
),
migrations.AlterField(
model_name="bookwyrmexportjob",
name="export_data",
field=models.FileField(
null=True,
storage=bookwyrm.storage_backends.ExportsFileStorage,
upload_to="",
),
),
migrations.CreateModel(
name="AddFileToTar",
fields=[
(
"childjob_ptr",
models.OneToOneField(
auto_created=True,
on_delete=django.db.models.deletion.CASCADE,
parent_link=True,
primary_key=True,
serialize=False,
to="bookwyrm.childjob",
),
),
(
"parent_export_job",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
related_name="child_edition_export_jobs",
to="bookwyrm.bookwyrmexportjob",
),
),
],
options={
"abstract": False,
},
bases=("bookwyrm.childjob",),
),
migrations.CreateModel(
name="AddBookToUserExportJob",
fields=[
(
"childjob_ptr",
models.OneToOneField(
auto_created=True,
on_delete=django.db.models.deletion.CASCADE,
parent_link=True,
primary_key=True,
serialize=False,
to="bookwyrm.childjob",
),
),
(
"edition",
models.ForeignKey(
on_delete=django.db.models.deletion.CASCADE,
to="bookwyrm.edition",
),
),
],
options={
"abstract": False,
},
bases=("bookwyrm.childjob",),
),
]

View file

@ -0,0 +1,13 @@
# Generated by Django 3.2.23 on 2024-03-18 17:37
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0193_auto_20240128_0249"),
("bookwyrm", "0195_alter_user_preferred_language"),
]
operations = []

View file

@ -0,0 +1,13 @@
# Generated by Django 3.2.25 on 2024-03-24 02:35
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0196_merge_20240318_1737"),
("bookwyrm", "0196_merge_pr3134_into_main"),
]
operations = []

View file

@ -0,0 +1,23 @@
# Generated by Django 3.2.25 on 2024-03-26 11:37
import bookwyrm.models.bookwyrm_export_job
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0197_merge_20240324_0235"),
]
operations = [
migrations.AlterField(
model_name="bookwyrmexportjob",
name="export_data",
field=models.FileField(
null=True,
storage=bookwyrm.models.bookwyrm_export_job.select_exports_storage,
upload_to="",
),
),
]

View file

@ -0,0 +1,13 @@
# Generated by Django 3.2.25 on 2024-03-26 12:17
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0198_alter_bookwyrmexportjob_export_data"),
("bookwyrm", "0198_book_search_vector_author_aliases"),
]
operations = []

View file

@ -0,0 +1,27 @@
# Generated by Django 3.2.25 on 2024-03-27 19:14
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0199_merge_20240326_1217"),
]
operations = [
migrations.RemoveField(
model_name="addfiletotar",
name="childjob_ptr",
),
migrations.RemoveField(
model_name="addfiletotar",
name="parent_export_job",
),
migrations.DeleteModel(
name="AddBookToUserExportJob",
),
migrations.DeleteModel(
name="AddFileToTar",
),
]

View file

@ -0,0 +1,13 @@
# Generated by Django 3.2.25 on 2024-04-13 02:32
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
("bookwyrm", "0200_auto_20240327_1914"),
("bookwyrm", "0204_merge_20240409_1042"),
]
operations = []

View file

@ -1,174 +1,300 @@
"""Export user account to tar.gz file for import into another Bookwyrm instance"""
import dataclasses
import logging
from uuid import uuid4
import os
from django.db.models import FileField
from boto3.session import Session as BotoSession
from s3_tar import S3Tar
from django.db.models import BooleanField, FileField, JSONField
from django.db.models import Q
from django.core.serializers.json import DjangoJSONEncoder
from django.core.files.base import ContentFile
from django.utils.module_loading import import_string
from bookwyrm.models import AnnualGoal, ReadThrough, ShelfBook, List, ListItem
from bookwyrm import settings, storage_backends
from bookwyrm.models import AnnualGoal, ReadThrough, ShelfBook, ListItem
from bookwyrm.models import Review, Comment, Quotation
from bookwyrm.models import Edition
from bookwyrm.models import UserFollows, User, UserBlocks
from bookwyrm.models.job import ParentJob, ParentTask
from bookwyrm.models.job import ParentJob
from bookwyrm.tasks import app, IMPORTS
from bookwyrm.utils.tar import BookwyrmTarFile
logger = logging.getLogger(__name__)
class BookwyrmAwsSession(BotoSession):
"""a boto session that always uses settings.AWS_S3_ENDPOINT_URL"""
def client(self, *args, **kwargs): # pylint: disable=arguments-differ
kwargs["endpoint_url"] = settings.AWS_S3_ENDPOINT_URL
return super().client("s3", *args, **kwargs)
def select_exports_storage():
"""callable to allow for dependency on runtime configuration"""
cls = import_string(settings.EXPORTS_STORAGE)
return cls()
class BookwyrmExportJob(ParentJob):
"""entry for a specific request to export a bookwyrm user"""
export_data = FileField(null=True)
export_data = FileField(null=True, storage=select_exports_storage)
export_json = JSONField(null=True, encoder=DjangoJSONEncoder)
json_completed = BooleanField(default=False)
def start_job(self):
"""Start the job"""
start_export_task.delay(job_id=self.id, no_children=True)
"""schedule the first task"""
return self
task = create_export_json_task.delay(job_id=self.id)
self.task_id = task.id
self.save(update_fields=["task_id"])
@app.task(queue=IMPORTS, base=ParentTask)
def start_export_task(**kwargs):
"""trigger the child tasks for each row"""
job = BookwyrmExportJob.objects.get(id=kwargs["job_id"])
@app.task(queue=IMPORTS)
def create_export_json_task(job_id):
"""create the JSON data for the export"""
job = BookwyrmExportJob.objects.get(id=job_id)
# don't start the job if it was stopped from the UI
if job.complete:
return
try:
# This is where ChildJobs get made
job.export_data = ContentFile(b"", str(uuid4()))
json_data = json_export(job.user)
tar_export(json_data, job.user, job.export_data)
job.save(update_fields=["export_data"])
job.set_status("active")
# generate JSON structure
job.export_json = export_json(job.user)
job.save(update_fields=["export_json"])
# create archive in separate task
create_archive_task.delay(job_id=job.id)
except Exception as err: # pylint: disable=broad-except
logger.exception("User Export Job %s Failed with error: %s", job.id, err)
logger.exception(
"create_export_json_task for %s failed with error: %s", job, err
)
job.set_status("failed")
job.set_status("complete")
def archive_file_location(file, directory="") -> str:
"""get the relative location of a file inside the archive"""
return os.path.join(directory, file.name)
def tar_export(json_data: str, user, file):
"""wrap the export information in a tar file"""
file.open("wb")
with BookwyrmTarFile.open(mode="w:gz", fileobj=file) as tar:
tar.write_bytes(json_data.encode("utf-8"))
def add_file_to_s3_tar(s3_tar: S3Tar, storage, file, directory=""):
"""
add file to S3Tar inside directory, keeping any directories under its
storage location
"""
s3_tar.add_file(
os.path.join(storage.location, file.name),
folder=os.path.dirname(archive_file_location(file, directory=directory)),
)
@app.task(queue=IMPORTS)
def create_archive_task(job_id):
"""create the archive containing the JSON file and additional files"""
job = BookwyrmExportJob.objects.get(id=job_id)
# don't start the job if it was stopped from the UI
if job.complete:
return
try:
export_task_id = str(job.task_id)
archive_filename = f"{export_task_id}.tar.gz"
export_json_bytes = DjangoJSONEncoder().encode(job.export_json).encode("utf-8")
user = job.user
editions = get_books_for_user(user)
if settings.USE_S3:
# Storage for writing temporary files
exports_storage = storage_backends.ExportsS3Storage()
# Handle for creating the final archive
s3_tar = S3Tar(
exports_storage.bucket_name,
os.path.join(exports_storage.location, archive_filename),
session=BookwyrmAwsSession(),
)
# Save JSON file to a temporary location
export_json_tmp_file = os.path.join(export_task_id, "archive.json")
exports_storage.save(
export_json_tmp_file,
ContentFile(export_json_bytes),
)
s3_tar.add_file(
os.path.join(exports_storage.location, export_json_tmp_file)
)
# Add images to TAR
images_storage = storage_backends.ImagesStorage()
if user.avatar:
add_file_to_s3_tar(s3_tar, images_storage, user.avatar)
for edition in editions:
if edition.cover:
add_file_to_s3_tar(
s3_tar, images_storage, edition.cover, directory="images"
)
# Create archive and store file name
s3_tar.tar()
job.export_data = archive_filename
job.save(update_fields=["export_data"])
# Delete temporary files
exports_storage.delete(export_json_tmp_file)
else:
job.export_data = archive_filename
with job.export_data.open("wb") as tar_file:
with BookwyrmTarFile.open(mode="w:gz", fileobj=tar_file) as tar:
# save json file
tar.write_bytes(export_json_bytes)
# Add avatar image if present
if getattr(user, "avatar", False):
tar.add_image(user.avatar, filename="avatar")
if user.avatar:
tar.add_image(user.avatar)
editions = get_books_for_user(user)
for book in editions:
if getattr(book, "cover", False):
tar.add_image(book.cover)
for edition in editions:
if edition.cover:
tar.add_image(edition.cover, directory="images")
job.save(update_fields=["export_data"])
file.close()
job.set_status("completed")
except Exception as err: # pylint: disable=broad-except
logger.exception("create_archive_task for %s failed with error: %s", job, err)
job.set_status("failed")
def json_export(
user,
): # pylint: disable=too-many-locals, too-many-statements, too-many-branches
"""Generate an export for a user"""
def export_json(user: User):
"""create export JSON"""
data = export_user(user) # in the root of the JSON structure
data["settings"] = export_settings(user)
data["goals"] = export_goals(user)
data["books"] = export_books(user)
data["saved_lists"] = export_saved_lists(user)
data["follows"] = export_follows(user)
data["blocks"] = export_blocks(user)
return data
# User as AP object
exported_user = user.to_activity()
# I don't love this but it prevents a JSON encoding error
# when there is no user image
if exported_user.get("icon") in (None, dataclasses.MISSING):
exported_user["icon"] = {}
def export_user(user: User):
"""export user data"""
data = user.to_activity()
if user.avatar:
data["icon"]["url"] = archive_file_location(user.avatar)
else:
# change the URL to be relative to the JSON file
file_type = exported_user["icon"]["url"].rsplit(".", maxsplit=1)[-1]
filename = f"avatar.{file_type}"
exported_user["icon"]["url"] = filename
data["icon"] = {}
return data
# Additional settings - can't be serialized as AP
def export_settings(user: User):
"""Additional settings - can't be serialized as AP"""
vals = [
"show_goal",
"preferred_timezone",
"default_post_privacy",
"show_suggested_users",
]
exported_user["settings"] = {}
for k in vals:
exported_user["settings"][k] = getattr(user, k)
return {k: getattr(user, k) for k in vals}
# Reading goals - can't be serialized as AP
def export_saved_lists(user: User):
"""add user saved lists to export JSON"""
return [l.remote_id for l in user.saved_lists.all()]
def export_follows(user: User):
"""add user follows to export JSON"""
follows = UserFollows.objects.filter(user_subject=user).distinct()
following = User.objects.filter(userfollows_user_object__in=follows).distinct()
return [f.remote_id for f in following]
def export_blocks(user: User):
"""add user blocks to export JSON"""
blocks = UserBlocks.objects.filter(user_subject=user).distinct()
blocking = User.objects.filter(userblocks_user_object__in=blocks).distinct()
return [b.remote_id for b in blocking]
def export_goals(user: User):
"""add user reading goals to export JSON"""
reading_goals = AnnualGoal.objects.filter(user=user).distinct()
exported_user["goals"] = []
for goal in reading_goals:
exported_user["goals"].append(
return [
{"goal": goal.goal, "year": goal.year, "privacy": goal.privacy}
for goal in reading_goals
]
def export_books(user: User):
"""add books to export JSON"""
editions = get_books_for_user(user)
return [export_book(user, edition) for edition in editions]
def export_book(user: User, edition: Edition):
"""add book to export JSON"""
data = {}
data["work"] = edition.parent_work.to_activity()
data["edition"] = edition.to_activity()
if edition.cover:
data["edition"]["cover"]["url"] = archive_file_location(
edition.cover, directory="images"
)
# Reading history - can't be serialized as AP
readthroughs = ReadThrough.objects.filter(user=user).distinct().values()
readthroughs = list(readthroughs)
# Books
editions = get_books_for_user(user)
exported_user["books"] = []
for edition in editions:
book = {}
book["work"] = edition.parent_work.to_activity()
book["edition"] = edition.to_activity()
if book["edition"].get("cover"):
# change the URL to be relative to the JSON file
filename = book["edition"]["cover"]["url"].rsplit("/", maxsplit=1)[-1]
book["edition"]["cover"]["url"] = f"covers/{filename}"
# authors
book["authors"] = []
for author in edition.authors.all():
book["authors"].append(author.to_activity())
data["authors"] = [author.to_activity() for author in edition.authors.all()]
# Shelves this book is on
# Every ShelfItem is this book so we don't other serializing
book["shelves"] = []
shelf_books = (
ShelfBook.objects.select_related("shelf")
.filter(user=user, book=edition)
.distinct()
)
for shelfbook in shelf_books:
book["shelves"].append(shelfbook.shelf.to_activity())
data["shelves"] = [shelfbook.shelf.to_activity() for shelfbook in shelf_books]
# Lists and ListItems
# ListItems include "notes" and "approved" so we need them
# even though we know it's this book
book["lists"] = []
list_items = ListItem.objects.filter(book=edition, user=user).distinct()
data["lists"] = []
for item in list_items:
list_info = item.book_list.to_activity()
list_info[
"privacy"
] = item.book_list.privacy # this isn't serialized so we add it
list_info["list_item"] = item.to_activity()
book["lists"].append(list_info)
data["lists"].append(list_info)
# Statuses
# Can't use select_subclasses here because
# we need to filter on the "book" value,
# which is not available on an ordinary Status
for status in ["comments", "quotations", "reviews"]:
book[status] = []
data[status] = []
comments = Comment.objects.filter(user=user, book=edition).all()
for status in comments:
obj = status.to_activity()
obj["progress"] = status.progress
obj["progress_mode"] = status.progress_mode
book["comments"].append(obj)
data["comments"].append(obj)
quotes = Quotation.objects.filter(user=user, book=edition).all()
for status in quotes:
@ -176,38 +302,17 @@ def json_export(
obj["position"] = status.position
obj["endposition"] = status.endposition
obj["position_mode"] = status.position_mode
book["quotations"].append(obj)
data["quotations"].append(obj)
reviews = Review.objects.filter(user=user, book=edition).all()
for status in reviews:
obj = status.to_activity()
book["reviews"].append(obj)
data["reviews"] = [status.to_activity() for status in reviews]
# readthroughs can't be serialized to activity
book_readthroughs = (
ReadThrough.objects.filter(user=user, book=edition).distinct().values()
)
book["readthroughs"] = list(book_readthroughs)
# append everything
exported_user["books"].append(book)
# saved book lists - just the remote id
saved_lists = List.objects.filter(id__in=user.saved_lists.all()).distinct()
exported_user["saved_lists"] = [l.remote_id for l in saved_lists]
# follows - just the remote id
follows = UserFollows.objects.filter(user_subject=user).distinct()
following = User.objects.filter(userfollows_user_object__in=follows).distinct()
exported_user["follows"] = [f.remote_id for f in following]
# blocks - just the remote id
blocks = UserBlocks.objects.filter(user_subject=user).distinct()
blocking = User.objects.filter(userblocks_user_object__in=blocks).distinct()
exported_user["blocks"] = [b.remote_id for b in blocking]
return DjangoJSONEncoder().encode(exported_user)
data["readthroughs"] = list(book_readthroughs)
return data
def get_books_for_user(user):

View file

@ -42,20 +42,23 @@ def start_import_task(**kwargs):
try:
archive_file.open("rb")
with BookwyrmTarFile.open(mode="r:gz", fileobj=archive_file) as tar:
job.import_data = json.loads(tar.read("archive.json").decode("utf-8"))
json_filename = next(
filter(lambda n: n.startswith("archive"), tar.getnames())
)
job.import_data = json.loads(tar.read(json_filename).decode("utf-8"))
if "include_user_profile" in job.required:
update_user_profile(job.user, tar, job.import_data)
if "include_user_settings" in job.required:
update_user_settings(job.user, job.import_data)
if "include_goals" in job.required:
update_goals(job.user, job.import_data.get("goals"))
update_goals(job.user, job.import_data.get("goals", []))
if "include_saved_lists" in job.required:
upsert_saved_lists(job.user, job.import_data.get("saved_lists"))
upsert_saved_lists(job.user, job.import_data.get("saved_lists", []))
if "include_follows" in job.required:
upsert_follows(job.user, job.import_data.get("follows"))
upsert_follows(job.user, job.import_data.get("follows", []))
if "include_blocks" in job.required:
upsert_user_blocks(job.user, job.import_data.get("blocks"))
upsert_user_blocks(job.user, job.import_data.get("blocks", []))
process_books(job, tar)
@ -212,7 +215,7 @@ def upsert_statuses(user, cls, data, book_remote_id):
instance.save() # save and broadcast
else:
logger.info("User does not have permission to import statuses")
logger.warning("User does not have permission to import statuses")
def upsert_lists(user, lists, book_id):

View file

@ -135,8 +135,7 @@ class ParentJob(Job):
)
app.control.revoke(list(tasks))
for task in self.pending_child_jobs:
task.update(status=self.Status.STOPPED)
self.pending_child_jobs.update(status=self.Status.STOPPED)
@property
def has_completed(self):
@ -248,7 +247,7 @@ class SubTask(app.Task):
"""
def before_start(
self, task_id, args, kwargs
self, task_id, *args, **kwargs
): # pylint: disable=no-self-use, unused-argument
"""Handler called before the task starts. Override.
@ -272,7 +271,7 @@ class SubTask(app.Task):
child_job.set_status(ChildJob.Status.ACTIVE)
def on_success(
self, retval, task_id, args, kwargs
self, retval, task_id, *args, **kwargs
): # pylint: disable=no-self-use, unused-argument
"""Run by the worker if the task executes successfully. Override.

View file

@ -374,6 +374,7 @@ if USE_HTTPS:
USE_S3 = env.bool("USE_S3", False)
USE_AZURE = env.bool("USE_AZURE", False)
S3_SIGNED_URL_EXPIRY = env.int("S3_SIGNED_URL_EXPIRY", 900)
if USE_S3:
# AWS settings
@ -388,16 +389,20 @@ if USE_S3:
# S3 Static settings
STATIC_LOCATION = "static"
STATIC_URL = f"{PROTOCOL}://{AWS_S3_CUSTOM_DOMAIN}/{STATIC_LOCATION}/"
STATIC_FULL_URL = STATIC_URL
STATICFILES_STORAGE = "bookwyrm.storage_backends.StaticStorage"
# S3 Media settings
MEDIA_LOCATION = "images"
MEDIA_URL = f"{PROTOCOL}://{AWS_S3_CUSTOM_DOMAIN}/{MEDIA_LOCATION}/"
MEDIA_FULL_URL = MEDIA_URL
STATIC_FULL_URL = STATIC_URL
DEFAULT_FILE_STORAGE = "bookwyrm.storage_backends.ImagesStorage"
# S3 Exports settings
EXPORTS_STORAGE = "bookwyrm.storage_backends.ExportsS3Storage"
# Content Security Policy
CSP_DEFAULT_SRC = ["'self'", AWS_S3_CUSTOM_DOMAIN] + CSP_ADDITIONAL_HOSTS
CSP_SCRIPT_SRC = ["'self'", AWS_S3_CUSTOM_DOMAIN] + CSP_ADDITIONAL_HOSTS
elif USE_AZURE:
# Azure settings
AZURE_ACCOUNT_NAME = env("AZURE_ACCOUNT_NAME")
AZURE_ACCOUNT_KEY = env("AZURE_ACCOUNT_KEY")
AZURE_CONTAINER = env("AZURE_CONTAINER")
@ -407,6 +412,7 @@ elif USE_AZURE:
STATIC_URL = (
f"{PROTOCOL}://{AZURE_CUSTOM_DOMAIN}/{AZURE_CONTAINER}/{STATIC_LOCATION}/"
)
STATIC_FULL_URL = STATIC_URL
STATICFILES_STORAGE = "bookwyrm.storage_backends.AzureStaticStorage"
# Azure Media settings
MEDIA_LOCATION = "images"
@ -414,15 +420,24 @@ elif USE_AZURE:
f"{PROTOCOL}://{AZURE_CUSTOM_DOMAIN}/{AZURE_CONTAINER}/{MEDIA_LOCATION}/"
)
MEDIA_FULL_URL = MEDIA_URL
STATIC_FULL_URL = STATIC_URL
DEFAULT_FILE_STORAGE = "bookwyrm.storage_backends.AzureImagesStorage"
# Azure Exports settings
EXPORTS_STORAGE = None # not implemented yet
# Content Security Policy
CSP_DEFAULT_SRC = ["'self'", AZURE_CUSTOM_DOMAIN] + CSP_ADDITIONAL_HOSTS
CSP_SCRIPT_SRC = ["'self'", AZURE_CUSTOM_DOMAIN] + CSP_ADDITIONAL_HOSTS
else:
# Static settings
STATIC_URL = "/static/"
STATIC_FULL_URL = f"{PROTOCOL}://{DOMAIN}{STATIC_URL}"
STATICFILES_STORAGE = "django.contrib.staticfiles.storage.StaticFilesStorage"
# Media settings
MEDIA_URL = "/images/"
MEDIA_FULL_URL = f"{PROTOCOL}://{DOMAIN}{MEDIA_URL}"
STATIC_FULL_URL = f"{PROTOCOL}://{DOMAIN}{STATIC_URL}"
DEFAULT_FILE_STORAGE = "django.core.files.storage.FileSystemStorage"
# Exports settings
EXPORTS_STORAGE = "bookwyrm.storage_backends.ExportsFileStorage"
# Content Security Policy
CSP_DEFAULT_SRC = ["'self'"] + CSP_ADDITIONAL_HOSTS
CSP_SCRIPT_SRC = ["'self'"] + CSP_ADDITIONAL_HOSTS

View file

@ -1,6 +1,7 @@
"""Handles backends for storages"""
import os
from tempfile import SpooledTemporaryFile
from django.core.files.storage import FileSystemStorage
from storages.backends.s3boto3 import S3Boto3Storage
from storages.backends.azure_storage import AzureStorage
@ -61,3 +62,18 @@ class AzureImagesStorage(AzureStorage): # pylint: disable=abstract-method
location = "images"
overwrite_files = False
class ExportsFileStorage(FileSystemStorage): # pylint: disable=abstract-method
"""Storage class for exports contents with local files"""
location = "exports"
overwrite_files = False
class ExportsS3Storage(S3Boto3Storage): # pylint: disable=abstract-method
"""Storage class for exports contents with S3"""
location = "exports"
default_acl = None
overwrite_files = False

View file

@ -97,25 +97,25 @@
</td>
</tr>
{% endif %}
{% for job in jobs %}
{% for export in jobs %}
<tr>
<td>{{ job.updated_date }}</td>
<td>{{ export.job.updated_date }}</td>
<td>
<span
{% if job.status == "stopped" or job.status == "failed" %}
{% if export.job.status == "stopped" or export.job.status == "failed" %}
class="tag is-danger"
{% elif job.status == "pending" %}
{% elif export.job.status == "pending" %}
class="tag is-warning"
{% elif job.complete %}
{% elif export.job.complete %}
class="tag"
{% else %}
class="tag is-success"
{% endif %}
>
{% if job.status %}
{{ job.status }}
{{ job.status_display }}
{% elif job.complete %}
{% if export.job.status %}
{{ export.job.status }}
{{ export.job.status_display }}
{% elif export.job.complete %}
{% trans "Complete" %}
{% else %}
{% trans "Active" %}
@ -123,18 +123,20 @@
</span>
</td>
<td>
<span>{{ job.export_data|get_file_size }}</span>
{% if export.size %}
<span>{{ export.size|get_file_size }}</span>
{% endif %}
</td>
<td>
{% if job.complete and not job.status == "stopped" and not job.status == "failed" %}
<p>
<a download="" href="/preferences/user-export/{{ job.task_id }}">
{% if export.url %}
<a href="{{ export.url }}">
<span class="icon icon-download" aria-hidden="true"></span>
<span class="is-hidden-mobile">
{% trans "Download your export" %}
</span>
</a>
</p>
{% elif export.unavailable %}
{% trans "Archive is no longer available" %}
{% endif %}
</td>
</tr>

View file

@ -157,13 +157,13 @@
>
<div class="notification is-danger is-light">
<p class="my-2">{% trans "Users are currently unable to start new user exports. This is the default setting." %}</p>
{% if use_s3 %}
<p>{% trans "It is not currently possible to provide user exports when using s3 storage. The BookWyrm development team are working on a fix for this." %}</p>
{% if use_azure %}
<p>{% trans "It is not currently possible to provide user exports when using Azure storage." %}</p>
{% endif %}
</div>
{% csrf_token %}
<div class="control">
<button type="submit" class="button is-success" {% if use_s3 %}disabled{% endif %}>
<button type="submit" class="button is-success" {% if use_azure %}disabled{% endif %}>
{% trans "Enable user exports" %}
</button>
</div>

View file

@ -130,11 +130,14 @@ def id_to_username(user_id):
@register.filter(name="get_file_size")
def get_file_size(file):
def get_file_size(nbytes):
"""display the size of a file in human readable terms"""
try:
raw_size = os.stat(file.path).st_size
raw_size = float(nbytes)
except (ValueError, TypeError):
return repr(nbytes)
else:
if raw_size < 1024:
return f"{raw_size} bytes"
if raw_size < 1024**2:
@ -142,8 +145,6 @@ def get_file_size(file):
if raw_size < 1024**3:
return f"{raw_size/1024**2:.2f} MB"
return f"{raw_size/1024**3:.2f} GB"
except Exception: # pylint: disable=broad-except
return ""
@register.filter(name="get_user_permission")

View file

@ -1,17 +1,18 @@
"""test bookwyrm user export functions"""
import datetime
import json
import pathlib
from unittest.mock import patch
from django.core.serializers.json import DjangoJSONEncoder
from django.test import TestCase
from django.utils import timezone
from django.test import TestCase
from bookwyrm import models
import bookwyrm.models.bookwyrm_export_job as export_job
from bookwyrm.utils.tar import BookwyrmTarFile
class BookwyrmExport(TestCase):
class BookwyrmExportJob(TestCase):
"""testing user export functions"""
def setUp(self):
@ -42,6 +43,11 @@ class BookwyrmExport(TestCase):
preferred_timezone="America/Los Angeles",
default_post_privacy="followers",
)
avatar_path = pathlib.Path(__file__).parent.joinpath(
"../../static/images/default_avi.jpg"
)
with open(avatar_path, "rb") as avatar_file:
self.local_user.avatar.save("mouse-avatar.jpg", avatar_file)
self.rat_user = models.User.objects.create_user(
"rat", "rat@rat.rat", "ratword", local=True, localname="rat"
@ -87,6 +93,13 @@ class BookwyrmExport(TestCase):
title="Example Edition", parent_work=self.work
)
# edition cover
cover_path = pathlib.Path(__file__).parent.joinpath(
"../../static/images/default_avi.jpg"
)
with open(cover_path, "rb") as cover_file:
self.edition.cover.save("tèst.jpg", cover_file)
self.edition.authors.add(self.author)
# readthrough
@ -139,91 +152,105 @@ class BookwyrmExport(TestCase):
book=self.edition,
)
def test_json_export_user_settings(self):
"""Test the json export function for basic user info"""
data = export_job.json_export(self.local_user)
user_data = json.loads(data)
self.assertEqual(user_data["preferredUsername"], "mouse")
self.assertEqual(user_data["name"], "Mouse")
self.assertEqual(user_data["summary"], "<p>I'm a real bookmouse</p>")
self.assertEqual(user_data["manuallyApprovesFollowers"], False)
self.assertEqual(user_data["hideFollows"], False)
self.assertEqual(user_data["discoverable"], True)
self.assertEqual(user_data["settings"]["show_goal"], False)
self.assertEqual(user_data["settings"]["show_suggested_users"], False)
self.job = models.BookwyrmExportJob.objects.create(user=self.local_user)
# run the first stage of the export
with patch("bookwyrm.models.bookwyrm_export_job.create_archive_task.delay"):
models.bookwyrm_export_job.create_export_json_task(job_id=self.job.id)
self.job.refresh_from_db()
def test_add_book_to_user_export_job(self):
"""does AddBookToUserExportJob ...add the book to the export?"""
self.assertIsNotNone(self.job.export_json["books"])
self.assertEqual(len(self.job.export_json["books"]), 1)
book = self.job.export_json["books"][0]
self.assertEqual(book["work"]["id"], self.work.remote_id)
self.assertEqual(len(book["authors"]), 1)
self.assertEqual(len(book["shelves"]), 1)
self.assertEqual(len(book["lists"]), 1)
self.assertEqual(len(book["comments"]), 1)
self.assertEqual(len(book["reviews"]), 1)
self.assertEqual(len(book["quotations"]), 1)
self.assertEqual(len(book["readthroughs"]), 1)
self.assertEqual(book["edition"]["id"], self.edition.remote_id)
self.assertEqual(
user_data["settings"]["preferred_timezone"], "America/Los Angeles"
)
self.assertEqual(user_data["settings"]["default_post_privacy"], "followers")
def test_json_export_extended_user_data(self):
"""Test the json export function for other non-book user info"""
data = export_job.json_export(self.local_user)
json_data = json.loads(data)
# goal
self.assertEqual(len(json_data["goals"]), 1)
self.assertEqual(json_data["goals"][0]["goal"], 128937123)
self.assertEqual(json_data["goals"][0]["year"], timezone.now().year)
self.assertEqual(json_data["goals"][0]["privacy"], "followers")
# saved lists
self.assertEqual(len(json_data["saved_lists"]), 1)
self.assertEqual(json_data["saved_lists"][0], "https://local.lists/9999")
# follows
self.assertEqual(len(json_data["follows"]), 1)
self.assertEqual(json_data["follows"][0], "https://your.domain.here/user/rat")
# blocked users
self.assertEqual(len(json_data["blocks"]), 1)
self.assertEqual(json_data["blocks"][0], "https://your.domain.here/user/badger")
def test_json_export_books(self):
"""Test the json export function for extended user info"""
data = export_job.json_export(self.local_user)
json_data = json.loads(data)
start_date = json_data["books"][0]["readthroughs"][0]["start_date"]
self.assertEqual(len(json_data["books"]), 1)
self.assertEqual(json_data["books"][0]["edition"]["title"], "Example Edition")
self.assertEqual(len(json_data["books"][0]["authors"]), 1)
self.assertEqual(json_data["books"][0]["authors"][0]["name"], "Sam Zhu")
self.assertEqual(
f'"{start_date}"', DjangoJSONEncoder().encode(self.readthrough_start)
book["edition"]["cover"]["url"], f"images/{self.edition.cover.name}"
)
self.assertEqual(json_data["books"][0]["shelves"][0]["name"], "Read")
def test_start_export_task(self):
"""test saved list task saves initial json and data"""
self.assertIsNotNone(self.job.export_data)
self.assertIsNotNone(self.job.export_json)
self.assertEqual(self.job.export_json["name"], self.local_user.name)
self.assertEqual(len(json_data["books"][0]["lists"]), 1)
self.assertEqual(json_data["books"][0]["lists"][0]["name"], "My excellent list")
def test_export_saved_lists_task(self):
"""test export_saved_lists_task adds the saved lists"""
self.assertIsNotNone(self.job.export_json["saved_lists"])
self.assertEqual(
json_data["books"][0]["lists"][0]["list_item"]["book"],
self.edition.remote_id,
self.edition.id,
self.job.export_json["saved_lists"][0], self.saved_list.remote_id
)
self.assertEqual(len(json_data["books"][0]["reviews"]), 1)
self.assertEqual(len(json_data["books"][0]["comments"]), 1)
self.assertEqual(len(json_data["books"][0]["quotations"]), 1)
def test_export_follows_task(self):
"""test export_follows_task adds the follows"""
self.assertIsNotNone(self.job.export_json["follows"])
self.assertEqual(self.job.export_json["follows"][0], self.rat_user.remote_id)
self.assertEqual(json_data["books"][0]["reviews"][0]["name"], "my review")
self.assertEqual(
json_data["books"][0]["reviews"][0]["content"], "<p>awesome</p>"
)
self.assertEqual(json_data["books"][0]["reviews"][0]["rating"], 5.0)
def test_export_blocks_task(self):
"""test export_blocks_task adds the blocks"""
self.assertIsNotNone(self.job.export_json["blocks"])
self.assertEqual(self.job.export_json["blocks"][0], self.badger_user.remote_id)
self.assertEqual(
json_data["books"][0]["comments"][0]["content"], "<p>ok so far</p>"
)
self.assertEqual(json_data["books"][0]["comments"][0]["progress"], 15)
self.assertEqual(json_data["books"][0]["comments"][0]["progress_mode"], "PG")
def test_export_reading_goals_task(self):
"""test export_reading_goals_task adds the goals"""
self.assertIsNotNone(self.job.export_json["goals"])
self.assertEqual(self.job.export_json["goals"][0]["goal"], 128937123)
def test_json_export(self):
"""test json_export job adds settings"""
self.assertIsNotNone(self.job.export_json["settings"])
self.assertFalse(self.job.export_json["settings"]["show_goal"])
self.assertEqual(
json_data["books"][0]["quotations"][0]["content"], "<p>check this out</p>"
self.job.export_json["settings"]["preferred_timezone"],
"America/Los Angeles",
)
self.assertEqual(
json_data["books"][0]["quotations"][0]["quote"],
"<p>A rose by any other name</p>",
self.job.export_json["settings"]["default_post_privacy"], "followers"
)
self.assertFalse(self.job.export_json["settings"]["show_suggested_users"])
def test_get_books_for_user(self):
"""does get_books_for_user get all the books"""
data = models.bookwyrm_export_job.get_books_for_user(self.local_user)
self.assertEqual(len(data), 1)
self.assertEqual(data[0].title, "Example Edition")
def test_archive(self):
"""actually create the TAR file"""
models.bookwyrm_export_job.create_archive_task(job_id=self.job.id)
self.job.refresh_from_db()
with (
self.job.export_data.open("rb") as tar_file,
BookwyrmTarFile.open(mode="r", fileobj=tar_file) as tar,
):
archive_json_file = tar.extractfile("archive.json")
data = json.load(archive_json_file)
# JSON from the archive should be what we want it to be
self.assertEqual(data, self.job.export_json)
# User avatar should be present in archive
with self.local_user.avatar.open() as expected_avatar:
archive_avatar = tar.extractfile(data["icon"]["url"])
self.assertEqual(expected_avatar.read(), archive_avatar.read())
# Edition cover should be present in archive
with self.edition.cover.open() as expected_cover:
archive_cover = tar.extractfile(
data["books"][0]["edition"]["cover"]["url"]
)
self.assertEqual(expected_cover.read(), archive_cover.read())

View file

@ -42,7 +42,7 @@ class ExportUserViews(TestCase):
request = self.factory.post("")
request.user = self.local_user
with patch("bookwyrm.models.bookwyrm_export_job.start_export_task.delay"):
with patch("bookwyrm.models.bookwyrm_export_job.BookwyrmExportJob.start_job"):
export = views.ExportUser.as_view()(request)
self.assertIsInstance(export, HttpResponse)
self.assertEqual(export.status_code, 302)

View file

@ -1,5 +1,6 @@
"""manage tar files for user exports"""
import io
import os
import tarfile
from typing import Any, Optional
from uuid import uuid4
@ -17,20 +18,20 @@ class BookwyrmTarFile(tarfile.TarFile):
self.addfile(info, fileobj=buffer)
def add_image(
self, image: Any, filename: Optional[str] = None, directory: Any = ""
self, image: Any, filename: Optional[str] = None, directory: str = ""
) -> None:
"""
Add an image to the tar archive
:param str filename: overrides the file name set by image
:param str directory: the directory in the archive to put the image
"""
if filename is not None:
file_type = image.name.rsplit(".", maxsplit=1)[-1]
filename = f"{directory}{filename}.{file_type}"
if filename is None:
dst_filename = image.name
else:
filename = f"{directory}{image.name}"
dst_filename = filename + os.path.splitext(image.name)[1]
dst_path = os.path.join(directory, dst_filename)
info = tarfile.TarInfo(name=filename)
info = tarfile.TarInfo(name=dst_path)
info.size = image.size
self.addfile(info, fileobj=image)
@ -43,7 +44,7 @@ class BookwyrmTarFile(tarfile.TarFile):
def write_image_to_file(self, filename: str, file_field: Any) -> None:
"""add an image to the tar"""
extension = filename.rsplit(".")[-1]
extension = os.path.splitext(filename)[1]
if buf := self.extractfile(filename):
filename = f"{str(uuid4())}.{extension}"
filename = str(uuid4()) + extension
file_field.save(filename, File(buf))

View file

@ -9,7 +9,7 @@ from django.views.decorators.http import require_POST
from bookwyrm import models
from bookwyrm.views.helpers import redirect_to_referer
from bookwyrm.settings import PAGE_LENGTH, USE_S3
from bookwyrm.settings import PAGE_LENGTH, USE_AZURE
# pylint: disable=no-self-use
@ -59,7 +59,7 @@ class ImportList(View):
"import_size_limit": site_settings.import_size_limit,
"import_limit_reset": site_settings.import_limit_reset,
"user_import_time_limit": site_settings.user_import_time_limit,
"use_s3": USE_S3,
"use_azure": USE_AZURE,
}
return TemplateResponse(request, "settings/imports/imports.html", data)

View file

@ -6,16 +6,19 @@ import io
from django.contrib.auth.decorators import login_required
from django.core.paginator import Paginator
from django.db.models import Q
from django.http import HttpResponse
from django.http import HttpResponse, HttpResponseServerError, Http404
from django.template.response import TemplateResponse
from django.utils import timezone
from django.views import View
from django.urls import reverse
from django.utils.decorators import method_decorator
from django.shortcuts import redirect
from bookwyrm import models
from storages.backends.s3boto3 import S3Boto3Storage
from bookwyrm import models, storage_backends
from bookwyrm.models.bookwyrm_export_job import BookwyrmExportJob
from bookwyrm.settings import PAGE_LENGTH
from bookwyrm import settings
# pylint: disable=no-self-use,too-many-locals
@ -144,25 +147,53 @@ class Export(View):
# pylint: disable=no-self-use
@method_decorator(login_required, name="dispatch")
class ExportUser(View):
"""Let users export user data to import into another Bookwyrm instance"""
"""
Let users request and download an archive of user data to import into
another Bookwyrm instance.
"""
user_jobs = None
def setup(self, request, *args, **kwargs):
super().setup(request, *args, **kwargs)
self.user_jobs = BookwyrmExportJob.objects.filter(user=request.user).order_by(
"-created_date"
)
def new_export_blocked_until(self):
"""whether the user is allowed to request a new export"""
last_job = self.user_jobs.first()
if not last_job:
return None
site = models.SiteSettings.objects.get()
blocked_until = last_job.created_date + timedelta(
hours=site.user_import_time_limit
)
return blocked_until if blocked_until > timezone.now() else None
def get(self, request):
"""Request tar file"""
jobs = BookwyrmExportJob.objects.filter(user=request.user).order_by(
"-created_date"
)
site = models.SiteSettings.objects.get()
hours = site.user_import_time_limit
allowed = (
jobs.first().created_date < timezone.now() - timedelta(hours=hours)
if jobs.first()
else True
)
next_available = (
jobs.first().created_date + timedelta(hours=hours) if not allowed else False
)
paginated = Paginator(jobs, PAGE_LENGTH)
exports = []
for job in self.user_jobs:
export = {"job": job}
if job.export_data:
try:
export["size"] = job.export_data.size
export["url"] = reverse("prefs-export-file", args=[job.task_id])
except FileNotFoundError:
# file no longer exists locally
export["unavailable"] = True
except Exception: # pylint: disable=broad-except
# file no longer exists on storage backend
export["unavailable"] = True
exports.append(export)
next_available = self.new_export_blocked_until()
paginated = Paginator(exports, settings.PAGE_LENGTH)
page = paginated.get_page(request.GET.get("page"))
data = {
"jobs": page,
@ -175,7 +206,9 @@ class ExportUser(View):
return TemplateResponse(request, "preferences/export-user.html", data)
def post(self, request):
"""Download the json file of a user's data"""
"""Trigger processing of a new user export file"""
if self.new_export_blocked_until() is not None:
return HttpResponse(status=429) # Too Many Requests
job = BookwyrmExportJob.objects.create(user=request.user)
job.start_job()
@ -187,9 +220,27 @@ class ExportUser(View):
class ExportArchive(View):
"""Serve the archive file"""
# TODO: how do we serve s3 files?
def get(self, request, archive_id):
"""download user export file"""
export = BookwyrmExportJob.objects.get(task_id=archive_id, user=request.user)
if isinstance(export.export_data.storage, storage_backends.ExportsS3Storage):
# make custom_domain None so we can sign the url
# see https://github.com/jschneier/django-storages/issues/944
storage = S3Boto3Storage(querystring_auth=True, custom_domain=None)
try:
url = S3Boto3Storage.url(
storage,
f"/exports/{export.task_id}.tar.gz",
expire=settings.S3_SIGNED_URL_EXPIRY,
)
except Exception:
raise Http404()
return redirect(url)
if isinstance(export.export_data.storage, storage_backends.ExportsFileStorage):
try:
return HttpResponse(
export.export_data,
content_type="application/gzip",
@ -197,3 +248,7 @@ class ExportArchive(View):
"Content-Disposition": 'attachment; filename="bookwyrm-account-export.tar.gz"' # pylint: disable=line-too-long
},
)
except FileNotFoundError:
raise Http404()
return HttpResponseServerError()

View file

@ -29,6 +29,7 @@ services:
- .:/app
- static_volume:/app/static
- media_volume:/app/images
- exports_volume:/app/exports
depends_on:
- db
- celery_worker
@ -67,6 +68,7 @@ services:
- .:/app
- static_volume:/app/static
- media_volume:/app/images
- exports_volume:/app/exports
depends_on:
- db
- redis_broker
@ -81,6 +83,7 @@ services:
- .:/app
- static_volume:/app/static
- media_volume:/app/images
- exports_volume:/app/exports
depends_on:
- celery_worker
restart: on-failure
@ -109,6 +112,7 @@ volumes:
pgdata:
static_volume:
media_volume:
exports_volume:
redis_broker_data:
redis_activity_data:
networks:

0
exports/.gitkeep Normal file
View file

View file

@ -38,6 +38,7 @@ qrcode==7.3.1
redis==4.5.4
requests==2.31.0
responses==0.22.0
s3-tar==0.1.13
setuptools>=65.5.1 # Not a direct dependency, pinned to get a security fix
tornado==6.3.3 # Not a direct dependency, pinned to get a security fix