mirror of
https://github.com/bookwyrm-social/bookwyrm.git
synced 2025-01-11 17:55:37 +00:00
various fixes
- use signed url for s3 downloads - re-arrange tar.gz file to match original - delete all working files after tarring - import from s3 export TODO - check local export and import - fix error when avatar missing - deal with multiple s3 storage options (e.g. Azure)
This commit is contained in:
parent
26c37de2d4
commit
2bb9a85591
7 changed files with 114 additions and 56 deletions
|
@ -81,6 +81,7 @@ AWS_SECRET_ACCESS_KEY=
|
||||||
# AWS_S3_CUSTOM_DOMAIN=None # "example-bucket-name.s3.fr-par.scw.cloud"
|
# AWS_S3_CUSTOM_DOMAIN=None # "example-bucket-name.s3.fr-par.scw.cloud"
|
||||||
# AWS_S3_REGION_NAME=None # "fr-par"
|
# AWS_S3_REGION_NAME=None # "fr-par"
|
||||||
# AWS_S3_ENDPOINT_URL=None # "https://s3.fr-par.scw.cloud"
|
# AWS_S3_ENDPOINT_URL=None # "https://s3.fr-par.scw.cloud"
|
||||||
|
# S3_ENDPOINT_URL=None # same as AWS_S3_ENDPOINT_URL - needed for non-AWS for user exports
|
||||||
|
|
||||||
# Commented are example values if you use Azure Blob Storage
|
# Commented are example values if you use Azure Blob Storage
|
||||||
# USE_AZURE=true
|
# USE_AZURE=true
|
||||||
|
|
|
@ -1,4 +1,4 @@
|
||||||
# Generated by Django 3.2.23 on 2024-01-14 00:55
|
# Generated by Django 3.2.23 on 2024-01-28 02:49
|
||||||
|
|
||||||
import bookwyrm.storage_backends
|
import bookwyrm.storage_backends
|
||||||
import django.core.serializers.json
|
import django.core.serializers.json
|
||||||
|
@ -9,7 +9,7 @@ import django.db.models.deletion
|
||||||
class Migration(migrations.Migration):
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
dependencies = [
|
dependencies = [
|
||||||
("bookwyrm", "0191_merge_20240102_0326"),
|
("bookwyrm", "0192_sitesettings_user_exports_enabled"),
|
||||||
]
|
]
|
||||||
|
|
||||||
operations = [
|
operations = [
|
|
@ -5,6 +5,7 @@ import logging
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
|
|
||||||
from s3_tar import S3Tar
|
from s3_tar import S3Tar
|
||||||
|
from storages.backends.s3boto3 import S3Boto3Storage
|
||||||
|
|
||||||
from django.db.models import CASCADE, BooleanField, FileField, ForeignKey, JSONField
|
from django.db.models import CASCADE, BooleanField, FileField, ForeignKey, JSONField
|
||||||
from django.db.models import Q
|
from django.db.models import Q
|
||||||
|
@ -57,7 +58,6 @@ class BookwyrmExportJob(ParentJob):
|
||||||
|
|
||||||
if not self.complete and self.has_completed:
|
if not self.complete and self.has_completed:
|
||||||
if not self.json_completed:
|
if not self.json_completed:
|
||||||
|
|
||||||
try:
|
try:
|
||||||
self.json_completed = True
|
self.json_completed = True
|
||||||
self.save(update_fields=["json_completed"])
|
self.save(update_fields=["json_completed"])
|
||||||
|
@ -193,8 +193,7 @@ class AddFileToTar(ChildJob):
|
||||||
|
|
||||||
# NOTE we are doing this all in one big job, which has the potential to block a thread
|
# NOTE we are doing this all in one big job, which has the potential to block a thread
|
||||||
# This is because we need to refer to the same s3_job or BookwyrmTarFile whilst writing
|
# This is because we need to refer to the same s3_job or BookwyrmTarFile whilst writing
|
||||||
# Alternatives using a series of jobs in a loop would be beter
|
# Using a series of jobs in a loop would be better if possible
|
||||||
# but Hugh couldn't make that work
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
export_data = self.parent_export_job.export_data
|
export_data = self.parent_export_job.export_data
|
||||||
|
@ -203,29 +202,41 @@ class AddFileToTar(ChildJob):
|
||||||
user = self.parent_export_job.user
|
user = self.parent_export_job.user
|
||||||
editions = get_books_for_user(user)
|
editions = get_books_for_user(user)
|
||||||
|
|
||||||
|
# filenames for later
|
||||||
|
export_data_original = str(export_data)
|
||||||
|
filename = str(self.parent_export_job.task_id)
|
||||||
|
|
||||||
if settings.USE_S3:
|
if settings.USE_S3:
|
||||||
s3_job = S3Tar(
|
s3_job = S3Tar(
|
||||||
settings.AWS_STORAGE_BUCKET_NAME,
|
settings.AWS_STORAGE_BUCKET_NAME,
|
||||||
f"exports/{str(self.parent_export_job.task_id)}.tar.gz",
|
f"exports/{filename}.tar.gz",
|
||||||
)
|
)
|
||||||
|
|
||||||
# TODO: will need to get it to the user
|
# save json file
|
||||||
# from this secure part of the bucket
|
export_data.save(
|
||||||
export_data.save("archive.json", ContentFile(json_data.encode("utf-8")))
|
f"archive_{filename}.json", ContentFile(json_data.encode("utf-8"))
|
||||||
|
)
|
||||||
s3_job.add_file(f"exports/{export_data.name}")
|
s3_job.add_file(f"exports/{export_data.name}")
|
||||||
s3_job.add_file(f"images/{user.avatar.name}", folder="avatar")
|
|
||||||
|
# save image file
|
||||||
|
file_type = user.avatar.name.rsplit(".", maxsplit=1)[-1]
|
||||||
|
export_data.save(f"avatar_{filename}.{file_type}", user.avatar)
|
||||||
|
s3_job.add_file(f"exports/{export_data.name}")
|
||||||
|
|
||||||
for book in editions:
|
for book in editions:
|
||||||
if getattr(book, "cover", False):
|
if getattr(book, "cover", False):
|
||||||
cover_name = f"images/{book.cover.name}"
|
cover_name = f"images/{book.cover.name}"
|
||||||
s3_job.add_file(cover_name, folder="covers")
|
s3_job.add_file(cover_name, folder="covers")
|
||||||
|
|
||||||
s3_job.tar()
|
s3_job.tar()
|
||||||
# delete export json as soon as it's tarred
|
|
||||||
# TODO: there is probably a better way to do this
|
# delete child files - we don't need them any more
|
||||||
# Currently this merely makes the file empty even though
|
s3_storage = S3Boto3Storage(querystring_auth=True, custom_domain=None)
|
||||||
# we're using save=False
|
S3Boto3Storage.delete(s3_storage, f"exports/{export_data_original}")
|
||||||
export_data.delete(save=False)
|
S3Boto3Storage.delete(s3_storage, f"exports/archive_{filename}.json")
|
||||||
|
S3Boto3Storage.delete(
|
||||||
|
s3_storage, f"exports/avatar_{filename}.{file_type}"
|
||||||
|
)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
export_data.open("wb")
|
export_data.open("wb")
|
||||||
|
@ -266,7 +277,14 @@ def start_export_task(**kwargs):
|
||||||
|
|
||||||
# prepare the initial file and base json
|
# prepare the initial file and base json
|
||||||
job.export_data = ContentFile(b"", str(uuid4()))
|
job.export_data = ContentFile(b"", str(uuid4()))
|
||||||
|
# BUG: this throws a MISSING class error if there is no avatar
|
||||||
|
# #3096 may fix it
|
||||||
|
if not job.user.avatar:
|
||||||
|
job.user.avatar = ""
|
||||||
|
job.user.save()
|
||||||
|
|
||||||
job.export_json = job.user.to_activity()
|
job.export_json = job.user.to_activity()
|
||||||
|
logger.info(job.export_json)
|
||||||
job.save(update_fields=["export_data", "export_json"])
|
job.save(update_fields=["export_data", "export_json"])
|
||||||
|
|
||||||
# let's go
|
# let's go
|
||||||
|
|
|
@ -42,20 +42,23 @@ def start_import_task(**kwargs):
|
||||||
try:
|
try:
|
||||||
archive_file.open("rb")
|
archive_file.open("rb")
|
||||||
with BookwyrmTarFile.open(mode="r:gz", fileobj=archive_file) as tar:
|
with BookwyrmTarFile.open(mode="r:gz", fileobj=archive_file) as tar:
|
||||||
job.import_data = json.loads(tar.read("archive.json").decode("utf-8"))
|
json_filename = next(
|
||||||
|
filter(lambda n: n.startswith("archive"), tar.getnames())
|
||||||
|
)
|
||||||
|
job.import_data = json.loads(tar.read(json_filename).decode("utf-8"))
|
||||||
|
|
||||||
if "include_user_profile" in job.required:
|
if "include_user_profile" in job.required:
|
||||||
update_user_profile(job.user, tar, job.import_data)
|
update_user_profile(job.user, tar, job.import_data)
|
||||||
if "include_user_settings" in job.required:
|
if "include_user_settings" in job.required:
|
||||||
update_user_settings(job.user, job.import_data)
|
update_user_settings(job.user, job.import_data)
|
||||||
if "include_goals" in job.required:
|
if "include_goals" in job.required:
|
||||||
update_goals(job.user, job.import_data.get("goals"))
|
update_goals(job.user, job.import_data.get("goals", []))
|
||||||
if "include_saved_lists" in job.required:
|
if "include_saved_lists" in job.required:
|
||||||
upsert_saved_lists(job.user, job.import_data.get("saved_lists"))
|
upsert_saved_lists(job.user, job.import_data.get("saved_lists", []))
|
||||||
if "include_follows" in job.required:
|
if "include_follows" in job.required:
|
||||||
upsert_follows(job.user, job.import_data.get("follows"))
|
upsert_follows(job.user, job.import_data.get("follows", []))
|
||||||
if "include_blocks" in job.required:
|
if "include_blocks" in job.required:
|
||||||
upsert_user_blocks(job.user, job.import_data.get("blocks"))
|
upsert_user_blocks(job.user, job.import_data.get("blocks", []))
|
||||||
|
|
||||||
process_books(job, tar)
|
process_books(job, tar)
|
||||||
|
|
||||||
|
|
|
@ -92,25 +92,25 @@
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% for job in jobs %}
|
{% for export in jobs %}
|
||||||
<tr>
|
<tr>
|
||||||
<td>{{ job.updated_date }}</td>
|
<td>{{ export.job.updated_date }}</td>
|
||||||
<td>
|
<td>
|
||||||
<span
|
<span
|
||||||
{% if job.status == "stopped" or job.status == "failed" %}
|
{% if export.job.status == "stopped" or export.job.status == "failed" %}
|
||||||
class="tag is-danger"
|
class="tag is-danger"
|
||||||
{% elif job.status == "pending" %}
|
{% elif export.job.status == "pending" %}
|
||||||
class="tag is-warning"
|
class="tag is-warning"
|
||||||
{% elif job.complete %}
|
{% elif export.job.complete %}
|
||||||
class="tag"
|
class="tag"
|
||||||
{% else %}
|
{% else %}
|
||||||
class="tag is-success"
|
class="tag is-success"
|
||||||
{% endif %}
|
{% endif %}
|
||||||
>
|
>
|
||||||
{% if job.status %}
|
{% if export.job.status %}
|
||||||
{{ job.status }}
|
{{ export.job.status }}
|
||||||
{{ job.status_display }}
|
{{ export.job.status_display }}
|
||||||
{% elif job.complete %}
|
{% elif export.job.complete %}
|
||||||
{% trans "Complete" %}
|
{% trans "Complete" %}
|
||||||
{% else %}
|
{% else %}
|
||||||
{% trans "Active" %}
|
{% trans "Active" %}
|
||||||
|
@ -118,18 +118,30 @@
|
||||||
</span>
|
</span>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
<span>{{ job.export_data|get_file_size }}</span>
|
<span>{{ export.size|get_file_size }}</span>
|
||||||
</td>
|
</td>
|
||||||
<td>
|
<td>
|
||||||
{% if job.complete and not job.status == "stopped" and not job.status == "failed" %}
|
{% if export.job.complete and not export.job.status == "stopped" and not export.job.status == "failed" %}
|
||||||
<p>
|
{% if export.url%}
|
||||||
<a download="" href="/preferences/user-export/{{ job.task_id }}">
|
<p>
|
||||||
<span class="icon icon-download" aria-hidden="true"></span>
|
<a href="{{ export.url }}">
|
||||||
<span class="is-hidden-mobile">
|
<span class="icon icon-download" aria-hidden="true"></span>
|
||||||
{% trans "Download your export" %}
|
<span class="is-hidden-mobile">
|
||||||
</span>
|
{% trans "Download your export" %}
|
||||||
</a>
|
</span>
|
||||||
</p>
|
</a>
|
||||||
|
</p>
|
||||||
|
{% else %}
|
||||||
|
<p>
|
||||||
|
<a download="" href="/preferences/user-export/{{ export.job.task_id }}">
|
||||||
|
<span class="icon icon-download" aria-hidden="true"></span>
|
||||||
|
<span class="is-hidden-mobile">
|
||||||
|
{% trans "Download your export" %}
|
||||||
|
</span>
|
||||||
|
</a>
|
||||||
|
</p>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</td>
|
</td>
|
||||||
</tr>
|
</tr>
|
||||||
|
|
|
@ -130,23 +130,17 @@ def id_to_username(user_id):
|
||||||
|
|
||||||
|
|
||||||
@register.filter(name="get_file_size")
|
@register.filter(name="get_file_size")
|
||||||
def get_file_size(file):
|
def get_file_size(raw_size):
|
||||||
"""display the size of a file in human readable terms"""
|
"""display the size of a file in human readable terms"""
|
||||||
|
|
||||||
try:
|
try:
|
||||||
# TODO: this obviously isn't a proper solution
|
if raw_size < 1024:
|
||||||
# boto storages do not implement 'path'
|
return f"{raw_size} bytes"
|
||||||
if not USE_S3:
|
if raw_size < 1024**2:
|
||||||
raw_size = os.stat(file.path).st_size
|
return f"{raw_size/1024:.2f} KB"
|
||||||
if raw_size < 1024:
|
if raw_size < 1024**3:
|
||||||
return f"{raw_size} bytes"
|
return f"{raw_size/1024**2:.2f} MB"
|
||||||
if raw_size < 1024**2:
|
return f"{raw_size/1024**3:.2f} GB"
|
||||||
return f"{raw_size/1024:.2f} KB"
|
|
||||||
if raw_size < 1024**3:
|
|
||||||
return f"{raw_size/1024**2:.2f} MB"
|
|
||||||
return f"{raw_size/1024**3:.2f} GB"
|
|
||||||
|
|
||||||
return ""
|
|
||||||
|
|
||||||
except Exception as error: # pylint: disable=broad-except
|
except Exception as error: # pylint: disable=broad-except
|
||||||
print(error)
|
print(error)
|
||||||
|
|
|
@ -13,9 +13,11 @@ from django.views import View
|
||||||
from django.utils.decorators import method_decorator
|
from django.utils.decorators import method_decorator
|
||||||
from django.shortcuts import redirect
|
from django.shortcuts import redirect
|
||||||
|
|
||||||
|
from storages.backends.s3boto3 import S3Boto3Storage
|
||||||
|
|
||||||
from bookwyrm import models
|
from bookwyrm import models
|
||||||
from bookwyrm.models.bookwyrm_export_job import BookwyrmExportJob
|
from bookwyrm.models.bookwyrm_export_job import BookwyrmExportJob
|
||||||
from bookwyrm.settings import PAGE_LENGTH
|
from bookwyrm import settings
|
||||||
|
|
||||||
|
|
||||||
# pylint: disable=no-self-use,too-many-locals
|
# pylint: disable=no-self-use,too-many-locals
|
||||||
|
@ -152,6 +154,34 @@ class ExportUser(View):
|
||||||
jobs = BookwyrmExportJob.objects.filter(user=request.user).order_by(
|
jobs = BookwyrmExportJob.objects.filter(user=request.user).order_by(
|
||||||
"-created_date"
|
"-created_date"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
exports = []
|
||||||
|
for job in jobs:
|
||||||
|
export = {"job": job}
|
||||||
|
|
||||||
|
if settings.USE_S3:
|
||||||
|
# make custom_domain None so we can sign the url (https://github.com/jschneier/django-storages/issues/944)
|
||||||
|
storage = S3Boto3Storage(querystring_auth=True, custom_domain=None)
|
||||||
|
|
||||||
|
# for s3 we download directly from s3, so we need a signed url
|
||||||
|
export["url"] = S3Boto3Storage.url(
|
||||||
|
storage, f"/exports/{job.task_id}.tar.gz", expire=900
|
||||||
|
) # temporarily downloadable file, expires after 5 minutes
|
||||||
|
|
||||||
|
# for s3 we create a new tar file in s3, so we need to check the size of _that_ file
|
||||||
|
try:
|
||||||
|
export["size"] = S3Boto3Storage.size(
|
||||||
|
storage, f"exports/{job.task_id}.tar.gz"
|
||||||
|
)
|
||||||
|
except Exception:
|
||||||
|
export["size"] = 0
|
||||||
|
|
||||||
|
else:
|
||||||
|
# for local storage export_data is the tar file
|
||||||
|
export["size"] = job.export_data.size if job.export_data else 0
|
||||||
|
|
||||||
|
exports.append(export)
|
||||||
|
|
||||||
site = models.SiteSettings.objects.get()
|
site = models.SiteSettings.objects.get()
|
||||||
hours = site.user_import_time_limit
|
hours = site.user_import_time_limit
|
||||||
allowed = (
|
allowed = (
|
||||||
|
@ -162,7 +192,7 @@ class ExportUser(View):
|
||||||
next_available = (
|
next_available = (
|
||||||
jobs.first().created_date + timedelta(hours=hours) if not allowed else False
|
jobs.first().created_date + timedelta(hours=hours) if not allowed else False
|
||||||
)
|
)
|
||||||
paginated = Paginator(jobs, PAGE_LENGTH)
|
paginated = Paginator(exports, settings.PAGE_LENGTH)
|
||||||
page = paginated.get_page(request.GET.get("page"))
|
page = paginated.get_page(request.GET.get("page"))
|
||||||
data = {
|
data = {
|
||||||
"jobs": page,
|
"jobs": page,
|
||||||
|
|
Loading…
Reference in a new issue