From 50a81bdfdd52858683eb8fa31620e97e1a02c5de Mon Sep 17 00:00:00 2001 From: Wesley Aptekar-Cassels Date: Tue, 7 Mar 2023 17:30:19 -0500 Subject: [PATCH] Change CSV export to buffer instead of streaming The idea behind a streaming CSV export was to reduce the amount of memory used, by avoiding building the entire CSV file in memory before sending it to the client. However, it didn't work out this way in practice: the query objects that were created to represent each line caused Postgres to generate a very large (~200MB on bookwyrm.social) temp file, not to mention the memory being used by the Query object likely being similar to, if not larger than that used by the finalized CSV row. While we should in the long term run our CSV exports as a Celery task, this change should allow CSV exports to work on large servers without causing disk-space problems. Fixes: #2157 --- .../tests/views/preferences/test_export.py | 13 ++- bookwyrm/views/preferences/export.py | 102 ++++++++---------- 2 files changed, 51 insertions(+), 64 deletions(-) diff --git a/bookwyrm/tests/views/preferences/test_export.py b/bookwyrm/tests/views/preferences/test_export.py index a3d930f6c..fbc55a9e3 100644 --- a/bookwyrm/tests/views/preferences/test_export.py +++ b/bookwyrm/tests/views/preferences/test_export.py @@ -1,7 +1,7 @@ """ test for app action functionality """ from unittest.mock import patch -from django.http import StreamingHttpResponse +from django.http import HttpResponse from django.test import TestCase from django.test.client import RequestFactory @@ -57,13 +57,12 @@ class ExportViews(TestCase): request = self.factory.post("") request.user = self.local_user export = views.Export.as_view()(request) - self.assertIsInstance(export, StreamingHttpResponse) + self.assertIsInstance(export, HttpResponse) self.assertEqual(export.status_code, 200) - result = list(export.streaming_content) # pylint: disable=line-too-long self.assertEqual( - result[0], - b"title,author_text,remote_id,openlibrary_key,inventaire_id,librarything_key,goodreads_key,bnf_id,viaf,wikidata,asin,aasin,isfdb,isbn_10,isbn_13,oclc_number,rating,review_name,review_cw,review_content\r\n", + export.content, + b"title,author_text,remote_id,openlibrary_key,inventaire_id,librarything_key,goodreads_key,bnf_id,viaf,wikidata,asin,aasin,isfdb,isbn_10,isbn_13,oclc_number,rating,review_name,review_cw,review_content\r\nTest Book,," + + self.book.remote_id.encode("utf-8") + + b",,,,,beep,,,,,,123456789X,9781234567890,,,,,\r\n", ) - expected = f"Test Book,,{self.book.remote_id},,,,,beep,,,,,,123456789X,9781234567890,,,,,\r\n" - self.assertEqual(result[1].decode("utf-8"), expected) diff --git a/bookwyrm/views/preferences/export.py b/bookwyrm/views/preferences/export.py index 2582dda3c..c4540ba78 100644 --- a/bookwyrm/views/preferences/export.py +++ b/bookwyrm/views/preferences/export.py @@ -1,9 +1,10 @@ """ Let users export their book data """ import csv +import io from django.contrib.auth.decorators import login_required from django.db.models import Q -from django.http import StreamingHttpResponse +from django.http import HttpResponse from django.template.response import TemplateResponse from django.views import View from django.utils.decorators import method_decorator @@ -20,8 +21,8 @@ class Export(View): return TemplateResponse(request, "preferences/export.html") def post(self, request): - """Streaming the csv file of a user's book data""" - data = ( + """Download the csv file of a user's book data""" + books = ( models.Edition.viewer_aware_objects(request.user) .filter( Q(shelves__user=request.user) @@ -33,63 +34,50 @@ class Export(View): .distinct() ) - generator = csv_row_generator(data, request.user) + csv_string = io.StringIO() + writer = csv.writer(csv_string) - pseudo_buffer = Echo() - writer = csv.writer(pseudo_buffer) - # for testing, if you want to see the results in the browser: - # from django.http import JsonResponse - # return JsonResponse(list(generator), safe=False) - return StreamingHttpResponse( - (writer.writerow(row) for row in generator), + deduplication_fields = [ + f.name + for f in models.Edition._meta.get_fields() # pylint: disable=protected-access + if getattr(f, "deduplication_field", False) + ] + fields = ( + ["title", "author_text"] + + deduplication_fields + + ["rating", "review_name", "review_cw", "review_content"] + ) + writer.writerow(fields) + + for book in books: + # I think this is more efficient than doing a subquery in the view? but idk + review_rating = ( + models.Review.objects.filter( + user=request.user, book=book, rating__isnull=False + ) + .order_by("-published_date") + .first() + ) + + book.rating = review_rating.rating if review_rating else None + + review = ( + models.Review.objects.filter( + user=request.user, book=book, content__isnull=False + ) + .order_by("-published_date") + .first() + ) + if review: + book.review_name = review.name + book.review_cw = review.content_warning + book.review_content = review.raw_content + writer.writerow([getattr(book, field, "") or "" for field in fields]) + + return HttpResponse( + csv_string.getvalue(), content_type="text/csv", headers={ "Content-Disposition": 'attachment; filename="bookwyrm-export.csv"' }, ) - - -def csv_row_generator(books, user): - """generate a csv entry for the user's book""" - deduplication_fields = [ - f.name - for f in models.Edition._meta.get_fields() # pylint: disable=protected-access - if getattr(f, "deduplication_field", False) - ] - fields = ( - ["title", "author_text"] - + deduplication_fields - + ["rating", "review_name", "review_cw", "review_content"] - ) - yield fields - for book in books: - # I think this is more efficient than doing a subquery in the view? but idk - review_rating = ( - models.Review.objects.filter(user=user, book=book, rating__isnull=False) - .order_by("-published_date") - .first() - ) - - book.rating = review_rating.rating if review_rating else None - - review = ( - models.Review.objects.filter(user=user, book=book, content__isnull=False) - .order_by("-published_date") - .first() - ) - if review: - book.review_name = review.name - book.review_cw = review.content_warning - book.review_content = review.raw_content - yield [getattr(book, field, "") or "" for field in fields] - - -class Echo: - """An object that implements just the write method of the file-like - interface. (https://docs.djangoproject.com/en/3.2/howto/outputting-csv/) - """ - - # pylint: disable=no-self-use - def write(self, value): - """Write the value by returning it, instead of storing in a buffer.""" - return value