Media proxy, caching and tuning docs

Fixes #67
2024-11-21 23:01:00 +00:00 · 2022-12-10 12:16:08 -07:00 · 2022-12-10 12:16:08 -07:00 · 3595af7bd2
commit 3595af7bd2
parent 9a978786d4
12 changed files with 285 additions and 20 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,6 +8,7 @@
 .vscode
 /*.env
 /build
+/cache/
 /docs/_build
 /media/
 /static-collected
--- a/activities/models/post_attachment.py
+++ b/activities/models/post_attachment.py
@ -77,13 +77,13 @@ class PostAttachment(StatorModel):
        elif self.file:
            return self.file.url
        else:
-            return self.remote_url
+            return f"/proxy/post_attachment/{self.pk}/"

    def full_url(self):
        if self.file:
            return self.file.url
        else:
-            return self.remote_url
+            return f"/proxy/post_attachment/{self.pk}/"

    ### ActivityPub ###

--- a/activities/views/posts.py
+++ b/activities/views/posts.py
@ -3,6 +3,7 @@ from django.db import models
 from django.http import JsonResponse
 from django.shortcuts import get_object_or_404, redirect, render
 from django.utils.decorators import method_decorator
+from django.views.decorators.vary import vary_on_headers
 from django.views.generic import TemplateView, View

 from activities.models import Post, PostInteraction, PostInteractionStates, PostStates
@ -15,6 +16,7 @@ from users.shortcuts import by_handle_or_404
@method_decorator(
    cache_page_by_ap_json("cache_timeout_page_post", public_only=True), name="dispatch"
 )
+@method_decorator(vary_on_headers("Accept"), name="dispatch")
 class Individual(TemplateView):

    template_name = "activities/post.html"
--- a/docs/installation.rst
+++ b/docs/installation.rst
@ -252,9 +252,8 @@ You should select the "Domains" link in the sidebar and create one, and then
 you will be able to make your first identity.


-Scaling
-------
+Tuning and Scaling
+------------------

-You can run as many copies of the webserver and workers as you like; the main
-limitation will be your database server's processing power and number of
-allowed connections.
+See :doc:`/tuning` for all the things you should tweak as your server gains
+users. We recommend setting up caches early on!
--- a/docs/tuning.rst
+++ b/docs/tuning.rst
@ -5,6 +5,39 @@ This page contains a collection of tips and settings that can be used to
 tune your server based upon its users and the other servers it federates
 with.

+Scaling
+-------
+
+The only bottleneck, and single point of failure in a Takahē installation is
+its database; no permanent state is stored elsewhere.
+
+Provided your database is happy (and PostgreSQL does a very good job of just
+using more resources if you give them to it), you can:
+
+* Run more webserver containers to handle a higher request load (requests
+  come from both users and other ActivityPub servers trying to forward you
+  messages). Consider setting up the DEFAULT cache under high request load, too.
+
+* Run more Stator worker containers to handle a higher processing load (Stator
+  handles pulling profiles, fanning out messages to followers, and processing
+  stats, among others). You'll generally see Stator load climb roughly in
+  relation to the sum of the number of followers each user in your instance has;
+  a "celebrity" or other popular account will give Stator a lot of work as it
+  has to send a copy of each of their posts to every follower, separately.
+
+As you scale up the number of containers, keep the PostgreSQL connection limit
+in mind; this is generally the first thing that will fail, as Stator workers in
+particular are quite connection-hungry (the parallel nature of their internal
+processing means they might be working on 50 different objects at once). It's
+generally a good idea to set it as high as your PostgreSQL server will take
+(consult PostgreSQL tuning guides for the effect changing that settting has
+on memory usage, specifically).
+
+If you end up having a large server that is running into database performance
+problems, please get in touch with us and discuss it; Takahē is young enough
+that we need data and insight from those installations to help optimise it more.
+
+
 Federating
 ----------

@ -17,22 +50,115 @@ Environment Variable:


 Caching
--------
+-------

 By default Takakē has caching disabled. The caching needs of a server can
 varying drastically based upon the number of users and how interconnected
 they are with other servers.

-Caching is configured by specifying a cache DSN in the environment variable
-``TAKAHE_CACHES_DEFAULT``. The DSN format can be any supported by
+There are multiple ways Takahē uses caches:
+
+* For caching rendered pages and responses, like user profile information.
+  These caches reduce database load on your server and improve performance.
+
+* For proxying and caching remote user images and post images. These must be
+  proxied to protect your users' privacy; also caching these reduces
+  your server's consumed bandwidth and improves users' loading times.
+
+The exact caches you can configure are:
+
+* ``TAKAHE_CACHES_DEFAULT``: Rendered page and response caching
+
+* ``TAKAHE_CACHES_MEDIA``: Remote post images and user profile header pictures
+
+* ``TAKAHE_CACHES_AVATARS``: Remote user avatars ("icons") only
+
+We recommend you set up ``TAKAHE_CACHES_MEDIA`` and ``TAKAHE_CACHES_AVATARS``
+at a bare minimum - proxying these all the time without caching will eat into
+your server's bandwidth.
+
+All caches are configured the same way - with a custom cache URI/URL. We
+support anything that is available as part of
 `django-cache-url <https://github.com/epicserve/django-cache-url>`_, but
 some cache backends will require additional Python packages not installed
-by default with Takahē.
+by default with Takahē. More discussion on backend is below.

-**Examples**
+All items in the cache come with an expiry set - usually one week - but you
+can also configure a maximum cache size on dedicated cache datastores like
+Memcache. The key names used by the caches do not overlap, so there is
+no need to configure different key prefixes for each of Takahē's caches.

-* LocMem cache for a small server: ``locmem://default``
-* Memcache cache for a service named ``memcache``  in a docker compose file:
-  ``memcached://memcache:11211?key_prefix=takahe``
-* Multiple memcache cache servers:
-  ``memcached://server1:11211,server2:11211``
+
+Backends
+~~~~~~~~
+
+Redis
+#####
+
+Examples::
+  redis://redis:6379/0
+  redis://user:password@redis:6379/0
+  rediss://user:password@redis:6379/0
+
+A Redis-protocol server. Use ``redis://`` for unencrypted communication and
+``rediss://`` for TLS.
+
+Redis has a large item size limit and is suitable for all caches. We recommend
+that you keep the DEFAULT cache separate from the MEDIA and AVATARS caches, and
+set the ``maxmemory`` on both to appropriate values (the proxying caches will
+need more memory than the DEFAULT cache).
+
+
+
+Memcache
+########
+
+Examples::
+  memcached://memcache:11211?key_prefix=takahe
+  memcached://server1:11211,server2:11211
+
+A remote Memcache-protocol server (or set of servers).
+
+Memcached has a 1MB limit per key by default, so this is only suitable for the
+DEFAULT cache and not the AVATARS or MEDIA cache.
+
+
+Filesystem
+##########
+
+Examples::
+  file:///var/cache/takahe/
+
+A cache on the local disk.
+
+This *will* work with any of the cache backends, but is probably more suitable
+for MEDIA and AVATARS.
+
+Note that if you are running Takahē in a cluster, this cache will not be shared
+across different machines. This is not quite as bad as it first seems; it just
+means you will have more potential uncached requests until all machines have
+a cached copy.
+
+
+Local Memory
+############
+
+Examples::
+  locmem://default
+
+A local memory cache, inside the Python process. This will consume additional
+memory for the process, and should not be used with the MEDIA or AVATARS caches.
+
+
+CDNs
+----
+
+You can use Takahē with a "read through" CDN that takes over your site's main
+domain serving and passes some requests through to Takahē as a backend.
+
+Takahē sets the appropriate ``Vary`` headers to ensure that cache leakage does
+not happen, and ``Last-Modified`` and ``ETag`` headers to allow the CDN to
+correctly expire cache items.
+
+Takahē does not yet support offloading local media URLs (such as profile images
+and post images) to a *separate* CDN URL; this will be coming in the future.
--- a/mediaproxy/init.py
+++ b/mediaproxy/init.py
--- a/mediaproxy/apps.py
+++ b/mediaproxy/apps.py
@ -0,0 +1,6 @@
+from django.apps import AppConfig
+
+
+class MediaproxyConfig(AppConfig):
+    default_auto_field = "django.db.models.BigAutoField"
+    name = "mediaproxy"
--- a/mediaproxy/views.py
+++ b/mediaproxy/views.py
@ -0,0 +1,101 @@
+import httpx
+from django.conf import settings
+from django.core.cache import caches
+from django.http import Http404, HttpResponse
+from django.shortcuts import get_object_or_404
+from django.views.generic import View
+
+from activities.models import PostAttachment
+from users.models import Identity
+
+
+class BaseCacheView(View):
+    """
+    Base class for caching remote content.
+    """
+
+    cache_name = "media"
+    item_timeout: int | None = None
+
+    def get(self, request, **kwargs):
+        self.kwargs = kwargs
+        remote_url = self.get_remote_url()
+        cache = caches[self.cache_name]
+        cache_key = "proxy_" + remote_url
+        # See if it's already cached
+        cached_content = cache.get(cache_key)
+        if not cached_content:
+            # OK, fetch and cache it
+            try:
+                remote_response = httpx.get(
+                    remote_url,
+                    headers={"User-Agent": settings.TAKAHE_USER_AGENT},
+                    follow_redirects=True,
+                    timeout=settings.SETUP.REMOTE_TIMEOUT,
+                )
+            except (httpx.ConnectError, httpx.RequestError):
+                return HttpResponse(status=502)
+            if remote_response.status_code >= 400:
+                return HttpResponse(status=502)
+            # We got it - shove it into the cache
+            cached_content = {
+                "content": remote_response.content,
+                "mimetype": remote_response.headers.get(
+                    "Content-Type", "application/octet-stream"
+                ),
+            }
+            cache.set(cache_key, cached_content, timeout=self.item_timeout)
+        return HttpResponse(
+            cached_content["content"],
+            headers={
+                "Content-Type": cached_content["mimetype"],
+            },
+        )
+
+    def get_remote_url(self):
+        raise NotImplementedError()
+
+
+class IdentityIconCacheView(BaseCacheView):
+    """
+    Caches identity icons (avatars)
+    """
+
+    cache_name = "avatars"
+    item_timeout = 86400 * 7  # One week
+
+    def get_remote_url(self):
+        self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"])
+        if self.identity.local or not self.identity.image_uri:
+            raise Http404()
+        return self.identity.icon_uri
+
+
+class IdentityImageCacheView(BaseCacheView):
+    """
+    Caches identity profile header images
+    """
+
+    item_timeout = 86400 * 7  # One week
+
+    def get_remote_url(self):
+        self.identity = get_object_or_404(Identity, pk=self.kwargs["identity_id"])
+        if self.identity.local or not self.identity.image_uri:
+            raise Http404()
+        return self.identity.image_uri
+
+
+class PostAttachmentCacheView(BaseCacheView):
+    """
+    Caches post media (images only, videos should always be offloaded to remote)
+    """
+
+    item_timeout = 86400 * 7  # One week
+
+    def get_remote_url(self):
+        self.post_attachment = get_object_or_404(
+            PostAttachment, pk=self.kwargs["attachment_id"]
+        )
+        if not self.post_attachment.is_image():
+            raise Http404()
+        return self.post_attachment.remote_url
--- a/takahe/settings.py
+++ b/takahe/settings.py
@ -118,6 +118,12 @@ class Settings(BaseSettings):
    #: Default cache backend
    CACHES_DEFAULT: CacheBackendUrl | None = None

+    #: User icon (avatar) caching backend
+    CACHES_AVATARS: CacheBackendUrl | None = None
+
+    #: Media caching backend
+    CACHES_MEDIA: CacheBackendUrl | None = None
+
    PGHOST: str | None = None
    PGPORT: int | None = 5432
    PGNAME: str = "takahe"
@ -167,6 +173,7 @@ INSTALLED_APPS = [
    "activities",
    "users",
    "stator",
+    "mediaproxy",
 ]

 MIDDLEWARE = [
@ -351,7 +358,11 @@ if SETUP.MEDIA_BACKEND:
    else:
        raise ValueError(f"Unsupported media backend {parsed.scheme}")

-CACHES = {"default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://")}
+CACHES = {
+    "default": django_cache_url.parse(SETUP.CACHES_DEFAULT or "dummy://"),
+    "avatars": django_cache_url.parse(SETUP.CACHES_AVATARS or "dummy://"),
+    "media": django_cache_url.parse(SETUP.CACHES_MEDIA or "dummy://"),
+}

 if SETUP.ERROR_EMAILS:
    ADMINS = [("Admin", e) for e in SETUP.ERROR_EMAILS]
--- a/takahe/urls.py
+++ b/takahe/urls.py
@ -5,6 +5,7 @@ from django.views.static import serve

 from activities.views import compose, explore, follows, posts, search, timelines
 from core import views as core
+from mediaproxy import views as mediaproxy
 from stator import views as stator
 from users.views import activitypub, admin, auth, identity, settings

@ -176,6 +177,22 @@ urlpatterns = [
        core.FlatPage.as_view(title="Server Rules", config_option="policy_rules"),
        name="rules",
    ),
+    # Media/image proxy
+    path(
+        "proxy/identity_icon/<identity_id>/",
+        mediaproxy.IdentityIconCacheView.as_view(),
+        name="proxy_identity_icon",
+    ),
+    path(
+        "proxy/identity_image/<identity_id>/",
+        mediaproxy.IdentityImageCacheView.as_view(),
+        name="proxy_identity_image",
+    ),
+    path(
+        "proxy/post_attachment/<attachment_id>/",
+        mediaproxy.PostAttachmentCacheView.as_view(),
+        name="proxy_post_attachment",
+    ),
    # Well-known endpoints and system actor
    path(".well-known/webfinger", activitypub.Webfinger.as_view()),
    path(".well-known/host-meta", activitypub.HostMeta.as_view()),
--- a/users/models/identity.py
+++ b/users/models/identity.py
@ -153,7 +153,7 @@ class Identity(StatorModel):
        if self.icon:
            return self.icon.url
        elif self.icon_uri:
-            return self.icon_uri
+            return f"/proxy/identity_icon/{self.pk}/"
        else:
            return static("img/unknown-icon-128.png")

@ -164,7 +164,7 @@ class Identity(StatorModel):
        if self.image:
            return self.image.url
        elif self.image_uri:
-            return self.image_uri
+            return f"/proxy/identity_image/{self.pk}/"

    @property
    def safe_summary(self):
--- a/users/views/identity.py
+++ b/users/views/identity.py
@ -7,6 +7,7 @@ from django.core import validators
 from django.http import Http404, JsonResponse
 from django.shortcuts import redirect
 from django.utils.decorators import method_decorator
+from django.views.decorators.vary import vary_on_headers
 from django.views.generic import FormView, ListView, TemplateView, View

 from activities.models import Post, PostInteraction
@ -18,6 +19,7 @@ from users.models import Domain, Follow, FollowStates, Identity, IdentityStates
 from users.shortcuts import by_handle_or_404


+@method_decorator(vary_on_headers("Accept"), name="dispatch")
@method_decorator(cache_page_by_ap_json(public_only=True), name="dispatch")
 class ViewIdentity(ListView):
    """