Fix inbox processing errors from pinned posts and non-Mastodon servers (#596)

If a post (interaction) comes in from AP inbox but no local author profile exists,
fetch_actor will pull in both the identity AND its pinned posts, which the incoming
post might have been. This would case a database integrity violation. We check
for post existing again after syncing the actor.

Post processing also barfed on posts where content didn't follow Mastodon specs.
For example, Kbin sets tag names in 'tag' attribute, instead of 'name' attribute.
This commit is contained in:
Osma Ahvenlampi 2023-07-12 18:49:30 +03:00 committed by GitHub
parent 5f49f9b2bb
commit 5549d21528
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 61 additions and 17 deletions

View file

@ -12,9 +12,11 @@ from asgiref.sync import async_to_sync
from django.contrib.postgres.indexes import GinIndex from django.contrib.postgres.indexes import GinIndex
from django.contrib.postgres.search import SearchVector from django.contrib.postgres.search import SearchVector
from django.db import models, transaction from django.db import models, transaction
from django.db.utils import IntegrityError
from django.template import loader from django.template import loader
from django.template.defaultfilters import linebreaks_filter from django.template.defaultfilters import linebreaks_filter
from django.utils import timezone from django.utils import timezone
from pyld.jsonld import JsonLdError
from activities.models.emoji import Emoji from activities.models.emoji import Emoji
from activities.models.fan_out import FanOut from activities.models.fan_out import FanOut
@ -842,6 +844,9 @@ class Post(StatorModel):
if author.domain is None: if author.domain is None:
if fetch_author: if fetch_author:
async_to_sync(author.fetch_actor)() async_to_sync(author.fetch_actor)()
# perhaps the entire "try again" logic below
# could be replaced with TryAgainLater for
# _all_ fetches, to let it handle pinned posts?
if author.domain is None: if author.domain is None:
raise TryAgainLater() raise TryAgainLater()
else: else:
@ -849,6 +854,14 @@ class Post(StatorModel):
# If the post is from a blocked domain, stop and drop # If the post is from a blocked domain, stop and drop
if author.domain.blocked: if author.domain.blocked:
raise cls.DoesNotExist("Post is from a blocked domain") raise cls.DoesNotExist("Post is from a blocked domain")
try:
# try again, because fetch_actor() also fetches pinned posts
post = cls.objects.select_related("author__domain").get(
object_uri=data["id"]
)
except cls.DoesNotExist:
# finally, create a stub
try:
post = cls.objects.create( post = cls.objects.create(
object_uri=data["id"], object_uri=data["id"],
author=author, author=author,
@ -857,14 +870,30 @@ class Post(StatorModel):
type=data["type"], type=data["type"],
) )
created = True created = True
except IntegrityError as dupe:
# there's still some kind of race condition here
# it's far more rare, but sometimes we fire an
# IntegrityError on activities_post_object_uri_key
# this transaction is now aborted and anything following
# in the caller function will fail in the database.
raise TryAgainLater() from dupe
else: else:
raise cls.DoesNotExist(f"No post with ID {data['id']}", data) raise cls.DoesNotExist(f"No post with ID {data['id']}", data)
if update or created: if update or created:
post.type = data["type"] post.type = data["type"]
if post.type in (cls.Types.article, cls.Types.question): if post.type in (cls.Types.article, cls.Types.question):
post.type_data = PostTypeData(__root__=data).__root__ post.type_data = PostTypeData(__root__=data).__root__
try:
# apparently sometimes posts (Pages?) in the fediverse
# don't have content?!
post.content = get_value_or_map(data, "content", "contentMap") post.content = get_value_or_map(data, "content", "contentMap")
post.summary = data.get("summary") except KeyError:
post.content = None
# Document types have names, not summaries
post.summary = data.get("summary") or data.get("name")
if not post.content and post.summary:
post.content = post.summary
post.summary = None
post.sensitive = data.get("sensitive", False) post.sensitive = data.get("sensitive", False)
post.url = data.get("url", data["id"]) post.url = data.get("url", data["id"])
post.published = parse_ld_date(data.get("published")) post.published = parse_ld_date(data.get("published"))
@ -878,10 +907,13 @@ class Post(StatorModel):
mention_identity = Identity.by_actor_uri(tag["href"], create=True) mention_identity = Identity.by_actor_uri(tag["href"], create=True)
post.mentions.add(mention_identity) post.mentions.add(mention_identity)
elif tag_type in ["_:hashtag", "hashtag"]: elif tag_type in ["_:hashtag", "hashtag"]:
# kbin produces tags with 'tag' instead of 'name'
if "tag" in tag and "name" not in tag:
name = get_value_or_map(tag, "tag", "tagMap")
else:
name = get_value_or_map(tag, "name", "nameMap")
post.hashtags.append( post.hashtags.append(
get_value_or_map(tag, "name", "nameMap") name.lower().lstrip("#")[: Hashtag.MAXIMUM_LENGTH]
.lower()
.lstrip("#")[: Hashtag.MAXIMUM_LENGTH]
) )
elif tag_type in ["toot:emoji", "emoji"]: elif tag_type in ["toot:emoji", "emoji"]:
emoji = Emoji.by_ap_tag(post.author.domain, tag, create=True) emoji = Emoji.by_ap_tag(post.author.domain, tag, create=True)
@ -907,6 +939,10 @@ class Post(StatorModel):
# These have no IDs, so we have to wipe them each time # These have no IDs, so we have to wipe them each time
post.attachments.all().delete() post.attachments.all().delete()
for attachment in get_list(data, "attachment"): for attachment in get_list(data, "attachment"):
if "url" not in attachment.keys():
# sometimes attachments don't have URLs. Skip them.
print(f"no URL for {attachment} in {post}")
continue
if "focalPoint" in attachment: if "focalPoint" in attachment:
try: try:
focal_x, focal_y = attachment["focalPoint"] focal_x, focal_y = attachment["focalPoint"]
@ -982,8 +1018,10 @@ class Post(StatorModel):
update=True, update=True,
fetch_author=True, fetch_author=True,
) )
except (json.JSONDecodeError, ValueError): except (json.JSONDecodeError, ValueError, JsonLdError) as err:
raise cls.DoesNotExist(f"Invalid ld+json response for {object_uri}") raise cls.DoesNotExist(
f"Invalid ld+json response for {object_uri}"
) from err
# We may need to fetch the author too # We may need to fetch the author too
if post.author.state == IdentityStates.outdated: if post.author.state == IdentityStates.outdated:
async_to_sync(post.author.fetch_actor)() async_to_sync(post.author.fetch_actor)()

View file

@ -471,6 +471,7 @@ class PostInteraction(StatorModel):
# TODO: Limited retry state? # TODO: Limited retry state?
return return
if interaction and interaction.post:
interaction.post.calculate_stats() interaction.post.calculate_stats()
interaction.post.calculate_type_data() interaction.post.calculate_type_data()

View file

@ -4,6 +4,7 @@ from django.template.defaultfilters import linebreaks_filter
from activities.models import FanOut, Post, PostInteraction, PostInteractionStates from activities.models import FanOut, Post, PostInteraction, PostInteractionStates
from core.files import resize_image from core.files import resize_image
from core.html import FediverseHtmlParser from core.html import FediverseHtmlParser
from stator.exceptions import TryAgainLater
from users.models import ( from users.models import (
Block, Block,
BlockStates, BlockStates,
@ -201,6 +202,10 @@ class IdentityService:
except Post.DoesNotExist: except Post.DoesNotExist:
# ignore 404s... # ignore 404s...
pass pass
except TryAgainLater:
# when fetching a post -> author -> post we can
# get into a state. Ignore this round.
pass
for removed in PostInteraction.objects.filter( for removed in PostInteraction.objects.filter(
type=PostInteraction.Types.pin, type=PostInteraction.Types.pin,
identity=self.identity, identity=self.identity,