bookwyrm/bookwyrm/activitypub/base_activity.py

473 lines
16 KiB
Python
Raw Normal View History

2021-03-08 16:49:10 +00:00
""" basics for an activitypub serializer """
2023-09-13 07:22:53 +00:00
from __future__ import annotations
from dataclasses import dataclass, fields, MISSING
from json import JSONEncoder
import logging
from typing import Optional, Union, TypeVar, overload, Any
2023-01-20 07:20:18 +00:00
import requests
2020-12-08 17:43:12 +00:00
from django.apps import apps
from django.db import IntegrityError, transaction
from django.utils.http import http_date
from bookwyrm import models
2020-12-03 20:35:57 +00:00
from bookwyrm.connectors import ConnectorException, get_data
from bookwyrm.models import base_model
from bookwyrm.signatures import make_signature
from bookwyrm.settings import DOMAIN, INSTANCE_ACTOR_USERNAME
from bookwyrm.tasks import app, MISC
logger = logging.getLogger(__name__)
# pylint: disable=invalid-name
TBookWyrmModel = TypeVar("TBookWyrmModel", bound=base_model.BookWyrmModel)
2021-03-08 16:49:10 +00:00
class ActivitySerializerError(ValueError):
2021-04-26 16:15:42 +00:00
"""routine problems serializing activitypub json"""
class ActivityEncoder(JSONEncoder):
2021-04-26 16:15:42 +00:00
"""used to convert an Activity object into json"""
2021-03-08 16:49:10 +00:00
def default(self, o):
return o.__dict__
@dataclass
2021-06-18 21:12:56 +00:00
# pylint: disable=invalid-name
class Signature:
2021-04-26 16:15:42 +00:00
"""public key block"""
2021-03-08 16:49:10 +00:00
creator: str
created: str
signatureValue: str
2021-03-08 16:49:10 +00:00
type: str = "RsaSignature2017"
def naive_parse(activity_objects, activity_json, serializer=None):
"""this navigates circular import issues by looking up models' serializers"""
if not serializer:
2021-03-08 16:49:10 +00:00
if activity_json.get("publicKeyPem"):
# ugh
2021-03-08 16:49:10 +00:00
activity_json["type"] = "PublicKey"
activity_type = activity_json.get("type")
if activity_type in ["Question", "Article"]:
return None
try:
serializer = activity_objects[activity_type]
2021-06-18 21:12:56 +00:00
except KeyError as err:
# we know this exists and that we can't handle it
2021-06-18 21:12:56 +00:00
raise ActivitySerializerError(err)
2021-02-16 01:23:17 +00:00
return serializer(activity_objects=activity_objects, **activity_json)
2021-02-16 04:49:23 +00:00
@dataclass(init=False)
class ActivityObject:
2021-04-26 16:15:42 +00:00
"""actor activitypub json"""
2021-03-08 16:49:10 +00:00
id: str
type: str
def __init__(
self,
2023-09-13 07:22:53 +00:00
activity_objects: Optional[
dict[str, Union[str, list[str], ActivityObject, base_model.BookWyrmModel]]
] = None,
**kwargs: Any,
):
2021-03-08 16:49:10 +00:00
"""this lets you pass in an object with fields that aren't in the
2020-10-17 02:13:18 +00:00
dataclass, which it ignores. Any field in the dataclass is required or
2021-03-08 16:49:10 +00:00
has a default value"""
for field in fields(self):
try:
value = kwargs[field.name]
if value in (None, MISSING, {}):
raise KeyError("Missing required field", field.name)
2021-02-16 02:47:08 +00:00
try:
is_subclass = issubclass(field.type, ActivityObject)
except TypeError:
is_subclass = False
# serialize a model obj
2021-03-08 16:49:10 +00:00
if hasattr(value, "to_activity"):
value = value.to_activity()
# parse a dict into the appropriate activity
elif is_subclass and isinstance(value, dict):
if activity_objects:
value = naive_parse(activity_objects, value)
else:
2021-02-24 01:18:25 +00:00
value = naive_parse(
2021-03-08 16:49:10 +00:00
activity_objects, value, serializer=field.type
)
2021-02-16 01:23:17 +00:00
except KeyError:
2021-03-08 16:49:10 +00:00
if field.default == MISSING and field.default_factory == MISSING:
raise ActivitySerializerError(
2021-09-18 04:39:18 +00:00
f"Missing required field: {field.name}"
2021-03-08 16:49:10 +00:00
)
value = field.default
setattr(self, field.name, value)
2021-08-17 17:49:11 +00:00
# pylint: disable=too-many-locals,too-many-branches,too-many-arguments
def to_model(
self,
model: Optional[type[TBookWyrmModel]] = None,
instance: Optional[TBookWyrmModel] = None,
allow_create: bool = True,
save: bool = True,
overwrite: bool = True,
allow_external_connections: bool = True,
) -> Optional[TBookWyrmModel]:
"""convert from an activity to a model instance. Args:
model: the django model that this object is being converted to
(will guess if not known)
instance: an existing database entry that is going to be updated by
this activity
allow_create: whether a new object should be created if there is no
existing object is provided or found matching the remote_id
save: store in the database if true, return an unsaved model obj if false
overwrite: replace fields in the database with this activity if true,
only update blank fields if false
allow_external_connections: look up missing data if true,
throw an exception if false and an external connection is needed
"""
2021-02-16 19:04:13 +00:00
model = model or get_model_from_type(self.type)
2021-02-17 00:35:28 +00:00
# only reject statuses if we're potentially creating them
2021-03-08 16:49:10 +00:00
if (
allow_create
and hasattr(model, "ignore_activity")
and model.ignore_activity(self, allow_external_connections)
2021-03-08 16:49:10 +00:00
):
return None
2020-12-18 20:38:27 +00:00
2021-02-16 05:20:00 +00:00
# check for an existing instance
2021-02-16 04:49:23 +00:00
instance = instance or model.find_existing(self.serialize())
2021-02-17 00:35:28 +00:00
2021-02-16 04:49:23 +00:00
if not instance and not allow_create:
2021-02-16 05:20:00 +00:00
# so that we don't create when we want to delete or update
2021-02-16 04:49:23 +00:00
return None
instance = instance or model()
2020-12-03 20:35:57 +00:00
# keep track of what we've changed
update_fields = []
# sets field on the model using the activity value
for field in instance.simple_fields:
2021-02-17 04:24:37 +00:00
try:
changed = field.set_field_from_activity(
instance,
self,
overwrite=overwrite,
allow_external_connections=allow_external_connections,
)
if changed:
update_fields.append(field.name)
2021-02-17 04:24:37 +00:00
except AttributeError as e:
raise ActivitySerializerError(e)
# image fields have to be set after other fields because they can save
# too early and jank up users
for field in instance.image_fields:
changed = field.set_field_from_activity(
instance,
self,
save=save,
overwrite=overwrite,
allow_external_connections=allow_external_connections,
)
if changed:
update_fields.append(field.name)
2020-12-08 17:43:12 +00:00
if not save:
return instance
with transaction.atomic():
# can't force an update on fields unless the object already exists in the db
if not instance.id:
update_fields = None
# we can't set many to many and reverse fields on an unsaved object
try:
2021-02-08 17:38:28 +00:00
try:
instance.save(broadcast=False, update_fields=update_fields)
2021-02-08 17:38:28 +00:00
except TypeError:
instance.save(update_fields=update_fields)
except IntegrityError as e:
raise ActivitySerializerError(e)
# add many to many fields, which have to be set post-save
for field in instance.many_to_many_fields:
# mention books/users/hashtags, for example
field.set_field_from_activity(
instance,
self,
allow_external_connections=allow_external_connections,
)
# reversed relationships in the models
2021-03-08 16:49:10 +00:00
for (
model_field_name,
activity_field_name,
) in instance.deserialize_reverse_fields:
# attachments on Status, for example
values = getattr(self, activity_field_name)
if values is None or values is MISSING:
continue
model_field = getattr(model, model_field_name)
# creating a Work, model_field is 'editions'
# creating a User, model field is 'key_pair'
related_model = model_field.field.model
related_field_name = model_field.field.name
for item in values:
2020-12-08 17:43:12 +00:00
set_related_field.delay(
related_model.__name__,
instance.__class__.__name__,
related_field_name,
2020-12-08 17:43:12 +00:00
instance.remote_id,
2021-03-08 16:49:10 +00:00
item,
2020-12-08 17:43:12 +00:00
)
return instance
2021-12-16 01:10:59 +00:00
def serialize(self, **kwargs):
2021-04-26 16:15:42 +00:00
"""convert to dictionary with context attr"""
2021-12-16 01:10:59 +00:00
omit = kwargs.get("omit", ())
data = self.__dict__.copy()
2021-02-16 19:04:13 +00:00
# recursively serialize
2023-11-19 21:09:22 +00:00
for k, v in data.items():
2021-02-16 19:04:13 +00:00
try:
if issubclass(type(v), ActivityObject):
data[k] = v.serialize()
elif isinstance(v, list):
data[k] = [
e.serialize() if issubclass(type(e), ActivityObject) else e
for e in v
]
2021-02-16 19:04:13 +00:00
except TypeError:
pass
2021-12-16 01:10:59 +00:00
data = {k: v for (k, v) in data.items() if v is not None and k not in omit}
if "@context" not in omit:
data["@context"] = "https://www.w3.org/ns/activitystreams"
return data
@app.task(queue=MISC)
2020-12-08 17:43:12 +00:00
@transaction.atomic
def set_related_field(
2021-03-08 16:49:10 +00:00
model_name, origin_model_name, related_field_name, related_remote_id, data
):
2021-04-26 16:15:42 +00:00
"""load reverse related fields (editions, attachments) without blocking"""
2021-09-18 04:39:18 +00:00
model = apps.get_model(f"bookwyrm.{model_name}", require_ready=True)
origin_model = apps.get_model(f"bookwyrm.{origin_model_name}", require_ready=True)
2020-12-08 17:43:12 +00:00
2022-01-13 00:41:23 +00:00
if isinstance(data, str):
existing = model.find_existing_by_remote_id(data)
if existing:
data = existing.to_activity()
else:
data = get_data(data)
activity = model.activity_serializer(**data)
# this must exist because it's the object that triggered this function
instance = origin_model.find_existing_by_remote_id(related_remote_id)
if not instance:
raise ValueError(f"Invalid related remote id: {related_remote_id}")
# set the origin's remote id on the activity so it will be there when
# the model instance is created
# edition.parentWork = instance, for example
model_field = getattr(model, related_field_name)
if hasattr(model_field, "activitypub_field"):
2022-01-13 01:11:24 +00:00
setattr(activity, getattr(model_field, "activitypub_field"), instance.remote_id)
item = activity.to_model(model=model)
2022-01-13 00:41:23 +00:00
# if the related field isn't serialized (attachments on Status), then
# we have to set it post-creation
if not hasattr(model_field, "activitypub_field"):
setattr(item, related_field_name, instance)
item.save()
2020-12-08 17:43:12 +00:00
2021-02-16 19:04:13 +00:00
def get_model_from_type(activity_type):
2021-04-26 16:15:42 +00:00
"""given the activity, what type of model"""
2023-01-20 07:20:18 +00:00
activity_models = apps.get_models()
2021-03-08 16:49:10 +00:00
model = [
m
2023-01-20 07:20:18 +00:00
for m in activity_models
2021-03-08 16:49:10 +00:00
if hasattr(m, "activity_serializer")
and hasattr(m.activity_serializer, "type")
and m.activity_serializer.type == activity_type
]
if not model:
2021-02-16 19:04:13 +00:00
raise ActivitySerializerError(
2021-09-18 04:39:18 +00:00
f'No model found for activity type "{activity_type}"'
2021-03-08 16:49:10 +00:00
)
2021-02-16 19:04:13 +00:00
return model[0]
# pylint: disable=too-many-arguments
@overload
def resolve_remote_id(
remote_id: str,
model: type[TBookWyrmModel],
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> TBookWyrmModel:
...
# pylint: disable=too-many-arguments
@overload
def resolve_remote_id(
remote_id: str,
model: Optional[str] = None,
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> base_model.BookWyrmModel:
...
# pylint: disable=too-many-arguments
def resolve_remote_id(
remote_id: str,
model: Optional[Union[str, type[base_model.BookWyrmModel]]] = None,
refresh: bool = False,
save: bool = True,
get_activity: bool = False,
allow_external_connections: bool = True,
) -> base_model.BookWyrmModel:
"""take a remote_id and return an instance, creating if necessary. Args:
remote_id: the unique url for looking up the object in the db or by http
model: a string or object representing the model that corresponds to the object
save: whether to return an unsaved database entry or a saved one
get_activity: whether to return the activitypub object or the model object
allow_external_connections: whether to make http connections
"""
2021-03-08 16:49:10 +00:00
if model: # a bonus check we can do if we already know the model
if isinstance(model, str):
model = apps.get_model(f"bookwyrm.{model}", require_ready=True)
2021-02-16 19:04:13 +00:00
result = model.find_existing_by_remote_id(remote_id)
if result and not refresh:
return result if not get_activity else result.to_activity_dataclass()
# The above block will return the object if it already exists in the database.
# If it doesn't, an external connection would be needed, so check if that's cool
if not allow_external_connections:
raise ActivitySerializerError(
"Unable to serialize object without making external HTTP requests"
)
# load the data and create the object
try:
2020-11-29 17:40:15 +00:00
data = get_data(remote_id)
except ConnectionError:
logger.info("Could not connect to host for remote_id: %s", remote_id)
return None
except requests.HTTPError as e:
if (e.response is not None) and e.response.status_code == 401:
2023-01-20 07:20:18 +00:00
# This most likely means it's a mastodon with secure fetch enabled.
data = get_activitypub_data(remote_id)
else:
logger.info("Could not connect to host for remote_id: %s", remote_id)
return None
2021-02-16 19:04:13 +00:00
# determine the model implicitly, if not provided
# or if it's a model with subclasses like Status, check again
if not model or hasattr(model.objects, "select_subclasses"):
2021-03-08 16:49:10 +00:00
model = get_model_from_type(data.get("type"))
2020-12-12 21:39:55 +00:00
# check for existing items with shared unique identifiers
2021-02-16 19:04:13 +00:00
result = model.find_existing(data)
if result and not refresh:
return result if not get_activity else result.to_activity_dataclass()
2020-12-12 21:39:55 +00:00
2020-11-29 17:40:15 +00:00
item = model.activity_serializer(**data)
if get_activity:
return item
# if we're refreshing, "result" will be set and we'll update it
2021-02-16 19:04:13 +00:00
return item.to_model(model=model, instance=result, save=save)
2021-12-16 01:10:59 +00:00
2022-01-05 14:42:54 +00:00
def get_representative():
"""Get or create an actor representing the instance
to sign outgoing HTTP GET requests"""
2023-11-19 21:09:22 +00:00
return models.User.objects.get_or_create(
username=f"{INSTANCE_ACTOR_USERNAME}@{DOMAIN}",
defaults=dict(
email="bookwyrm@localhost",
2023-01-22 05:10:30 +00:00
local=True,
localname=INSTANCE_ACTOR_USERNAME,
2023-11-19 21:09:22 +00:00
),
)[0]
2022-01-05 14:42:54 +00:00
def get_activitypub_data(url):
"""wrapper for request.get"""
now = http_date()
2022-01-05 14:42:54 +00:00
sender = get_representative()
if not sender.key_pair.private_key:
# this shouldn't happen. it would be bad if it happened.
raise ValueError("No private key found for sender")
try:
resp = requests.get(
url,
headers={
# pylint: disable=line-too-long
"Accept": 'application/ld+json; profile="https://www.w3.org/ns/activitystreams"',
"Date": now,
"Signature": make_signature("get", sender, url, now),
},
2024-03-02 04:11:14 +00:00
timeout=15,
)
except requests.RequestException:
raise ConnectorException()
if not resp.ok:
resp.raise_for_status()
try:
data = resp.json()
except ValueError:
raise ConnectorException()
return data
2021-12-16 01:10:59 +00:00
@dataclass(init=False)
class Link(ActivityObject):
"""for tagging a book in a status"""
href: str
2022-01-10 19:21:43 +00:00
name: str = None
2021-12-16 01:10:59 +00:00
mediaType: str = None
id: str = None
attributedTo: str = None
availability: str = None
2021-12-16 01:10:59 +00:00
type: str = "Link"
def serialize(self, **kwargs):
"""remove fields"""
omit = ("id", "@context")
if self.type == "Link":
omit += ("type",)
2021-12-16 01:10:59 +00:00
return super().serialize(omit=omit)
@dataclass(init=False)
class Mention(Link):
"""a subtype of Link for mentioning an actor"""
type: str = "Mention"
@dataclass(init=False)
class Hashtag(Link):
"""a subtype of Link for mentioning a hashtag"""
type: str = "Hashtag"