From cfefbc08d822cd85787d95dc2ee253e3368826d8 Mon Sep 17 00:00:00 2001 From: tobi <31960611+tsmethurst@users.noreply.github.com> Date: Tue, 21 Nov 2023 15:13:30 +0100 Subject: [PATCH] [feature] Federate status language in and out (#2366) * [feature] Federate status language in + out * go fmt * tests, little fix * improve comments * unnest a bit * avoid unnecessary nil check * use more descriptive variable for contentMap * prefer instance languages when selecting from contentMap * update docs to reflect lang selection * rename rdfLangString -> rdfLangs * update comments to mention Pollable * iter through slice instead of map --- docs/federation/federating_with_gotosocial.md | 61 ++++ internal/ap/ap_test.go | 6 + internal/ap/extract.go | 37 ++- internal/ap/extractcontent_test.go | 5 +- internal/ap/normalize.go | 285 ++++++++++++++++-- internal/ap/normalize_test.go | 82 +++-- internal/ap/serialize.go | 143 ++++----- internal/gtsmodel/status.go | 11 + internal/typeutils/astointernal.go | 15 +- internal/typeutils/astointernal_test.go | 9 +- internal/typeutils/internaltoas.go | 10 +- internal/typeutils/internaltoas_test.go | 44 ++- internal/typeutils/util.go | 101 +++++++ internal/typeutils/util_test.go | 114 +++++++ internal/typeutils/wrap_test.go | 3 + 15 files changed, 758 insertions(+), 168 deletions(-) diff --git a/docs/federation/federating_with_gotosocial.md b/docs/federation/federating_with_gotosocial.md index a977ee0e9..fc02d1977 100644 --- a/docs/federation/federating_with_gotosocial.md +++ b/docs/federation/federating_with_gotosocial.md @@ -482,3 +482,64 @@ For the convenience of remote servers, GoToSocial will always provide both the ` GoToSocial tries to parse incoming Mentions in the same way it sends them out: as a `Mention` type entry in the `tag` property. However, when parsing incoming Mentions it's a bit more relaxed with regards to which properties must be set. GoToSocial will prefer the `href` property, which can be either the ActivityPub ID/URI or the web URL of the target; if `href` is not present, it will fall back to using the `name` property. If neither property is present, the mention will be considered invalid and discarded. + +## Content, ContentMap, and Language + +In line with other ActivityPub implementations, GoToSocial uses `content` and `contentMap` fields on `Objects` to infer content and language of incoming posts, and to set content and language on outgoing posts. + +### Outgoing + +If an outgoing `Object` (usually a `Note`) has content, it will be set as stringified HTML on the `content` field. + +If the `content` is in a specific user-selected language, then the `Object` will also have the `contentMap` property set to a single-entry key/value map, where the key is a BCP47 language tag, and the value is the same content from the `content` field. + +For example, a post written in English (`en`) will look something like this: + +```json +{ + "@context": "https://www.w3.org/ns/activitystreams", + "type": "Note", + "attributedTo": "http://example.org/users/i_p_freely", + "to": "https://www.w3.org/ns/activitystreams#Public", + "cc": "http://example.org/users/i_p_freely/followers", + "id": "http://example.org/users/i_p_freely/statuses/01FF25D5Q0DH7CHD57CTRS6WK0", + "url": "http://example.org/@i_p_freely/statuses/01FF25D5Q0DH7CHD57CTRS6WK0", + "published": "2021-11-20T13:32:16Z", + "content": "

This is an example note.

", + "contentMap": { + "en": "

This is an example note.

" + }, + "attachment": [], + "replies": {...}, + "sensitive": false, + "summary": "", + "tag": {...} +} +``` + +GoToSocial will always set the `content` field if the post has content, but it may not always set the `contentMap` field, if an old version of GoToSocial is in use, or the language used by a user is not set or not a recognized BCP47 language tag. + +### Incoming + +GoToSocial uses both the `content` and the `contentMap` properties on incoming `Object`s to determine the content and infer the intended "primary" language for that content. It uses the following algorithm: + +#### Only `content` is set + +Take that content only and mark language as unknown. + +#### Both `content` and `contentMap` are set + +Look for a language tag as key in the `contentMap`, with a value that matches the stringified HTML set in `content`. + +If a match is found, use this as the post's language. + +If a match is not found, keep content from `content` and mark language as unknown. + +#### Only `contentMap` is set + +If `contentMap` has only one entry, take the language tag and content value as the "primary" language and content. + +If `contentMap` has multiple entries, we have no way of determining the intended preferred content and language of the post, since map order is not deterministic. In this case, try to pick a language and content entry that matches one of the languages configured in the GoToSocial instance's [configured languages](../configuration/instance.md). If no language can be matched this way, pick a language and content entry from the `contentMap` at random as the "primary" language and content. + +!!! Note + In all of the above cases, if the inferred language cannot be parsed as a valid BCP47 language tag, language will fall back to unknown. diff --git a/internal/ap/ap_test.go b/internal/ap/ap_test.go index 6a5073c63..583a37c53 100644 --- a/internal/ap/ap_test.go +++ b/internal/ap/ap_test.go @@ -93,6 +93,12 @@ func noteWithMentions1() vocab.ActivityStreamsNote { content := streams.NewActivityStreamsContentProperty() content.AppendXMLSchemaString("hey @f0x and @dumpsterqueer") + + rdfLangString := make(map[string]string) + rdfLangString["en"] = "hey @f0x and @dumpsterqueer" + rdfLangString["fr"] = "bonjour @f0x et @dumpsterqueer" + content.AppendRDFLangString(rdfLangString) + note.SetActivityStreamsContent(content) return note diff --git a/internal/ap/extract.go b/internal/ap/extract.go index 424f77409..3d92fa2ba 100644 --- a/internal/ap/extract.go +++ b/internal/ap/extract.go @@ -631,27 +631,34 @@ func ExtractPublicKey(i WithPublicKey) ( return nil, nil, nil, gtserror.New("couldn't find public key") } -// ExtractContent returns a string representation of the -// given interface's Content property, or an empty string -// if no Content is found. -func ExtractContent(i WithContent) string { - contentProperty := i.GetActivityStreamsContent() - if contentProperty == nil { - return "" +// ExtractContent returns an intermediary representation of +// the given interface's Content and/or ContentMap property. +func ExtractContent(i WithContent) gtsmodel.Content { + content := gtsmodel.Content{} + + contentProp := i.GetActivityStreamsContent() + if contentProp == nil { + // No content at all. + return content } - for iter := contentProperty.Begin(); iter != contentProperty.End(); iter = iter.Next() { + for iter := contentProp.Begin(); iter != contentProp.End(); iter = iter.Next() { switch { - // Content may be parsed as IRI, depending on - // how it's formatted, so account for this. - case iter.IsXMLSchemaString(): - return iter.GetXMLSchemaString() - case iter.IsIRI(): - return iter.GetIRI().String() + case iter.IsRDFLangString() && + len(content.ContentMap) == 0: + content.ContentMap = iter.GetRDFLangString() + + case iter.IsXMLSchemaString() && + content.Content == "": + content.Content = iter.GetXMLSchemaString() + + case iter.IsIRI() && + content.Content == "": + content.Content = iter.GetIRI().String() } } - return "" + return content } // ExtractAttachments attempts to extract barebones MediaAttachment objects from given AS interface type. diff --git a/internal/ap/extractcontent_test.go b/internal/ap/extractcontent_test.go index 590d1b931..c899a10e1 100644 --- a/internal/ap/extractcontent_test.go +++ b/internal/ap/extractcontent_test.go @@ -30,10 +30,11 @@ type ExtractContentTestSuite struct { func (suite *ExtractContentTestSuite) TestExtractContent1() { note := suite.noteWithMentions1 - content := ap.ExtractContent(note) - suite.Equal("hey @f0x and @dumpsterqueer", content) + suite.Equal("hey @f0x and @dumpsterqueer", content.Content) + suite.Equal("bonjour @f0x et @dumpsterqueer", content.ContentMap["fr"]) + suite.Equal("hey @f0x and @dumpsterqueer", content.ContentMap["en"]) } func TestExtractContentTestSuite(t *testing.T) { diff --git a/internal/ap/normalize.go b/internal/ap/normalize.go index 192a2d740..a27527b84 100644 --- a/internal/ap/normalize.go +++ b/internal/ap/normalize.go @@ -20,11 +20,12 @@ package ap import ( "github.com/superseriousbusiness/activity/pub" "github.com/superseriousbusiness/activity/streams" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" "github.com/superseriousbusiness/gotosocial/internal/text" ) /* - NORMALIZE INCOMING + INCOMING NORMALIZATION The below functions should be called to normalize the content of messages *COMING INTO* GoToSocial via the federation API, either as the result of delivery from a remote instance to this @@ -84,39 +85,84 @@ func NormalizeIncomingActivity(activity pub.Activity, rawJSON map[string]interfa } } -// NormalizeIncomingContent replaces the Content of the given item -// with the sanitized version of the raw 'content' value from the -// raw json object map. +// normalizeContent normalizes the given content +// string by sanitizing its HTML and minimizing it. // -// noop if there was no content in the json object map or the -// content was not a plain string. -func NormalizeIncomingContent(item WithContent, rawJSON map[string]interface{}) { - rawContent, ok := rawJSON["content"] - if !ok { - // No content in rawJSON. - // TODO: In future we might also - // look for "contentMap" property. - return +// Noop for non-string content. +func normalizeContent(rawContent interface{}) string { + if rawContent == nil { + // Nothing to fix. + return "" } content, ok := rawContent.(string) if !ok { - // Not interested in content arrays. - return + // Not interested in + // content slices etc. + return "" } - // Content should be HTML encoded by default: + if content == "" { + // Nothing to fix. + return "" + } + + // Content entries should be HTML encoded by default: // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-content // // TODO: sanitize differently based on mediaType. // https://www.w3.org/TR/activitystreams-vocabulary/#dfn-mediatype content = text.SanitizeToHTML(content) content = text.MinifyHTML(content) + return content +} - // Set normalized content property from the raw string; - // this replaces any existing content property on the item. +// NormalizeIncomingContent replaces the Content property of the given +// item with the normalized versions of the raw 'content' and 'contentMap' +// values from the raw json object map. +// +// noop if there was no 'content' or 'contentMap' in the json object map. +func NormalizeIncomingContent(item WithContent, rawJSON map[string]interface{}) { + var ( + rawContent = rawJSON["content"] + rawContentMap = rawJSON["contentMap"] + ) + + if rawContent == nil && + rawContentMap == nil { + // Nothing to normalize, + // leave no content on item. + return + } + + // Create wrapper for normalized content. contentProp := streams.NewActivityStreamsContentProperty() - contentProp.AppendXMLSchemaString(content) + + // Fix 'content' if applicable. + content := normalizeContent(rawContent) + if content != "" { + contentProp.AppendXMLSchemaString(content) + } + + // Fix 'contentMap' if applicable. + contentMap, ok := rawContentMap.(map[string]interface{}) + if ok { + rdfLangs := make(map[string]string, len(contentMap)) + + for lang, rawContent := range contentMap { + content := normalizeContent(rawContent) + if content != "" { + rdfLangs[lang] = content + } + } + + if len(rdfLangs) != 0 { + contentProp.AppendRDFLangString(rdfLangs) + } + } + + // Replace any existing content property + // on the item with normalized version. item.SetActivityStreamsContent(contentProp) } @@ -299,3 +345,204 @@ func NormalizeIncomingPollOptions(item WithOneOf, rawJSON map[string]interface{} NormalizeIncomingName(choiceable, rawChoice) } } + +/* + OUTGOING NORMALIZATION + The below functions should be called to normalize the content + of messages *GOING OUT OF* GoToSocial via the federation API, + either as the result of delivery to a remote instance from this + instance, or as a result of a remote instance doing an http call + to us to dereference something. +*/ + +// NormalizeOutgoingAttachmentProp replaces single-entry Attachment objects with +// single-entry arrays, for better compatibility with other AP implementations. +// +// Ie: +// +// "attachment": { +// ... +// } +// +// becomes: +// +// "attachment": [ +// { +// ... +// } +// ] +// +// Noop for items with no attachments, or with attachments that are already a slice. +func NormalizeOutgoingAttachmentProp(item WithAttachment, rawJSON map[string]interface{}) { + attachment, ok := rawJSON["attachment"] + if !ok { + // No 'attachment', + // nothing to change. + return + } + + if _, ok := attachment.([]interface{}); ok { + // Already slice, + // nothing to change. + return + } + + // Coerce single-object to slice. + rawJSON["attachment"] = []interface{}{attachment} +} + +// NormalizeOutgoingContentProp normalizes go-fed's funky formatting of content and +// contentMap properties to a format better understood by other AP implementations. +// +// Ie., incoming "content" property like this: +// +// "content": [ +// "hello world!", +// { +// "en": "hello world!" +// } +// ] +// +// Is unpacked to: +// +// "content": "hello world!", +// "contentMap": { +// "en": "hello world!" +// } +// +// Noop if neither content nor contentMap are set. +func NormalizeOutgoingContentProp(item WithContent, rawJSON map[string]interface{}) { + contentProp := item.GetActivityStreamsContent() + if contentProp == nil { + // Nothing to do, + // bail early. + return + } + + contentPropLen := contentProp.Len() + if contentPropLen == 0 { + // Nothing to do, + // bail early. + return + } + + var ( + content string + contentMap map[string]string + ) + + for iter := contentProp.Begin(); iter != contentProp.End(); iter = iter.Next() { + switch { + case iter.IsRDFLangString() && + contentMap == nil: + contentMap = iter.GetRDFLangString() + + case content == "" && + iter.IsXMLSchemaString(): + content = iter.GetXMLSchemaString() + } + } + + if content != "" { + rawJSON["content"] = content + } else { + delete(rawJSON, "content") + } + + if contentMap != nil { + rawJSON["contentMap"] = contentMap + } else { + delete(rawJSON, "contentMap") + } +} + +// NormalizeOutgoingObjectProp normalizes each Object entry in the rawJSON of the given +// item by calling custom serialization / normalization functions on them in turn. +// +// This function also unnests single-entry arrays, so that: +// +// "object": [ +// { +// ... +// } +// ] +// +// Becomes: +// +// "object": { +// ... +// } +// +// Noop for each Object entry that isn't an Accountable or Statusable. +func NormalizeOutgoingObjectProp(item WithObject, rawJSON map[string]interface{}) error { + objectProp := item.GetActivityStreamsObject() + if objectProp == nil { + // Nothing to do, + // bail early. + return nil + } + + objectPropLen := objectProp.Len() + if objectPropLen == 0 { + // Nothing to do, + // bail early. + return nil + } + + // The thing we already serialized has objects + // on it, so we should see if we need to custom + // serialize any of those objects, and replace + // them on the data map as necessary. + objects := make([]interface{}, 0, objectPropLen) + for iter := objectProp.Begin(); iter != objectProp.End(); iter = iter.Next() { + if iter.IsIRI() { + // Plain IRIs don't need custom serialization. + objects = append(objects, iter.GetIRI().String()) + continue + } + + var ( + objectType = iter.GetType() + objectSer map[string]interface{} + ) + + if objectType == nil { + // This is awkward. + return gtserror.Newf("could not resolve object iter %T to vocab.Type", iter) + } + + var err error + + // In the below accountable and statusable serialization, + // `@context` will be included in the wrapping type already, + // so we shouldn't also include it in the object itself. + switch tn := objectType.GetTypeName(); { + case IsAccountable(tn): + objectSer, err = serializeAccountable(objectType, false) + + case IsStatusable(tn): + // IsStatusable includes Pollable as well. + objectSer, err = serializeStatusable(objectType, false) + + default: + // No custom serializer for this type; serialize as normal. + objectSer, err = objectType.Serialize() + } + + if err != nil { + return err + } + + objects = append(objects, objectSer) + } + + if objectPropLen == 1 { + // Unnest single object. + rawJSON["object"] = objects[0] + } else { + // Array of objects. + rawJSON["object"] = objects + } + + return nil +} diff --git a/internal/ap/normalize_test.go b/internal/ap/normalize_test.go index cd1affe60..33b1f6ea6 100644 --- a/internal/ap/normalize_test.go +++ b/internal/ap/normalize_test.go @@ -46,6 +46,9 @@ func (suite *NormalizeTestSuite) getStatusable() (vocab.ActivityStreamsNote, map "https://example.org/users/someone/followers" ], "content": "UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.", + "contentMap": { + "en": "UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues." + }, "context": "https://example.org/contexts/01GX0MSHPER1E0FT022Q209EJZ", "conversation": "https://example.org/contexts/01GX0MSHPER1E0FT022Q209EJZ", "id": "https://example.org/objects/01GX0MT2PA58JNSMK11MCS65YD", @@ -182,7 +185,15 @@ func (suite *NormalizeTestSuite) getAccountable() (vocab.ActivityStreamsPerson, func (suite *NormalizeTestSuite) TestNormalizeActivityObject() { note, rawNote := suite.getStatusable() - suite.Equal(`update: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration%3C/a%3E.%3Cbr%3E%3Cbr%3EIn%20fact,%20100,000%20new%20accounts%20have%20been%20created%20since%20last%20night.%3Cbr%3E%3Cbr%3ESince%20last%20night&%2339;s%20spike%208,000-12,000%20new%20accounts%20are%20being%20created%20every%20hour.%3Cbr%3E%3Cbr%3EYesterday,%20I%20estimated%20that%20Mastodon%20would%20have%208%20million%20users%20by%20the%20end%20of%20the%20week.%20That%20might%20happen%20a%20lot%20sooner%20if%20this%20trend%20continues.`, ap.ExtractContent(note)) + content := ap.ExtractContent(note) + suite.Equal( + `update: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration%3C/a%3E.%3Cbr%3E%3Cbr%3EIn%20fact,%20100,000%20new%20accounts%20have%20been%20created%20since%20last%20night.%3Cbr%3E%3Cbr%3ESince%20last%20night&%2339;s%20spike%208,000-12,000%20new%20accounts%20are%20being%20created%20every%20hour.%3Cbr%3E%3Cbr%3EYesterday,%20I%20estimated%20that%20Mastodon%20would%20have%208%20million%20users%20by%20the%20end%20of%20the%20week.%20That%20might%20happen%20a%20lot%20sooner%20if%20this%20trend%20continues.`, + content.Content, + ) + + // Malformed contentMap entry + // will not be extractable yet. + suite.Empty(content.ContentMap["en"]) create := testrig.WrapAPNoteInCreate( testrig.URLMustParse("https://example.org/create_something"), @@ -192,7 +203,18 @@ func (suite *NormalizeTestSuite) TestNormalizeActivityObject() { ) ap.NormalizeIncomingActivity(create, map[string]interface{}{"object": rawNote}) - suite.Equal(`UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.`, ap.ExtractContent(note)) + content = ap.ExtractContent(note) + + suite.Equal( + `UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.`, + content.Content, + ) + + // Content map entry should now be extractable. + suite.Equal( + `UPDATE: As of this morning there are now more than 7 million Mastodon users, most from the #TwitterMigration.

In fact, 100,000 new accounts have been created since last night.

Since last night's spike 8,000-12,000 new accounts are being created every hour.

Yesterday, I estimated that Mastodon would have 8 million users by the end of the week. That might happen a lot sooner if this trend continues.`, + content.ContentMap["en"], + ) } func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment() { @@ -202,12 +224,14 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment // the attachment(s) should be all jacked up. suite.Equal(`{ "@context": "https://www.w3.org/ns/activitystreams", - "attachment": { - "mediaType": "image/jpeg", - "name": "description: here's \u003c\u003ca\u003e\u003e picture of a #cat,%20it%27s%20cute!%20here%27s%20some%20special%20characters:%20%22%22%20%5C%20weeee%27%27%27%27", - "type": "Document", - "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" - }, + "attachment": [ + { + "mediaType": "image/jpeg", + "name": "description: here's \u003c\u003ca\u003e\u003e picture of a #cat,%20it%27s%20cute!%20here%27s%20some%20special%20characters:%20%22%22%20%5C%20weeee%27%27%27%27", + "type": "Document", + "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" + } + ], "attributedTo": "https://example.org/users/hourlycatbot", "id": "https://example.org/users/hourlycatbot/statuses/01GYW48H311PZ78C5G856MGJJJ", "to": "https://www.w3.org/ns/activitystreams#Public", @@ -222,12 +246,14 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment // attachment should no longer be all jacked up. suite.Equal(`{ "@context": "https://www.w3.org/ns/activitystreams", - "attachment": { - "mediaType": "image/jpeg", - "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", - "type": "Document", - "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" - }, + "attachment": [ + { + "mediaType": "image/jpeg", + "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", + "type": "Document", + "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" + } + ], "attributedTo": "https://example.org/users/hourlycatbot", "id": "https://example.org/users/hourlycatbot/statuses/01GYW48H311PZ78C5G856MGJJJ", "to": "https://www.w3.org/ns/activitystreams#Public", @@ -243,12 +269,14 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment // the attachment(s) should be all jacked up. suite.Equal(`{ "@context": "https://www.w3.org/ns/activitystreams", - "attachment": { - "mediaType": "image/jpeg", - "name": "description: here's \u003c\u003ca\u003e\u003e picture of a #cat,%20it%27s%20cute!%20here%27s%20some%20special%20characters:%20%22%22%20%5C%20weeee%27%27%27%27", - "type": "Document", - "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" - }, + "attachment": [ + { + "mediaType": "image/jpeg", + "name": "description: here's \u003c\u003ca\u003e\u003e picture of a #cat,%20it%27s%20cute!%20here%27s%20some%20special%20characters:%20%22%22%20%5C%20weeee%27%27%27%27", + "type": "Document", + "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" + } + ], "attributedTo": "https://example.org/users/hourlycatbot", "id": "https://example.org/users/hourlycatbot/statuses/01GYW48H311PZ78C5G856MGJJJ", "to": "https://www.w3.org/ns/activitystreams#Public", @@ -263,12 +291,14 @@ func (suite *NormalizeTestSuite) TestNormalizeStatusableAttachmentsOneAttachment // attachment should no longer be all jacked up. suite.Equal(`{ "@context": "https://www.w3.org/ns/activitystreams", - "attachment": { - "mediaType": "image/jpeg", - "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", - "type": "Document", - "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" - }, + "attachment": [ + { + "mediaType": "image/jpeg", + "name": "DESCRIPTION: here's \u003c\u003e picture of a #cat, it's cute! here's some special characters: \"\" \\ weeee''''", + "type": "Document", + "url": "https://files.example.org/media_attachments/files/110/258/459/579/509/026/original/b65392ebe0fb04ef.jpeg" + } + ], "attributedTo": "https://example.org/users/hourlycatbot", "id": "https://example.org/users/hourlycatbot/statuses/01GYW48H311PZ78C5G856MGJJJ", "to": "https://www.w3.org/ns/activitystreams#Public", diff --git a/internal/ap/serialize.go b/internal/ap/serialize.go index 368d7f9a2..944e67407 100644 --- a/internal/ap/serialize.go +++ b/internal/ap/serialize.go @@ -18,10 +18,9 @@ package ap import ( - "fmt" - "github.com/superseriousbusiness/activity/streams" "github.com/superseriousbusiness/activity/streams/vocab" + "github.com/superseriousbusiness/gotosocial/internal/gtserror" ) // Serialize is a custom serializer for ActivityStreams types. @@ -35,17 +34,20 @@ import ( // // Currently, the following things will be custom serialized: // -// - OrderedCollection: 'orderedItems' property will always be made into an array. -// - Any Accountable type: 'attachment' property will always be made into an array. -// - Update: any Accountable 'object's set on an update will be custom serialized as above. +// - OrderedCollection: 'orderedItems' property will always be made into an array. +// - Any Accountable type: 'attachment' property will always be made into an array. +// - Any Statusable type: 'attachment' property will always be made into an array; 'content' and 'contentMap' will be normalized. +// - Any Activityable type: any 'object's set on an activity will be custom serialized as above. func Serialize(t vocab.Type) (m map[string]interface{}, e error) { - switch t.GetTypeName() { - case ObjectOrderedCollection: + switch tn := t.GetTypeName(); { + case tn == ObjectOrderedCollection: return serializeOrderedCollection(t) - case ActorApplication, ActorGroup, ActorOrganization, ActorPerson, ActorService: + case IsAccountable(tn): return serializeAccountable(t, true) - case ActivityUpdate: - return serializeWithObject(t) + case IsStatusable(tn): + return serializeStatusable(t, true) + case IsActivityable(tn): + return serializeActivityable(t, true) default: // No custom serializer necessary. return streams.Serialize(t) @@ -61,8 +63,8 @@ func Serialize(t vocab.Type) (m map[string]interface{}, e error) { // See: // - https://github.com/go-fed/activity/issues/139 // - https://github.com/mastodon/mastodon/issues/24225 -func serializeOrderedCollection(orderedCollection vocab.Type) (map[string]interface{}, error) { - data, err := streams.Serialize(orderedCollection) +func serializeOrderedCollection(t vocab.Type) (map[string]interface{}, error) { + data, err := streams.Serialize(t) if err != nil { return nil, err } @@ -99,7 +101,12 @@ func serializeOrderedCollection(orderedCollection vocab.Type) (map[string]interf // If the accountable is being serialized as part of another object (eg., as the // object of an activity), then includeContext should be set to false, as the // @context entry should be included on the top-level/wrapping activity/object. -func serializeAccountable(accountable vocab.Type, includeContext bool) (map[string]interface{}, error) { +func serializeAccountable(t vocab.Type, includeContext bool) (map[string]interface{}, error) { + accountable, ok := t.(Accountable) + if !ok { + return nil, gtserror.Newf("vocab.Type %T not accountable", t) + } + var ( data map[string]interface{} err error @@ -115,91 +122,61 @@ func serializeAccountable(accountable vocab.Type, includeContext bool) (map[stri return nil, err } - attachment, ok := data["attachment"] - if !ok { - // No 'attachment', nothing to change. - return data, nil - } - - if _, ok := attachment.([]interface{}); ok { - // Already slice. - return data, nil - } - - // Coerce single-object to slice. - data["attachment"] = []interface{}{attachment} + NormalizeOutgoingAttachmentProp(accountable, data) return data, nil } -func serializeWithObject(t vocab.Type) (map[string]interface{}, error) { - withObject, ok := t.(WithObject) +func serializeStatusable(t vocab.Type, includeContext bool) (map[string]interface{}, error) { + statusable, ok := t.(Statusable) if !ok { - return nil, fmt.Errorf("serializeWithObject: could not resolve %T to WithObject", t) + return nil, gtserror.Newf("vocab.Type %T not statusable", t) + } + + var ( + data map[string]interface{} + err error + ) + + if includeContext { + data, err = streams.Serialize(statusable) + } else { + data, err = statusable.Serialize() } - data, err := streams.Serialize(t) if err != nil { return nil, err } - object := withObject.GetActivityStreamsObject() - if object == nil { - // Nothing to do, bail early. - return data, nil + NormalizeOutgoingAttachmentProp(statusable, data) + NormalizeOutgoingContentProp(statusable, data) + + return data, nil +} + +func serializeActivityable(t vocab.Type, includeContext bool) (map[string]interface{}, error) { + activityable, ok := t.(Activityable) + if !ok { + return nil, gtserror.Newf("vocab.Type %T not activityable", t) } - objectLen := object.Len() - if objectLen == 0 { - // Nothing to do, bail early. - return data, nil - } + var ( + data map[string]interface{} + err error + ) - // The thing we already serialized has objects - // on it, so we should see if we need to custom - // serialize any of those objects, and replace - // them on the data map as necessary. - objects := make([]interface{}, 0, objectLen) - for iter := object.Begin(); iter != object.End(); iter = iter.Next() { - if iter.IsIRI() { - // Plain IRIs don't need custom serialization. - objects = append(objects, iter.GetIRI().String()) - continue - } - - var ( - objectType = iter.GetType() - objectSer map[string]interface{} - ) - - if objectType == nil { - // This is awkward. - return nil, fmt.Errorf("serializeWithObject: could not resolve object iter %T to vocab.Type", iter) - } - - switch objectType.GetTypeName() { - case ActorApplication, ActorGroup, ActorOrganization, ActorPerson, ActorService: - // @context will be included in wrapping type already, - // we don't need to include it in the object itself. - objectSer, err = serializeAccountable(objectType, false) - default: - // No custom serializer for this type; serialize as normal. - objectSer, err = objectType.Serialize() - } - - if err != nil { - return nil, err - } - - objects = append(objects, objectSer) - } - - if objectLen == 1 { - // Unnest single object. - data["object"] = objects[0] + if includeContext { + data, err = streams.Serialize(activityable) } else { - // Array of objects. - data["object"] = objects + data, err = activityable.Serialize() + } + + if err != nil { + return nil, err + } + + if err := NormalizeOutgoingObjectProp(activityable, data); err != nil { + return nil, err } return data, nil diff --git a/internal/gtsmodel/status.go b/internal/gtsmodel/status.go index a009a726d..9b93e34a1 100644 --- a/internal/gtsmodel/status.go +++ b/internal/gtsmodel/status.go @@ -237,3 +237,14 @@ const ( // VisibilityDefault is used when no other setting can be found. VisibilityDefault Visibility = VisibilityUnlocked ) + +// Content models the simple string content +// of a status along with its ContentMap, +// which contains content entries keyed by +// BCP47 language tag. +// +// Content and/or ContentMap may be zero/nil. +type Content struct { + Content string + ContentMap map[string]string +} diff --git a/internal/typeutils/astointernal.go b/internal/typeutils/astointernal.go index 707f51629..c7908ad24 100644 --- a/internal/typeutils/astointernal.go +++ b/internal/typeutils/astointernal.go @@ -244,9 +244,15 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab } // status.Content + // status.Language // - // The (html-formatted) content of this status. - status.Content = ap.ExtractContent(statusable) + // Many implementations set both content + // and contentMap; we can use these to + // infer the language of the status. + status.Content, status.Language = ContentToContentLanguage( + ctx, + ap.ExtractContent(statusable), + ) // status.Attachments // @@ -396,9 +402,6 @@ func (c *Converter) ASStatusToStatus(ctx context.Context, statusable ap.Statusab return &s }() - // language - // TODO: we might be able to extract this from the contentMap field - // ActivityStreamsType status.ActivityStreamsType = statusable.GetTypeName() @@ -707,7 +710,7 @@ func (c *Converter) ASFlagToReport(ctx context.Context, flaggable ap.Flaggable) // For Mastodon, this will just be a string, or nothing. // In Misskey's case, it may also contain the URLs of // one or more reported statuses, so extract these too. - content := ap.ExtractContent(flaggable) + content := ap.ExtractContent(flaggable).Content statusURIs := []*url.URL{} inlineURLs := misskeyReportInlineURLs(content) statusURIs = append(statusURIs, inlineURLs...) diff --git a/internal/typeutils/astointernal_test.go b/internal/typeutils/astointernal_test.go index 10ea422fa..851d57efc 100644 --- a/internal/typeutils/astointernal_test.go +++ b/internal/typeutils/astointernal_test.go @@ -45,6 +45,10 @@ func (suite *ASToInternalTestSuite) jsonToType(in string) vocab.Type { suite.FailNow(err.Error()) } + if statusable, ok := t.(ap.Statusable); ok { + ap.NormalizeIncomingContent(statusable, m) + } + return t } @@ -103,7 +107,8 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatus() { suite.NoError(err) suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning) - suite.Equal(`

> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.

`, status.Content) + suite.Equal(`

> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.

`, status.Content) + suite.Equal("en", status.Language) } func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() { @@ -117,7 +122,7 @@ func (suite *ASToInternalTestSuite) TestParsePublicStatusNoURL() { suite.NoError(err) suite.Equal("reading: Punishment and Reward in the Corporate University", status.ContentWarning) - suite.Equal(`

> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.

`, status.Content) + suite.Equal(`

> So we have to examine critical thinking as a signifier, dynamic and ambiguous. It has a normative definition, a tacit definition, and an ideal definition. One of the hallmarks of graduate training is learning to comprehend those definitions and applying the correct one as needed for professional success.

`, status.Content) // on statuses with no URL in them (like ones we get from pleroma sometimes) we should use the AP URI of the status as URL suite.Equal("http://fossbros-anonymous.io/users/foss_satan/statuses/108138763199405167", status.URL) diff --git a/internal/typeutils/internaltoas.go b/internal/typeutils/internaltoas.go index 16467be40..ff502296b 100644 --- a/internal/typeutils/internaltoas.go +++ b/internal/typeutils/internaltoas.go @@ -607,9 +607,17 @@ func (c *Converter) StatusToAS(ctx context.Context, s *gtsmodel.Status) (ap.Stat // conversation // TODO - // content -- the actual post itself + // content -- the actual post + // itself, plus the language contentProp := streams.NewActivityStreamsContentProperty() contentProp.AppendXMLSchemaString(s.Content) + + if s.Language != "" { + contentProp.AppendRDFLangString(map[string]string{ + s.Language: s.Content, + }) + } + status.SetActivityStreamsContent(contentProp) // attachments diff --git a/internal/typeutils/internaltoas_test.go b/internal/typeutils/internaltoas_test.go index 01dde66fb..878040dcc 100644 --- a/internal/typeutils/internaltoas_test.go +++ b/internal/typeutils/internaltoas_test.go @@ -340,6 +340,9 @@ func (suite *InternalToASTestSuite) TestStatusToAS() { "attributedTo": "http://localhost:8080/users/the_mighty_zork", "cc": "http://localhost:8080/users/the_mighty_zork/followers", "content": "hello everyone!", + "contentMap": { + "en": "hello everyone!" + }, "id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-10-20T12:40:37+02:00", "replies": { @@ -379,16 +382,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASWithIDs() { // http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams -- // will appear, so trim them out of the string for consistency trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1] - suite.Equal(` { - "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", - "mediaType": "image/jpeg", - "name": "Black and white image of some 50's style text saying: Welcome On Board", - "type": "Document", - "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" - }, + suite.Equal(` [ + { + "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", + "mediaType": "image/jpeg", + "name": "Black and white image of some 50's style text saying: Welcome On Board", + "type": "Document", + "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" + } + ], "attributedTo": "http://localhost:8080/users/admin", "cc": "http://localhost:8080/users/admin/followers", "content": "hello world! #welcome ! first post on the instance :rainbow: !", + "contentMap": { + "en": "hello world! #welcome ! first post on the instance :rainbow: !" + }, "id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R", "published": "2021-10-20T11:36:45Z", "replies": { @@ -446,16 +454,21 @@ func (suite *InternalToASTestSuite) TestStatusWithTagsToASFromDB() { // http://joinmastodon.org/ns, https://www.w3.org/ns/activitystreams -- // will appear, so trim them out of the string for consistency trimmed := strings.SplitAfter(string(bytes), `"attachment":`)[1] - suite.Equal(` { - "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", - "mediaType": "image/jpeg", - "name": "Black and white image of some 50's style text saying: Welcome On Board", - "type": "Document", - "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" - }, + suite.Equal(` [ + { + "blurhash": "LNJRdVM{00Rj%Mayt7j[4nWBofRj", + "mediaType": "image/jpeg", + "name": "Black and white image of some 50's style text saying: Welcome On Board", + "type": "Document", + "url": "http://localhost:8080/fileserver/01F8MH17FWEB39HZJ76B6VXSKF/attachment/original/01F8MH6NEM8D7527KZAECTCR76.jpg" + } + ], "attributedTo": "http://localhost:8080/users/admin", "cc": "http://localhost:8080/users/admin/followers", "content": "hello world! #welcome ! first post on the instance :rainbow: !", + "contentMap": { + "en": "hello world! #welcome ! first post on the instance :rainbow: !" + }, "id": "http://localhost:8080/users/admin/statuses/01F8MH75CBF9JFX4ZAD54N0W0R", "published": "2021-10-20T11:36:45Z", "replies": { @@ -519,6 +532,9 @@ func (suite *InternalToASTestSuite) TestStatusToASWithMentions() { "http://localhost:8080/users/the_mighty_zork" ], "content": "hi @the_mighty_zork welcome to the instance!", + "contentMap": { + "en": "hi @the_mighty_zork welcome to the instance!" + }, "id": "http://localhost:8080/users/admin/statuses/01FF25D5Q0DH7CHD57CTRS6WK0", "inReplyTo": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-11-20T13:32:16Z", diff --git a/internal/typeutils/util.go b/internal/typeutils/util.go index a19588221..8a8d4123b 100644 --- a/internal/typeutils/util.go +++ b/internal/typeutils/util.go @@ -31,6 +31,8 @@ import ( apimodel "github.com/superseriousbusiness/gotosocial/internal/api/model" "github.com/superseriousbusiness/gotosocial/internal/config" "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/language" + "github.com/superseriousbusiness/gotosocial/internal/log" "github.com/superseriousbusiness/gotosocial/internal/regexes" "github.com/superseriousbusiness/gotosocial/internal/text" ) @@ -184,3 +186,102 @@ func placeholdUnknownAttachments(arr []apimodel.Attachment) (string, []apimodel. return text.SanitizeToHTML(aside.String()), arr } + +// ContentToContentLanguage tries to +// extract a content string and language +// tag string from the given intermediary +// content. +// +// Either/both of the returned strings may +// be empty, depending on how things go. +func ContentToContentLanguage( + ctx context.Context, + content gtsmodel.Content, +) ( + string, // content + string, // language +) { + var ( + contentStr string + langTagStr string + ) + + switch contentMap := content.ContentMap; { + // Simplest case: no `contentMap`. + // Return `content`, even if empty. + case contentMap == nil: + return content.Content, "" + + // `content` and `contentMap` set. + // Try to infer "primary" language. + case content.Content != "": + // Assume `content` is intended + // primary content, and look for + // corresponding language tag. + contentStr = content.Content + + for t, c := range contentMap { + if contentStr == c { + langTagStr = t + break + } + } + + // `content` not set; `contentMap` + // is set with only one value. + // This must be the "primary" lang. + case len(contentMap) == 1: + // Use an empty loop to + // get the values we want. + // nolint:revive + for langTagStr, contentStr = range contentMap { + } + + // Only `contentMap` is set, with more + // than one value. Map order is not + // guaranteed so we can't know the + // "primary" language. + // + // Try to select content using our + // instance's configured languages. + // + // In case of no hits, just take the + // first tag and content in the map. + default: + instanceLangs := config.GetInstanceLanguages() + for _, langTagStr = range instanceLangs.TagStrs() { + if contentStr = contentMap[langTagStr]; contentStr != "" { + // Hit! + break + } + } + + // If nothing found, just take + // the first entry we can get by + // breaking after the first iter. + if contentStr == "" { + for langTagStr, contentStr = range contentMap { + break + } + } + } + + if langTagStr != "" { + // Found a lang tag for this content, + // make sure it's valid / parseable. + lang, err := language.Parse(langTagStr) + if err != nil { + log.Warnf( + ctx, + "could not parse %s as BCP47 language tag in status contentMap: %v", + langTagStr, err, + ) + } else { + // Inferred the language! + // Use normalized version. + langTagStr = lang.TagStr + } + } + + return contentStr, langTagStr +} diff --git a/internal/typeutils/util_test.go b/internal/typeutils/util_test.go index e6610574b..0f852d399 100644 --- a/internal/typeutils/util_test.go +++ b/internal/typeutils/util_test.go @@ -18,7 +18,12 @@ package typeutils import ( + "context" "testing" + + "github.com/superseriousbusiness/gotosocial/internal/config" + "github.com/superseriousbusiness/gotosocial/internal/gtsmodel" + "github.com/superseriousbusiness/gotosocial/internal/language" ) func TestMisskeyReportContentURLs1(t *testing.T) { @@ -44,3 +49,112 @@ misskey-formatted` t.Fatalf("wanted 0 urls, got %d", l) } } + +func TestContentToContentLanguage(t *testing.T) { + type testcase struct { + content gtsmodel.Content + instanceLanguages language.Languages + expectedContent string + expectedLang string + } + + ctx, cncl := context.WithCancel(context.Background()) + defer cncl() + + for i, testcase := range []testcase{ + { + content: gtsmodel.Content{ + Content: "hello world", + ContentMap: nil, + }, + expectedContent: "hello world", + expectedLang: "", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + }, + }, + expectedContent: "hello world", + expectedLang: "en", + }, + { + content: gtsmodel.Content{ + Content: "bonjour le monde", + ContentMap: map[string]string{ + "en": "hello world", + "fr": "bonjour le monde", + }, + }, + expectedContent: "bonjour le monde", + expectedLang: "fr", + }, + { + content: gtsmodel.Content{ + Content: "bonjour le monde", + ContentMap: map[string]string{ + "en": "hello world", + }, + }, + expectedContent: "bonjour le monde", + expectedLang: "", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + "ru": "Привет, мир!", + "nl": "hallo wereld!", + "ca": "Hola món!", + }, + }, + instanceLanguages: language.Languages{ + {TagStr: "en"}, + {TagStr: "ca"}, + }, + expectedContent: "hello world", + expectedLang: "en", + }, + { + content: gtsmodel.Content{ + Content: "", + ContentMap: map[string]string{ + "en": "hello world", + "ru": "Привет, мир!", + "nl": "hallo wereld!", + "ca": "Hola món!", + }, + }, + instanceLanguages: language.Languages{ + {TagStr: "ca"}, + {TagStr: "en"}, + }, + expectedContent: "Hola món!", + expectedLang: "ca", + }, + } { + langs, err := language.InitLangs(testcase.instanceLanguages.TagStrs()) + if err != nil { + t.Fatal(err) + } + config.SetInstanceLanguages(langs) + + content, language := ContentToContentLanguage(ctx, testcase.content) + if content != testcase.expectedContent { + t.Errorf( + "test %d expected content '%s' got '%s'", + i, testcase.expectedContent, content, + ) + } + + if language != testcase.expectedLang { + t.Errorf( + "test %d expected language '%s' got '%s'", + i, testcase.expectedLang, language, + ) + } + } +} diff --git a/internal/typeutils/wrap_test.go b/internal/typeutils/wrap_test.go index 9d6d95983..453073ed6 100644 --- a/internal/typeutils/wrap_test.go +++ b/internal/typeutils/wrap_test.go @@ -85,6 +85,9 @@ func (suite *WrapTestSuite) TestWrapNoteInCreate() { "attributedTo": "http://localhost:8080/users/the_mighty_zork", "cc": "http://localhost:8080/users/the_mighty_zork/followers", "content": "hello everyone!", + "contentMap": { + "en": "hello everyone!" + }, "id": "http://localhost:8080/users/the_mighty_zork/statuses/01F8MHAMCHF6Y650WCRSCP4WMY", "published": "2021-10-20T12:40:37+02:00", "replies": {