gotosocial/internal/filter/spam/statusable.go
tobi 9cadc764b3
[feature] Add experimental instance-federation-spam-filter option (#2685)
* [chore] Move `visibility` to `filter/visibility`

* [feature] Add experimental instance-federation-spam-filter option
2024-02-27 12:22:05 +00:00

473 lines
13 KiB
Go

// GoToSocial
// Copyright (C) GoToSocial Authors admin@gotosocial.org
// SPDX-License-Identifier: AGPL-3.0-or-later
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see <http://www.gnu.org/licenses/>.
package spam
import (
"context"
"errors"
"net/url"
"slices"
"strings"
"github.com/miekg/dns"
"github.com/superseriousbusiness/gotosocial/internal/ap"
"github.com/superseriousbusiness/gotosocial/internal/config"
"github.com/superseriousbusiness/gotosocial/internal/db"
"github.com/superseriousbusiness/gotosocial/internal/gtscontext"
"github.com/superseriousbusiness/gotosocial/internal/gtserror"
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
"github.com/superseriousbusiness/gotosocial/internal/log"
"github.com/superseriousbusiness/gotosocial/internal/regexes"
"github.com/superseriousbusiness/gotosocial/internal/util"
)
// preppedMention represents a partially-parsed
// mention, prepared for spam checking purposes.
type preppedMention struct {
*gtsmodel.Mention
uri *url.URL
domain string
user string
local bool
}
// StatusableOK returns no error if the given statusable looks OK,
// ie., relevant to the receiver, and not spam.
//
// This should only be used for Creates of statusables, NOT Announces!
//
// If the statusable does not pass relevancy or spam checks, either
// a Spam or NotRelevant error will be returned. Callers should use
// gtserror.IsSpam() and gtserror.IsNotRelevant() to check for this.
//
// If the returned error is not nil, but neither Spam or NotRelevant,
// then it's an actual database error.
//
// The decision is made based on the following heuristics, in order:
//
// 1. Receiver follow requester. Return nil.
// 2. Statusable doesn't mention receiver. Return NotRelevant.
//
// If instance-federation-spam-filter = false, then return nil now.
// Otherwise check:
//
// 3. Receiver is locked and is followed by requester. Return nil.
// 4. Five or more people are mentioned. Return Spam.
// 5. Receiver follow (requests) a mentioned account. Return nil.
// 6. Statusable has a media attachment. Return Spam.
// 7. Statusable contains non-mention, non-hashtag links. Return Spam.
func (f *Filter) StatusableOK(
ctx context.Context,
receiver *gtsmodel.Account,
requester *gtsmodel.Account,
statusable ap.Statusable,
) error {
// HEURISTIC 1: Check whether receiving account follows the requesting account.
// If so, we know it's OK and don't need to do any other checks.
follows, err := f.state.DB.IsFollowing(ctx, receiver.ID, requester.ID)
if err != nil {
return gtserror.Newf("db error checking follow status: %w", err)
}
if follows {
// Looks fine.
return nil
}
// HEURISTIC 2: Check whether statusable mentions the
// receiver. If not, we don't want to process this message.
rawMentions, _ := ap.ExtractMentions(statusable)
mentions := prepMentions(ctx, rawMentions)
mentioned := f.isMentioned(ctx, receiver, mentions)
if !mentioned {
// This is a random message fired
// into our inbox, just drop it.
err := errors.New("receiver does not follow requester, and is not mentioned")
return gtserror.SetNotRelevant(err)
}
// Receiver is mentioned, but not by someone
// they follow. Check if we need to do more
// granular spam filtering.
if !config.GetInstanceFederationSpamFilter() {
// Filter is not enabled, allow it
// through without further checks.
return nil
}
// More granular spam filtering time!
//
// HEURISTIC 3: Does requester follow locked receiver?
followedBy, err := f.lockedFollowedBy(ctx, receiver, requester)
if err != nil {
return gtserror.Newf("db error checking follow status: %w", err)
}
// If receiver is locked, and is followed
// by requester, this likely means they're
// interested in the message. Allow it.
if followedBy {
return nil
}
// HEURISTIC 4: How many people are mentioned?
// If it's 5 or more we can assume this is spam.
mentionsLen := len(mentions)
if mentionsLen >= 5 {
err := errors.New("status mentions 5 or more people")
return gtserror.SetSpam(err)
}
// HEURISTIC 5: Four or fewer people are mentioned,
// do we follow (request) at least one of them?
// If so, we're probably interested in the message.
knowsOne := f.knowsOneMentioned(ctx, receiver, mentions)
if knowsOne {
return nil
}
// HEURISTIC 6: Are there any media attachments?
attachments, _ := ap.ExtractAttachments(statusable)
hasAttachments := len(attachments) != 0
if hasAttachments {
err := errors.New("status has attachment(s)")
return gtserror.SetSpam(err)
}
// HEURISTIC 7: Are there any links in the post
// aside from mentions and hashtags? Include the
// summary/content warning when checking.
hashtags, _ := ap.ExtractHashtags(statusable)
hasErrantLinks := f.errantLinks(ctx, statusable, mentions, hashtags)
if hasErrantLinks {
err := errors.New("status has one or more non-mention, non-hashtag links")
return gtserror.SetSpam(err)
}
// Looks OK.
return nil
}
// prepMentions prepares a slice of mentions
// for spam checking by parsing out the namestring
// and targetAccountURI values, if present.
func prepMentions(
ctx context.Context,
mentions []*gtsmodel.Mention,
) []preppedMention {
var (
host = config.GetHost()
accountDomain = config.GetAccountDomain()
)
parsedMentions := make([]preppedMention, 0, len(mentions))
for _, mention := range mentions {
// Start by just embedding
// the original mention.
parsedMention := preppedMention{
Mention: mention,
}
// Try to parse namestring if present.
if mention.NameString != "" {
user, domain, err := util.ExtractNamestringParts(mention.NameString)
if err != nil {
// Malformed mention,
// just log + ignore.
log.Debugf(ctx,
"malformed mention namestring: %v",
err,
)
continue
}
parsedMention.domain = domain
parsedMention.user = user
}
// Try to parse URI if present.
if mention.TargetAccountURI != "" {
targetURI, err := url.Parse(mention.TargetAccountURI)
if err != nil {
// Malformed mention,
// just log + ignore.
log.Debugf(ctx,
"malformed mention uri: %v",
err,
)
continue
}
parsedMention.uri = targetURI
// Set host from targetURI if
// it wasn't set by namestring.
if parsedMention.domain == "" {
parsedMention.domain = targetURI.Host
}
}
// It's a mention of a local account if the target host is us.
parsedMention.local = parsedMention.domain == host || parsedMention.domain == accountDomain
// Done with this one.
parsedMentions = append(parsedMentions, parsedMention)
}
return parsedMentions
}
// isMentioned returns true if the
// receiver is targeted by at least
// one of the given mentions.
func (f *Filter) isMentioned(
ctx context.Context,
receiver *gtsmodel.Account,
mentions []preppedMention,
) bool {
return slices.ContainsFunc(
mentions,
func(mention preppedMention) bool {
// Check if receiver mentioned by URI.
if accURI := mention.TargetAccountURI; accURI != "" &&
(accURI == receiver.URI || accURI == receiver.URL) {
return true
}
// Check if receiver mentioned by namestring.
if mention.local && strings.EqualFold(mention.user, receiver.Username) {
return true
}
// Mention doesn't
// target receiver.
return false
},
)
}
// lockedFollowedBy returns true
// if receiver account is locked,
// and requester follows receiver.
func (f *Filter) lockedFollowedBy(
ctx context.Context,
receiver *gtsmodel.Account,
requester *gtsmodel.Account,
) (bool, error) {
// If receiver is not locked,
// return early to avoid a db call.
if !*receiver.Locked {
return false, nil
}
return f.state.DB.IsFollowing(ctx, requester.ID, receiver.ID)
}
// knowsOneMentioned returns true if the
// receiver follows or has follow requested
// at least one of the mentioned accounts.
func (f *Filter) knowsOneMentioned(
ctx context.Context,
receiver *gtsmodel.Account,
mentions []preppedMention,
) bool {
return slices.ContainsFunc(
mentions,
func(mention preppedMention) bool {
var (
acc *gtsmodel.Account
err error
)
// Try to get target account without
// dereffing. After all, if they're not
// in our db we definitely don't know them.
if mention.TargetAccountURI != "" {
acc, err = f.state.DB.GetAccountByURI(
gtscontext.SetBarebones(ctx),
mention.TargetAccountURI,
)
} else if mention.user != "" {
acc, err = f.state.DB.GetAccountByUsernameDomain(
gtscontext.SetBarebones(ctx),
mention.user,
mention.domain,
)
}
if err != nil && !errors.Is(err, db.ErrNoEntries) {
// Proper error.
log.Errorf(ctx, "db error getting mentioned account: %v", err)
return false
}
if acc == nil {
// We don't know this nerd!
return false
}
if acc.ID == receiver.ID {
// This is us, doesn't count.
return false
}
follows, err := f.state.DB.IsFollowing(ctx, receiver.ID, acc.ID)
if err != nil {
// Proper error.
log.Errorf(ctx, "db error checking follow status: %v", err)
return false
}
if follows {
// We follow this nerd.
return true
}
// We don't follow this nerd, but
// have we requested to follow them?
followRequested, err := f.state.DB.IsFollowRequested(ctx, receiver.ID, acc.ID)
if err != nil {
// Proper error.
log.Errorf(ctx, "db error checking follow req status: %v", err)
return false
}
return followRequested
},
)
}
// errantLinks returns true if any http/https
// link discovered in the statusable content + cw
// is not either a mention link, or a hashtag link.
func (f *Filter) errantLinks(
ctx context.Context,
statusable ap.Statusable,
mentions []preppedMention,
hashtags []*gtsmodel.Tag,
) bool {
// Concatenate the cw with the
// content to check for links in both.
cw := ap.ExtractSummary(statusable)
content := ap.ExtractContent(statusable)
concat := cw + " " + content.Content
// Store link string alongside link
// URI to avoid stringifying twice.
type preppedLink struct {
*url.URL
str string
}
// Find + parse every http/https link in the status.
rawLinks := regexes.LinkScheme.FindAllString(concat, -1)
links := make([]preppedLink, 0, len(rawLinks))
for _, rawLink := range rawLinks {
linkURI, err := url.Parse(rawLink)
if err != nil {
log.Debugf(ctx,
"malformed link in status: %v",
err,
)
// Ignore bad links
// for spam checking.
continue
}
links = append(links, preppedLink{
URL: linkURI,
str: rawLink,
})
}
// For each link in the status, try to
// match it to a hashtag or a mention.
// If we can't, we have an errant link.
for _, link := range links {
hashtagLink := slices.ContainsFunc(
hashtags,
func(hashtag *gtsmodel.Tag) bool {
// If a link is to the href
// of a hashtag, it's fine.
return strings.EqualFold(
link.str,
hashtag.Href,
)
},
)
if hashtagLink {
// This link is accounted for.
// Move to the next one.
continue
}
mentionLink := slices.ContainsFunc(
mentions,
func(mention preppedMention) bool {
// If link is straight up to the URI
// of a mentioned account, it's fine.
if strings.EqualFold(
link.str,
mention.TargetAccountURI,
) {
return true
}
// Link might be to an account URL rather
// than URI. This is a bit trickier because
// we can't predict the format of such URLs,
// and it's difficult to reconstruct them
// while also taking account of different
// host + account-domain values.
//
// So, just check if this link is on the same
// host as the mentioned account, or at least
// shares a host with it.
if link.Host == mention.domain {
// Same host.
return true
}
// Shares a host if it has at least two
// components from the right in common.
common := dns.CompareDomainName(
link.Host,
mention.domain,
)
return common >= 2
},
)
if mentionLink {
// This link is accounted for.
// Move to the next one.
continue
}
// Not a hashtag link
// or a mention link,
// so it's errant.
return true
}
// All links OK, or
// no links found.
return false
}