mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2025-01-26 16:08:08 +00:00
[bugfix] Fix unicode-unaware word boundary check in hashtags (#1049)
* [bugfix] Fix unicode-unaware word boundary check in hashtag regex Go `\b` does not care for Unicode, and without lookahead, the workarounds got very ugly. So I replaced the regex with a parser. The parser runs in O(n) time and performance should not be affected. * [bugfix] Add back hashtag max length and add tests for it
This commit is contained in:
parent
fece7fa706
commit
52109776f6
4 changed files with 146 additions and 45 deletions
|
@ -47,7 +47,6 @@ const (
|
||||||
const (
|
const (
|
||||||
maximumUsernameLength = 64
|
maximumUsernameLength = 64
|
||||||
maximumEmojiShortcodeLength = 30
|
maximumEmojiShortcodeLength = 30
|
||||||
maximumHashtagLength = 30
|
|
||||||
)
|
)
|
||||||
|
|
||||||
var (
|
var (
|
||||||
|
@ -66,17 +65,11 @@ var (
|
||||||
// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
|
// such as @whatever_user@example.org, returning whatever_user and example.org (without the @ symbols)
|
||||||
MentionName = regexp.MustCompile(mentionName)
|
MentionName = regexp.MustCompile(mentionName)
|
||||||
|
|
||||||
// mention regex can be played around with here: https://regex101.com/r/G1oGR0/1
|
// mention regex can be played around with here: https://regex101.com/r/P0vpYG/1
|
||||||
mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
|
mentionFinder = `(?:^|\s)(@\w+(?:@[a-zA-Z0-9_\-\.]+)?)`
|
||||||
// MentionFinder extracts mentions from a piece of text.
|
// MentionFinder extracts mentions from a piece of text.
|
||||||
MentionFinder = regexp.MustCompile(mentionFinder)
|
MentionFinder = regexp.MustCompile(mentionFinder)
|
||||||
|
|
||||||
// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
|
|
||||||
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
|
|
||||||
// HashtagFinder finds possible hashtags in a string.
|
|
||||||
// It returns just the string part of the hashtag, not the # symbol.
|
|
||||||
HashtagFinder = regexp.MustCompile(hashtagFinder)
|
|
||||||
|
|
||||||
emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
|
emojiShortcode = fmt.Sprintf(`\w{2,%d}`, maximumEmojiShortcodeLength)
|
||||||
// EmojiShortcode validates an emoji name.
|
// EmojiShortcode validates an emoji name.
|
||||||
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
|
EmojiShortcode = regexp.MustCompile(fmt.Sprintf("^%s$", emojiShortcode))
|
||||||
|
|
|
@ -27,36 +27,46 @@ import (
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
"github.com/superseriousbusiness/gotosocial/internal/gtsmodel"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/log"
|
"github.com/superseriousbusiness/gotosocial/internal/log"
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/regexes"
|
"github.com/superseriousbusiness/gotosocial/internal/regexes"
|
||||||
|
"github.com/superseriousbusiness/gotosocial/internal/util"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
|
func (f *formatter) ReplaceTags(ctx context.Context, in string, tags []*gtsmodel.Tag) string {
|
||||||
return regexes.ReplaceAllStringFunc(regexes.HashtagFinder, in, func(match string, buf *bytes.Buffer) string {
|
spans := util.FindHashtagSpansInText(in)
|
||||||
// we have a match
|
|
||||||
matchTrimmed := strings.TrimSpace(match)
|
if len(spans) == 0 {
|
||||||
tagAsEntered := matchTrimmed[1:]
|
return in
|
||||||
|
}
|
||||||
|
|
||||||
|
var b strings.Builder
|
||||||
|
i := 0
|
||||||
|
|
||||||
|
spans:
|
||||||
|
for _, t := range spans {
|
||||||
|
b.WriteString(in[i:t.First])
|
||||||
|
i = t.Second
|
||||||
|
tagAsEntered := in[t.First+1 : t.Second]
|
||||||
|
|
||||||
// check through the tags to find what we're matching
|
|
||||||
for _, tag := range tags {
|
for _, tag := range tags {
|
||||||
if strings.EqualFold(tagAsEntered, tag.Name) {
|
if strings.EqualFold(tagAsEntered, tag.Name) {
|
||||||
// Add any dropped space from match
|
|
||||||
if unicode.IsSpace(rune(match[0])) {
|
|
||||||
buf.WriteByte(match[0])
|
|
||||||
}
|
|
||||||
|
|
||||||
// replace the #tag with the formatted tag content
|
// replace the #tag with the formatted tag content
|
||||||
// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a>
|
// `<a href="tag.URL" class="mention hashtag" rel="tag">#<span>tagAsEntered</span></a>
|
||||||
buf.WriteString(`<a href="`)
|
b.WriteString(`<a href="`)
|
||||||
buf.WriteString(tag.URL)
|
b.WriteString(tag.URL)
|
||||||
buf.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
|
b.WriteString(`" class="mention hashtag" rel="tag">#<span>`)
|
||||||
buf.WriteString(tagAsEntered)
|
b.WriteString(tagAsEntered)
|
||||||
buf.WriteString(`</span></a>`)
|
b.WriteString(`</span></a>`)
|
||||||
return buf.String()
|
continue spans
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// the match wasn't in the list of tags for whatever reason, so just return the match as we found it so nothing changes
|
b.WriteString(in[t.First:t.Second])
|
||||||
return match
|
}
|
||||||
})
|
|
||||||
|
// Get the last bits.
|
||||||
|
i = spans[len(spans)-1].Second
|
||||||
|
b.WriteString(in[i:])
|
||||||
|
|
||||||
|
return b.String()
|
||||||
}
|
}
|
||||||
|
|
||||||
func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string {
|
func (f *formatter) ReplaceMentions(ctx context.Context, in string, mentions []*gtsmodel.Mention) string {
|
||||||
|
|
|
@ -19,11 +19,16 @@
|
||||||
package util
|
package util
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"strings"
|
"unicode"
|
||||||
|
"unicode/utf8"
|
||||||
|
|
||||||
"github.com/superseriousbusiness/gotosocial/internal/regexes"
|
"github.com/superseriousbusiness/gotosocial/internal/regexes"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
const (
|
||||||
|
maximumHashtagLength = 30
|
||||||
|
)
|
||||||
|
|
||||||
// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
|
// DeriveMentionNamesFromText takes a plaintext (ie., not html-formatted) text,
|
||||||
// and applies a regex to it to return a deduplicated list of account names
|
// and applies a regex to it to return a deduplicated list of account names
|
||||||
// mentioned in that text, in the format "@user@example.org" or "@username" for
|
// mentioned in that text, in the format "@user@example.org" or "@username" for
|
||||||
|
@ -36,16 +41,71 @@ func DeriveMentionNamesFromText(text string) []string {
|
||||||
return UniqueStrings(mentionedAccounts)
|
return UniqueStrings(mentionedAccounts)
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeriveHashtagsFromText takes a plaintext (ie., not html-formatted) text,
|
type Pair[A, B any] struct {
|
||||||
// and applies a regex to it to return a deduplicated list of hashtags
|
First A
|
||||||
// used in that text, without the leading #. The case of the returned
|
Second B
|
||||||
// tags will be lowered, for consistency.
|
}
|
||||||
|
|
||||||
|
// Byte index in original string
|
||||||
|
// `First` includes `#`.
|
||||||
|
type Span = Pair[int, int]
|
||||||
|
|
||||||
|
// Takes a plaintext (ie., not HTML-formatted) text,
|
||||||
|
// and returns a slice of unique hashtags.
|
||||||
func DeriveHashtagsFromText(text string) []string {
|
func DeriveHashtagsFromText(text string) []string {
|
||||||
|
tagsMap := make(map[string]bool)
|
||||||
tags := []string{}
|
tags := []string{}
|
||||||
for _, m := range regexes.HashtagFinder.FindAllStringSubmatch(text, -1) {
|
|
||||||
tags = append(tags, strings.TrimPrefix(m[1], "#"))
|
for _, v := range FindHashtagSpansInText(text) {
|
||||||
|
t := text[v.First+1 : v.Second]
|
||||||
|
if _, value := tagsMap[t]; !value {
|
||||||
|
tagsMap[t] = true
|
||||||
|
tags = append(tags, t)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
// Takes a plaintext (ie., not HTML-formatted) text,
|
||||||
|
// and returns a list of pairs of indices into the original string, where
|
||||||
|
// hashtags are located.
|
||||||
|
func FindHashtagSpansInText(text string) []Span {
|
||||||
|
tags := []Span{}
|
||||||
|
start := 0
|
||||||
|
// Keep one rune of lookbehind.
|
||||||
|
prev := ' '
|
||||||
|
inTag := false
|
||||||
|
|
||||||
|
for i, r := range text {
|
||||||
|
if r == '#' && isHashtagBoundary(prev) {
|
||||||
|
// Start of hashtag.
|
||||||
|
inTag = true
|
||||||
|
start = i
|
||||||
|
} else if inTag && !isPermittedInHashtag(r) && !isHashtagBoundary(r) {
|
||||||
|
// Inside the hashtag, but it was a phoney, gottem.
|
||||||
|
inTag = false
|
||||||
|
} else if inTag && isHashtagBoundary(r) {
|
||||||
|
// End of hashtag.
|
||||||
|
inTag = false
|
||||||
|
appendTag(&tags, text, start, i)
|
||||||
|
} else if irl := i + utf8.RuneLen(r); inTag && irl == len(text) {
|
||||||
|
// End of text.
|
||||||
|
appendTag(&tags, text, start, irl)
|
||||||
|
}
|
||||||
|
|
||||||
|
prev = r
|
||||||
|
}
|
||||||
|
|
||||||
|
return tags
|
||||||
|
}
|
||||||
|
|
||||||
|
func appendTag(tags *[]Span, text string, start int, end int) {
|
||||||
|
l := end - start - 1
|
||||||
|
// This check could be moved out into the parsing loop if necessary!
|
||||||
|
if 0 < l && l <= maximumHashtagLength {
|
||||||
|
*tags = append(*tags, Span{First: start, Second: end})
|
||||||
}
|
}
|
||||||
return UniqueStrings(tags)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
|
// DeriveEmojisFromText takes a plaintext (ie., not html-formatted) text,
|
||||||
|
@ -58,3 +118,17 @@ func DeriveEmojisFromText(text string) []string {
|
||||||
}
|
}
|
||||||
return UniqueStrings(emojis)
|
return UniqueStrings(emojis)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func isPermittedInHashtag(r rune) bool {
|
||||||
|
return unicode.IsLetter(r) || unicode.IsNumber(r)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Decides where to break before or after a hashtag.
|
||||||
|
func isHashtagBoundary(r rune) bool {
|
||||||
|
return r == '#' || // `###lol` should work
|
||||||
|
unicode.IsSpace(r) || // All kinds of Unicode whitespace.
|
||||||
|
unicode.IsControl(r) || // All kinds of control characters, like tab.
|
||||||
|
// Most kinds of punctuation except "Pc" ("Punctuation, connecting", like `_`).
|
||||||
|
// But `someurl/#fragment` should not match, neither should HTML entities like `#`.
|
||||||
|
('/' != r && '&' != r && !unicode.Is(unicode.Categories["Pc"], r) && unicode.IsPunct(r))
|
||||||
|
}
|
||||||
|
|
|
@ -77,26 +77,50 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() {
|
||||||
|
|
||||||
# testing this one shouldn't work
|
# testing this one shouldn't work
|
||||||
|
|
||||||
#thisshouldwork
|
#thisshouldwork #dupe #dupe!! #dupe
|
||||||
|
|
||||||
here's a link with a fragment: https://example.org/whatever#ahhh
|
here's a link with a fragment: https://example.org/whatever#ahhh
|
||||||
|
here's another link with a fragment: https://example.org/whatever/#ahhh
|
||||||
|
|
||||||
#ThisShouldAlsoWork #not_this_though
|
(#ThisShouldAlsoWork) #not_this_though
|
||||||
|
|
||||||
#111111 thisalsoshouldn'twork#### ##
|
#111111 thisalsoshouldn'twork#### ##
|
||||||
|
|
||||||
#alimentación, #saúde
|
#alimentación, #saúde, #lävistää, #ö, #네
|
||||||
|
#ThisOneIsThirtyOneCharactersLon... ...ng
|
||||||
|
#ThisOneIsThirteyCharactersLong
|
||||||
`
|
`
|
||||||
|
|
||||||
tags := util.DeriveHashtagsFromText(statusText)
|
tags := util.DeriveHashtagsFromText(statusText)
|
||||||
assert.Len(suite.T(), tags, 7)
|
assert.Len(suite.T(), tags, 12)
|
||||||
assert.Equal(suite.T(), "testing123", tags[0])
|
assert.Equal(suite.T(), "testing123", tags[0])
|
||||||
assert.Equal(suite.T(), "also", tags[1])
|
assert.Equal(suite.T(), "also", tags[1])
|
||||||
assert.Equal(suite.T(), "thisshouldwork", tags[2])
|
assert.Equal(suite.T(), "thisshouldwork", tags[2])
|
||||||
assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3])
|
assert.Equal(suite.T(), "dupe", tags[3])
|
||||||
assert.Equal(suite.T(), "111111", tags[4])
|
assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[4])
|
||||||
assert.Equal(suite.T(), "alimentación", tags[5])
|
assert.Equal(suite.T(), "111111", tags[5])
|
||||||
assert.Equal(suite.T(), "saúde", tags[6])
|
assert.Equal(suite.T(), "alimentación", tags[6])
|
||||||
|
assert.Equal(suite.T(), "saúde", tags[7])
|
||||||
|
assert.Equal(suite.T(), "lävistää", tags[8])
|
||||||
|
assert.Equal(suite.T(), "ö", tags[9])
|
||||||
|
assert.Equal(suite.T(), "네", tags[10])
|
||||||
|
assert.Equal(suite.T(), "ThisOneIsThirteyCharactersLong", tags[11])
|
||||||
|
|
||||||
|
statusText = `#올빼미 hej`
|
||||||
|
tags = util.DeriveHashtagsFromText(statusText)
|
||||||
|
assert.Equal(suite.T(), "올빼미", tags[0])
|
||||||
|
}
|
||||||
|
|
||||||
|
func (suite *StatusTestSuite) TestHashtagSpansOK() {
|
||||||
|
statusText := `#0 #3 #8aa`
|
||||||
|
|
||||||
|
spans := util.FindHashtagSpansInText(statusText)
|
||||||
|
assert.Equal(suite.T(), 0, spans[0].First)
|
||||||
|
assert.Equal(suite.T(), 2, spans[0].Second)
|
||||||
|
assert.Equal(suite.T(), 3, spans[1].First)
|
||||||
|
assert.Equal(suite.T(), 5, spans[1].Second)
|
||||||
|
assert.Equal(suite.T(), 8, spans[2].First)
|
||||||
|
assert.Equal(suite.T(), 12, spans[2].Second)
|
||||||
}
|
}
|
||||||
|
|
||||||
func (suite *StatusTestSuite) TestDeriveEmojiOK() {
|
func (suite *StatusTestSuite) TestDeriveEmojiOK() {
|
||||||
|
@ -127,7 +151,7 @@ Here's some normal text with an :emoji: at the end
|
||||||
func (suite *StatusTestSuite) TestDeriveMultiple() {
|
func (suite *StatusTestSuite) TestDeriveMultiple() {
|
||||||
statusText := `Another test @foss_satan@fossbros-anonymous.io
|
statusText := `Another test @foss_satan@fossbros-anonymous.io
|
||||||
|
|
||||||
#Hashtag
|
#HashTag
|
||||||
|
|
||||||
Text`
|
Text`
|
||||||
|
|
||||||
|
@ -139,7 +163,7 @@ func (suite *StatusTestSuite) TestDeriveMultiple() {
|
||||||
assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0])
|
assert.Equal(suite.T(), "@foss_satan@fossbros-anonymous.io", ms[0])
|
||||||
|
|
||||||
assert.Len(suite.T(), hs, 1)
|
assert.Len(suite.T(), hs, 1)
|
||||||
assert.Equal(suite.T(), "Hashtag", hs[0])
|
assert.Contains(suite.T(), hs, "HashTag")
|
||||||
|
|
||||||
assert.Len(suite.T(), es, 0)
|
assert.Len(suite.T(), es, 0)
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue