[bugfix] Make hashtag regex work with non-ascii characters (#682)

This commit is contained in:
tobi 2022-07-03 11:03:03 +02:00 committed by GitHub
parent 9e7d022a06
commit 664713ddd4
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
2 changed files with 9 additions and 4 deletions

View file

@ -71,8 +71,8 @@ var (
// MentionFinder extracts mentions from a piece of text.
MentionFinder = regexp.MustCompile(mentionFinder)
// hashtag regex can be played with here: https://regex101.com/r/bPxeca/1
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[a-zA-Z0-9]{1,%d})(?:#|\b)`, maximumHashtagLength)
// hashtag regex can be played with here: https://regex101.com/r/bpyGlj/1
hashtagFinder = fmt.Sprintf(`(?:^|\s)(?:#*)(#[\p{L}\p{N}]{1,%d})(?:#|\b)`, maximumHashtagLength)
// HashtagFinder finds possible hashtags in a string.
// It returns just the string part of the hashtag, not the # symbol.
HashtagFinder = regexp.MustCompile(hashtagFinder)

View file

@ -83,15 +83,20 @@ func (suite *StatusTestSuite) TestDeriveHashtagsOK() {
#ThisShouldAlsoWork #not_this_though
#111111 thisalsoshouldn'twork#### ##`
#111111 thisalsoshouldn'twork#### ##
#alimentación, #saúde
`
tags := util.DeriveHashtagsFromText(statusText)
assert.Len(suite.T(), tags, 5)
assert.Len(suite.T(), tags, 7)
assert.Equal(suite.T(), "testing123", tags[0])
assert.Equal(suite.T(), "also", tags[1])
assert.Equal(suite.T(), "thisshouldwork", tags[2])
assert.Equal(suite.T(), "ThisShouldAlsoWork", tags[3])
assert.Equal(suite.T(), "111111", tags[4])
assert.Equal(suite.T(), "alimentación", tags[5])
assert.Equal(suite.T(), "saúde", tags[6])
}
func (suite *StatusTestSuite) TestDeriveEmojiOK() {