forgejo/modules/emoji/emoji.go
mrsdizzie 4c1ff57f1a
Update emoji regex (#11584)
When matching emoji, use a regex built from the data we have instead of something generic using unicode ranges. A generic regex can't tell the difference between two separate emoji next to each other or one emoji that is built out of two separate emoji next to each other.

This means that emoji that are next to each other without space in between will be now accurately spanned individually with proper title etc...
2020-05-29 17:08:36 +01:00

147 lines
3.4 KiB
Go

// Copyright 2020 The Gitea Authors. All rights reserved.
// Copyright 2015 Kenneth Shaw
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package emoji
import (
"sort"
"strings"
"sync"
"unicode/utf8"
)
// Gemoji is a set of emoji data.
type Gemoji []Emoji
// Emoji represents a single emoji and associated data.
type Emoji struct {
Emoji string
Description string
Aliases []string
UnicodeVersion string
}
var (
// codeMap provides a map of the emoji unicode code to its emoji data.
codeMap map[string]int
// aliasMap provides a map of the alias to its emoji data.
aliasMap map[string]int
// codeReplacer is the string replacer for emoji codes.
codeReplacer *strings.Replacer
// aliasReplacer is the string replacer for emoji aliases.
aliasReplacer *strings.Replacer
once sync.Once
)
func loadMap() {
once.Do(func() {
// initialize
codeMap = make(map[string]int, len(GemojiData))
aliasMap = make(map[string]int, len(GemojiData))
// process emoji codes and aliases
codePairs := make([]string, 0)
aliasPairs := make([]string, 0)
// sort from largest to small so we match combined emoji first
sort.Slice(GemojiData, func(i, j int) bool {
return len(GemojiData[i].Emoji) > len(GemojiData[j].Emoji)
})
for i, e := range GemojiData {
if e.Emoji == "" || len(e.Aliases) == 0 {
continue
}
// setup codes
codeMap[e.Emoji] = i
codePairs = append(codePairs, e.Emoji, ":"+e.Aliases[0]+":")
// setup aliases
for _, a := range e.Aliases {
if a == "" {
continue
}
aliasMap[a] = i
aliasPairs = append(aliasPairs, ":"+a+":", e.Emoji)
}
}
// create replacers
codeReplacer = strings.NewReplacer(codePairs...)
aliasReplacer = strings.NewReplacer(aliasPairs...)
})
}
// FromCode retrieves the emoji data based on the provided unicode code (ie,
// "\u2618" will return the Gemoji data for "shamrock").
func FromCode(code string) *Emoji {
loadMap()
i, ok := codeMap[code]
if !ok {
return nil
}
return &GemojiData[i]
}
// FromAlias retrieves the emoji data based on the provided alias in the form
// "alias" or ":alias:" (ie, "shamrock" or ":shamrock:" will return the Gemoji
// data for "shamrock").
func FromAlias(alias string) *Emoji {
loadMap()
if strings.HasPrefix(alias, ":") && strings.HasSuffix(alias, ":") {
alias = alias[1 : len(alias)-1]
}
i, ok := aliasMap[alias]
if !ok {
return nil
}
return &GemojiData[i]
}
// ReplaceCodes replaces all emoji codes with the first corresponding emoji
// alias (in the form of ":alias:") (ie, "\u2618" will be converted to
// ":shamrock:").
func ReplaceCodes(s string) string {
loadMap()
return codeReplacer.Replace(s)
}
// ReplaceAliases replaces all aliases of the form ":alias:" with its
// corresponding unicode value.
func ReplaceAliases(s string) string {
loadMap()
return aliasReplacer.Replace(s)
}
// FindEmojiSubmatchIndex returns index pair of longest emoji in a string
func FindEmojiSubmatchIndex(s string) []int {
loadMap()
// if rune and string length are the same then no emoji will be present
// similar performance when there is unicode present but almost 200% faster when not
if utf8.RuneCountInString(s) == len(s) {
return nil
}
for j := range GemojiData {
i := strings.Index(s, GemojiData[j].Emoji)
if i != -1 {
return []int{i, i + len(GemojiData[j].Emoji)}
}
}
return nil
}