forgejo/vendor/github.com/src-d/enry/v2/classifier.go
Lauris BH ad2642a8aa
Language statistics bar for repositories (#8037)
* Implementation for calculating language statistics

Impement saving code language statistics to database

Implement rendering langauge stats

Add primary laguage to show in repository list

Implement repository stats indexer queue

Add indexer test

Refactor to use queue module

* Do not timeout for queues
2020-02-11 11:34:17 +02:00

108 lines
2.7 KiB
Go

package enry
import (
"math"
"sort"
"github.com/src-d/enry/v2/internal/tokenizer"
)
// Classifier is the interface in charge to detect the possible languages of the given content based on a set of
// candidates. Candidates is a map which can be used to assign weights to languages dynamically.
type Classifier interface {
Classify(content []byte, candidates map[string]float64) (languages []string)
}
type classifier struct {
languagesLogProbabilities map[string]float64
tokensLogProbabilities map[string]map[string]float64
tokensTotal float64
}
type scoredLanguage struct {
language string
score float64
}
// Classify returns a sorted slice of possible languages sorted by decreasing language's probability
func (c *classifier) Classify(content []byte, candidates map[string]float64) []string {
var languages map[string]float64
if len(candidates) == 0 {
languages = c.knownLangs()
} else {
languages = make(map[string]float64, len(candidates))
for candidate, weight := range candidates {
if lang, ok := GetLanguageByAlias(candidate); ok {
candidate = lang
}
languages[candidate] = weight
}
}
empty := len(content) == 0
scoredLangs := make([]*scoredLanguage, 0, len(languages))
var tokens []string
if !empty {
tokens = tokenizer.Tokenize(content)
}
for language := range languages {
score := c.languagesLogProbabilities[language]
if !empty {
score += c.tokensLogProbability(tokens, language)
}
scoredLangs = append(scoredLangs, &scoredLanguage{
language: language,
score: score,
})
}
return sortLanguagesByScore(scoredLangs)
}
func sortLanguagesByScore(scoredLangs []*scoredLanguage) []string {
sort.Stable(byScore(scoredLangs))
sortedLanguages := make([]string, 0, len(scoredLangs))
for _, scoredLang := range scoredLangs {
sortedLanguages = append(sortedLanguages, scoredLang.language)
}
return sortedLanguages
}
func (c *classifier) knownLangs() map[string]float64 {
langs := make(map[string]float64, len(c.languagesLogProbabilities))
for lang := range c.languagesLogProbabilities {
langs[lang]++
}
return langs
}
func (c *classifier) tokensLogProbability(tokens []string, language string) float64 {
var sum float64
for _, token := range tokens {
sum += c.tokenProbability(token, language)
}
return sum
}
func (c *classifier) tokenProbability(token, language string) float64 {
tokenProb, ok := c.tokensLogProbabilities[language][token]
if !ok {
tokenProb = math.Log(1.000000 / c.tokensTotal)
}
return tokenProb
}
type byScore []*scoredLanguage
func (b byScore) Len() int { return len(b) }
func (b byScore) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
func (b byScore) Less(i, j int) bool { return b[j].score < b[i].score }