woodpecker/vendor/github.com/quasilyte/regex/syntax/lexer.go
Lukas c28f7cb29f
Add golangci-lint (#502)
Initial part of #435
2021-11-14 21:01:54 +01:00

455 lines
9.5 KiB
Go

package syntax
import (
"strings"
"unicode"
"unicode/utf8"
)
type token struct {
kind tokenKind
pos Position
}
func (tok token) String() string {
return tok.kind.String()
}
type tokenKind byte
//go:generate stringer -type=tokenKind -trimprefix=tok -linecomment=true
const (
tokNone tokenKind = iota
tokChar
tokGroupFlags
tokPosixClass
tokConcat
tokRepeat
tokEscapeChar
tokEscapeMeta
tokEscapeOctal
tokEscapeUni
tokEscapeUniFull
tokEscapeHex
tokEscapeHexFull
tokComment
tokQ // \Q
tokMinus // -
tokLbracket // [
tokLbracketCaret // [^
tokRbracket // ]
tokDollar // $
tokCaret // ^
tokQuestion // ?
tokDot // .
tokPlus // +
tokStar // *
tokPipe // |
tokLparen // (
tokLparenName // (?P<name>
tokLparenNameAngle // (?<name>
tokLparenNameQuote // (?'name'
tokLparenFlags // (?flags
tokLparenAtomic // (?>
tokLparenPositiveLookahead // (?=
tokLparenPositiveLookbehind // (?<=
tokLparenNegativeLookahead // (?!
tokLparenNegativeLookbehind // (?<!
tokRparen // )
)
// reMetachar is a table of meta chars outside of a char class.
var reMetachar = [256]bool{
'\\': true,
'|': true,
'*': true,
'+': true,
'?': true,
'.': true,
'[': true,
']': true,
'^': true,
'$': true,
'(': true,
')': true,
}
// charClassMetachar is a table of meta chars inside char class.
var charClassMetachar = [256]bool{
'-': true,
']': true,
}
type lexer struct {
tokens []token
pos int
input string
}
func (l *lexer) HasMoreTokens() bool {
return l.pos < len(l.tokens)
}
func (l *lexer) NextToken() token {
if l.pos < len(l.tokens) {
tok := l.tokens[l.pos]
l.pos++
return tok
}
return token{}
}
func (l *lexer) Peek() token {
if l.pos < len(l.tokens) {
return l.tokens[l.pos]
}
return token{}
}
func (l *lexer) scan() {
for l.pos < len(l.input) {
ch := l.input[l.pos]
if ch > unicode.MaxASCII {
_, size := utf8.DecodeRuneInString(l.input[l.pos:])
l.pushTok(tokChar, size)
l.maybeInsertConcat()
continue
}
switch ch {
case '\\':
l.scanEscape(false)
case '.':
l.pushTok(tokDot, 1)
case '+':
l.pushTok(tokPlus, 1)
case '*':
l.pushTok(tokStar, 1)
case '^':
l.pushTok(tokCaret, 1)
case '$':
l.pushTok(tokDollar, 1)
case '?':
l.pushTok(tokQuestion, 1)
case ')':
l.pushTok(tokRparen, 1)
case '|':
l.pushTok(tokPipe, 1)
case '[':
if l.byteAt(l.pos+1) == '^' {
l.pushTok(tokLbracketCaret, 2)
} else {
l.pushTok(tokLbracket, 1)
}
l.scanCharClass()
case '(':
if l.byteAt(l.pos+1) == '?' {
switch {
case l.byteAt(l.pos+2) == '>':
l.pushTok(tokLparenAtomic, len("(?>"))
case l.byteAt(l.pos+2) == '=':
l.pushTok(tokLparenPositiveLookahead, len("(?="))
case l.byteAt(l.pos+2) == '!':
l.pushTok(tokLparenNegativeLookahead, len("(?!"))
case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '=':
l.pushTok(tokLparenPositiveLookbehind, len("(?<="))
case l.byteAt(l.pos+2) == '<' && l.byteAt(l.pos+3) == '!':
l.pushTok(tokLparenNegativeLookbehind, len("(?<!"))
default:
if l.tryScanComment(l.pos + 2) {
} else if l.tryScanGroupName(l.pos + 2) {
} else if l.tryScanGroupFlags(l.pos + 2) {
} else {
throwErrorf(l.pos, l.pos+1, "group token is incomplete")
}
}
} else {
l.pushTok(tokLparen, 1)
}
case '{':
if j := l.repeatWidth(l.pos + 1); j >= 0 {
l.pushTok(tokRepeat, len("{")+j)
} else {
l.pushTok(tokChar, 1)
}
default:
l.pushTok(tokChar, 1)
}
l.maybeInsertConcat()
}
}
func (l *lexer) scanCharClass() {
l.maybeInsertConcat()
// We need to handle first `]` in a special way. See #3.
if l.byteAt(l.pos) == ']' {
l.pushTok(tokChar, 1)
}
for l.pos < len(l.input) {
ch := l.input[l.pos]
if ch > unicode.MaxASCII {
_, size := utf8.DecodeRuneInString(l.input[l.pos:])
l.pushTok(tokChar, size)
continue
}
switch ch {
case '\\':
l.scanEscape(true)
case '[':
isPosixClass := false
if l.byteAt(l.pos+1) == ':' {
j := l.stringIndex(l.pos+2, ":]")
if j >= 0 {
isPosixClass = true
l.pushTok(tokPosixClass, j+len("[::]"))
}
}
if !isPosixClass {
l.pushTok(tokChar, 1)
}
case '-':
l.pushTok(tokMinus, 1)
case ']':
l.pushTok(tokRbracket, 1)
return // Stop scanning in the char context
default:
l.pushTok(tokChar, 1)
}
}
}
func (l *lexer) scanEscape(insideCharClass bool) {
s := l.input
if l.pos+1 >= len(s) {
throwErrorf(l.pos, l.pos+1, `unexpected end of pattern: trailing '\'`)
}
switch {
case s[l.pos+1] == 'p' || s[l.pos+1] == 'P':
if l.pos+2 >= len(s) {
throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected uni-class-short or '{'")
}
if s[l.pos+2] == '{' {
j := strings.IndexByte(s[l.pos+2:], '}')
if j < 0 {
throwErrorf(l.pos, l.pos+2, "can't find closing '}'")
}
l.pushTok(tokEscapeUniFull, len(`\p{`)+j)
} else {
l.pushTok(tokEscapeUni, len(`\pL`))
}
case s[l.pos+1] == 'x':
if l.pos+2 >= len(s) {
throwErrorf(l.pos, l.pos+2, "unexpected end of pattern: expected hex-digit or '{'")
}
if s[l.pos+2] == '{' {
j := strings.IndexByte(s[l.pos+2:], '}')
if j < 0 {
throwErrorf(l.pos, l.pos+2, "can't find closing '}'")
}
l.pushTok(tokEscapeHexFull, len(`\x{`)+j)
} else {
if isHexDigit(l.byteAt(l.pos + 3)) {
l.pushTok(tokEscapeHex, len(`\xFF`))
} else {
l.pushTok(tokEscapeHex, len(`\xF`))
}
}
case isOctalDigit(s[l.pos+1]):
digits := 1
if isOctalDigit(l.byteAt(l.pos + 2)) {
if isOctalDigit(l.byteAt(l.pos + 3)) {
digits = 3
} else {
digits = 2
}
}
l.pushTok(tokEscapeOctal, len(`\`)+digits)
case s[l.pos+1] == 'Q':
size := len(s) - l.pos // Until the pattern ends
j := l.stringIndex(l.pos+2, `\E`)
if j >= 0 {
size = j + len(`\Q\E`)
}
l.pushTok(tokQ, size)
default:
ch := l.byteAt(l.pos + 1)
if ch > unicode.MaxASCII {
_, size := utf8.DecodeRuneInString(l.input[l.pos+1:])
l.pushTok(tokEscapeChar, len(`\`)+size)
return
}
kind := tokEscapeChar
if insideCharClass {
if charClassMetachar[ch] {
kind = tokEscapeMeta
}
} else {
if reMetachar[ch] {
kind = tokEscapeMeta
}
}
l.pushTok(kind, 2)
}
}
func (l *lexer) maybeInsertConcat() {
if l.isConcatPos() {
last := len(l.tokens) - 1
tok := l.tokens[last]
l.tokens[last].kind = tokConcat
l.tokens = append(l.tokens, tok)
}
}
func (l *lexer) Init(s string) {
l.pos = 0
l.tokens = l.tokens[:0]
l.input = s
l.scan()
l.pos = 0
}
func (l *lexer) tryScanGroupName(pos int) bool {
tok := tokLparenName
endCh := byte('>')
offset := 1
switch l.byteAt(pos) {
case '\'':
endCh = '\''
tok = tokLparenNameQuote
case '<':
tok = tokLparenNameAngle
case 'P':
offset = 2
default:
return false
}
if pos+offset >= len(l.input) {
return false
}
end := strings.IndexByte(l.input[pos+offset:], endCh)
if end < 0 {
return false
}
l.pushTok(tok, len("(?")+offset+end+1)
return true
}
func (l *lexer) tryScanGroupFlags(pos int) bool {
colonPos := strings.IndexByte(l.input[pos:], ':')
parenPos := strings.IndexByte(l.input[pos:], ')')
if parenPos < 0 {
return false
}
end := parenPos
if colonPos >= 0 && colonPos < parenPos {
end = colonPos + len(":")
}
l.pushTok(tokLparenFlags, len("(?")+end)
return true
}
func (l *lexer) tryScanComment(pos int) bool {
if l.byteAt(pos) != '#' {
return false
}
parenPos := strings.IndexByte(l.input[pos:], ')')
if parenPos < 0 {
return false
}
l.pushTok(tokComment, len("(?")+parenPos+len(")"))
return true
}
func (l *lexer) repeatWidth(pos int) int {
j := pos
for isDigit(l.byteAt(j)) {
j++
}
if j == pos {
return -1
}
if l.byteAt(j) == '}' {
return (j + len("}")) - pos // {min}
}
if l.byteAt(j) != ',' {
return -1
}
j += len(",")
for isDigit(l.byteAt(j)) {
j++
}
if l.byteAt(j) == '}' {
return (j + len("}")) - pos // {min,} or {min,max}
}
return -1
}
func (l *lexer) stringIndex(offset int, s string) int {
if offset < len(l.input) {
return strings.Index(l.input[offset:], s)
}
return -1
}
func (l *lexer) byteAt(pos int) byte {
if pos >= 0 && pos < len(l.input) {
return l.input[pos]
}
return 0
}
func (l *lexer) pushTok(kind tokenKind, size int) {
l.tokens = append(l.tokens, token{
kind: kind,
pos: Position{Begin: uint16(l.pos), End: uint16(l.pos + size)},
})
l.pos += size
}
func (l *lexer) isConcatPos() bool {
if len(l.tokens) < 2 {
return false
}
x := l.tokens[len(l.tokens)-2].kind
if concatTable[x]&concatX != 0 {
return false
}
y := l.tokens[len(l.tokens)-1].kind
return concatTable[y]&concatY == 0
}
const (
concatX byte = 1 << iota
concatY
)
var concatTable = [256]byte{
tokPipe: concatX | concatY,
tokLparen: concatX,
tokLparenFlags: concatX,
tokLparenName: concatX,
tokLparenNameAngle: concatX,
tokLparenNameQuote: concatX,
tokLparenAtomic: concatX,
tokLbracket: concatX,
tokLbracketCaret: concatX,
tokLparenPositiveLookahead: concatX,
tokLparenPositiveLookbehind: concatX,
tokLparenNegativeLookahead: concatX,
tokLparenNegativeLookbehind: concatX,
tokRparen: concatY,
tokRbracket: concatY,
tokPlus: concatY,
tokStar: concatY,
tokQuestion: concatY,
tokRepeat: concatY,
}