woodpecker/vendor/github.com/quasilyte/regex/syntax/parser.go

package syntax

import (
	"errors"
	"fmt"
	"strings"
)

type ParserOptions struct {
	// NoLiterals disables OpChar merging into OpLiteral.
	NoLiterals bool
}

func NewParser(opts *ParserOptions) *Parser {
	return newParser(opts)
}

type Parser struct {
	out      Regexp
	lexer    lexer
	exprPool []Expr

	prefixParselets [256]prefixParselet
	infixParselets  [256]infixParselet

	charClass []Expr
	allocated uint

	opts ParserOptions
}

// ParsePCRE parses PHP-style pattern with delimiters.
// An example of such pattern is `/foo/i`.
func (p *Parser) ParsePCRE(pattern string) (*RegexpPCRE, error) {
	pcre, err := p.newPCRE(pattern)
	if err != nil {
		return nil, err
	}
	if pcre.HasModifier('x') {
		return nil, errors.New("'x' modifier is not supported")
	}
	re, err := p.Parse(pcre.Pattern)
	if re != nil {
		pcre.Expr = re.Expr
	}
	return pcre, err
}

func (p *Parser) Parse(pattern string) (result *Regexp, err error) {
	defer func() {
		r := recover()
		if r == nil {
			return
		}
		if err2, ok := r.(ParseError); ok {
			err = err2
			return
		}
		panic(r)
	}()

	p.lexer.Init(pattern)
	p.allocated = 0
	p.out.Pattern = pattern
	if pattern == "" {
		p.out.Expr = *p.newExpr(OpConcat, Position{})
	} else {
		p.out.Expr = *p.parseExpr(0)
	}

	if !p.opts.NoLiterals {
		p.mergeChars(&p.out.Expr)
	}
	p.setValues(&p.out.Expr)

	return &p.out, nil
}

type prefixParselet func(token) *Expr

type infixParselet func(*Expr, token) *Expr

func newParser(opts *ParserOptions) *Parser {
	var p Parser

	if opts != nil {
		p.opts = *opts
	}
	p.exprPool = make([]Expr, 256)

	for tok, op := range tok2op {
		if op != 0 {
			p.prefixParselets[tokenKind(tok)] = p.parsePrefixElementary
		}
	}

	p.prefixParselets[tokEscapeHexFull] = func(tok token) *Expr {
		return p.newExprForm(OpEscapeHex, FormEscapeHexFull, tok.pos)
	}
	p.prefixParselets[tokEscapeUniFull] = func(tok token) *Expr {
		return p.newExprForm(OpEscapeUni, FormEscapeUniFull, tok.pos)
	}

	p.prefixParselets[tokLparen] = func(tok token) *Expr { return p.parseGroup(OpCapture, tok) }
	p.prefixParselets[tokLparenAtomic] = func(tok token) *Expr { return p.parseGroup(OpAtomicGroup, tok) }
	p.prefixParselets[tokLparenPositiveLookahead] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookahead, tok) }
	p.prefixParselets[tokLparenNegativeLookahead] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookahead, tok) }
	p.prefixParselets[tokLparenPositiveLookbehind] = func(tok token) *Expr { return p.parseGroup(OpPositiveLookbehind, tok) }
	p.prefixParselets[tokLparenNegativeLookbehind] = func(tok token) *Expr { return p.parseGroup(OpNegativeLookbehind, tok) }

	p.prefixParselets[tokLparenName] = func(tok token) *Expr {
		return p.parseNamedCapture(FormDefault, tok)
	}
	p.prefixParselets[tokLparenNameAngle] = func(tok token) *Expr {
		return p.parseNamedCapture(FormNamedCaptureAngle, tok)
	}
	p.prefixParselets[tokLparenNameQuote] = func(tok token) *Expr {
		return p.parseNamedCapture(FormNamedCaptureQuote, tok)
	}

	p.prefixParselets[tokLparenFlags] = p.parseGroupWithFlags

	p.prefixParselets[tokPipe] = func(tok token) *Expr {
		// We need prefix pipe parselet to handle `(|x)` syntax.
		right := p.parseExpr(1)
		return p.newExpr(OpAlt, tok.pos, p.newEmpty(tok.pos), right)
	}
	p.prefixParselets[tokLbracket] = func(tok token) *Expr {
		return p.parseCharClass(OpCharClass, tok)
	}
	p.prefixParselets[tokLbracketCaret] = func(tok token) *Expr {
		return p.parseCharClass(OpNegCharClass, tok)
	}

	p.infixParselets[tokRepeat] = func(left *Expr, tok token) *Expr {
		repeatLit := p.newExpr(OpString, tok.pos)
		return p.newExpr(OpRepeat, combinePos(left.Pos, tok.pos), left, repeatLit)
	}
	p.infixParselets[tokStar] = func(left *Expr, tok token) *Expr {
		return p.newExpr(OpStar, combinePos(left.Pos, tok.pos), left)
	}
	p.infixParselets[tokConcat] = func(left *Expr, tok token) *Expr {
		right := p.parseExpr(2)
		if left.Op == OpConcat {
			left.Args = append(left.Args, *right)
			left.Pos.End = right.End()
			return left
		}
		return p.newExpr(OpConcat, combinePos(left.Pos, right.Pos), left, right)
	}
	p.infixParselets[tokPipe] = p.parseAlt
	p.infixParselets[tokMinus] = p.parseMinus
	p.infixParselets[tokPlus] = p.parsePlus
	p.infixParselets[tokQuestion] = p.parseQuestion

	return &p
}

func (p *Parser) setValues(e *Expr) {
	for i := range e.Args {
		p.setValues(&e.Args[i])
	}
	e.Value = p.exprValue(e)
}

func (p *Parser) exprValue(e *Expr) string {
	return p.out.Pattern[e.Begin():e.End()]
}

func (p *Parser) mergeChars(e *Expr) {
	for i := range e.Args {
		p.mergeChars(&e.Args[i])
	}
	if e.Op != OpConcat || len(e.Args) < 2 {
		return
	}

	args := e.Args[:0]
	i := 0
	for i < len(e.Args) {
		first := i
		chars := 0
		for j := i; j < len(e.Args) && e.Args[j].Op == OpChar; j++ {
			chars++
		}
		if chars > 1 {
			c1 := e.Args[first]
			c2 := e.Args[first+chars-1]
			lit := p.newExpr(OpLiteral, combinePos(c1.Pos, c2.Pos))
			for j := 0; j < chars; j++ {
				lit.Args = append(lit.Args, e.Args[first+j])
			}
			args = append(args, *lit)
			i += chars
		} else {
			args = append(args, e.Args[i])
			i++
		}
	}
	if len(args) == 1 {
		*e = args[0] // Turn OpConcat into OpLiteral
	} else {
		e.Args = args
	}
}

func (p *Parser) newEmpty(pos Position) *Expr {
	return p.newExpr(OpConcat, pos)
}

func (p *Parser) newExprForm(op Operation, form Form, pos Position, args ...*Expr) *Expr {
	e := p.newExpr(op, pos, args...)
	e.Form = form
	return e
}

func (p *Parser) newExpr(op Operation, pos Position, args ...*Expr) *Expr {
	e := p.allocExpr()
	*e = Expr{
		Op:   op,
		Pos:  pos,
		Args: e.Args[:0],
	}
	for _, arg := range args {
		e.Args = append(e.Args, *arg)
	}
	return e
}

func (p *Parser) allocExpr() *Expr {
	i := p.allocated
	if i < uint(len(p.exprPool)) {
		p.allocated++
		return &p.exprPool[i]
	}
	return &Expr{}
}

func (p *Parser) expect(kind tokenKind) Position {
	tok := p.lexer.NextToken()
	if tok.kind != kind {
		throwErrorf(int(tok.pos.Begin), int(tok.pos.End), "expected '%s', found '%s'", kind, tok.kind)
	}
	return tok.pos
}

func (p *Parser) parseExpr(precedence int) *Expr {
	tok := p.lexer.NextToken()
	prefix := p.prefixParselets[tok.kind]
	if prefix == nil {
		throwfPos(tok.pos, "unexpected token: %v", tok)
	}
	left := prefix(tok)

	for precedence < p.precedenceOf(p.lexer.Peek()) {
		tok := p.lexer.NextToken()
		infix := p.infixParselets[tok.kind]
		left = infix(left, tok)
	}

	return left
}

func (p *Parser) parsePrefixElementary(tok token) *Expr {
	return p.newExpr(tok2op[tok.kind], tok.pos)
}

func (p *Parser) parseCharClass(op Operation, tok token) *Expr {
	var endPos Position
	p.charClass = p.charClass[:0]
	for {
		p.charClass = append(p.charClass, *p.parseExpr(0))
		next := p.lexer.Peek()
		if next.kind == tokRbracket {
			endPos = next.pos
			p.lexer.NextToken()
			break
		}
		if next.kind == tokNone {
			throwfPos(tok.pos, "unterminated '['")
		}
	}

	result := p.newExpr(op, combinePos(tok.pos, endPos))
	result.Args = append(result.Args, p.charClass...)
	return result
}

func (p *Parser) parseMinus(left *Expr, tok token) *Expr {
	if p.isValidCharRangeOperand(left) {
		if p.lexer.Peek().kind != tokRbracket {
			right := p.parseExpr(2)
			return p.newExpr(OpCharRange, combinePos(left.Pos, right.Pos), left, right)
		}
	}
	p.charClass = append(p.charClass, *left)
	return p.newExpr(OpChar, tok.pos)
}

func (p *Parser) isValidCharRangeOperand(e *Expr) bool {
	switch e.Op {
	case OpEscapeHex, OpEscapeOctal, OpEscapeMeta, OpChar:
		return true
	case OpEscapeChar:
		switch p.exprValue(e) {
		case `\\`, `\|`, `\*`, `\+`, `\?`, `\.`, `\[`, `\^`, `\$`, `\(`, `\)`:
			return true
		}
	}
	return false
}

func (p *Parser) parsePlus(left *Expr, tok token) *Expr {
	op := OpPlus
	switch left.Op {
	case OpPlus, OpStar, OpQuestion, OpRepeat:
		op = OpPossessive
	}
	return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
}

func (p *Parser) parseQuestion(left *Expr, tok token) *Expr {
	op := OpQuestion
	switch left.Op {
	case OpPlus, OpStar, OpQuestion, OpRepeat:
		op = OpNonGreedy
	}
	return p.newExpr(op, combinePos(left.Pos, tok.pos), left)
}

func (p *Parser) parseAlt(left *Expr, tok token) *Expr {
	var right *Expr
	switch p.lexer.Peek().kind {
	case tokRparen, tokNone:
		// This is needed to handle `(x|)` syntax.
		right = p.newEmpty(tok.pos)
	default:
		right = p.parseExpr(1)
	}
	if left.Op == OpAlt {
		left.Args = append(left.Args, *right)
		left.Pos.End = right.End()
		return left
	}
	return p.newExpr(OpAlt, combinePos(left.Pos, right.Pos), left, right)
}

func (p *Parser) parseGroupItem(tok token) *Expr {
	if p.lexer.Peek().kind == tokRparen {
		// This is needed to handle `() syntax.`
		return p.newEmpty(tok.pos)
	}
	return p.parseExpr(0)
}

func (p *Parser) parseGroup(op Operation, tok token) *Expr {
	x := p.parseGroupItem(tok)
	result := p.newExpr(op, tok.pos, x)
	result.Pos.End = p.expect(tokRparen).End
	return result
}

func (p *Parser) parseNamedCapture(form Form, tok token) *Expr {
	prefixLen := len("(?<")
	if form == FormDefault {
		prefixLen = len("(?P<")
	}
	name := p.newExpr(OpString, Position{
		Begin: tok.pos.Begin + uint16(prefixLen),
		End:   tok.pos.End - uint16(len(">")),
	})
	x := p.parseGroupItem(tok)
	result := p.newExprForm(OpNamedCapture, form, tok.pos, x, name)
	result.Pos.End = p.expect(tokRparen).End
	return result
}

func (p *Parser) parseGroupWithFlags(tok token) *Expr {
	var result *Expr
	val := p.out.Pattern[tok.pos.Begin+1 : tok.pos.End]
	switch {
	case !strings.HasSuffix(val, ":"):
		flags := p.newExpr(OpString, Position{
			Begin: tok.pos.Begin + uint16(len("(?")),
			End:   tok.pos.End,
		})
		result = p.newExpr(OpFlagOnlyGroup, tok.pos, flags)
	case val == "?:":
		x := p.parseGroupItem(tok)
		result = p.newExpr(OpGroup, tok.pos, x)
	default:
		flags := p.newExpr(OpString, Position{
			Begin: tok.pos.Begin + uint16(len("(?")),
			End:   tok.pos.End - uint16(len(":")),
		})
		x := p.parseGroupItem(tok)
		result = p.newExpr(OpGroupWithFlags, tok.pos, x, flags)
	}
	result.Pos.End = p.expect(tokRparen).End
	return result
}

func (p *Parser) precedenceOf(tok token) int {
	switch tok.kind {
	case tokPipe:
		return 1
	case tokConcat, tokMinus:
		return 2
	case tokPlus, tokStar, tokQuestion, tokRepeat:
		return 3
	default:
		return 0
	}
}

func (p *Parser) newPCRE(source string) (*RegexpPCRE, error) {
	if source == "" {
		return nil, errors.New("empty pattern: can't find delimiters")
	}

	delim := source[0]
	endDelim := delim
	switch delim {
	case '(':
		endDelim = ')'
	case '{':
		endDelim = '}'
	case '[':
		endDelim = ']'
	case '<':
		endDelim = '>'
	case '\\':
		return nil, errors.New("'\\' is not a valid delimiter")
	default:
		if isSpace(delim) {
			return nil, errors.New("whitespace is not a valid delimiter")
		}
		if isAlphanumeric(delim) {
			return nil, fmt.Errorf("'%c' is not a valid delimiter", delim)
		}
	}

	j := strings.LastIndexByte(source, endDelim)
	if j == -1 {
		return nil, fmt.Errorf("can't find '%c' ending delimiter", endDelim)
	}

	pcre := &RegexpPCRE{
		Pattern:   source[1:j],
		Source:    source,
		Delim:     [2]byte{delim, endDelim},
		Modifiers: source[j+1:],
	}
	return pcre, nil
}

var tok2op = [256]Operation{
	tokDollar:      OpDollar,
	tokCaret:       OpCaret,
	tokDot:         OpDot,
	tokChar:        OpChar,
	tokMinus:       OpChar,
	tokEscapeChar:  OpEscapeChar,
	tokEscapeMeta:  OpEscapeMeta,
	tokEscapeHex:   OpEscapeHex,
	tokEscapeOctal: OpEscapeOctal,
	tokEscapeUni:   OpEscapeUni,
	tokPosixClass:  OpPosixClass,
	tokQ:           OpQuote,
	tokComment:     OpComment,
}