package syntax import ( "bytes" "fmt" "math" "strconv" ) type RegexTree struct { root *regexNode caps map[int]int capnumlist []int captop int Capnames map[string]int Caplist []string options RegexOptions } // It is built into a parsed tree for a regular expression. // Implementation notes: // // Since the node tree is a temporary data structure only used // during compilation of the regexp to integer codes, it's // designed for clarity and convenience rather than // space efficiency. // // RegexNodes are built into a tree, linked by the n.children list. // Each node also has a n.parent and n.ichild member indicating // its parent and which child # it is in its parent's list. // // RegexNodes come in as many types as there are constructs in // a regular expression, for example, "concatenate", "alternate", // "one", "rept", "group". There are also node types for basic // peephole optimizations, e.g., "onerep", "notsetrep", etc. // // Because perl 5 allows "lookback" groups that scan backwards, // each node also gets a "direction". Normally the value of // boolean n.backward = false. // // During parsing, top-level nodes are also stacked onto a parse // stack (a stack of trees). For this purpose we have a n.next // pointer. [Note that to save a few bytes, we could overload the // n.parent pointer instead.] // // On the parse stack, each tree has a "role" - basically, the // nonterminal in the grammar that the parser has currently // assigned to the tree. That code is stored in n.role. // // Finally, some of the different kinds of nodes have data. // Two integers (for the looping constructs) are stored in // n.operands, an an object (either a string or a set) // is stored in n.data type regexNode struct { t nodeType children []*regexNode str []rune set *CharSet ch rune m int n int options RegexOptions next *regexNode } type nodeType int32 const ( // The following are leaves, and correspond to primitive operations ntOnerep nodeType = 0 // lef,back char,min,max a {n} ntNotonerep = 1 // lef,back char,min,max .{n} ntSetrep = 2 // lef,back set,min,max [\d]{n} ntOneloop = 3 // lef,back char,min,max a {,n} ntNotoneloop = 4 // lef,back char,min,max .{,n} ntSetloop = 5 // lef,back set,min,max [\d]{,n} ntOnelazy = 6 // lef,back char,min,max a {,n}? ntNotonelazy = 7 // lef,back char,min,max .{,n}? ntSetlazy = 8 // lef,back set,min,max [\d]{,n}? ntOne = 9 // lef char a ntNotone = 10 // lef char [^a] ntSet = 11 // lef set [a-z\s] \w \s \d ntMulti = 12 // lef string abcd ntRef = 13 // lef group \# ntBol = 14 // ^ ntEol = 15 // $ ntBoundary = 16 // \b ntNonboundary = 17 // \B ntBeginning = 18 // \A ntStart = 19 // \G ntEndZ = 20 // \Z ntEnd = 21 // \Z // Interior nodes do not correspond to primitive operations, but // control structures compositing other operations // Concat and alternate take n children, and can run forward or backwards ntNothing = 22 // [] ntEmpty = 23 // () ntAlternate = 24 // a|b ntConcatenate = 25 // ab ntLoop = 26 // m,x * + ? {,} ntLazyloop = 27 // m,x *? +? ?? {,}? ntCapture = 28 // n () ntGroup = 29 // (?:) ntRequire = 30 // (?=) (?<=) ntPrevent = 31 // (?!) (?<!) ntGreedy = 32 // (?>) (?<) ntTestref = 33 // (?(n) | ) ntTestgroup = 34 // (?(...) | ) ntECMABoundary = 41 // \b ntNonECMABoundary = 42 // \B ) func newRegexNode(t nodeType, opt RegexOptions) *regexNode { return ®exNode{ t: t, options: opt, } } func newRegexNodeCh(t nodeType, opt RegexOptions, ch rune) *regexNode { return ®exNode{ t: t, options: opt, ch: ch, } } func newRegexNodeStr(t nodeType, opt RegexOptions, str []rune) *regexNode { return ®exNode{ t: t, options: opt, str: str, } } func newRegexNodeSet(t nodeType, opt RegexOptions, set *CharSet) *regexNode { return ®exNode{ t: t, options: opt, set: set, } } func newRegexNodeM(t nodeType, opt RegexOptions, m int) *regexNode { return ®exNode{ t: t, options: opt, m: m, } } func newRegexNodeMN(t nodeType, opt RegexOptions, m, n int) *regexNode { return ®exNode{ t: t, options: opt, m: m, n: n, } } func (n *regexNode) writeStrToBuf(buf *bytes.Buffer) { for i := 0; i < len(n.str); i++ { buf.WriteRune(n.str[i]) } } func (n *regexNode) addChild(child *regexNode) { reduced := child.reduce() n.children = append(n.children, reduced) reduced.next = n } func (n *regexNode) insertChildren(afterIndex int, nodes []*regexNode) { newChildren := make([]*regexNode, 0, len(n.children)+len(nodes)) n.children = append(append(append(newChildren, n.children[:afterIndex]...), nodes...), n.children[afterIndex:]...) } // removes children including the start but not the end index func (n *regexNode) removeChildren(startIndex, endIndex int) { n.children = append(n.children[:startIndex], n.children[endIndex:]...) } // Pass type as OneLazy or OneLoop func (n *regexNode) makeRep(t nodeType, min, max int) { n.t += (t - ntOne) n.m = min n.n = max } func (n *regexNode) reduce() *regexNode { switch n.t { case ntAlternate: return n.reduceAlternation() case ntConcatenate: return n.reduceConcatenation() case ntLoop, ntLazyloop: return n.reduceRep() case ntGroup: return n.reduceGroup() case ntSet, ntSetloop: return n.reduceSet() default: return n } } // Basic optimization. Single-letter alternations can be replaced // by faster set specifications, and nested alternations with no // intervening operators can be flattened: // // a|b|c|def|g|h -> [a-c]|def|[gh] // apple|(?:orange|pear)|grape -> apple|orange|pear|grape func (n *regexNode) reduceAlternation() *regexNode { if len(n.children) == 0 { return newRegexNode(ntNothing, n.options) } wasLastSet := false lastNodeCannotMerge := false var optionsLast RegexOptions var i, j int for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { at := n.children[i] if j < i { n.children[j] = at } for { if at.t == ntAlternate { for k := 0; k < len(at.children); k++ { at.children[k].next = n } n.insertChildren(i+1, at.children) j-- } else if at.t == ntSet || at.t == ntOne { // Cannot merge sets if L or I options differ, or if either are negated. optionsAt := at.options & (RightToLeft | IgnoreCase) if at.t == ntSet { if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge || !at.set.IsMergeable() { wasLastSet = true lastNodeCannotMerge = !at.set.IsMergeable() optionsLast = optionsAt break } } else if !wasLastSet || optionsLast != optionsAt || lastNodeCannotMerge { wasLastSet = true lastNodeCannotMerge = false optionsLast = optionsAt break } // The last node was a Set or a One, we're a Set or One and our options are the same. // Merge the two nodes. j-- prev := n.children[j] var prevCharClass *CharSet if prev.t == ntOne { prevCharClass = &CharSet{} prevCharClass.addChar(prev.ch) } else { prevCharClass = prev.set } if at.t == ntOne { prevCharClass.addChar(at.ch) } else { prevCharClass.addSet(*at.set) } prev.t = ntSet prev.set = prevCharClass } else if at.t == ntNothing { j-- } else { wasLastSet = false lastNodeCannotMerge = false } break } } if j < i { n.removeChildren(j, i) } return n.stripEnation(ntNothing) } // Basic optimization. Adjacent strings can be concatenated. // // (?:abc)(?:def) -> abcdef func (n *regexNode) reduceConcatenation() *regexNode { // Eliminate empties and concat adjacent strings/chars var optionsLast RegexOptions var optionsAt RegexOptions var i, j int if len(n.children) == 0 { return newRegexNode(ntEmpty, n.options) } wasLastString := false for i, j = 0, 0; i < len(n.children); i, j = i+1, j+1 { var at, prev *regexNode at = n.children[i] if j < i { n.children[j] = at } if at.t == ntConcatenate && ((at.options & RightToLeft) == (n.options & RightToLeft)) { for k := 0; k < len(at.children); k++ { at.children[k].next = n } //insert at.children at i+1 index in n.children n.insertChildren(i+1, at.children) j-- } else if at.t == ntMulti || at.t == ntOne { // Cannot merge strings if L or I options differ optionsAt = at.options & (RightToLeft | IgnoreCase) if !wasLastString || optionsLast != optionsAt { wasLastString = true optionsLast = optionsAt continue } j-- prev = n.children[j] if prev.t == ntOne { prev.t = ntMulti prev.str = []rune{prev.ch} } if (optionsAt & RightToLeft) == 0 { if at.t == ntOne { prev.str = append(prev.str, at.ch) } else { prev.str = append(prev.str, at.str...) } } else { if at.t == ntOne { // insert at the front by expanding our slice, copying the data over, and then setting the value prev.str = append(prev.str, 0) copy(prev.str[1:], prev.str) prev.str[0] = at.ch } else { //insert at the front...this one we'll make a new slice and copy both into it merge := make([]rune, len(prev.str)+len(at.str)) copy(merge, at.str) copy(merge[len(at.str):], prev.str) prev.str = merge } } } else if at.t == ntEmpty { j-- } else { wasLastString = false } } if j < i { // remove indices j through i from the children n.removeChildren(j, i) } return n.stripEnation(ntEmpty) } // Nested repeaters just get multiplied with each other if they're not // too lumpy func (n *regexNode) reduceRep() *regexNode { u := n t := n.t min := n.m max := n.n for { if len(u.children) == 0 { break } child := u.children[0] // multiply reps of the same type only if child.t != t { childType := child.t if !(childType >= ntOneloop && childType <= ntSetloop && t == ntLoop || childType >= ntOnelazy && childType <= ntSetlazy && t == ntLazyloop) { break } } // child can be too lumpy to blur, e.g., (a {100,105}) {3} or (a {2,})? // [but things like (a {2,})+ are not too lumpy...] if u.m == 0 && child.m > 1 || child.n < child.m*2 { break } u = child if u.m > 0 { if (math.MaxInt32-1)/u.m < min { u.m = math.MaxInt32 } else { u.m = u.m * min } } if u.n > 0 { if (math.MaxInt32-1)/u.n < max { u.n = math.MaxInt32 } else { u.n = u.n * max } } } if math.MaxInt32 == min { return newRegexNode(ntNothing, n.options) } return u } // Simple optimization. If a concatenation or alternation has only // one child strip out the intermediate node. If it has zero children, // turn it into an empty. func (n *regexNode) stripEnation(emptyType nodeType) *regexNode { switch len(n.children) { case 0: return newRegexNode(emptyType, n.options) case 1: return n.children[0] default: return n } } func (n *regexNode) reduceGroup() *regexNode { u := n for u.t == ntGroup { u = u.children[0] } return u } // Simple optimization. If a set is a singleton, an inverse singleton, // or empty, it's transformed accordingly. func (n *regexNode) reduceSet() *regexNode { // Extract empty-set, one and not-one case as special if n.set == nil { n.t = ntNothing } else if n.set.IsSingleton() { n.ch = n.set.SingletonChar() n.set = nil n.t += (ntOne - ntSet) } else if n.set.IsSingletonInverse() { n.ch = n.set.SingletonChar() n.set = nil n.t += (ntNotone - ntSet) } return n } func (n *regexNode) reverseLeft() *regexNode { if n.options&RightToLeft != 0 && n.t == ntConcatenate && len(n.children) > 0 { //reverse children order for left, right := 0, len(n.children)-1; left < right; left, right = left+1, right-1 { n.children[left], n.children[right] = n.children[right], n.children[left] } } return n } func (n *regexNode) makeQuantifier(lazy bool, min, max int) *regexNode { if min == 0 && max == 0 { return newRegexNode(ntEmpty, n.options) } if min == 1 && max == 1 { return n } switch n.t { case ntOne, ntNotone, ntSet: if lazy { n.makeRep(Onelazy, min, max) } else { n.makeRep(Oneloop, min, max) } return n default: var t nodeType if lazy { t = ntLazyloop } else { t = ntLoop } result := newRegexNodeMN(t, n.options, min, max) result.addChild(n) return result } } // debug functions var typeStr = []string{ "Onerep", "Notonerep", "Setrep", "Oneloop", "Notoneloop", "Setloop", "Onelazy", "Notonelazy", "Setlazy", "One", "Notone", "Set", "Multi", "Ref", "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End", "Nothing", "Empty", "Alternate", "Concatenate", "Loop", "Lazyloop", "Capture", "Group", "Require", "Prevent", "Greedy", "Testref", "Testgroup", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "Unknown", "ECMABoundary", "NonECMABoundary", } func (n *regexNode) description() string { buf := &bytes.Buffer{} buf.WriteString(typeStr[n.t]) if (n.options & ExplicitCapture) != 0 { buf.WriteString("-C") } if (n.options & IgnoreCase) != 0 { buf.WriteString("-I") } if (n.options & RightToLeft) != 0 { buf.WriteString("-L") } if (n.options & Multiline) != 0 { buf.WriteString("-M") } if (n.options & Singleline) != 0 { buf.WriteString("-S") } if (n.options & IgnorePatternWhitespace) != 0 { buf.WriteString("-X") } if (n.options & ECMAScript) != 0 { buf.WriteString("-E") } switch n.t { case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntOne, ntNotone: buf.WriteString("(Ch = " + CharDescription(n.ch) + ")") break case ntCapture: buf.WriteString("(index = " + strconv.Itoa(n.m) + ", unindex = " + strconv.Itoa(n.n) + ")") break case ntRef, ntTestref: buf.WriteString("(index = " + strconv.Itoa(n.m) + ")") break case ntMulti: fmt.Fprintf(buf, "(String = %s)", string(n.str)) break case ntSet, ntSetloop, ntSetlazy: buf.WriteString("(Set = " + n.set.String() + ")") break } switch n.t { case ntOneloop, ntNotoneloop, ntOnelazy, ntNotonelazy, ntSetloop, ntSetlazy, ntLoop, ntLazyloop: buf.WriteString("(Min = ") buf.WriteString(strconv.Itoa(n.m)) buf.WriteString(", Max = ") if n.n == math.MaxInt32 { buf.WriteString("inf") } else { buf.WriteString(strconv.Itoa(n.n)) } buf.WriteString(")") break } return buf.String() } var padSpace = []byte(" ") func (t *RegexTree) Dump() string { return t.root.dump() } func (n *regexNode) dump() string { var stack []int CurNode := n CurChild := 0 buf := bytes.NewBufferString(CurNode.description()) buf.WriteRune('\n') for { if CurNode.children != nil && CurChild < len(CurNode.children) { stack = append(stack, CurChild+1) CurNode = CurNode.children[CurChild] CurChild = 0 Depth := len(stack) if Depth > 32 { Depth = 32 } buf.Write(padSpace[:Depth]) buf.WriteString(CurNode.description()) buf.WriteRune('\n') } else { if len(stack) == 0 { break } CurChild = stack[len(stack)-1] stack = stack[:len(stack)-1] CurNode = CurNode.next } } return buf.String() }