kuvia2/vendor/github.com/alecthomas/participle/v2/lexer/stateful.go
2022-01-15 00:09:03 +01:00

407 lines
10 KiB
Go

package lexer
import (
"errors"
"fmt"
"io"
"regexp"
"sort"
"strconv"
"strings"
"sync"
"unicode"
"unicode/utf8"
)
var (
backrefReplace = regexp.MustCompile(`(\\+)(\d)`)
)
// Option for modifying how the Lexer works.
type Option func(d *StatefulDefinition)
// A Rule matching input and possibly changing state.
type Rule struct {
Name string
Pattern string
Action Action
}
// Rules grouped by name.
type Rules map[string][]Rule
// compiledRule is a Rule with its pattern compiled.
type compiledRule struct {
Rule
ignore bool
RE *regexp.Regexp
}
// compiledRules grouped by name.
type compiledRules map[string][]compiledRule
// A Action is applied when a rule matches.
type Action interface {
// Actions are responsible for validating the match. ie. if they consumed any input.
applyAction(lexer *StatefulLexer, groups []string) error
}
// RulesAction is an optional interface that Actions can implement.
//
// It is applied during rule construction to mutate the rule map.
type RulesAction interface {
applyRules(state string, rule int, rules compiledRules) error
}
// InitialState overrides the default initial state of "Root".
func InitialState(state string) Option {
return func(d *StatefulDefinition) {
d.initialState = state
}
}
// MatchLongest causes the Lexer to continue checking rules past the first match.
// If any subsequent rule has a longer match, it will be used instead.
func MatchLongest() Option {
return func(d *StatefulDefinition) {
d.matchLongest = true
}
}
// ActionPop pops to the previous state when the Rule matches.
type ActionPop struct{}
func (p ActionPop) applyAction(lexer *StatefulLexer, groups []string) error {
if groups[0] == "" {
return errors.New("did not consume any input")
}
lexer.stack = lexer.stack[:len(lexer.stack)-1]
return nil
}
// Pop to the previous state.
func Pop() Action {
return ActionPop{}
}
// ReturnRule signals the lexer to return immediately.
var ReturnRule = Rule{"returnToParent", "", nil}
// Return to the parent state.
//
// Useful as the last rule in a sub-state.
func Return() Rule { return ReturnRule }
// ActionPush pushes the current state and switches to "State" when the Rule matches.
type ActionPush struct{ State string }
func (p ActionPush) applyAction(lexer *StatefulLexer, groups []string) error {
if groups[0] == "" {
return errors.New("did not consume any input")
}
lexer.stack = append(lexer.stack, lexerState{name: p.State, groups: groups})
return nil
}
// Push to the given state.
//
// The target state will then be the set of rules used for matching
// until another Push or Pop is encountered.
func Push(state string) Action {
return ActionPush{state}
}
type include struct{ state string }
func (i include) applyAction(lexer *StatefulLexer, groups []string) error {
panic("should not be called")
}
func (i include) applyRules(state string, rule int, rules compiledRules) error {
includedRules, ok := rules[i.state]
if !ok {
return fmt.Errorf("invalid include state %q", i.state)
}
clone := make([]compiledRule, len(includedRules))
copy(clone, includedRules)
rules[state] = append(rules[state][:rule], append(clone, rules[state][rule+1:]...)...) // nolint: makezero
return nil
}
// Include rules from another state in this one.
func Include(state string) Rule {
return Rule{Action: include{state}}
}
// StatefulDefinition is the lexer.Definition.
type StatefulDefinition struct {
rules compiledRules
symbols map[string]TokenType
// Map of key->*regexp.Regexp
backrefCache sync.Map
initialState string
matchLongest bool
}
// MustSimple creates a new lexer definition based on a single state described by `rules`.
// panics if the rules trigger an error
func MustSimple(rules []Rule, options ...Option) *StatefulDefinition {
def, err := NewSimple(rules, options...)
if err != nil {
panic(err)
}
return def
}
// MustStateful creates a new stateful lexer and panics if it is incorrect.
func MustStateful(rules Rules, options ...Option) *StatefulDefinition {
def, err := New(rules, options...)
if err != nil {
panic(err)
}
return def
}
// NewSimple creates a new stateful lexer with a single "Root" state.
func NewSimple(rules []Rule, options ...Option) (*StatefulDefinition, error) {
return New(Rules{"Root": rules}, options...)
}
// New constructs a new stateful lexer from rules.
func New(rules Rules, options ...Option) (*StatefulDefinition, error) {
compiled := compiledRules{}
for key, set := range rules {
for i, rule := range set {
pattern := "^(?:" + rule.Pattern + ")"
var (
re *regexp.Regexp
err error
)
var match = backrefReplace.FindStringSubmatch(rule.Pattern)
if match == nil || len(match[1])%2 == 0 {
re, err = regexp.Compile(pattern)
if err != nil {
return nil, fmt.Errorf("%s.%d: %s", key, i, err)
}
}
compiled[key] = append(compiled[key], compiledRule{
Rule: rule,
ignore: len(rule.Name) > 0 && unicode.IsLower(rune(rule.Name[0])),
RE: re,
})
}
}
restart:
for state, rules := range compiled {
for i, rule := range rules {
if action, ok := rule.Action.(RulesAction); ok {
if err := action.applyRules(state, i, compiled); err != nil {
return nil, fmt.Errorf("%s.%d: %s", state, i, err)
}
goto restart
}
}
}
keys := make([]string, 0, len(compiled))
for key := range compiled {
keys = append(keys, key)
}
symbols := map[string]TokenType{
"EOF": EOF,
}
sort.Strings(keys)
duplicates := map[string]compiledRule{}
rn := EOF - 1
for _, key := range keys {
for i, rule := range compiled[key] {
if dup, ok := duplicates[rule.Name]; ok && rule.Pattern != dup.Pattern {
panic(fmt.Sprintf("duplicate key %q with different patterns %q != %q", rule.Name, rule.Pattern, dup.Pattern))
}
duplicates[rule.Name] = rule
compiled[key][i] = rule
symbols[rule.Name] = rn
rn--
}
}
d := &StatefulDefinition{
initialState: "Root",
rules: compiled,
symbols: symbols,
}
for _, option := range options {
option(d)
}
return d, nil
}
// Rules returns the user-provided Rules used to construct the lexer.
func (d *StatefulDefinition) Rules() Rules {
out := Rules{}
for state, rules := range d.rules {
for _, rule := range rules {
out[state] = append(out[state], rule.Rule)
}
}
return out
}
func (d *StatefulDefinition) LexString(filename string, s string) (Lexer, error) { // nolint: golint
return &StatefulLexer{
def: d,
data: s,
stack: []lexerState{{name: d.initialState}},
pos: Position{
Filename: filename,
Line: 1,
Column: 1,
},
}, nil
}
func (d *StatefulDefinition) Lex(filename string, r io.Reader) (Lexer, error) { // nolint: golint
w := &strings.Builder{}
_, err := io.Copy(w, r)
if err != nil {
return nil, err
}
return d.LexString(filename, w.String())
}
func (d *StatefulDefinition) Symbols() map[string]TokenType { // nolint: golint
return d.symbols
}
type lexerState struct {
name string
groups []string
}
// StatefulLexer implementation.
type StatefulLexer struct {
stack []lexerState
def *StatefulDefinition
data string
pos Position
}
func (l *StatefulLexer) Next() (Token, error) { // nolint: golint
parent := l.stack[len(l.stack)-1]
rules := l.def.rules[parent.name]
next:
for len(l.data) > 0 {
var (
rule *compiledRule
m []int
match []int
)
for i, candidate := range rules {
// Special case "Return()".
if candidate.Rule == ReturnRule {
l.stack = l.stack[:len(l.stack)-1]
parent = l.stack[len(l.stack)-1]
rules = l.def.rules[parent.name]
continue next
}
re, err := l.getPattern(candidate)
if err != nil {
return Token{}, wrapf(l.pos, err, "rule %q", candidate.Name)
}
m = re.FindStringSubmatchIndex(l.data)
if m != nil && (match == nil || m[1] > match[1]) {
match = m
rule = &rules[i]
if !l.def.matchLongest {
break
}
}
}
if match == nil || rule == nil {
sample := []rune(l.data)
if len(sample) > 16 {
sample = append(sample[:16], []rune("...")...)
}
return Token{}, errorf(l.pos, "invalid input text %q", string(sample))
}
if rule.Action != nil {
groups := make([]string, 0, len(match)/2)
for i := 0; i < len(match); i += 2 {
groups = append(groups, l.data[match[i]:match[i+1]])
}
if err := rule.Action.applyAction(l, groups); err != nil {
return Token{}, errorf(l.pos, "rule %q: %s", rule.Name, err)
}
} else if match[0] == match[1] {
return Token{}, errorf(l.pos, "rule %q did not match any input", rule.Name)
}
span := l.data[match[0]:match[1]]
l.data = l.data[match[1]:]
// l.groups = groups
// Update position.
pos := l.pos
l.pos.Offset += match[1]
lines := strings.Count(span, "\n")
l.pos.Line += lines
// Update column.
if lines == 0 {
l.pos.Column += utf8.RuneCountInString(span)
} else {
l.pos.Column = utf8.RuneCountInString(span[strings.LastIndex(span, "\n"):])
}
if rule.ignore {
parent = l.stack[len(l.stack)-1]
rules = l.def.rules[parent.name]
continue
}
return Token{
Type: l.def.symbols[rule.Name],
Value: span,
Pos: pos,
}, nil
}
return EOFToken(l.pos), nil
}
func (l *StatefulLexer) getPattern(candidate compiledRule) (*regexp.Regexp, error) {
if candidate.RE != nil {
return candidate.RE, nil
}
// We don't have a compiled RE. This means there are back-references
// that need to be substituted first.
parent := l.stack[len(l.stack)-1]
key := candidate.Pattern + "\000" + strings.Join(parent.groups, "\000")
cached, ok := l.def.backrefCache.Load(key)
if ok {
return cached.(*regexp.Regexp), nil
}
var (
re *regexp.Regexp
err error
)
pattern := backrefReplace.ReplaceAllStringFunc(candidate.Pattern, func(s string) string {
var rematch = backrefReplace.FindStringSubmatch(s)
n, nerr := strconv.ParseInt(rematch[2], 10, 64)
if nerr != nil {
err = nerr
return s
}
if len(parent.groups) == 0 || int(n) >= len(parent.groups) {
err = fmt.Errorf("invalid group %d from parent with %d groups", n, len(parent.groups))
return s
}
// concatenate the leading \\\\ which are already escaped to the quoted match.
return rematch[1][:len(rematch[1])-1] + regexp.QuoteMeta(parent.groups[n])
})
if err == nil {
re, err = regexp.Compile("^(?:" + pattern + ")")
}
if err != nil {
return nil, fmt.Errorf("invalid backref expansion: %q: %s", pattern, err)
}
l.def.backrefCache.Store(key, re)
return re, nil
}