407 lines
10 KiB
Go
407 lines
10 KiB
Go
package lexer
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"sync"
|
|
"unicode"
|
|
"unicode/utf8"
|
|
)
|
|
|
|
var (
|
|
backrefReplace = regexp.MustCompile(`(\\+)(\d)`)
|
|
)
|
|
|
|
// Option for modifying how the Lexer works.
|
|
type Option func(d *StatefulDefinition)
|
|
|
|
// A Rule matching input and possibly changing state.
|
|
type Rule struct {
|
|
Name string
|
|
Pattern string
|
|
Action Action
|
|
}
|
|
|
|
// Rules grouped by name.
|
|
type Rules map[string][]Rule
|
|
|
|
// compiledRule is a Rule with its pattern compiled.
|
|
type compiledRule struct {
|
|
Rule
|
|
ignore bool
|
|
RE *regexp.Regexp
|
|
}
|
|
|
|
// compiledRules grouped by name.
|
|
type compiledRules map[string][]compiledRule
|
|
|
|
// A Action is applied when a rule matches.
|
|
type Action interface {
|
|
// Actions are responsible for validating the match. ie. if they consumed any input.
|
|
applyAction(lexer *StatefulLexer, groups []string) error
|
|
}
|
|
|
|
// RulesAction is an optional interface that Actions can implement.
|
|
//
|
|
// It is applied during rule construction to mutate the rule map.
|
|
type RulesAction interface {
|
|
applyRules(state string, rule int, rules compiledRules) error
|
|
}
|
|
|
|
// InitialState overrides the default initial state of "Root".
|
|
func InitialState(state string) Option {
|
|
return func(d *StatefulDefinition) {
|
|
d.initialState = state
|
|
}
|
|
}
|
|
|
|
// MatchLongest causes the Lexer to continue checking rules past the first match.
|
|
// If any subsequent rule has a longer match, it will be used instead.
|
|
func MatchLongest() Option {
|
|
return func(d *StatefulDefinition) {
|
|
d.matchLongest = true
|
|
}
|
|
}
|
|
|
|
// ActionPop pops to the previous state when the Rule matches.
|
|
type ActionPop struct{}
|
|
|
|
func (p ActionPop) applyAction(lexer *StatefulLexer, groups []string) error {
|
|
if groups[0] == "" {
|
|
return errors.New("did not consume any input")
|
|
}
|
|
lexer.stack = lexer.stack[:len(lexer.stack)-1]
|
|
return nil
|
|
}
|
|
|
|
// Pop to the previous state.
|
|
func Pop() Action {
|
|
return ActionPop{}
|
|
}
|
|
|
|
// ReturnRule signals the lexer to return immediately.
|
|
var ReturnRule = Rule{"returnToParent", "", nil}
|
|
|
|
// Return to the parent state.
|
|
//
|
|
// Useful as the last rule in a sub-state.
|
|
func Return() Rule { return ReturnRule }
|
|
|
|
// ActionPush pushes the current state and switches to "State" when the Rule matches.
|
|
type ActionPush struct{ State string }
|
|
|
|
func (p ActionPush) applyAction(lexer *StatefulLexer, groups []string) error {
|
|
if groups[0] == "" {
|
|
return errors.New("did not consume any input")
|
|
}
|
|
lexer.stack = append(lexer.stack, lexerState{name: p.State, groups: groups})
|
|
return nil
|
|
}
|
|
|
|
// Push to the given state.
|
|
//
|
|
// The target state will then be the set of rules used for matching
|
|
// until another Push or Pop is encountered.
|
|
func Push(state string) Action {
|
|
return ActionPush{state}
|
|
}
|
|
|
|
type include struct{ state string }
|
|
|
|
func (i include) applyAction(lexer *StatefulLexer, groups []string) error {
|
|
panic("should not be called")
|
|
}
|
|
|
|
func (i include) applyRules(state string, rule int, rules compiledRules) error {
|
|
includedRules, ok := rules[i.state]
|
|
if !ok {
|
|
return fmt.Errorf("invalid include state %q", i.state)
|
|
}
|
|
clone := make([]compiledRule, len(includedRules))
|
|
copy(clone, includedRules)
|
|
rules[state] = append(rules[state][:rule], append(clone, rules[state][rule+1:]...)...) // nolint: makezero
|
|
return nil
|
|
}
|
|
|
|
// Include rules from another state in this one.
|
|
func Include(state string) Rule {
|
|
return Rule{Action: include{state}}
|
|
}
|
|
|
|
// StatefulDefinition is the lexer.Definition.
|
|
type StatefulDefinition struct {
|
|
rules compiledRules
|
|
symbols map[string]TokenType
|
|
// Map of key->*regexp.Regexp
|
|
backrefCache sync.Map
|
|
initialState string
|
|
matchLongest bool
|
|
}
|
|
|
|
// MustSimple creates a new lexer definition based on a single state described by `rules`.
|
|
// panics if the rules trigger an error
|
|
func MustSimple(rules []Rule, options ...Option) *StatefulDefinition {
|
|
def, err := NewSimple(rules, options...)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return def
|
|
}
|
|
|
|
// MustStateful creates a new stateful lexer and panics if it is incorrect.
|
|
func MustStateful(rules Rules, options ...Option) *StatefulDefinition {
|
|
def, err := New(rules, options...)
|
|
if err != nil {
|
|
panic(err)
|
|
}
|
|
return def
|
|
}
|
|
|
|
// NewSimple creates a new stateful lexer with a single "Root" state.
|
|
func NewSimple(rules []Rule, options ...Option) (*StatefulDefinition, error) {
|
|
return New(Rules{"Root": rules}, options...)
|
|
}
|
|
|
|
// New constructs a new stateful lexer from rules.
|
|
func New(rules Rules, options ...Option) (*StatefulDefinition, error) {
|
|
compiled := compiledRules{}
|
|
for key, set := range rules {
|
|
for i, rule := range set {
|
|
pattern := "^(?:" + rule.Pattern + ")"
|
|
var (
|
|
re *regexp.Regexp
|
|
err error
|
|
)
|
|
var match = backrefReplace.FindStringSubmatch(rule.Pattern)
|
|
if match == nil || len(match[1])%2 == 0 {
|
|
re, err = regexp.Compile(pattern)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("%s.%d: %s", key, i, err)
|
|
}
|
|
}
|
|
compiled[key] = append(compiled[key], compiledRule{
|
|
Rule: rule,
|
|
ignore: len(rule.Name) > 0 && unicode.IsLower(rune(rule.Name[0])),
|
|
RE: re,
|
|
})
|
|
}
|
|
}
|
|
restart:
|
|
for state, rules := range compiled {
|
|
for i, rule := range rules {
|
|
if action, ok := rule.Action.(RulesAction); ok {
|
|
if err := action.applyRules(state, i, compiled); err != nil {
|
|
return nil, fmt.Errorf("%s.%d: %s", state, i, err)
|
|
}
|
|
goto restart
|
|
}
|
|
}
|
|
}
|
|
keys := make([]string, 0, len(compiled))
|
|
for key := range compiled {
|
|
keys = append(keys, key)
|
|
}
|
|
symbols := map[string]TokenType{
|
|
"EOF": EOF,
|
|
}
|
|
sort.Strings(keys)
|
|
duplicates := map[string]compiledRule{}
|
|
rn := EOF - 1
|
|
for _, key := range keys {
|
|
for i, rule := range compiled[key] {
|
|
if dup, ok := duplicates[rule.Name]; ok && rule.Pattern != dup.Pattern {
|
|
panic(fmt.Sprintf("duplicate key %q with different patterns %q != %q", rule.Name, rule.Pattern, dup.Pattern))
|
|
}
|
|
duplicates[rule.Name] = rule
|
|
compiled[key][i] = rule
|
|
symbols[rule.Name] = rn
|
|
rn--
|
|
}
|
|
}
|
|
d := &StatefulDefinition{
|
|
initialState: "Root",
|
|
rules: compiled,
|
|
symbols: symbols,
|
|
}
|
|
for _, option := range options {
|
|
option(d)
|
|
}
|
|
return d, nil
|
|
}
|
|
|
|
// Rules returns the user-provided Rules used to construct the lexer.
|
|
func (d *StatefulDefinition) Rules() Rules {
|
|
out := Rules{}
|
|
for state, rules := range d.rules {
|
|
for _, rule := range rules {
|
|
out[state] = append(out[state], rule.Rule)
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
func (d *StatefulDefinition) LexString(filename string, s string) (Lexer, error) { // nolint: golint
|
|
return &StatefulLexer{
|
|
def: d,
|
|
data: s,
|
|
stack: []lexerState{{name: d.initialState}},
|
|
pos: Position{
|
|
Filename: filename,
|
|
Line: 1,
|
|
Column: 1,
|
|
},
|
|
}, nil
|
|
}
|
|
|
|
func (d *StatefulDefinition) Lex(filename string, r io.Reader) (Lexer, error) { // nolint: golint
|
|
w := &strings.Builder{}
|
|
_, err := io.Copy(w, r)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return d.LexString(filename, w.String())
|
|
}
|
|
|
|
func (d *StatefulDefinition) Symbols() map[string]TokenType { // nolint: golint
|
|
return d.symbols
|
|
}
|
|
|
|
type lexerState struct {
|
|
name string
|
|
groups []string
|
|
}
|
|
|
|
// StatefulLexer implementation.
|
|
type StatefulLexer struct {
|
|
stack []lexerState
|
|
def *StatefulDefinition
|
|
data string
|
|
pos Position
|
|
}
|
|
|
|
func (l *StatefulLexer) Next() (Token, error) { // nolint: golint
|
|
parent := l.stack[len(l.stack)-1]
|
|
rules := l.def.rules[parent.name]
|
|
next:
|
|
for len(l.data) > 0 {
|
|
var (
|
|
rule *compiledRule
|
|
m []int
|
|
match []int
|
|
)
|
|
for i, candidate := range rules {
|
|
// Special case "Return()".
|
|
if candidate.Rule == ReturnRule {
|
|
l.stack = l.stack[:len(l.stack)-1]
|
|
parent = l.stack[len(l.stack)-1]
|
|
rules = l.def.rules[parent.name]
|
|
continue next
|
|
}
|
|
re, err := l.getPattern(candidate)
|
|
if err != nil {
|
|
return Token{}, wrapf(l.pos, err, "rule %q", candidate.Name)
|
|
}
|
|
m = re.FindStringSubmatchIndex(l.data)
|
|
if m != nil && (match == nil || m[1] > match[1]) {
|
|
match = m
|
|
rule = &rules[i]
|
|
if !l.def.matchLongest {
|
|
break
|
|
}
|
|
}
|
|
}
|
|
if match == nil || rule == nil {
|
|
sample := []rune(l.data)
|
|
if len(sample) > 16 {
|
|
sample = append(sample[:16], []rune("...")...)
|
|
}
|
|
return Token{}, errorf(l.pos, "invalid input text %q", string(sample))
|
|
}
|
|
|
|
if rule.Action != nil {
|
|
groups := make([]string, 0, len(match)/2)
|
|
for i := 0; i < len(match); i += 2 {
|
|
groups = append(groups, l.data[match[i]:match[i+1]])
|
|
}
|
|
if err := rule.Action.applyAction(l, groups); err != nil {
|
|
return Token{}, errorf(l.pos, "rule %q: %s", rule.Name, err)
|
|
}
|
|
} else if match[0] == match[1] {
|
|
return Token{}, errorf(l.pos, "rule %q did not match any input", rule.Name)
|
|
}
|
|
|
|
span := l.data[match[0]:match[1]]
|
|
l.data = l.data[match[1]:]
|
|
// l.groups = groups
|
|
|
|
// Update position.
|
|
pos := l.pos
|
|
l.pos.Offset += match[1]
|
|
lines := strings.Count(span, "\n")
|
|
l.pos.Line += lines
|
|
// Update column.
|
|
if lines == 0 {
|
|
l.pos.Column += utf8.RuneCountInString(span)
|
|
} else {
|
|
l.pos.Column = utf8.RuneCountInString(span[strings.LastIndex(span, "\n"):])
|
|
}
|
|
if rule.ignore {
|
|
parent = l.stack[len(l.stack)-1]
|
|
rules = l.def.rules[parent.name]
|
|
continue
|
|
}
|
|
return Token{
|
|
Type: l.def.symbols[rule.Name],
|
|
Value: span,
|
|
Pos: pos,
|
|
}, nil
|
|
}
|
|
return EOFToken(l.pos), nil
|
|
}
|
|
|
|
func (l *StatefulLexer) getPattern(candidate compiledRule) (*regexp.Regexp, error) {
|
|
if candidate.RE != nil {
|
|
return candidate.RE, nil
|
|
}
|
|
|
|
// We don't have a compiled RE. This means there are back-references
|
|
// that need to be substituted first.
|
|
parent := l.stack[len(l.stack)-1]
|
|
key := candidate.Pattern + "\000" + strings.Join(parent.groups, "\000")
|
|
cached, ok := l.def.backrefCache.Load(key)
|
|
if ok {
|
|
return cached.(*regexp.Regexp), nil
|
|
}
|
|
|
|
var (
|
|
re *regexp.Regexp
|
|
err error
|
|
)
|
|
pattern := backrefReplace.ReplaceAllStringFunc(candidate.Pattern, func(s string) string {
|
|
var rematch = backrefReplace.FindStringSubmatch(s)
|
|
n, nerr := strconv.ParseInt(rematch[2], 10, 64)
|
|
if nerr != nil {
|
|
err = nerr
|
|
return s
|
|
}
|
|
if len(parent.groups) == 0 || int(n) >= len(parent.groups) {
|
|
err = fmt.Errorf("invalid group %d from parent with %d groups", n, len(parent.groups))
|
|
return s
|
|
}
|
|
// concatenate the leading \\\\ which are already escaped to the quoted match.
|
|
return rematch[1][:len(rematch[1])-1] + regexp.QuoteMeta(parent.groups[n])
|
|
})
|
|
if err == nil {
|
|
re, err = regexp.Compile("^(?:" + pattern + ")")
|
|
}
|
|
if err != nil {
|
|
return nil, fmt.Errorf("invalid backref expansion: %q: %s", pattern, err)
|
|
}
|
|
l.def.backrefCache.Store(key, re)
|
|
return re, nil
|
|
}
|