-
-
Notifications
You must be signed in to change notification settings - Fork 401
/
lexer.go
162 lines (132 loc) · 4.43 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
package chroma
import (
"fmt"
"strings"
)
var (
defaultOptions = &TokeniseOptions{
State: "root",
EnsureLF: true,
}
)
// Config for a lexer.
type Config struct {
// Name of the lexer.
Name string `xml:"name,omitempty"`
// Shortcuts for the lexer
Aliases []string `xml:"alias,omitempty"`
// File name globs
Filenames []string `xml:"filename,omitempty"`
// Secondary file name globs
AliasFilenames []string `xml:"alias_filename,omitempty"`
// MIME types
MimeTypes []string `xml:"mime_type,omitempty"`
// Regex matching is case-insensitive.
CaseInsensitive bool `xml:"case_insensitive,omitempty"`
// Regex matches all characters.
DotAll bool `xml:"dot_all,omitempty"`
// Regex does not match across lines ($ matches EOL).
//
// Defaults to multiline.
NotMultiline bool `xml:"not_multiline,omitempty"`
// Don't strip leading and trailing newlines from the input.
// DontStripNL bool
// Strip all leading and trailing whitespace from the input
// StripAll bool
// Make sure that the input ends with a newline. This
// is required for some lexers that consume input linewise.
EnsureNL bool `xml:"ensure_nl,omitempty"`
// If given and greater than 0, expand tabs in the input.
// TabSize int
// Priority of lexer.
//
// If this is 0 it will be treated as a default of 1.
Priority float32 `xml:"priority,omitempty"`
// Analyse is a list of regexes to match against the input.
//
// If a match is found, the score is returned if single attribute is set to true,
// otherwise the sum of all the score of matching patterns will be
// used as the final score.
Analyse *AnalyseConfig `xml:"analyse,omitempty"`
}
// AnalyseConfig defines the list of regexes analysers.
type AnalyseConfig struct {
Regexes []RegexConfig `xml:"regex,omitempty"`
// If true, the first matching score is returned.
First bool `xml:"first,attr"`
}
// RegexConfig defines a single regex pattern and its score in case of match.
type RegexConfig struct {
Pattern string `xml:"pattern,attr"`
Score float32 `xml:"score,attr"`
}
// Token output to formatter.
type Token struct {
Type TokenType `json:"type"`
Value string `json:"value"`
}
func (t *Token) String() string { return t.Value }
func (t *Token) GoString() string { return fmt.Sprintf("&Token{%s, %q}", t.Type, t.Value) }
// Clone returns a clone of the Token.
func (t *Token) Clone() Token {
return *t
}
// EOF is returned by lexers at the end of input.
var EOF Token
// TokeniseOptions contains options for tokenisers.
type TokeniseOptions struct {
// State to start tokenisation in. Defaults to "root".
State string
// Nested tokenisation.
Nested bool
// If true, all EOLs are converted into LF
// by replacing CRLF and CR
EnsureLF bool
}
// A Lexer for tokenising source code.
type Lexer interface {
// Config describing the features of the Lexer.
Config() *Config
// Tokenise returns an Iterator over tokens in text.
Tokenise(options *TokeniseOptions, text string) (Iterator, error)
// SetRegistry sets the registry this Lexer is associated with.
//
// The registry should be used by the Lexer if it needs to look up other
// lexers.
SetRegistry(registry *LexerRegistry) Lexer
// SetAnalyser sets a function the Lexer should use for scoring how
// likely a fragment of text is to match this lexer, between 0.0 and 1.0.
// A value of 1 indicates high confidence.
//
// Lexers may ignore this if they implement their own analysers.
SetAnalyser(analyser func(text string) float32) Lexer
// AnalyseText scores how likely a fragment of text is to match
// this lexer, between 0.0 and 1.0. A value of 1 indicates high confidence.
AnalyseText(text string) float32
}
// Lexers is a slice of lexers sortable by name.
type Lexers []Lexer
func (l Lexers) Len() int { return len(l) }
func (l Lexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l Lexers) Less(i, j int) bool {
return strings.ToLower(l[i].Config().Name) < strings.ToLower(l[j].Config().Name)
}
// PrioritisedLexers is a slice of lexers sortable by priority.
type PrioritisedLexers []Lexer
func (l PrioritisedLexers) Len() int { return len(l) }
func (l PrioritisedLexers) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l PrioritisedLexers) Less(i, j int) bool {
ip := l[i].Config().Priority
if ip == 0 {
ip = 1
}
jp := l[j].Config().Priority
if jp == 0 {
jp = 1
}
return ip > jp
}
// Analyser determines how appropriate this lexer is for the given text.
type Analyser interface {
AnalyseText(text string) float32
}