-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathtokenizer.go
295 lines (262 loc) · 8.83 KB
/
tokenizer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
// Package tokenizer provides a high performance generic tokenizer (lexer)
// that can parse any string, slice or infinite buffer to any tokens.
// It is highly customizable and can be used, for example, by higher level
// parsers for writing DSLs.
package tokenizer
import (
"io"
"sort"
"sync"
)
const newLine = '\n'
// TokenKey token type identifier
type TokenKey int
const (
// TokenUnknown means that this token not embedded token and not user defined.
TokenUnknown TokenKey = -6
// TokenStringFragment means that this is only fragment of the quoted string with injections.
// For example, "one {{ two }} three", where "one " and " three" — TokenStringFragment
TokenStringFragment TokenKey = -5
// TokenString means that this token is quoted string.
// For example, "one two"
TokenString TokenKey = -4
// TokenFloat means that this token is a float number with point and/or exponent.
// For example, 1.2, 1e6, 1E-6
TokenFloat TokenKey = -3
// TokenInteger means that this token is an integer number.
// For example, 3, 49983
TokenInteger TokenKey = -2
// TokenKeyword means that this token is word.
// For example, one, two, три
TokenKeyword TokenKey = -1
// TokenUndef means that token doesn't exist.
// Then stream out of range of a token list any getter or checker will return TokenUndef token.
TokenUndef TokenKey = 0
)
// BackSlash just backslash byte
const BackSlash = '\\'
var DefaultWhiteSpaces = []byte{' ', '\t', '\n', '\r'}
// DefaultStringEscapes is default escaped symbols. Those symbols are often used everywhere.
//
// Deprecated: use DefaultSpecialString and AddSpecialStrings
var DefaultStringEscapes = map[byte]byte{
'n': '\n',
'r': '\r',
't': '\t',
'\\': '\\',
}
// DefaultSpecialString is default escaped symbols.
var DefaultSpecialString = []string{
"\\",
"n",
"r",
"t",
}
var Numbers = []rune{'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}
var Underscore = []rune{'_'}
// tokenItem describes one token.
type tokenRef struct {
// Token type. Not unique.
Key TokenKey
// Token value as is. Should be unique.
Token []byte
}
// QuoteInjectSettings describes open injection token and close injection token.
type QuoteInjectSettings struct {
// Token type witch opens quoted string.
StartKey TokenKey
// Token type witch closes quoted string.
EndKey TokenKey
}
// StringSettings describes framed(quoted) string tokens like quoted strings.
type StringSettings struct {
Key TokenKey
StartToken []byte
EndToken []byte
EscapeSymbol byte
SpecSymbols [][]byte
Injects []QuoteInjectSettings
}
// AddInjection configure injection in to string.
// Injection - parsable fragment of framed(quoted) string.
// Often used for parsing of placeholders or template expressions in the framed string.
func (q *StringSettings) AddInjection(startTokenKey, endTokenKey TokenKey) *StringSettings {
q.Injects = append(q.Injects, QuoteInjectSettings{StartKey: startTokenKey, EndKey: endTokenKey})
return q
}
// SetEscapeSymbol set escape symbol for framed(quoted) string.
// Escape symbol allows ignoring close token of framed string.
// Also, escape symbol allows using special symbols in the frame strings, like \n, \t.
func (q *StringSettings) SetEscapeSymbol(symbol byte) *StringSettings {
q.EscapeSymbol = symbol
return q
}
// SetSpecialSymbols set mapping of all escapable symbols for escape symbol, like \n, \t, \r.
//
// Deprecated: use AddSpecialStrings
func (q *StringSettings) SetSpecialSymbols(special map[byte]byte) *StringSettings {
for _, v := range special {
q.SpecSymbols = append(q.SpecSymbols, []byte{v})
}
return q
}
// AddSpecialStrings set mapping of all escapable strings for escape symbol, like \n, \t, \r.
func (q *StringSettings) AddSpecialStrings(special []string) *StringSettings {
for _, s := range special {
q.SpecSymbols = append(q.SpecSymbols, []byte(s))
}
return q
}
// Tokenizer stores all token configuration and behaviors.
type Tokenizer struct {
stopOnUnknown bool
allowNumberUnderscore bool
// all defined custom tokens {key: [token1, token2, ...], ...}
tokens map[TokenKey][]*tokenRef
index map[byte][]*tokenRef
quotes []*StringSettings
wSpaces []byte
kwMajorSymbols []rune
kwMinorSymbols []rune
pool sync.Pool
}
// New creates new tokenizer.
func New() *Tokenizer {
t := Tokenizer{
// flags: 0,
tokens: map[TokenKey][]*tokenRef{},
index: map[byte][]*tokenRef{},
quotes: []*StringSettings{},
wSpaces: DefaultWhiteSpaces,
}
t.pool.New = func() interface{} {
return new(Token)
}
return &t
}
// SetWhiteSpaces sets custom whitespace symbols between tokens.
// By default: `{' ', '\t', '\n', '\r'}`
func (t *Tokenizer) SetWhiteSpaces(ws []byte) *Tokenizer {
t.wSpaces = ws
return t
}
// AllowKeywordSymbols sets major and minor symbols for keywords.
// Major symbols (any quantity) might be in the beginning, at the middle and at the end of keyword.
// Minor symbols (any quantity) might be at the middle and at the end of the keyword.
//
// parser.AllowKeywordSymbols(tokenizer.Underscore, tokenizer.Numbers)
// // allows: "_one23", "__one2__two3"
// parser.AllowKeywordSymbols([]rune{'_', '@'}, tokenizer.Numbers)
// // allows: "one@23", "@_one_two23", "_one23", "_one2_two3", "@@one___two@_9"
//
// Beware, the tokenizer does not control consecutive duplicates of these runes.
func (t *Tokenizer) AllowKeywordSymbols(majorSymbols []rune, minorSymbols []rune) *Tokenizer {
t.kwMajorSymbols = majorSymbols
t.kwMinorSymbols = minorSymbols
return t
}
// AllowKeywordUnderscore allows underscore symbol in keywords, like `one_two` or `_three`
//
// Deprecated: use AllowKeywordSymbols
func (t *Tokenizer) AllowKeywordUnderscore() *Tokenizer {
t.kwMajorSymbols = append(t.kwMajorSymbols, '_')
return t
}
// AllowNumbersInKeyword allows numbers in keywords, like `one1` or `r2d2`
// The method allows numbers in keywords, but the keyword itself must not start with a number.
// There should be no spaces between letters and numbers.
//
// Deprecated: use AllowKeywordSymbols
func (t *Tokenizer) AllowNumbersInKeyword() *Tokenizer {
t.kwMinorSymbols = append(t.kwMinorSymbols, Numbers...)
return t
}
// StopOnUndefinedToken stops parsing if unknown token detected.
func (t *Tokenizer) StopOnUndefinedToken() *Tokenizer {
t.stopOnUnknown = true
return t
}
// AllowNumberUnderscore allows underscore symbol in numbers, like `1_000`
func (t *Tokenizer) AllowNumberUnderscore() *Tokenizer {
t.allowNumberUnderscore = true
return t
}
// DefineTokens add custom token.
// The `key` is the identifier of `tokens`, `tokens` — slice of tokens as string.
// If a key already exists, tokens will be rewritten.
func (t *Tokenizer) DefineTokens(key TokenKey, tokens []string) *Tokenizer {
var tks []*tokenRef
if key < 1 {
return t
}
for _, token := range tokens {
ref := tokenRef{
Key: key,
Token: s2b(token),
}
head := ref.Token[0]
tks = append(tks, &ref)
if t.index[head] == nil {
t.index[head] = []*tokenRef{}
}
t.index[head] = append(t.index[head], &ref)
sort.Slice(t.index[head], func(i, j int) bool {
return len(t.index[head][i].Token) > len(t.index[head][j].Token)
})
}
t.tokens[key] = tks
return t
}
// DefineStringToken defines a token string.
// For example, a piece of data surrounded by quotes: "string in quotes" or 'string on single quotes'.
// Arguments startToken and endToken defines open and close "quotes".
//
// - `t.DefineStringToken(10, "`", "`")` - parse string "one `two three`" will be parsed as
// [{key: TokenKeyword, value: "one"}, {key: TokenString, value: "`two three`"}]
//
// - `t.DefineStringToken(11, "//", "\n")` - parse string "parse // like comment\n" will be parsed as
// [{key: TokenKeyword, value: "parse"}, {key: TokenString, value: "// like comment"}]
func (t *Tokenizer) DefineStringToken(key TokenKey, startToken, endToken string) *StringSettings {
q := &StringSettings{
Key: key,
StartToken: s2b(startToken),
EndToken: s2b(endToken),
}
if q.StartToken == nil {
return q
}
t.quotes = append(t.quotes, q)
return q
}
func (t *Tokenizer) allocToken() *Token {
return t.pool.Get().(*Token)
}
func (t *Tokenizer) freeToken(token *Token) {
token.next = nil
token.prev = nil
token.value = nil
token.indent = nil
token.offset = 0
token.line = 0
token.id = 0
token.key = 0
token.string = nil
t.pool.Put(token)
}
// ParseString parse string into stream of tokens.
func (t *Tokenizer) ParseString(str string) *Stream {
return t.ParseBytes(s2b(str))
}
// ParseBytes parse and convert slice of bytes into stream of tokens.
func (t *Tokenizer) ParseBytes(str []byte) *Stream {
p := newParser(t, str)
p.parse()
return NewStream(p)
}
// ParseStream parse and convert infinite stream of bytes into infinite stream of tokens.
func (t *Tokenizer) ParseStream(r io.Reader, bufferSize uint) *Stream {
p := newInfParser(t, r, bufferSize)
p.preload()
p.parse()
return NewInfStream(p)
}