forked from simoneguidi94/gopapageno
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexer.go
274 lines (211 loc) · 6 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
package gopapageno
import (
"context"
"errors"
"fmt"
"unsafe"
)
var (
ErrInvalid = errors.New("invalid character")
)
type PreambleFunc func(sourceLen, concurrency int)
type LexerFunc func(rule int, text string, start int, end int, thread int, token *Token) LexResult
type Lexer struct {
Automaton LexerDFA
CutPointsAutomaton LexerDFA
Func LexerFunc
PreambleFunc PreambleFunc
}
type LexerDFAState struct {
Transitions [256]int
IsFinal bool
AssociatedRules []int
}
type LexerDFA []LexerDFAState
// Scanner implements reading and tokenization.
type Scanner struct {
Lexer *Lexer
source []byte
cutPoints []int
concurrency int
pools []*Pool[stack[Token]]
}
func (l *Lexer) Scanner(src []byte, opts *RunOptions) *Scanner {
if opts.Concurrency < 1 {
opts.Concurrency = 1
}
s := &Scanner{
Lexer: l,
source: src,
cutPoints: []int{0},
concurrency: 1,
}
s.cutPoints, s.concurrency = s.findCutPoints(opts.Concurrency)
s.pools = make([]*Pool[stack[Token]], s.concurrency)
if opts.AvgTokenLength < 1 {
opts.AvgTokenLength = 1
}
stacksNum := stacksCount[Token](s.source, s.concurrency, opts.AvgTokenLength)
for thread := 0; thread < s.concurrency; thread++ {
s.pools[thread] = NewPool(stacksNum, WithConstructor(newStack[Token]))
}
return s
}
// findCutPoints cuts the source string at specific points determined by the lexer description file.
// It returns a slice containing the cut points indices in the source string, and the number of goroutines to spawn to handle them.
func (s *Scanner) findCutPoints(maxConcurrency int) ([]int, int) {
sourceLen := len(s.source)
avgBytesPerThread := sourceLen / maxConcurrency
cutPoints := make([]int, maxConcurrency+1)
cutPoints[0] = 0
cutPoints[maxConcurrency] = len(s.source)
for i := 1; i < maxConcurrency; i++ {
startPos := cutPoints[i-1] + avgBytesPerThread
pos := startPos
state := s.Lexer.CutPointsAutomaton[0]
for !state.IsFinal {
if pos >= sourceLen {
return append(cutPoints[0:i], cutPoints[maxConcurrency]), i
}
stateIdx := state.Transitions[s.source[pos]]
//No more transitions are possible, reset the Automaton state
if stateIdx == -1 {
startPos = pos + 1
state = s.Lexer.CutPointsAutomaton[0]
} else {
state = s.Lexer.Automaton[stateIdx]
}
pos++
}
cutPoints[i] = startPos
}
return cutPoints, maxConcurrency
}
func (s *Scanner) Lex(ctx context.Context) ([]*LOS[Token], error) {
resultCh := make(chan lexResult, s.concurrency)
errCh := make(chan error, 1)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
for thread := 0; thread < s.concurrency; thread++ {
w := &scannerWorker{
lexer: s.Lexer,
id: thread,
stackPool: s.pools[thread],
data: s.source[s.cutPoints[thread]:s.cutPoints[thread+1]],
pos: 0,
startingPos: s.cutPoints[thread],
}
go w.lex(ctx, resultCh, errCh)
}
lexResults := make([]*LOS[Token], s.concurrency)
completed := 0
for completed < s.concurrency {
select {
case result := <-resultCh:
lexResults[result.threadID] = result.tokens
completed++
case err := <-errCh:
cancel()
return nil, err
}
}
return lexResults, nil
}
// worker implements the tokenizing logic on a subset of the source string.
type scannerWorker struct {
lexer *Lexer
id int
stackPool *Pool[stack[Token]]
data []byte
pos int
startingPos int
}
type lexResult struct {
threadID int
tokens *LOS[Token]
}
// lex is the lexing function executed in parallel by each thread.
func (w *scannerWorker) lex(ctx context.Context, resultCh chan<- lexResult, errCh chan<- error) {
los := NewLOS[Token](w.stackPool)
var token Token
for {
token.Value = nil
result := w.next(&token)
if result != LexOK {
if result == LexEOF {
resultCh <- lexResult{
threadID: w.id,
tokens: los,
}
return
}
errCh <- ErrInvalid
return
}
los.Push(token)
}
}
type LexResult uint8
const (
LexOK LexResult = iota
LexSkip
LexErr
LexEOF
)
// next scans the input text and returns the next Token.
func (w *scannerWorker) next(token *Token) LexResult {
for {
var lastFinalStateReached *LexerDFAState = nil
var lastFinalStatePos int
startPos := w.pos
state := &w.lexer.Automaton[0]
for {
// If we reach the end of the source data, return EOF.
if w.pos == len(w.data) {
return LexEOF
}
stateIdx := state.Transitions[w.data[w.pos]]
// If we are in an invalid state:
if stateIdx == -1 {
// If we haven't reached any final state so far, return an error.
if lastFinalStateReached == nil {
fmt.Printf("could not parse token '%s' in surrounding %s at position %d\n", string(w.data[startPos:w.pos+1]), string(w.data[startPos-5:w.pos+5]), startPos)
return LexErr
}
result := w.advance(token, lastFinalStatePos, lastFinalStateReached, startPos)
if result == LexSkip {
break
}
return result
}
state = &w.lexer.Automaton[stateIdx]
// If the state is not final, keep lexing.
if !state.IsFinal {
w.pos++
continue
}
lastFinalStateReached = state
lastFinalStatePos = w.pos
if w.pos == len(w.data)-1 {
result := w.advance(token, lastFinalStatePos, lastFinalStateReached, startPos)
if result == LexSkip {
break
}
return result
}
w.pos++
}
}
}
func (w *scannerWorker) advance(token *Token, lastFinalStatePos int, lastFinalStateReached *LexerDFAState, startPos int) LexResult {
w.pos = lastFinalStatePos + 1
ruleNum := lastFinalStateReached.AssociatedRules[0]
// TODO: should be changed to safe code when Run supports no-op []byte to string conversion
//text := unsafe.String(unsafe.SliceData(w.data[startPos:w.pos]), w.pos - startPos)
textBytes := w.data[startPos:w.pos]
text := *(*string)(unsafe.Pointer(&textBytes))
// Compute absolute start & end position of the current token in the source file.
tokenStart := w.startingPos + startPos
tokenEnd := tokenStart + w.pos - startPos - 1
return w.lexer.Func(ruleNum, text, tokenStart, tokenEnd, w.id, token)
}