-
Notifications
You must be signed in to change notification settings - Fork 193
/
Copy pathfilter6.awk
executable file
·263 lines (232 loc) · 6.54 KB
/
filter6.awk
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
#!/usr/bin/awk -f
# Copyright (c) 2021 International Business Machines Corporation
# Prepared by: Geert Janssen <[email protected]>
# Expects a C/C++ tokenizer generated CSV file as input with explicit
# whitespace and separate newline (and continuation) tokens.
# (tokenize -W -n [-N] -mcsv)
# Outputs one possibly modified token (class or literal) per line.
# Tries to use some context to better discriminate the meaning of some
# otherwise ambiguous tokens.
# Should use yacc/bison or lemon?
# Ambiguous tokens in C/C++:
# < > delimiters of filename in preprocessor include directive
# Resolved by using preceding #include context
# < > delimiters of template parameters
# < less than operator
# Resolve: preceding context keyword template, template <+
# > greater than operator
# Resolve: preceding context keyword template <
# " " delimiters of filename in preprocessor include directive
# " " delimiters of string literal
# Resolved by using preceding #include context
# ( ) expression grouping
# ( ) argument list
# { } block
# { } initializer
# [ ] indexing
# [ ] lambda capture
# ~ destructor
# ~ unary operator
# - unary operator
# - binary operator
# Resolve: no white-space after - then unary?
# * unary operator (dereference pointer)
# * binary operator (multiplication)
# * pointer declarator
# & bitwise and operator
# & address of operator
# Can of worms: overloaded operator symbols
# Simplistic CPP line syntax:
# "#" directive-name (token)* newline
# #include <sys.h>
# #include "local"
# #define identifier-macro-def
# #define identifier-macro-const val
# #define identifier-macro-func( ... )
# Using a stack to remember CSV token lines whose output is temporarily
# suppressed. That way can have unbounded lookahead.
# Use function to empty and print stack from bottom to top.
function push(record) {
stack[sp++]=record
}
function empty_out() {
for (i=0; i<sp; i++)
print stack[i]
sp=0
}
BEGIN {
# CPP directive-names:
directive["include"]=1
directive["define"]=1
directive["undef"]=1
directive["if"]=1
directive["ifdef"]=1
directive["ifndef"]=1
directive["else"]=1
directive["elif"]=1
directive["endif"]=1
directive["line"]=1
directive["pragma"]=1
directive["error"]=1
# Empty stack of tokens:
sp=0
# Start (current) state:
state=0
# Next state:
next_state=-1 # indicates no specific rule matches
# Field separator of input record (line):
FS=","
# Read CSV header line:
getline
# Echo to output:
print #0
}
# Note: only gawk has switch statement.
# Dispatch on current state and input.
# Make sure all conditions are mutually exclusive, except last one.
# Last one is made exclusive by next_state==-1.
# Must use next_state to avoid immediate action on current line.
# All rules when matched must set next_state to something other than -1.
# Instead of composing new CSV record could also modify $0 via
# assignments to its fields (like $3="identifier").
# A # followed by an identifier in a macro body means stringize the identifier.
(state == 0 && $4 == "#") {
push($0)
next_state=1
}
# The keyword template provides context for some < and > disambiguation.
(state == 0 && $4 == "template") {
print $0
next_state=0 # switched off for now
}
# # seen; expect directive or identifier.
(state == 1 && $3 == "identifier") {
push($0)
if ($4 in directive) {
if ($4 == "include")
next_state=2
else
if ($4 == "define")
next_state=7
else {
empty_out()
next_state=0
}
}
else { # #ident => stringize to "ident"
empty_out()
next_state=0
}
}
# Handle #include <...
(state == 2 && $4 == "<") {
# Note: suppressing this token.
next_state=3
}
# Handle #include "...".
(state == 2 && $3 == "string") {
# $4 has enclosing " doubled!
filename=substr($4,3,length($4)-4)
empty_out()
print $1 "," $2 ",string-local-filename," filename
next_state=0
}
# Collect all tokens after the < till >.
# Treat first specially to get its coordinates.
(state == 3 && ($3 == "identifier" || $3 == "keyword")) {
id_lin=$1
id_col=$2
filename=$4
# Note: modifying this token.
next_state=4
}
# Keep collecting tokens till > or newline.
(state == 4 && $3 != "newline" && $4 != ">") { # eats up anything
filename=filename $4
# Note: suppressing this token.
next_state=4
}
# Handling #include <...>, or #include <...newline.
(state == 4 && ($3 == "newline" || $4 == ">")) {
# When newline it's an error, but act as if > was present:
empty_out()
print id_lin "," id_col ",string-sys-filename,\"" filename "\""
if ($3 == "newline")
print $0
# else suppressing the > token.
next_state=0
}
# Handle template <.
(state == 5 && $4 == "<") {
$3="start-template-paramlist"
print $0
next_state=6
}
# Handle template < >, explicit specialization.
(state == 6 && $4 == ">") {
$3="end-template-paramlist"
print $0
next_state=0
}
# Handle #define name.
(state == 7 && ($3 == "identifier" || $3 == "keyword")) {
id_lin=$1
id_col=$2
macro_name=$4
# Note: modifying this token later.
next_state=8
}
# Handle #define name(.
(state == 8 && $4 == "(") {
empty_out()
print id_lin "," id_col ",identifier-macro-func," macro_name
print $0
next_state=0
}
# Handle #define name whitespace
(state == 8 && $3 == "whitespace") {
# Note: suppressing this token.
next_state=9
}
# Handle #define name whitespace? newline
((state == 8 || state == 9) && $3 == "newline") {
empty_out()
print id_lin "," id_col ",identifier-macro-def," macro_name
print $0
next_state=0
}
# Handle #define name whitespace !newline.
(state == 9 && $3 != "newline") {
empty_out()
print id_lin "," id_col ",identifier-macro-const," macro_name
print $0
next_state=0
}
# Default rule; always executed:
# 1. no prior rule matched:
# - stay in same state only for whitespace, newline, and continuation;
# this allows for their presence without explicit mention in rules
# - output any previously suppressed tokens (to not lose them)
# - print current token except for whitespace
# - back to state 0 to quickly recover for any errors in input
# 2. some rule matched:
# - simply move on to next state as stated in that rule
# - reset next_state to -1
{
if (next_state == -1) {
# Echo the current token as is (ignore whitespace though):
if ($3 != "whitespace") {
if ($3 != "newline" && $3 != "continuation") {
empty_out()
state=0
}
print $0
}
# otherwise: Do not change state!
}
else {
state=next_state
next_state=-1
}
}
END {}