-
Notifications
You must be signed in to change notification settings - Fork 193
/
Copy pathtokml.c
223 lines (185 loc) · 6.11 KB
/
tokml.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
/* Copyright (c) 2021, 2022 International Business Machines Corporation
Prepared by: Geert Janssen <[email protected]>
Tokenizer for C, C++ and Java with output as annotated XML,
much like srcML annotates a parse tree. Any white-space (including
newlines) is output as is, without any special XML element.
All other tokens (even comments) are output as a stream of XML
elements with tag names indicating the type/kind/class of
token provided as the enclosed text node.
<?xml version='1.0' encoding='UTF-8'?>
<source language='' filename=''>
<@kind@ line='' col='' len=''>...</@kind@>
</source>
Note that end-of-line characters (\r, \n) and sequences (\r \n) are
normalized and will always be output as a LINEFEED (LF, 0x0A).
The characters <, >, and & will be replaced by the special XML entities
<, > and & respectively.
To undo the XML annotation in <file>.xml use either:
(this will also correctly revert the XML entities)
xmlstarlet sel -T -t -v 'source' <file>.xml, or
xidel -s -e 'source' <file>.xml
Useful xpath queries:
(the results show all occurrences and these are not necessarily unique)
- all identifiers: //identifier
- the length of the last identifier: //identifier[last()]/@len
- the value of the first integer: //integer[1]
- all comments starting at the beginning of a line:
//line_comment[@col=0]|//block_comment[@col=0]
- all while keywords: /keyword[text()="while"]
- identifiers of length greater than 10: //identifier[@len>10]
- tokens immediately following a long identifier:
//identifier[@len>15]/following-sibling::*[1]
- tokens immediately following the keyword static:
//keyword[text()="static"]/following-sibling::*[1]
*/
#include <unistd.h> /* getopt() */
#include <libgen.h> /* basename() */
#include "libtoken.h"
int main(int argc, char *argv[])
{
extern char *optarg;
extern int opterr;
extern int optind;
int option;
char const *opt_str = "1acdhl:o:rvw";
char usage_str[80];
const char *token;
enum TokenClass type;
unsigned line;
unsigned col;
unsigned pos;
unsigned token_len;
unsigned num_files = 0; // number of files read
int continuous_files = 0; // when 1 do not reset after each file
char *outfile = 0;
Language source;
int explicit_source = 0;
int append = 0;
comment_token = 1;
whitespace_token = 1;
sprintf(usage_str, "usage: %%s [ -%s ] [ FILES ]\n", opt_str);
/* Process arguments: */
while ((option = getopt(argc, argv, opt_str)) != EOF) {
switch (option) {
case '1':
continuous_files = 1;
break;
case 'a':
append = 1;
break;
case 'c':
hash_as_comment = 1;
break;
case 'd':
debug = verbose = 1;
break;
case 'h':
fputs(
"A tokenizer for C/C++ (and Java) source code with output in XML.\n"
"Recognizes the following token classes: keyword, identifier, integer,\n"
"floating, string, character, operator, preprocessor, line_comment,\n"
"and block_comment.\n\n", stderr);
fprintf(stderr, usage_str, basename(argv[0]));
fputs(
"\nCommand line options are:\n"
"-a : append to output file instead of create or overwrite.\n"
"-c : treat a # character as the start of a line comment.\n"
"-d : print debug info to stderr; implies -v.\n"
"-h : print just this text to stderr and stop.\n"
"-l<lang> : specify language explicitly (C, C++, Java).\n"
"-o<file> : write output to this file (instead of stdout).\n"
"-1 : treat all filename arguments as a continuous single input.\n"
"-v : print action summary to stderr.\n"
"-w : suppress all warning messages.\n",
stderr);
return 0;
case 'l':
source = set_or_detect_lang(optarg);
explicit_source = 1;
break;
case 'o':
outfile = optarg;
break;
case 'v':
verbose = 1;
break;
case 'w':
nowarn = 1;
break;
case '?':
default:
fputs("(F): unknown option. Stop.\n", stderr);
fprintf(stderr, usage_str, argv[0]);
return 1;
}
}
if (outfile && outfile[0]) {
if (!freopen(outfile, append ? "a" : "w", stdout)) {
fprintf(stderr, "(F): cannot open %s for writing.\n", outfile);
exit(3);
}
}
if (optind == argc)
goto doit;
do {
filename = argv[optind];
if (!freopen(filename, "r", stdin)) {
if (!nowarn)
fprintf(stderr, "(W): Cannot read file %s.\n", filename);
continue;
}
if (!explicit_source)
source = set_or_detect_lang(0);
doit:
if (verbose) fprintf(stderr, "(I): Processing file %s...\n", filename);
num_files++;
// Header:
if (!continuous_files || num_files == 1) {
fputs("<?xml version='1.0' encoding='UTF-8'?>\n", stdout);
// standalone="yes"
fprintf(stdout, "<source language='%s' filename='%s'>",
lang_name(source), filename);
}
while ((token_len = C_tokenize_int(&token, &type, &line, &col, &pos))) {
if (type == WHITESPACE) {
fputs(token, stdout);
continue;
}
fprintf(stdout, "<%s line='%u' col='%u' len='%u'>",
token_class[type], line, col, token_len);
if (type == STRING ||
type == CHARACTER ||
type == OPERATOR ||
type == LINE_COMMENT ||
type == BLOCK_COMMENT)
XML_escape(stdout, token);
else
fputs(token, stdout);
fprintf(stdout, "</%s>", token_class[type]);
}
if (!continuous_files) {
// Trailer:
fputs("</source>\n", stdout);
if (verbose)
fprintf (stderr, "(I): %u bytes, %u UTF-8 encoded chars.\n",
char_count, utf8_count);
// Reset globals:
char_count = 0;
utf8_count = 0;
linenr = 1;
column = 0;
buffered = 0;
saved_col = 0;
}
} while (++optind < argc);
if (continuous_files) {
// Trailer:
fputs("</source>\n", stdout);
if (verbose)
fprintf(stderr, "(I): %u bytes, %u (UTF-8 encoded) unicode characters.\n",
char_count, utf8_count);
}
if (num_files > 1 && verbose)
fprintf(stderr, "(I): Total number of files processed: %u\n", num_files);
return (illegals || unexpect_eof) ? 1 : 0;
}