-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathsub3.c
395 lines (369 loc) · 10.7 KB
/
sub3.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at usr/src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright 2005 Sun Microsystems, Inc.
* All rights reserved.
* Use is subject to license terms.
*/
/* from OpenSolaris "sub3.c 1.8 05/06/08 SMI" */
/*
* Portions Copyright (c) 2005 Gunnar Ritter, Freiburg i. Br., Germany
*
* Sccsid @(#)sub3.c 1.4 (gritter) 11/26/05
*/
/*
* sub3.c ... ALE enhancement.
* Since a typical Asian language has a huge character set, it is not
* ideal to index an array by a character code itself, which requires
* as large as 2**16 entries per array.
* To get arround this problem, we identify a set of characters that
* causes the same transition on all states and call it character group.
* Every character in a same character group has a unique number called
* character group id. A function yycgid(c) maps the character c (in process
* code) to the id. This mapping is determined by analyzing all regular
* expressions in the lex program.
*
*/
#include <stdlib.h>
#ifdef __sun
#include <widec.h>
#endif
#include "search.h"
#include "ldefs.h"
#include <ctype.h>
/*
* "lchar" stands for linearized character. It is a variant of
* process code. AT&T's 16-bit process code has a drawback in which
* for three three process code C, D and E where C <= D <= E,
* codeset(C)==codeset(E) does not mean codeset(D)==codeset(C).
* In other words, four codesets alternates as the magnitude
* of character increases.
* The lchar representation holds this property:
* If three lchar C', D' and E' have the relationship C' < D' < E' and
* codeset(C') == codeset(E') then D' is guaranteed to belong to
* the same codeset as C' and E'.
* lchar is implemented as 32 bit entities and the function linearize()
* that maps a wchar_t to lchar is defined below. There is no
* reverse function for it though.
* The 32-bit process code by AT&T, used only for Taiwanese version at the
* time of wrting, has no such problem and we use it as it is.
*/
lchar yycgidtbl[MAXNCG] = {
0, /* For ease of computation of the id. */
'\n', /* Newline is always special because '.' exclude it. */
0x000000ff, /* The upper limit of codeset 0. */
0x20ffffff, /* The upper limit of codeset 2. */
0x40ffffff /* The upper limit of codeset 3. */
/* 0x60ffffff The upper limit of codeset 1. */
/* Above assumes the number of significant bits of wchar_t is <= 24. */
};
int ncgidtbl = 5; /* # elements in yycgidtbl. */
int ncg; /* Should set to ncgidtbl*2; this is the largest value yycgid() */
/* returns plus 1. */
static void setsymbol(int i);
/*
* For given 16-bit wchar_t (See NOTE), lchar is computed as illustrated below:
*
* wc: axxxxxxbyyyyyyy
*
* returns: 0ab0000000000000axxxxxxxbyyyyyyy
*
* linearize() doesn't do any if compiled with 32-bit wchar_t, use of
* which is flagged with LONG_WCHAR_T macro.
* NOTE:
* The implementation is highly depends on the process code representation.
* This function should be modified when 32-bit process code is used.
* There is no need to keep 'a' and 'b' bits in the lower half of lchar.
* You can actually omit these and squeeze the xxxxxx part one bit right.
* We don't do that here just in sake of speed.
*/
lchar
linearize(wchar_t wc)
{
#ifdef LONG_WCHAR_T
return ((lchar)wc); /* Don't do anything. */
#else
lchar prefix;
switch (wc&0x8080) {
case 0x0000: prefix = 0x00000000; break;
case 0x0080: prefix = 0x20000000; break;
case 0x8000: prefix = 0x40000000; break;
case 0x8080: prefix = 0x60000000; break;
}
return (prefix|wc);
#endif
}
/* compare liniear characters pointed to by pc1 and pc2 */
int
cmplc(const void *arg1, const void *arg2)
{
lchar *pc1 = (lchar *)arg1;
lchar *pc2 = (lchar *)arg2;
if (*pc1 > *pc2)
return (1);
else if (*pc1 == *pc2)
return (0);
else
return (-1);
}
void
remch(wchar_t c)
{
lchar lc = linearize(c);
/*
* User-friendliness consideration:
* Make sure no EUC chars are used in reg. exp.
*/
if (!handleeuc) {
if (!isascii(c)) {
if (iswprint(c))
warning(
"Non-ASCII character '%lc' in pattern; use -w or -e lex option.", c);
else warning(
"Non-ASCII character of value %#x in pattern; use -w or -e lex option.", c);
}
/* In any case, we don't need to construct ncgidtbl[]. */
return;
}
xlsearch(&lc, yycgidtbl,
(unsigned *)&ncgidtbl, sizeof (lchar), cmplc);
}
void
sortcgidtbl(void)
{
if (!handleeuc)
return;
qsort(yycgidtbl, ncgidtbl, sizeof (lchar), cmplc);
}
/*
* int yycgid(wchar_t c)
* Takes c and returns its character group id, determind by the
* following algorithm. The program also uses the binary search
* algorithm, generalized from Knuth (6.2.1) Algorithm B.
*
* This function computes the "character group id" based on
* a table yycgidtbl of which each lchar entry is pre-sorted
* in ascending sequence The number of valid entries is given
* by YYNCGIDTBL. There is no duplicate entries in yycgidtbl.
* const int YYNCGIDTBL;
* lchar yycgidtbl[YYNCGIDTBL];
*
* yycgidtbl[0] is guaranteed to have zero.
*
* For given c, yycgid(c) returns:
* 2*i iff yycgidtbl[i] == lc
* 2*i+1 iff yycgidtbl[i] < lc < yycgidtbl[i+1]
* YYNCGIDTBL*2-1
* iff yycgidtbl[YYNCGIDTBL-1] < lc
* where lc=linearize(c).
*
* Some interesting properties.:
* 1. For any c, 0 <= yycgid(c) <= 2*YYNCGIDTBL-1
* 2. yycgid(c) == 0 iff c == 0.
* 3. For any wchar_t c and d, if linearize(c) < linearize(d) then
* yycgid(c) <= yycgid(d).
* 4. For any wchar_t c and d, if yycgid(c) < yycgid(d) then
* linearize(c) < linearize(d).
*/
#define YYNCGIDTBL ncgidtbl
int
yycgid(wchar_t c)
{
int first = 0;
int last = YYNCGIDTBL - 1;
lchar lc;
/*
* In ASCII compat. mode, each character forms a "group" and the
* group-id is itself...
*/
if (!handleeuc)
return (c);
lc = linearize(c);
/* An exceptional case: yycgidtbl[YYNCGIDTBL-1] < lc */
if (yycgidtbl[YYNCGIDTBL - 1] < lc)
return (YYNCGIDTBL*2 - 1);
while (last >= 0) {
int i = (first+last)/2;
if (lc == yycgidtbl[i])
return (2*i); /* lc exactly matches an element. */
else if (yycgidtbl[i] < lc) {
if (lc < yycgidtbl[i+1])
return (2*i+1); /* lc is in between two elements. */
else
first = i + 1;
} else
last = i - 1;
}
error(
"system error in yycgid():binary search failed for c=0x%04x\n", c);
return (0);
}
/*
* repbycgid --- replaces each character in the parsing tree by its
* character group id. This, however, should be called even in
* the ASCII compat. mode to process DOT nodes and to call cclinter()
* for the DOT and CCL nodes.
*/
void
repbycgid(void)
{
int i, c;
for (i = 0; i < tptr; ++i) {
c = name[i];
if (!ISOPERATOR(c)) {
/* If not an operator, it must be a char. */
name[i] = yycgid((wchar_t)c); /* So replace it. */
#ifdef DEBUG
if (debug) {
printf("name[%d]:'%c'->%d;\n", i, c, name[i]);
}
#endif
} else if (c == RSTR) {
c = right[i];
right[i] = yycgid((wchar_t)c);
#ifdef DEBUG
if (debug) {
printf(
"name[%d].right:'%c'->%d;\n", i, c, right[i]);
}
#endif
} else if ((c == RCCL) || (c == RNCCL)) {
CHR cc, *s;
int j;
CHR ccltoken[CCLSIZE];
CHR *ccp;
int m;
/*
* This node represetns a character class RE [ccccc]
* s points to the string of characters that forms
* the class and/or a special prefix notation
* <RANGE>XY which corresponds to the RE X-Y,
* characters in the range of X and Y. Here,
* X <= Y is guranteed.
* We transform these characters into a string
* of sorted character group ids.
*
* There is another mechanism of packing tables
* that is inherited from the ASCII lex. Call of
* cclinter() is required for this packing.
* This used to be done as yylex() reads the lex
* rules but we have to do this here because the
* transition table is made to work on the char-group
* ids and the mapping cannot be determined until
* the entire file is read.
*/
#ifdef DEBUG
if (debug) {
printf("name[%d]:R[N]CCL of \"", i);
strpt((CHR *)left[i]);
printf(" -> {");
}
#endif
/* Prepare symbol[] for cclinter(). */
for (j = 0; j < ncg; ++j)
symbol[j] = FALSE;
s = (CHR *) left[i];
while ((cc = *s++)) {
if (cc == RANGE) {
int low, high, i;
/*
* Special form: <RANGE>XY
* This means the range X-Y.
* We mark all symbols[]
* elements for yycgid(X) thru
* yycgid(Y), inclusively.
*/
low = yycgid(*s++);
high = yycgid(*s++);
for (i = low; i <= high; ++i)
setsymbol(i);
} else {
setsymbol(yycgid(cc));
}
}
/* Now make a transformed string of cgids. */
s = ccptr;
m = 0;
for (j = 0; j < ncg; ++j)
if (symbol[j]) {
ccltoken[m++] = (CHR)j;
#ifdef DEBUG
if (debug) printf("%d, ", j);
#endif
}
#ifdef DEBUG
if (debug) printf("}\n");
#endif
ccltoken[m] = 0;
ccp = ccl;
while (ccp < ccptr && scomp(ccltoken, ccp) != 0)
ccp++;
if (ccp < ccptr) { /* character class found in ccl */
left[i] = (intptr_t)ccp;
} else { /* not in ccl, add it */
left[i] = (intptr_t)ccptr;
scopy(ccltoken, ccptr);
ccptr += slength(ccltoken) + 1;
if (ccptr > ccl + CCLSIZE)
error("Too many large character classes");
}
cclinter(c == RCCL);
} else if (c == DOT) {
if (psave == 0) { /* First DOT node. */
int j, nlid;
/*
* Make symbol[k]=TRUE for all k
* except k == yycgid('\n').
*/
nlid = yycgid('\n');
psave = ccptr;
for (j = 1; j < ncg; ++j) {
if (j == nlid) {
symbol[j] = FALSE;
} else {
symbol[j] = TRUE;
*ccptr++ = (CHR) j;
}
}
*ccptr++ = 0;
if (ccptr > ccl + CCLSIZE)
error("Too many large character classes");
}
/* Mimic mn1(RCCL,psave)... */
name[i] = RCCL;
left[i] = (intptr_t)psave;
cclinter(1);
}
}
#ifdef DEBUG
if (debug) {
printf("treedump after repbycgid().\n");
treedump();
}
#endif
}
static void
setsymbol(int i)
{
if (i > sizeof (symbol))
error("setsymbol: (SYSERR) %d out of range", i);
symbol[i] = TRUE;
}