-
Notifications
You must be signed in to change notification settings - Fork 393
/
Copy pathLexer.java
150 lines (127 loc) · 5.19 KB
/
Lexer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
package play.parser;
import java.io.CharArrayReader;
import java.io.IOException;
import java.util.*;
/**
* 一个词法分析器。可以根据正则表达式做词法分析。
* 原理:
* 1.把正则表达式转换成NFA,再转成DFA。
* 2.要在NFA的State上标记关联的GrammarNode,以便区分被DFA识别出来的是哪种Token。这在Regex的regexToNFA中已经做了。
*/
public class Lexer extends Regex{
public static void main(String args[]) {
GrammarNode lexerGrammar = SampleGrammar.commonLexerGrammar();
State[] nfaStates = regexToNFA(lexerGrammar);
List<DFAState> dfaStates = NFA2DFA(nfaStates[0], CharSet.ascii);
System.out.println("\ndump NFA:");
nfaStates[0].dump();
System.out.println("\ndump DFA:");
dfaStates.get(0).dump();
String code = "int i = 0; i + 100; if (a == 10) println(a); a <= b;";
List<Token> tokens = tokenize(code, dfaStates.get(0), lexerGrammar);
System.out.println("\nTokens:");
for (Token token : tokens){
System.out.println(token);
}
}
/**
* 把字符串解析成Token列表
* @param str
* @return
*/
public static List<Token> tokenize(String str){
GrammarNode lexerGrammar = SampleGrammar.commonLexerGrammar();
State[] nfaStates = regexToNFA(lexerGrammar);
List<DFAState> dfaStates = NFA2DFA(nfaStates[0],CharSet.ascii);
List<Token> tokens = tokenize(str,dfaStates.get(0),lexerGrammar);
//加上结束符号
tokens.add(Token.EOF);
return tokens;
}
/**
* 逻辑:
* 1.找到能消化接下来的字符的DFA;
* 2.针对这个DFA一直给它发字符,直到不能接受;
* 3.查看是否是处于结束状态。
* @param str
* @param startState
* @return
*/
private static List<Token> tokenize(String str, DFAState startState, GrammarNode root){
List<Token> tokens = new LinkedList<Token>();
tokens = new ArrayList<Token>();
CharArrayReader reader = new CharArrayReader(str.toCharArray());
DFAState currentState = startState;
DFAState nextState = null;
int ich = 0;
char ch = 0;
StringBuffer tokenText = new StringBuffer();
try {
while ((ich = reader.read()) != -1 || tokenText.length() > 0) { //第二个条件,是为了生成最后一个Token
ch = (char) ich;
boolean consumed = false;
while (!consumed) {
nextState = currentState.getNextState(ch);
if (nextState == null) {
if (currentState == startState){
//不认识的字符, //TODO 忽略
consumed = true;
}
else if (currentState.isAcceptable()) {
//查找对应的词法规则
GrammarNode grammar = getGrammar(currentState, root);
assert grammar != null;
//创建Token
if (!grammar.isNeglect()) { //空白字符会被忽略
Token token = new Token(grammar.getName(), tokenText.toString());
tokens.add(token);
}
tokenText = new StringBuffer();
//会到初始状态,重新匹配
currentState = startState;
} else {
//遇到了不认识的字符,没有到达结束态,但也无法迁移 //TODO 暂时忽略
consumed = true;
}
} else {
//做状态迁移
currentState = nextState;
//累积token的文本值
tokenText.append(ch);
consumed = true;
}
}
}
} catch (IOException e) {
e.printStackTrace();
}
return tokens;
}
/**
* 检查DFAState中的各个NFAstate,看是否是某个词法规则的结束节点。
* 注意:如果有符合两个词法规则,那么以宣布顺序的先后算。比如关键字和标识符就会重叠。
* @param state
* @param root
* @return
*/
private static GrammarNode getGrammar(DFAState state, GrammarNode root){
//找出state符合的所有词法
Set<GrammarNode> validGrammars = new HashSet<GrammarNode>();
for (State child: state.states()){
if (child.getGrammarNode() != null){
validGrammars.add(child.getGrammarNode());
}
}
//按顺序遍历词法规则,声明在前的优先级更高
GrammarNode rtn = null;
for (GrammarNode grammar: root.children()){
if (grammar.getName()!= null){
if(validGrammars.contains(grammar)){
rtn = grammar;
break;
}
}
}
return rtn;
}
}