forked from jamiebuilds/the-super-tiny-compiler
-
Notifications
You must be signed in to change notification settings - Fork 12
/
Copy pathsuper-tiny-compiler.js
1168 lines (1054 loc) · 49.9 KB
/
super-tiny-compiler.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
/**
* TTTTTTTTTTTTTTTTTTTTTTTHHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::::::::::::::TH:::::::H H:::::::HE::::::::::::::::::::E
* T:::::TT:::::::TT:::::THH::::::H H::::::HHEE::::::EEEEEEEEE::::E
* TTTTTT T:::::T TTTTTT H:::::H H:::::H E:::::E EEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H:::::::::::::::::H E:::::::::::::::E
* T:::::T H::::::HHHHH::::::H E::::::EEEEEEEEEE
* T:::::T H:::::H H:::::H E:::::E
* T:::::T H:::::H H:::::H E:::::E EEEEEE
* TT:::::::TT HH::::::H H::::::HHEE::::::EEEEEEEE:::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* T:::::::::T H:::::::H H:::::::HE::::::::::::::::::::E
* TTTTTTTTTTT HHHHHHHHH HHHHHHHHHEEEEEEEEEEEEEEEEEEEEEE
*
* SSSSSSSSSSSSSSS UUUUUUUU UUUUUUUUPPPPPPPPPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* SS:::::::::::::::SU::::::U U::::::UP::::::::::::::::P E::::::::::::::::::::ER::::::::::::::::R
* S:::::SSSSSS::::::SU::::::U U::::::UP::::::PPPPPP:::::P E::::::::::::::::::::ER::::::RRRRRR:::::R
* S:::::S SSSSSSSUU:::::U U:::::UUPP:::::P P:::::PEE::::::EEEEEEEEE::::ERR:::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E EEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P P:::::P E:::::E R::::R R:::::R
* S::::SSSS U:::::U U:::::U P::::PPPPPP:::::P E::::::EEEEEEEEEE R::::RRRRRR:::::R
* SS::::::SSSSS U:::::U U:::::U P:::::::::::::PP E:::::::::::::::E R:::::::::::::RR
* SSS::::::::SS U:::::U U:::::U P::::PPPPPPPPP E:::::::::::::::E R::::RRRRRR:::::R
* SSSSSS::::S U:::::U U:::::U P::::P E::::::EEEEEEEEEE R::::R R:::::R
* S:::::S U:::::U U:::::U P::::P E:::::E R::::R R:::::R
* S:::::S U::::::U U::::::U P::::P E:::::E EEEEEE R::::R R:::::R
* SSSSSSS S:::::S U:::::::UUU:::::::U PP::::::PP EE::::::EEEEEEEE:::::ERR:::::R R:::::R
* S::::::SSSSSS:::::S UU:::::::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* S:::::::::::::::SS UU:::::::::UU P::::::::P E::::::::::::::::::::ER::::::R R:::::R
* SSSSSSSSSSSSSSS UUUUUUUUU PPPPPPPPPP EEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* TTTTTTTTTTTTTTTTTTTTTTTIIIIIIIIIINNNNNNNN NNNNNNNNYYYYYYY YYYYYYY
* T:::::::::::::::::::::TI::::::::IN:::::::N N::::::NY:::::Y Y:::::Y
* T:::::::::::::::::::::TI::::::::IN::::::::N N::::::NY:::::Y Y:::::Y
* T:::::TT:::::::TT:::::TII::::::IIN:::::::::N N::::::NY::::::Y Y::::::Y
* TTTTTT T:::::T TTTTTT I::::I N::::::::::N N::::::NYYY:::::Y Y:::::YYY
* T:::::T I::::I N:::::::::::N N::::::N Y:::::Y Y:::::Y
* T:::::T I::::I N:::::::N::::N N::::::N Y:::::Y:::::Y
* T:::::T I::::I N::::::N N::::N N::::::N Y:::::::::Y
* T:::::T I::::I N::::::N N::::N:::::::N Y:::::::Y
* T:::::T I::::I N::::::N N:::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N::::::::::N Y:::::Y
* T:::::T I::::I N::::::N N:::::::::N Y:::::Y
* TT:::::::TT II::::::IIN::::::N N::::::::N Y:::::Y
* T:::::::::T I::::::::IN::::::N N:::::::N YYYY:::::YYYY
* T:::::::::T I::::::::IN::::::N N::::::N Y:::::::::::Y
* TTTTTTTTTTT IIIIIIIIIINNNNNNNN NNNNNNN YYYYYYYYYYYYY
*
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPPPPPPPPP IIIIIIIIIILLLLLLLLLLL EEEEEEEEEEEEEEEEEEEEEERRRRRRRRRRRRRRRRR
* CCC::::::::::::C OO:::::::::OO M:::::::M M:::::::MP::::::::::::::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::::::::::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::::M M::::::::MP::::::PPPPPP:::::P I::::::::IL:::::::::L E::::::::::::::::::::ER::::::RRRRRR:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM:::::::::M M:::::::::MPP:::::P P:::::PII::::::IILL:::::::LL EE::::::EEEEEEEEE::::ERR:::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::::::M M::::::::::M P::::P P:::::P I::::I L:::::L E:::::E EEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::::::M M:::::::::::M P::::P P:::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C O:::::O O:::::OM:::::::M::::M M::::M:::::::M P::::PPPPPP:::::P I::::I L:::::L E::::::EEEEEEEEEE R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M::::M M::::M M::::::M P:::::::::::::PP I::::I L:::::L E:::::::::::::::E R:::::::::::::RR
* C:::::C O:::::O O:::::OM::::::M M::::M::::M M::::::M P::::PPPPPPPPP I::::I L:::::L E:::::::::::::::E R::::RRRRRR:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::::M M::::::M P::::P I::::I L:::::L E::::::EEEEEEEEEE R::::R R:::::R
* C:::::C O:::::O O:::::OM::::::M M:::::M M::::::M P::::P I::::I L:::::L E:::::E R::::R R:::::R
* C:::::C CCCCCCO::::::O O::::::OM::::::M MMMMM M::::::M P::::P I::::I L:::::L LLLLLL E:::::E EEEEEE R::::R R:::::R
* C:::::CCCCCCCC::::CO:::::::OOO:::::::OM::::::M M::::::MPP::::::PP II::::::IILL:::::::LLLLLLLLL:::::LEE::::::EEEEEEEE:::::ERR:::::R R:::::R
* CC:::::::::::::::C OO:::::::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCC::::::::::::C OO:::::::::OO M::::::M M::::::MP::::::::P I::::::::IL::::::::::::::::::::::LE::::::::::::::::::::ER::::::R R:::::R
* CCCCCCCCCCCCC OOOOOOOOO MMMMMMMM MMMMMMMMPPPPPPPPPP IIIIIIIIIILLLLLLLLLLLLLLLLLLLLLLLLEEEEEEEEEEEEEEEEEEEEEERRRRRRRR RRRRRRR
*
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
* =======================================================================================================================================================================
*/
/**
* Today we're going to write a compiler together. But not just any compiler... A
今天我们要写一个编译器. 不过不是普通的编译器...
* super duper teeny tiny compiler! A compiler that is so small that if you
是个超小的编译器! 这个编译器的代码很短,
* remove all the comments this file would only be ~200 lines of actual code.
如果你把所有注释删掉, 代码才 200 行左右
* We're going to compile some lisp-like function calls into some C-like
我们要把 Lisp 语法的函数 编译成 C 语法的函数
* function calls.
* If you are not familiar with one or the other. I'll just give you a quick intro.
如果你不熟悉这两种语言. 我快速介绍下
* If we had two functions `add` and `subtract` they would be written like this:
如果有两个函数 `加` 和 `减` 那么语法如下
*
* LISP C
*
* 2 + 2 (add 2 2) add(2, 2)
* 4 - 2 (subtract 4 2) subtract(4, 2)
* 2 + (4 - 2) (add 2 (subtract 4 2)) add(2, subtract(4, 2))
*
* Easy peezy right?
很简单对吧?
*
* Well good, because this is exactly what we are going to compile. While this
* is neither a complete LISP or C syntax, it will be enough of the syntax to
* demonstrate many of the major pieces of a modern compiler.
这就是今天我们要编译的东西. 虽然不是完整的 LIST 或 C 语法.
但足以展示现代编译器中的主要模块了
*/
/**
* Most compilers break down into three primary stages: Parsing, Transformation,
* and Code Generation
大部分编译器可以分成以下几个阶段: 分析(Parsing), 变形(Transformation), 和代码生成(Code Generation)
*
* 1. *Parsing* is taking raw code and turning it into a more abstract
* representation of the code.
*分析* 是把要编译的代码 变成更抽象的表示
*
* 2. *Transformation* takes this abstract representation and manipulates to do
* whatever the compiler wants it to.
*变形* 把上一步的抽象表示, 变换成想要的格式
*
* 3. *Code Generation* takes the transformed representation of the code and
* turns it into new code.
*代码生成* 是把变形后的结果 变成新的代码
*/
/**
* Parsing 分析
* -------
*
* Parsing typically gets broken down into two phases: Lexical Analysis and
* Syntactic Analysis.
分析这个阶段一般分成2个大块:
1. 语法分析(Lexical Analysis)
2. 语义分析(Syntactic Analysis)
*
* 1. *Lexical Analysis* takes the raw code and splits it apart into these things
* called tokens by a thing called a tokenizer (or lexer).
1. *语法分析* 把源代码切成一块一块叫 token 的东西, 完成这件事情的程序叫做 tokenizer(或者lexer)
*
* Tokens are an array of tiny little objects that describe an isolated piece
* of the syntax. They could be numbers, labels, punctuation, operators,
* whatever.
Tokens 是一块块不可分割的基本单元. token 可以是数字, 标签, 标点符号, 操作符等等.
*
* 2. *Syntactic Analysis* takes the tokens and reformats them into a
* representation that describes each part of the syntax and their relation
* to one another. This is known as an intermediate representation or
* Abstract Syntax Tree.
2. *语义分析* 把前面折腾出来的一堆 token 变成一种抽象表示, 描述出这些 token 各自的关系.
这种抽象表示叫做 中间表示(intermediate representation)
或者 抽象语法树(Abstract Syntax Tree)
*
* An Abstract Syntax Tree, or AST for short, is a deeply nested object that
* represents code in a way that is both easy to work with and tells us a lot
* of information.
抽象语法树 或者简称 AST, 是一个嵌套N层的对象,
这种表示代码的方式既容易处理, 也告诉了我们很多信息
*
* For the following syntax:
对于下面的代码
* (add 2 (subtract 4 2))
*
* Tokens might look something like this:
变成 token 数组后: 可以看到根据先后顺序把 类型 和 值 都判断出来了.
*
* [
* { type: 'paren', value: '(' },
* { type: 'name', value: 'add' },
* { type: 'number', value: '2' },
* { type: 'paren', value: '(' },
* { type: 'name', value: 'subtract' },
* { type: 'number', value: '4' },
* { type: 'number', value: '2' },
* { type: 'paren', value: ')' },
* { type: 'paren', value: ')' }
* ]
*
* And an Abstract Syntax Tree (AST) might look like this:
token 数组变成 AST 后:
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*/
/**
* Transformation 变形
* --------------
*
* The next type of stage for a compiler is transformation. Again, this just
* takes the AST from the last step and makes changes to it. It can manipulate
* the AST in the same language or it can translate it into an entirely new
* language.
生成 AST 后, 下一步是变形
*
* Let’s look at how we would transform an AST.
接下来看看具体怎么变形 AST
*
* You might notice that our AST has elements within it that look very similar.
* There are these objects with a type property. Each of these are known as an
* AST Node. These nodes have defined properties on them that describe one
* isolated part of the tree.
你可能注意到了 AST 看起来蛮眼熟的,
它和 token 一样有 type 属性.
然后的话, AST 里每一个都叫做 Node(节点).
*
* We can have a node for a "NumberLiteral":
我们用类型为 NumberLiteral (数字字面量) 的节点来表示数字
*
* {
* type: 'NumberLiteral',
* value: '2'
* }
*
* Or maybe a node for a "CallExpression":
用类型为 CallExpression 的节点表示调用表达式(比如add和subtract)
*
* {
* type: 'CallExpression',
* name: 'subtract',
* params: [...nested nodes go here...]
* }
*
* When transforming the AST we can manipulate nodes by
* adding/removing/replacing properties, we can add new nodes, remove nodes, or
* we could leave the existing AST alone and create an entirely new one based
* on it.
变形 AST 时我们可以给节点 添加/删除/替换 属性.
或者干脆就添加节点, 删除节点。
或是基于 AST 的信息,弄个全新的 AST,而不是去修改原来的 AST
*
* Since we’re targeting a new language, we’re going to focus on creating an
* entirely new AST that is specific to the target language.
因为我们要编译成其他语言, 所以这里我们就新建一个用于目标语言的新 AST
*
* Traversal 遍历
* ---------
*
* In order to navigate through all of these nodes, we need to be able to
* traverse through them. This traversal process goes to each node in the AST
* depth-first.
我们用深度优先(depth-first)方式遍历 AST 的各个节点来变形
*
* {
* type: 'Program',
* body: [{
* type: 'CallExpression',
* name: 'add',
* params: [{
* type: 'NumberLiteral',
* value: '2'
* }, {
* type: 'CallExpression',
* name: 'subtract',
* params: [{
* type: 'NumberLiteral',
* value: '4'
* }, {
* type: 'NumberLiteral',
* value: '2'
* }]
* }]
* }]
* }
*
* So for the above AST we would go:
对于以上的 AST, 深度优先这种遍历方式的访问顺序是:
*
* 1. Program - Starting at the top level of the AST
* 2. CallExpression (add) - Moving to the first element of the Program's body
* 3. NumberLiteral (2) - Moving to the first element of CallExpression's params
* 4. CallExpression (subtract) - Moving to the second element of CallExpression's params
* 5. NumberLiteral (4) - Moving to the first element of CallExpression's params
* 6. NumberLiteral (2) - Moving to the second element of CallExpression's params
*
上面这段没翻, 想必你也看得懂.
简单说就是 add 2 subtract 4 2 这么按顺序访问
* If we were manipulating this AST directly, instead of creating a separate AST,
* we would likely introduce all sorts of abstractions here. But just visiting
* each node in the tree is enough.
如果我们要直接改这个 AST, 而不是创建一个新 AST,
逻辑可能需要写的更复杂一些, 可能要多写些代码, 所以这里访问各个节点就好.
* The reason I use the word “visiting” is because there is this pattern of how
* to represent operations on elements of an object structure.
*
* Visitors 访问者
* --------
*
* The basic idea here is that we are going to create a “visitor” object that
* has methods that will accept different node types.
我们会创建一个"访问者"(visitor)对象
它有方法去接收不同节点类型
*
* var visitor = {
* NumberLiteral() {},
* CallExpression() {}
* };
*
* When we traverse our AST we will call the methods on this visitor whenever we
* encounter a node of a matching type.
当遍历 AST 时, Vistior 碰到不同节点会调用不同方法
*
* In order to make this useful we will also pass the node and a reference to
* the parent node.
为了方便创建新 AST, 我们把节点和父节点都传递过去
*
* var visitor = {
* NumberLiteral(node, parent) {},
* CallExpression(node, parent) {}
* };
*/
/**
* Code Generation 代码生成
* ---------------
*
* The final phase of a compiler is code generation. Sometimes compilers will do
* things that overlap with transformation, but for the most part code
* generation just means take our AST and string-ify code back out.
最后一个阶段是代码生成. 有时编译器会把"代码生成"这一步和"变形"混合着一起做.
代码生成 就是把 AST 变成目标语言字符串
*
* Code generators work several different ways, some compilers will reuse the
* tokens from earlier, others will have created a separate representation of
* the code so that they can print node linearly, but from what I can tell most
* will use the same AST we just created, which is what we’re going to focus on.
代码生成有几种不同方式实现, 有些编译器会重用前面分析出来的 tokens,
有些会创造代码的另一种表示方式, 然后就可以线性输出节点,
但据我所知大部分编译器会用上一步的 AST.
*
* Effectively our code generator will know how to “print” all of the different
* node types of the AST, and it will recursively call itself to print nested
* nodes until everything is printed into one long string of code.
我们的代码生成器知道怎么正确输出 AST 里的不同节点,并且它会递归调用自己.
直到所有节点都输出完.
*/
/**
* And that's it! That's all the different pieces of a compiler.
现在各个部分都介绍完了.
*
* Now that isn’t to say every compiler looks exactly like I described here.
* Compilers serve many different purposes, and they might need more steps than
* I have detailed.
注意不是每个编译器都和这里说的长得完全一样.
编译器可以用来做很多事情, 有些事需要额外的步骤去实现.
*
* But now you should have a general high-level idea of what most compilers look
* like.
不过现在你大概了解编译器是怎么做的了.
*
* Now that I’ve explained all of this, you’re all good to go write your own
* compilers right?
现在我解释了各个部分, 你就自己写个编译器吧?
*
* Just kidding, that's what I'm here to help with :P
开个玩笑, 下面我会演示怎么写.
*
* So let's begin...
开始吧
*/
/**
* ============================================================================
* (/^▽^)/
* THE TOKENIZER!
* ============================================================================
*/
/**
* We're gonna start off with our first phase of parsing, lexical analysis, with
* the tokenizer.
我们要开始分析(parsing)的第一个阶段, 语法分析, 做这个事情的函数我们叫它 tokenizer
*
* We're just going to take our string of code and break it down into an array
* of tokens.
我们会把如下代码, 拆成一个 token 数组
*
* (add 2 (subtract 4 2)) => [{ type: 'paren', value: '(' }, ...]
*/
// We start by accepting an input string of code, and we're gonna set up two
// things...
// tokenizer 接收字符串形式的源代码
function tokenizer(input) {
// A `current` variable for tracking our position in the code like a cursor.
// `current` 变量用于存当前处理到哪里了, 就像手指指着纸上我们处理到哪个位置了
var current = 0;
// And a `tokens` array for pushing our tokens to.
// `tokens` 数组用于存所有的 token,待会我们会把 token 一个个 push 进去
var tokens = [];
// We start by creating a `while` loop where we are setting up our `current`
// variable to be incremented as much as we want `inside` the loop.
//
// We do this because we may want to increment `current` many times within a
// single loop because our tokens can be any length.
// 我们用 while 来遍历源代码里的每个字符
while (current < input.length) {
// We're also going to store the `current` character in the `input`.
// 把当前的单个字符拿到
var char = input[current];
// The first thing we want to check for is an open parenthesis. This will
// later be used for `CallExpressions` but for now we only care about the
// character.
// 我们首先想处理的是左括号, 这个后面会用于 调用表达式(CallExpressions)
//
// We check to see if we have an open parenthesis:
// 看看是不是左括号
if (char === '(') {
// If we do, we push a new token with the type `paren` and set the value
// to an open parenthesis.
// 如果是左括号, 就用个对象标上, type(类型): 'paren(括号)' (值)value: '('
// 然后 push 扔数组里
tokens.push({
type: 'paren',
value: '('
});
// Then we increment `current`
// 手指往后挪一位
current++;
// And we `continue` onto the next cycle of the loop.
// 直接下一次循环
continue;
}
// Next we're going to check for a closing parenthesis. We do the same exact
// thing as before: Check for a closing parenthesis, add a new token,
// increment `current`, and `continue`.
// 和上面的左括号处理流程完全一样. 只不过这次是右括号
if (char === ')') {
tokens.push({
type: 'paren',
value: ')'
});
current++;
continue;
}
// Moving on, we're now going to check for whitespace. This is interesting
// because we care that whitespace exists to separate characters, but it
// isn't actually important for us to store as a token. We would only throw
// it out later.
//
// So here we're just going to test for existence and if it does exist we're
// going to just `continue` on.
// 继续, 我们现在来处理空格, 空格用于隔开字符, 但不用像括号一样当做 token 存起来.
// 这里跳过不处理
// 这里 /\s/ 这种写法叫正则表达式, 如果你第一次看到这种写法
// 那么我解释下: 正则表达式是用来匹配字符串模式, 用途是判断一个字符串是不是电子邮件, 是不是电话号码,
// 或者是否达到指定位数(比如6位数字). 诸如此类的模式匹配.
// 有时候我们需要用正则表达式判断 yes 或 no. 比如是不是电子邮件
// 有时候我们需要用正则表达式返回字符串里面的字符, 比如从 "av1355589" 里提取出数字 13555589
var WHITESPACE = /\s/;
if (WHITESPACE.test(char)) {
current++;
continue;
}
// The next type of token is a number. This is different than what we have
// seen before because a number could be any number of characters and we
// want to capture the entire sequence of characters as one token.
//
// (add 123 456)
// ^^^ ^^^
// Only two separate tokens
//
// So we start this off when we encounter the first number in a sequence.
// 下一种要处理的 token 是数字, 数字可以是任意长度, 比如 123(3位) 88990(5位)
// 所以一旦碰到数字, 就看看下一个是不是数字, 如果是就把手指也移过去, 直到碰到不是数字的才停
// 这样就拿到了一整串数字
var NUMBERS = /[0-9]/;
if (NUMBERS.test(char)) {
// We're going to create a `value` string that we are going to push
// characters to.
// 用于存一整串数字, 如 123 或者 88990
var value = '';
// Then we're going to loop through each character in the sequence until
// we encounter a character that is not a number, pushing each character
// that is a number to our `value` and incrementing `current` as we go.
// 一直循环, 直到碰到不是数字
while (NUMBERS.test(char)) {
value += char;
char = input[++current];
}
// After that we push our `number` token to the `tokens` array.
// 整个数字拿到手了就把这个 token 放起来
tokens.push({
type: 'number',
value: value
});
// And we continue on.
// 下一次循环
continue;
}
// The last type of token will be a `name` token. This is a sequence of
// letters instead of numbers, that are the names of functions in our lisp
// syntax.
//
// (add 2 4)
// ^^^
// Name token
//
// 最后这种 token 类型是 name(名字)
// 逻辑和数字一样, 也是正则匹配
var LETTERS = /[a-z]/i;
if (LETTERS.test(char)) {
var value = '';
// Again we're just going to loop through all the letters pushing them to
// a value.
while (LETTERS.test(char)) {
value += char;
char = input[++current];
}
// And pushing that value as a token with the type `name` and continuing.
tokens.push({
type: 'name',
value: value
});
continue;
}
// Finally if we have not matched a character by now, we're going to throw
// an error and completely exit.
// 如果一个字符在上面这几种类型里全匹配不到, 就会运行到这里(因为上面的 continue 一个都没碰到)
// 那么我们就抛出错误
throw new TypeError('I dont know what this character is: ' + char);
}
// Then at the end of our `tokenizer` we simply return the tokens array.
// 最后把 token 数组返回
return tokens;
}
/**
* ============================================================================
* ヽ/❀o ل͜ o\ノ
* THE PARSER!!!
* ============================================================================
*/
/**
* For our parser we're going to take our array of tokens and turn it into an
* AST.
*
* [{ type: 'paren', value: '(' }, ...] => { type: 'Program', body: [...] }
*
* 这一步, 我们要把 token 数组转换成 AST (Abstract Syntax Tree) 抽象语法树
*/
// Okay, so we define a `parser` function that accepts our array of `tokens`.
// 接收 token 数组参数
function parser(tokens) {
// Again we keep a `current` variable that we will use as a cursor.
// 我们这里又用 current 变量, 就像手指一样指着处理到哪里了
var current = 0;
// But this time we're going to use recursion instead of a `while` loop. So we
// define a `walk` function.
// 这一次不用 while 循环了. 用递归. 所以这里定义个 walk 函数, 待会它会调用自己
function walk() {
// Inside the walk function we start by grabbing the `current` token.
// 拿到要处理的 token
var token = tokens[current];
// We're going to split each type of token off into a different code path,
// starting off with `number` tokens.
// 根据 token 的不同类型来处理 token
//
// We test to see if we have a `number` token.
// 如果是数字
if (token.type === 'number') {
// If we have one, we'll increment `current`.
// 先指向下一个要处理的
current++;
// And we'll return a new AST node called `NumberLiteral` and setting its
// value to the value of our token.
// 把这个 token 变成 node (对于数字, 这个 node 和原先的 token 其实没啥区别, 就是 type 不一样了)
return {
type: 'NumberLiteral',
value: token.value
};
}
// Next we're going to look for CallExpressions. We start this off when we
// encounter an open parenthesis.
// 接下来找 调用表达式(CallExpressions). 我们会先碰到左括号
// 如果是括号 而且是 左括号
if (
token.type === 'paren' &&
token.value === '('
) {
// We'll increment `current` to skip the parenthesis since we don't care
// about it in our AST.
// current + 1 因为在 AST 里我们不关心括号
token = tokens[++current];
// We create a base node with the type `CallExpression`, and we're going
// to set the name as the current token's value since the next token after
// the open parenthesis is the name of the function.
// 我们创建一个类型是 CallExpression 的节点
// 值是函数名, 也就是紧跟左括号后面的 add 或 subtract 这样的名字
var node = {
type: 'CallExpression',
name: token.value, // add or subtract, you get the idea
params: []
};
// We increment `current` *again* to skip the name token.
// 函数名处理了, 这里就 +1 跳过函数名.
token = tokens[++current];
// And now we want to loop through each token that will be the `params` of
// our `CallExpression` until we encounter a closing parenthesis.
// 现在我们想遍历 CallExpression 的参数, 直到碰到右括号
//
// Now this is where recursion comes in. Instead of trying to parse a
// potentially infinitely nested set of nodes we're going to rely on
// recursion to resolve things.
// 现在就是递归登场的地方了. 因为不管有多少层嵌套都可以用递归解决.
//
// To explain this, let's take our Lisp code. You can see that the
// parameters of the `add` are a number and a nested `CallExpression` that
// includes its own numbers.
// 举个例子, 如下是嵌套了一层, 先算里面那层, 再算外面, 但 subtract 里面还可以继续一直嵌套别的.
// 所以递归是个好方案
//
// (add 2 (subtract 4 2))
//
// You'll also notice that in our tokens array we have multiple closing
// parenthesis.
// 你应该注意到了 token 数组里有多个右括号
//
// [
// { type: 'paren', value: '(' },
// { type: 'name', value: 'add' },
// { type: 'number', value: '2' },
// { type: 'paren', value: '(' },
// { type: 'name', value: 'subtract' },
// { type: 'number', value: '4' },
// { type: 'number', value: '2' },
// { type: 'paren', value: ')' }, <<< Closing parenthesis (右括号)
// { type: 'paren', value: ')' } <<< Closing parenthesis (右括号)
// ]
//
// We're going to rely on the nested `walk` function to increment our
// `current` variable past any nested `CallExpressions`.
// 我们需要 walk 函数来移动我们的手指 (current变量) 来处理嵌套的 CallExpressions
// So we create a `while` loop that will continue until it encounters a
// token with a `type` of `'paren'` and a `value` of a closing
// parenthesis.
// 这里用 while 循环, 直到碰到右括号才退出.
// 如果不是括号 token.type !== 'paren'
// 或者是括号而且不是右括号 token.type === 'paren' && token.value !== ')'
while (
(token.type !== 'paren') ||
(token.type === 'paren' && token.value !== ')')
) {
// we'll call the `walk` function which will return a `node` and we'll
// push it into our `node.params`.
// 可以看到这里递归调用了 walk()
// 当前节点的 params 数组属性 node.params 推入(push) 递归的结果( 也是个 node )
node.params.push(walk());
token = tokens[current];
// 注意 current 是在 walk 函数外面的 不会每次都变0
// 在 walk() 里面 current+1 之后
// 这里就可以继续处理下一个 token
}
// Finally we will increment `current` one last time to skip the closing
// parenthesis.
// 跳过右括号 (跳出上面 while 循环的唯一可能就是碰到右括号)
current++;
// And return the node.
// 并且返回节点(node)
return node;
}
// Again, if we haven't recognized the token type by now we're going to
// throw an error.
// 同样, 认不出 token 类型就报错
throw new TypeError(token.type);
}// walk 函数结束
// Now, we're going to create our AST which will have a root which is a
// `Program` node.
// 现在根据 token 数组创建 AST, AST 根节点类型是 "Program" 代表我们的整个程序
var ast = {
type: 'Program',
body: []
};
// And we're going to kickstart our `walk` function, pushing nodes to our
// `ast.body` array.
//
// 现在我们要调用前面定义的 walk 函数, 把节点 push 到
// ast.body 这个数组里
//
// The reason we are doing this inside a loop is because our program can have
// `CallExpressions` after one another instead of being nested.
//
// 这里用 while 循环, 是因为我们的程序可以多个 CallExpressions 并排, 不一定要嵌套
//
// (add 2 2)(subtract 4 2)
while (current < tokens.length) {
ast.body.push(walk());
}
// At the end of our parser we'll return the AST.
return ast;
// 最后返回 AST
}
/**
* ============================================================================
* ⌒(❀>◞౪◟<❀)⌒
* THE TRAVERSER!!!
* ============================================================================
*/
/**
译者注:
注意这里有点不一样了,
前面 tokenizer 和 parser 都是单个函数就搞定
这里是 traverser + transformer 搭配用, 2 个函数
traverser 里又有2个函数, 分别是 traverseArray 和 traverseNode
def traverser()
def traverseArray() # 定义函数
def traverseNode() # 定义函数
traverseNode(a,b) # 调函数
def transformer()
traverser(ast, visitor) # 调函数
总体结构如上, 因为这个部分我看了半天没懂(主要是_context那里没懂, 后来弄懂了)
这里整理下清晰些
* So now we have our AST, and we want to be able to visit different nodes with
* a visitor. We need to be able to call the methods on the visitor whenever we
* encounter a node with a matching type.
*
* 现在我们有 AST 了, 我们需要用 visitor 访问 AST 里的各个节点
* 然后根据节点的不同类型, 调用 visitor 的不同方法(从而创建新 AST)
*
* traverse(ast, {
* Program(node, parent) {
* // ...
* },
*
* CallExpression(node, parent) {
* // ...
* },
*
* NumberLiteral(node, parent) {
* // ...
* }
* });
*/
// So we define a traverser function which accepts an AST and a
// visitor. Inside we're going to define two functions...
// 我们定义个 traverser, 参数是 ast 和 visitor
function traverser(ast, visitor) {
// A `traverseArray` function that will allow us to iterate over an array and
// call the next function that we will define: `traverseNode`.
// `traverseArray` 遍历数组, 并且对数组每个元素调用 traverseNode 这个函数
// 访问顺序是 add, 2, subtract, 4, 2
function traverseArray(array, parent) {
array.forEach(function(child) {
traverseNode(child, parent);
});
}
// `traverseNode` will accept a `node` and its `parent` node. So that it can
// pass both to our visitor methods.
// `traverseNode` 接收一个节点 以及它的上级节点(parent node)
// 这样它就可以把这俩参数传给 visitor 方法
function traverseNode(node, parent) {
// We start by testing for the existence of a method on the visitor with a
// matching `type`.
// 这部分根据节点类型, 执行 visitor 里对应方法
var method = visitor[node.type];
if (method) {
method(node, parent);
}
// Next we are going to split things up by the current node type.
// 这部分取属性继续遍历
switch (node.type) {
// We'll start with our top level `Program`. Since Program nodes have a
// property named body that has an array of nodes, we will call
// `traverseArray` to traverse down into them.
//
// (Remember that `traverseArray` will in turn call `traverseNode` so we
// are causing the tree to be traversed recursively)
//
// 我们从顶层节点, `Program` 开始.
// 这个节点的 body 属性里是节点数组
// 我们用 traverseArray 来遍历这个节点数组
//
// (记住 traverseArray 又会调用 traverseNode, 使得我们可以递归访问一整棵树)
case 'Program':
traverseArray(node.body, node);
break;
// Next we do the same with `CallExpressions` and traverse their `params`.
// 如果节点是 CallExpressions 就 traverseArray 这个节点的 params
case 'CallExpression':
traverseArray(node.params, node);
break;
// In the case of `NumberLiterals` we don't have any child nodes to visit,
// so we'll just break.
// 如果是数字, 那就没有子节点要访问, 所以直接 break;
case 'NumberLiteral':
break;
// And again, if we haven't recognized the node type then we'll throw an
// error.
// 如果认不出节点类型就报错
default:
throw new TypeError(node.type);
}
}
// Finally we kickstart the traverser by calling `traverseNode` with our ast
// with no `parent` because the top level of the AST doesn't have a parent.
// 我们在这个 traverser 里面定义了 traverseArray 函数 和 traverseNode 函数
// 现在要调用它了, 第二个参数是 null 是因为 ast 本身没有上级节点了
traverseNode(ast, null);
}
/**
* ============================================================================
* ⁽(◍˃̵͈̑ᴗ˂̵͈̑)⁽
* THE TRANSFORMER!!!
* ============================================================================
*/
/**
* Next up, the transformer. Our transformer is going to take the AST that we
* have built and pass it to our traverser function with a visitor and will
* create a new ast.
接下来, transformer.
transformer 的用途就是变换 AST 结构
我们会在 transformer 里定义个 visitor, visitor 碰到不同节点会调不同的方法
transformer 把 AST 和 visitor 传给 traverser
通过遍历所有节点(traverser负责遍历和看节点类型),并且根据不同节点调用不同方法(visitor负责提供方法)
最后我们会得到新的 AST
*
* ----------------------------------------------------------------------------
* Original AST | Transformed AST
* ----------------------------------------------------------------------------
* { | {
* type: 'Program', | type: 'Program',
* body: [{ | body: [{
* type: 'CallExpression', | type: 'ExpressionStatement',
* name: 'add', | expression: {
* params: [{ | type: 'CallExpression',
* type: 'NumberLiteral', | callee: {
* value: '2' | type: 'Identifier',
* }, { | name: 'add'
* type: 'CallExpression', | },
* name: 'subtract', | arguments: [{
* params: [{ | type: 'NumberLiteral',
* type: 'NumberLiteral', | value: '2'
* value: '4' | }, {
* }, { | type: 'CallExpression',
* type: 'NumberLiteral', | callee: {
* value: '2' | type: 'Identifier',
* }] | name: 'subtract'
* }] | },
* }] | arguments: [{
* } | type: 'NumberLiteral',
* | value: '4'
* ---------------------------------- | }, {
* | type: 'NumberLiteral',
* | value: '2'
* | }]
* (sorry the other one is longer.) | }
* | }
* | }]
* | }
* ----------------------------------------------------------------------------
*/
// So we have our transformer function which will accept the lisp ast.
// 我们的 transformer 会接收 lisp 的 AST
function transformer(ast) {
// We'll create a `newAst` which like our previous AST will have a program
// node.
// 新的 AST 和之前的 老 AST 一样也有个根节点叫 Program
var newAst = {
type: 'Program',
body: []
};
// Next I'm going to cheat a little and create a bit of a hack. We're going to
// use a property named `context` on our parent nodes that we're going to push
// nodes to their parent's `context`. Normally you would have a better
// abstraction than this, but for our purposes this keeps things simple.
//
//
// Just take note that the context is a reference *from* the old ast *to* the
// new ast.
// 这行代码使得 旧 AST 的 _context 指向了 新 AST 的 body
// 就是说你往 旧 AST 的 _context 里面 push
// 和 push 到 新 AST 的 body 是一回事
// 这么写的话就方便些
ast._context = newAst.body;
// We'll start by calling the traverser function with our ast and a visitor.
// 调 traverser, 第一个参数 ast 没啥说的, 第二个参数是 visitor
traverser(ast, {
// The first visitor method accepts `NumberLiterals`
// 处理数字
NumberLiteral: function(node, parent) {
// We'll create a new node also named `NumberLiteral` that we will push to
// the parent context.
// 首先对于数字节点, parent 总是 CallExpression 这点先明确了.
// 然后
// 这里创建了个新节点, 然后 parent._haha.push 就会进上级 CallExpression 的 arguments 里了
// 之所以管用请看下面 CallExpression 写了 node._haha = expression.arguments;
parent._haha.push({
type: 'NumberLiteral',
value: node.value
});
},
// Next up, `CallExpressions`.
// 处理调用表达式 (CallExpressions)
CallExpression: function(node, parent) {
// We start creating a new node `CallExpression` with a nested
// `Identifier`.
// 创建个新节点, 把 name 先包一下.
// 之前的 CallExpressions 就是 type,name,params 三个属性
// 现在先把 name 包一层同时给个类型(Identifier), 然后 arguments 其实就是 params (含义一样,我不是说指向)
var expression = {