From a296249af5e0175c36cd2d5cf14dd8aaada8e4c3 Mon Sep 17 00:00:00 2001 From: Ken Domino Date: Mon, 1 Apr 2024 12:03:49 -0400 Subject: [PATCH] [php] Fix for #4037 and #4038 (#4039) * Port php grammar to Antlr4ng. * Fix for #4037 and #4038 --- php/Antlr4ng/PhpLexerBase.ts | 150 +++++++++++++++++++++++++++++++ php/Antlr4ng/transformGrammar.py | 31 +++++++ php/PhpLexer.g4 | 4 +- php/desc.xml | 2 +- php/examples/numericScale.php | 2 +- php/examples/strings.php | 2 +- 6 files changed, 186 insertions(+), 5 deletions(-) create mode 100644 php/Antlr4ng/PhpLexerBase.ts create mode 100644 php/Antlr4ng/transformGrammar.py diff --git a/php/Antlr4ng/PhpLexerBase.ts b/php/Antlr4ng/PhpLexerBase.ts new file mode 100644 index 0000000000..9b5b4b0fb7 --- /dev/null +++ b/php/Antlr4ng/PhpLexerBase.ts @@ -0,0 +1,150 @@ +import {CommonToken, Lexer, Token, CharStream} from "antlr4ng"; +import { PhpParser } from "./PhpParser.js"; +import { PhpLexer } from "./PhpLexer.js"; + +export default abstract class PhpLexerBase extends Lexer { + private AspTags: boolean; + protected _scriptTag: boolean; + protected _styleTag: boolean; + private _heredocIdentifier: string | undefined; + private _prevTokenType: number; + private _htmlNameText: string | undefined; + private _phpScript: boolean; + private _insideString: boolean; + + protected static MIN_CHAR_VALUE = 0x0000; + protected static MAX_CHAR_VALUE = 0x10FFFF; + + constructor(input: CharStream) { + super(input); + this.AspTags = true; + this._scriptTag = false; + this._styleTag = false; + this._heredocIdentifier = undefined; + this._prevTokenType = 0; + this._htmlNameText = undefined; + this._phpScript = false; + this._insideString = false; + } + + nextToken() { + let token = super.nextToken() + + if (token.type === PhpParser.PHPEnd || token.type === PhpLexer.PHPEndSingleLineComment) { + if (this.mode === PhpLexer.SingleLineCommentMode) { + // SingleLineCommentMode for such allowed syntax: + // // + this.popMode(); + } + this.popMode(); + + if (token.text === "") { + this._phpScript = false; + token.type = PhpLexer.HtmlScriptClose; + } else { + // Add semicolon to the end of statement if it is absent. + // For example: + if (this._prevTokenType === PhpLexer.SemiColon || this._prevTokenType === PhpLexer.Colon || this._prevTokenType === PhpLexer.OpenCurlyBracket || this._prevTokenType === PhpLexer.CloseCurlyBracket) { + token.channel = 4; // Damn tool does not generate constants for declared channels. + } else { + token.type = PhpLexer.SemiColon; + } + } + } + + else if (token.type === PhpLexer.HtmlName) { + this._htmlNameText = token.text + } + + else if (token.type === PhpLexer.HtmlDoubleQuoteString) { + if (token.text === "php" && this._htmlNameText === "language") { + this._phpScript = true; + } + } + + else if (this.mode === PhpLexer.HereDoc) { + // Heredoc and Nowdoc syntax support: http://php.net/manual/en/language.types.string.php#language.types.string.syntax.heredoc + if (token.type === PhpLexer.StartHereDoc || token.type === PhpLexer.StartNowDoc) { + this._heredocIdentifier = token.text.slice(3).trim().replace(/\'/, '').replace(/\'/, ''); + } + + else if (token.type === PhpLexer.HereDocText) { + if (this.CheckHeredocEnd(token.text)) { + this.popMode() + const heredocIdentifier = this.GetHeredocIdentifier(token.text) + if (token.text.trim().endsWith(';')) { + token.text = `${heredocIdentifier};\n`; + token.type = PhpLexer.SemiColon; + } else { + token = super.nextToken() + token.text = `${heredocIdentifier}\n;`; + } + } + } + } + + else if (this.mode === PhpLexer.PHP) { + if (!(this.channel === PhpLexer.HIDDEN)) { + this._prevTokenType = token.type; + } + } + + return token; + } + + GetHeredocIdentifier(text: string): string { + return text.trim().replace(/\;$/, ""); + } + + CheckHeredocEnd(text: string): boolean { + return this.GetHeredocIdentifier(text) === this._heredocIdentifier; + } + + IsNewLineOrStart(pos: number): boolean { + return this.inputStream.LA(pos) <= 0 || this.inputStream.LA(pos) == '\r'.charCodeAt(0) || + this.inputStream.LA(pos) == '\n'.charCodeAt(0) + } + + PushModeOnHtmlClose() { + this.popMode(); + if (this._scriptTag) { + if (!this._phpScript) { + this.pushMode(PhpLexer.SCRIPT); + } else { + this.pushMode(PhpLexer.PHP); + } + this._scriptTag = false; + } else if (this._styleTag) { + this.pushMode(PhpLexer.STYLE); + this._styleTag = false; + } + } + + HasAspTags(): boolean { + return this.AspTags; + } + + HasPhpScriptTag(): boolean { + return this._phpScript; + } + + PopModeOnCurlyBracketClose() { + if (this._insideString) { + this._insideString = false; + this.channel = 4; // Tool does not generate a constant for declared channels. + this.popMode(); + } + } + + ShouldPushHereDocMode(pos: number): boolean { + return this.inputStream.LA(pos) === '\r'.charCodeAt(0) || this.inputStream.LA(pos) === '\n'.charCodeAt(0); + } + + IsCurlyDollar(pos: number): boolean { + return this.inputStream.LA(pos) === '$'.charCodeAt(0); + } + + SetInsideString() { + this._insideString = true + } +} diff --git a/php/Antlr4ng/transformGrammar.py b/php/Antlr4ng/transformGrammar.py new file mode 100644 index 0000000000..556d8cac27 --- /dev/null +++ b/php/Antlr4ng/transformGrammar.py @@ -0,0 +1,31 @@ +"""The script transforms the grammar to fit for the c++ target """ +import sys +import re +import shutil +from glob import glob +from pathlib import Path + +def main(): + """Executes the script.""" + for file in glob("./*.g4"): + transform_grammar(file) + +def transform_grammar(file_path): + """Transforms the grammar to fit for the target""" + print("Altering " + file_path) + if not Path(file_path).is_file: + print(f"Could not find file: {file_path}") + sys.exit(1) + + shutil.move(file_path, file_path + ".bak") + with open(file_path + ".bak",'r', encoding="utf-8") as input_file: + with open(file_path, 'w', encoding="utf-8") as output_file: + for line in input_file: + line = re.sub(r"(\/\/ Insert here @header for C\+\+ lexer\.)",\ + '@header {import PhpLexerBase from "./PhpLexerBase.js"}', line) + output_file.write(line) + + print("Writing ...") + +if __name__ == '__main__': + main() diff --git a/php/PhpLexer.g4 b/php/PhpLexer.g4 index a1cec7f379..af40b0a600 100644 --- a/php/PhpLexer.g4 +++ b/php/PhpLexer.g4 @@ -48,8 +48,8 @@ HtmlText : ~[<#]+; XmlStart : ' pushMode(XML); PHPStartEcho : PhpStartEchoFragment -> type(Echo), pushMode(PHP); PHPStart : PhpStartFragment -> channel(SkipChannel), pushMode(PHP); -HtmlScriptOpen : ' pushMode(INSIDE); -HtmlStyleOpen : ' pushMode(INSIDE); +HtmlScriptOpen : ' pushMode(INSIDE); +HtmlStyleOpen : ' pushMode(INSIDE); HtmlComment : '' -> channel(HIDDEN); HtmlDtd : ''; HtmlOpen : '<' -> pushMode(INSIDE); diff --git a/php/desc.xml b/php/desc.xml index 67714256f8..ad4cdbfa37 100644 --- a/php/desc.xml +++ b/php/desc.xml @@ -1,5 +1,5 @@ ^4.10 - CSharp;Java;Python3 + CSharp;Java;Python3;Antlr4ng diff --git a/php/examples/numericScale.php b/php/examples/numericScale.php index ac2878b69e..9e8d8cbeb9 100644 --- a/php/examples/numericScale.php +++ b/php/examples/numericScale.php @@ -1,4 +1,4 @@ -aspell_path} list`;