diff --git a/packages/langium/src/parser/indentation-aware.ts b/packages/langium/src/parser/indentation-aware.ts index 6db1ca2fb..f337f8717 100644 --- a/packages/langium/src/parser/indentation-aware.ts +++ b/packages/langium/src/parser/indentation-aware.ts @@ -6,7 +6,7 @@ import type { CustomPatternMatcherFunc, TokenType, IToken, IMultiModeLexerDefinition } from 'chevrotain'; import type { Grammar, TerminalRule } from '../languages/generated/ast.js'; -import type { TokenBuilderOptions } from './token-builder.js'; +import type { ILexingReport, TokenBuilderOptions } from './token-builder.js'; import type { LexerResult } from './lexer.js'; import type { LangiumCoreServices } from '../services.js'; import { createToken, createTokenInstance, Lexer } from 'chevrotain'; @@ -69,22 +69,28 @@ export enum LexingMode { IGNORE_INDENTATION = 'ignore-indentation', } +export interface IndentationLexingReport extends ILexingReport { + /** Dedent tokens that are necessary to close the remaining indents. */ + remainingDedents: IToken[]; +} + /** * A token builder that is sensitive to indentation in the input text. * It will generate tokens for indentation and dedentation based on the indentation level. * * The first generic parameter corresponds to the names of terminal tokens, - * while the second one corresonds to the names of keyword tokens. + * while the second one corresponds to the names of keyword tokens. * Both parameters are optional and can be imported from `./generated/ast.js`. * * Inspired by https://github.com/chevrotain/chevrotain/blob/master/examples/lexer/python_indentation/python_indentation.js */ export class IndentationAwareTokenBuilder extends DefaultTokenBuilder { /** - * The stack in which all the previous matched indentation levels are stored - * to understand how deep a the next tokens are nested. + * The stack stores all the previously matched indentation levels to understand how deeply the next tokens are nested. + * The stack is valid for lexing */ protected indentationStack: number[] = [0]; + readonly options: IndentationTokenBuilderOptions; /** @@ -123,7 +129,7 @@ export class IndentationAwareTokenBuilder): { currIndentLevel: number, prevIndentLevel: number, match: RegExpExecArray | null } { this.whitespaceRegExp.lastIndex = offset; const match = this.whitespaceRegExp.exec(text); return { @@ -210,8 +224,8 @@ export class IndentationAwareTokenBuilder): ReturnType { + protected indentMatcher(text: string, offset: number, tokens: IToken[], groups: Record): ReturnType { const { indentTokenName } = this.options; if (!this.isStartOfLine(text, offset)) { return null; } - const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups); if (currIndentLevel <= prevIndentLevel) { // shallower indentation (should be matched by dedent) @@ -266,14 +291,14 @@ export class IndentationAwareTokenBuilder): ReturnType { + protected dedentMatcher(text: string, offset: number, tokens: IToken[], groups: Record): ReturnType { const { dedentTokenName } = this.options; if (!this.isStartOfLine(text, offset)) { return null; } - const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset); + const { currIndentLevel, prevIndentLevel, match } = this.matchWhitespace(text, offset, tokens, groups); if (currIndentLevel >= prevIndentLevel) { // bigger indentation (should be matched by indent) @@ -285,9 +310,14 @@ export class IndentationAwareTokenBuilder = { value: T, parserErrors: IRecognitionException[], - lexerErrors: ILexingError[] + lexerErrors: ILexingError[], + lexerReport?: ILexingReport } export const DatatypeSymbol = Symbol('Datatype'); @@ -240,6 +242,7 @@ export class LangiumParser extends AbstractLangiumParser { return { value: result, lexerErrors: lexerResult.errors, + lexerReport: lexerResult.report, parserErrors: this.wrapper.errors }; } diff --git a/packages/langium/src/parser/lexer.ts b/packages/langium/src/parser/lexer.ts index eec23ea12..849f809e3 100644 --- a/packages/langium/src/parser/lexer.ts +++ b/packages/langium/src/parser/lexer.ts @@ -7,6 +7,11 @@ import type { ILexingError, IMultiModeLexerDefinition, IToken, TokenType, TokenTypeDictionary, TokenVocabulary } from 'chevrotain'; import type { LangiumCoreServices } from '../services.js'; import { Lexer as ChevrotainLexer } from 'chevrotain'; +import type { ILexingReport, TokenBuilder } from './token-builder.js'; + +export interface ILexingDiagnostic extends ILexingError { + severity?: 'error' | 'warning' | 'info' | 'hint'; +} export interface LexerResult { /** @@ -21,6 +26,7 @@ export interface LexerResult { */ hidden: IToken[]; errors: ILexingError[]; + report?: ILexingReport; } export interface Lexer { @@ -31,10 +37,12 @@ export interface Lexer { export class DefaultLexer implements Lexer { protected chevrotainLexer: ChevrotainLexer; + protected tokenBuilder: TokenBuilder; protected tokenTypes: TokenTypeDictionary; - constructor(services: LangiumCoreServices) { - const tokens = services.parser.TokenBuilder.buildTokens(services.Grammar, { + constructor( services: LangiumCoreServices) { + this.tokenBuilder = services.parser.TokenBuilder; + const tokens = this.tokenBuilder.buildTokens(services.Grammar, { caseInsensitive: services.LanguageMetaData.caseInsensitive }); this.tokenTypes = this.toTokenTypeDictionary(tokens); @@ -53,7 +61,8 @@ export class DefaultLexer implements Lexer { return { tokens: chevrotainResult.tokens, errors: chevrotainResult.errors, - hidden: chevrotainResult.groups.hidden ?? [] + hidden: chevrotainResult.groups.hidden ?? [], + report: this.tokenBuilder.popLexingReport?.(text) }; } diff --git a/packages/langium/src/parser/token-builder.ts b/packages/langium/src/parser/token-builder.ts index eb25155de..e543f04de 100644 --- a/packages/langium/src/parser/token-builder.ts +++ b/packages/langium/src/parser/token-builder.ts @@ -13,6 +13,7 @@ import { streamAllContents } from '../utils/ast-utils.js'; import { getAllReachableRules, terminalRegex } from '../utils/grammar-utils.js'; import { getCaseInsensitivePattern, isWhitespace, partialMatches } from '../utils/regexp-utils.js'; import { stream } from '../utils/stream.js'; +import type { ILexingDiagnostic } from './lexer.js'; export interface TokenBuilderOptions { caseInsensitive?: boolean @@ -20,9 +21,27 @@ export interface TokenBuilderOptions { export interface TokenBuilder { buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary; + /** + * Produces a lexing report for the given text that was just tokenized using the tokens provided by this builder. + * + * @param text The text that was tokenized. + */ + popLexingReport?(text: string): ILexingReport; +} + +/** + * A custom lexing report that can be produced by the token builder during the lexing process. + * Adopters need to ensure that the any custom fields are serializable so they can be sent across worker threads. + */ +export interface ILexingReport { + diagnostics: ILexingDiagnostic[]; } export class DefaultTokenBuilder implements TokenBuilder { + /** + * The list of diagnostics stored during the lexing process of a single text. + */ + protected diagnostics: ILexingDiagnostic[] = []; buildTokens(grammar: Grammar, options?: TokenBuilderOptions): TokenVocabulary { const reachableRules = stream(getAllReachableRules(grammar, false)); @@ -42,6 +61,16 @@ export class DefaultTokenBuilder implements TokenBuilder { return tokens; } + popLexingReport(_text: string): ILexingReport { + return { diagnostics: this.popDiagnostics() }; + } + + protected popDiagnostics(): ILexingDiagnostic[] { + const diagnostics = [...this.diagnostics]; + this.diagnostics = []; + return diagnostics; + } + protected buildTerminalTokens(rules: Stream): TokenType[] { return rules.filter(isTerminalRule).filter(e => !e.fragment) .map(terminal => this.buildTerminalToken(terminal)).toArray(); diff --git a/packages/langium/src/serializer/hydrator.ts b/packages/langium/src/serializer/hydrator.ts index 1c0fdaded..7a955e167 100644 --- a/packages/langium/src/serializer/hydrator.ts +++ b/packages/langium/src/serializer/hydrator.ts @@ -18,6 +18,7 @@ import { isRootCstNode, isCompositeCstNode, isLeafCstNode, isAstNode, isReferenc import { streamAst } from '../utils/ast-utils.js'; import { BiMap } from '../utils/collections.js'; import { streamCst } from '../utils/cst-utils.js'; +import type { ILexingReport } from '../parser/token-builder.js'; /** * The hydrator service is responsible for allowing AST parse results to be sent across worker threads. @@ -64,11 +65,19 @@ export class DefaultHydrator implements Hydrator { // We need to create shallow copies of the errors // The original errors inherit from the `Error` class, which is not transferable across worker threads lexerErrors: result.lexerErrors.map(e => ({ ...e, message: e.message })), + lexerReport: result.lexerReport ? this.dehydrateLexerReport(result.lexerReport) : undefined, parserErrors: result.parserErrors.map(e => ({ ...e, message: e.message })), value: this.dehydrateAstNode(result.value, this.createDehyrationContext(result.value)) }; } + protected dehydrateLexerReport(lexerReport: ILexingReport): ILexingReport { + return { + ...lexerReport, + diagnostics: lexerReport.diagnostics.map(d => ({ ...d, message: d.message })) + }; + } + protected createDehyrationContext(node: AstNode): DehydrateContext { const astNodes = new Map(); const cstNodes = new Map(); @@ -162,6 +171,7 @@ export class DefaultHydrator implements Hydrator { } return { lexerErrors: result.lexerErrors, + lexerReport: result.lexerReport, parserErrors: result.parserErrors, value: this.hydrateAstNode(node, context) as T }; diff --git a/packages/langium/src/validation/document-validator.ts b/packages/langium/src/validation/document-validator.ts index e5e1d56d5..fb5462415 100644 --- a/packages/langium/src/validation/document-validator.ts +++ b/packages/langium/src/validation/document-validator.ts @@ -18,6 +18,7 @@ import { streamAst } from '../utils/ast-utils.js'; import { tokenToRange } from '../utils/cst-utils.js'; import { interruptAndCheck, isOperationCancelled } from '../utils/promise-utils.js'; import { diagnosticData } from './validation-registry.js'; +import type { ILexingDiagnostic } from '../parser/lexer.js'; export interface ValidationOptions { /** @@ -97,21 +98,23 @@ export class DefaultDocumentValidator implements DocumentValidator { } protected processLexingErrors(parseResult: ParseResult, diagnostics: Diagnostic[], _options: ValidationOptions): void { - for (const lexerError of parseResult.lexerErrors) { + const lexerDiagnostics = [...parseResult.lexerErrors, ...parseResult.lexerReport?.diagnostics ?? []] as ILexingDiagnostic[]; + for (const lexerDiagnostic of lexerDiagnostics) { + const severity = lexerDiagnostic?.severity ?? 'error'; const diagnostic: Diagnostic = { - severity: toDiagnosticSeverity('error'), + severity: toDiagnosticSeverity(severity), range: { start: { - line: lexerError.line! - 1, - character: lexerError.column! - 1 + line: lexerDiagnostic.line! - 1, + character: lexerDiagnostic.column! - 1 }, end: { - line: lexerError.line! - 1, - character: lexerError.column! + lexerError.length - 1 + line: lexerDiagnostic.line! - 1, + character: lexerDiagnostic.column! + lexerDiagnostic.length - 1 } }, - message: lexerError.message, - data: diagnosticData(DocumentValidator.LexingError), + message: lexerDiagnostic.message, + data: toDiagnosticData(severity), source: this.getSource() }; diagnostics.push(diagnostic); @@ -245,8 +248,26 @@ export function toDiagnosticSeverity(severity: 'error' | 'warning' | 'info' | 'h } } +export function toDiagnosticData(severity: 'error' | 'warning' | 'info' | 'hint'): DiagnosticData { + switch (severity) { + case 'error': + return diagnosticData(DocumentValidator.LexingError); + case 'warning': + return diagnosticData(DocumentValidator.LexingWarning); + case 'info': + return diagnosticData(DocumentValidator.LexingInfo); + case 'hint': + return diagnosticData(DocumentValidator.LexingHint); + default: + throw new Error('Invalid diagnostic severity: ' + severity); + } +} + export namespace DocumentValidator { export const LexingError = 'lexing-error'; + export const LexingWarning = 'lexing-warning'; + export const LexingInfo = 'lexing-info'; + export const LexingHint = 'lexing-hint'; export const ParsingError = 'parsing-error'; export const LinkingError = 'linking-error'; } diff --git a/packages/langium/test/parser/indentation-aware.test.ts b/packages/langium/test/parser/indentation-aware.test.ts index cb7026bc7..6fbdef599 100644 --- a/packages/langium/test/parser/indentation-aware.test.ts +++ b/packages/langium/test/parser/indentation-aware.test.ts @@ -6,7 +6,7 @@ import type { TokenType } from '@chevrotain/types'; import type { AstNode, Grammar, IndentationTokenBuilderOptions, LangiumParser, Lexer, Module } from 'langium'; -import { beforeEach, describe, expect, test } from 'vitest'; +import { describe, expect, test } from 'vitest'; import { EmptyFileSystem, IndentationAwareLexer, IndentationAwareTokenBuilder } from 'langium'; import { createLangiumGrammarServices, createServicesForGrammar } from 'langium/grammar'; import type { LangiumServices, PartialLangiumServices } from 'langium/lsp'; @@ -51,10 +51,6 @@ async function createIndentationAwareServices(grammar: string, options?: Partial return services; } -beforeEach(() => { - tokenBuilder.popRemainingDedents(''); -}); - describe('IndentationAwareTokenBuilder', () => { const sampleGrammar = ` @@ -333,10 +329,28 @@ describe('IndentationAware parsing', () => { else: return true `); - expect(parserErrors.length).toBeGreaterThan(0); }); + test('should report error on non-matching dedent', async () => { + const parser = await getParser(sampleGrammar); + const { lexerReport } = parser.parse(expandToString` + if true: + return false + else: + return true + `); + expect(lexerReport?.diagnostics.length).toBe(1); + const diagnostic = lexerReport?.diagnostics[0]; + expect(diagnostic).toBeDefined(); + expect(diagnostic!.severity).toBe('error'); + expect(diagnostic!.message).toContain('Invalid dedent level'); + // offset should be 26 on Linux/MacOs and 28 on Windows due to differences in line endings + expect([26, 28]).toContain(diagnostic!.offset); + expect(diagnostic!.length).toBe(2); + expect(diagnostic!.line).toBe(3); + }); + test('should throw an error on unexpected indent', async () => { const parser = await getParser(sampleGrammar); const { parserErrors } = parser.parse(expandToString`