diff --git a/README.md b/README.md index d3c0fa7..b0ba843 100644 --- a/README.md +++ b/README.md @@ -127,18 +127,21 @@ See [Quantifiers API doc](https://callstack.github.io/ts-regex-builder/api/quant ### Character classes -| Character class | Regex Syntax | Description | -| --------------------- | ------------ | ------------------------------------------------- | -| `any` | `.` | Any character | -| `word` | `\w` | Word character: letter, digit, underscore | -| `digit` | `\d` | Digit character: 0 to 9 | -| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | -| `anyOf('abc')` | `[abc]` | Any of provided characters | -| `charRange('a', 'z')` | `[a-z]` | Character in a range | -| `charClass(...)` | `[...]` | Union of multiple character classes | -| `negated(...)` | `[^...]` | Negation of a given character class | - -See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) for more info. +| Character class | Regex Syntax | Description | +| ---------------------- | ------------ | ------------------------------------------------- | +| `any` | `.` | Any character | +| `word` | `\w` | Word character: letter, digit, underscore | +| `digit` | `\d` | Digit character: 0 to 9 | +| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | +| `anyOf('abc')` | `[abc]` | Any of provided characters | +| `charRange('a', 'z')` | `[a-z]` | Character in a range | +| `charClass(...)` | `[...]` | Union of multiple character classes | +| `negated(...)` | `[^...]` | Negation of a given character class | +| `char(...)` | `\uXXXX` | Character specified given Unicode code point | +| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property | + + +See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) and [Unicode API doc](https://callstack.github.io/ts-regex-builder/api/unicode) for more info. ### Assertions @@ -177,9 +180,12 @@ TS Regex Builder is inspired by [Swift Regex Builder API](https://developer.appl ## Reference -- [ECMAScript Regular Expression BNF Grammar](https://262.ecma-international.org/7.0/#sec-regular-expressions) -- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder) +- [ECMAScript Regular Expression BNF Grammar](https://tc39.es/ecma262/#sec-regular-expressions) +- [Unicode Regular Expressions](https://www.unicode.org/reports/tr18/) - [Swift Evolution 351: Regex Builder DSL](https://github.com/apple/swift-evolution/blob/main/proposals/0351-regex-builder.md) +- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder) + + --- diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts index 7cf2797..7bb2a6d 100644 --- a/src/__tests__/builder.test.ts +++ b/src/__tests__/builder.test.ts @@ -1,4 +1,4 @@ -import { buildRegExp } from '..'; +import { buildRegExp, char, unicodeProperty } from '..'; test('`regexBuilder` flags', () => { expect(buildRegExp('a').flags).toBe(''); @@ -32,3 +32,24 @@ test('`regexBuilder` flags', () => { }).flags, ).toBe('gisy'); }); + +test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => { + expect(() => buildRegExp(char(0x1234))).not.toThrow(); + expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow(); + expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow(); + + expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1193046"`, + ); + expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + ); + expect(() => + buildRegExp(unicodeProperty('Emoji_Presentation')), + ).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + ); + expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot( + `"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`, + ); +}); diff --git a/src/builders.ts b/src/builders.ts index 5568761..482392f 100644 --- a/src/builders.ts +++ b/src/builders.ts @@ -11,6 +11,16 @@ import { encode } from './encoder'; export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp { const pattern = encode(sequence).pattern; const flagsString = encodeFlags(flags ?? {}); + + if (!flags?.unicode) { + const unicodeModePattern = getUnicodeModePattern(pattern); + if (unicodeModePattern) { + throw new Error( + `The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`, + ); + } + } + return new RegExp(pattern, flagsString); } @@ -32,6 +42,14 @@ function encodeFlags(flags: RegexFlags): string { if (flags.hasIndices) result += 'd'; if (flags.dotAll) result += 's'; if (flags.sticky) result += 'y'; + if (flags.unicode) result += 'u'; return result; } + +const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/; + +function getUnicodeModePattern(pattern: string): string | null { + const match = pattern.match(unicodeModePatterns); + return match?.[0] ?? null; +} diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/char-escape-unicode.test.tsx new file mode 100644 index 0000000..e7c940e --- /dev/null +++ b/src/constructs/__tests__/char-escape-unicode.test.tsx @@ -0,0 +1,154 @@ +import { + buildRegExp, + char, + charClass, + endOfString, + type RegexSequence, + startOfString, + unicodeProperty, +} from '../..'; + +function u(sequence: RegexSequence) { + return buildRegExp(sequence, { unicode: true }); +} + +test('`char` pattern', () => { + // eslint-disable-next-line no-control-regex + expect(char(0)).toEqualRegex(/\u0000/); + // eslint-disable-next-line no-control-regex + expect(char(0x1)).toEqualRegex(/\u0001/); + // eslint-disable-next-line no-control-regex + expect(char(0x12)).toEqualRegex(/\u0012/); + expect(char(0x123)).toEqualRegex(/\u0123/); + expect(char(0x1234)).toEqualRegex(/\u1234/); + + // eslint-disable-next-line no-control-regex + expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u')); + // eslint-disable-next-line no-control-regex + expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u')); + expect(u(char(0x12))).toEqualRegex( + // eslint-disable-next-line no-control-regex + new RegExp('\\u0012', 'u'), + ); + expect(char(0x0123)).toEqualRegex(/\u0123/); + expect(char(0x1234)).toEqualRegex(/\u1234/); + + expect(u(char(0x0123))).toEqualRegex(/\u0123/u); + expect(u(char(0x1234))).toEqualRegex(/\u1234/u); + expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u')); + expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u')); +}); + +test('`char` matching', () => { + expect(char(0)).toMatchString('\u{0}'); + expect(char(0x1)).toMatchString('\u{1}'); + expect(char(0x12)).toMatchString('\u{12}}'); + expect(char(0x123)).toMatchString('\u{123}'); + expect(char(0x1234)).toMatchString('\u{1234}}'); + + expect(char('a'.codePointAt(0)!)).toMatchString('a'); + expect(char('ą'.codePointAt(0)!)).toMatchString('ą'); + expect(char('©'.codePointAt(0)!)).toMatchString('©'); + + expect(u(char(0))).toMatchString('\u{0}'); + expect(u(char(0))).not.toMatchString('a'); + expect(u(char(0x1))).toMatchString('\u{1}'); + expect(u(char(0x12))).toMatchString('\u{12}'); + expect(u(char(0x123))).toMatchString('\u{123}'); + expect(u(char(0x1234))).toMatchString('\u{1234}'); + expect(u(char(0x12345))).toMatchString('\u{12345}'); + expect(u(char(0x103456))).toMatchString('\u{103456}'); + + expect(u(char('a'.codePointAt(0)!))).toMatchString('a'); + expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą'); + expect(u(char('©'.codePointAt(0)!))).toMatchString('©'); + expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎'); + expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}'); +}); + +test('`char` nesting matching', () => { + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a'); + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą'); + expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b'); +}); + +test('`char` edge cases handling', () => { + expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received NaN"`, + ); + expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1.5"`, + ); + expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received -1"`, + ); + expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot( + `"Expected a valid unicode code point but received 1114112"`, + ); + + expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u); +}); + +test('`unicodeProperty` pattern', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex( + /\p{General_Category=Letter}/u, + ); + expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u); + expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u); + expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u); + expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u); + expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u); + expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u); + expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u); + + expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}'); + expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}'); + expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}'); + + expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}'); + expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex( + '\\p{Script_Extensions=Thaana}', + ); + expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}'); + + expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}'); +}); + +test('`unicodeProperty` matching', () => { + expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A'); + expect(u(unicodeProperty('Letter'))).toMatchString('A'); + expect(u(unicodeProperty('L'))).toMatchString('A'); + + expect(u(unicodeProperty('Uppercase'))).toMatchString('A'); + expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a'); + expect(u(unicodeProperty('Lu'))).toMatchString('A'); + + expect(u(unicodeProperty('Lowercase'))).toMatchString('a'); + expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A'); + expect(u(unicodeProperty('Ll'))).toMatchString('a'); + + expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A'); + expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α'); + expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A'); + + // Basic emoji + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌'); + + // Complex emoji with skin tone modifier + expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼'); + expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼'); +}); + +test('`unicodeProperty` nesting matching', () => { + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + 'a', + ); + expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString( + ' ', + ); + expect( + u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))), + ).not.toMatchString('A'); +}); diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts index b2bc758..c480d9f 100644 --- a/src/constructs/char-class.ts +++ b/src/constructs/char-class.ts @@ -32,7 +32,7 @@ export function charRange(start: string, end: string): CharacterClass { } export function anyOf(characters: string): CharacterClass { - const chars = characters.split('').map((c) => escapeForCharacterClass(c)); + const chars = characters.split('').map((c) => escapeCharClass(c)); if (chars.length === 0) { throw new Error('`anyOf` should received at least one character'); @@ -52,6 +52,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex */ export const inverted = negated; -function escapeForCharacterClass(text: string): string { +function escapeCharClass(text: string): string { return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string } diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts index 77aa2cb..fcf6be5 100644 --- a/src/constructs/char-escape.ts +++ b/src/constructs/char-escape.ts @@ -59,3 +59,54 @@ export const notWord = nonWord; * @deprecated Renamed to `nonWhitespace`. */ export const notWhitespace = nonWhitespace; + +/** + * Unicode character code point escape. + * + * Regex pattern: + * - `\uXXXX`: 4-digit hex escape for code points below 0x10000. + * - `\u{X}`: Unicode code point escape for code points above 0xFFFF. + * + * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param codePoint The code point of the character to escape. + * @returns A character class representing the unicode escape. + */ +export function char(codePoint: number): CharacterEscape { + if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) { + throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`); + } + + let escape = + codePoint < 0x10000 + ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes) + : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode) + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} + +/** + * Unicode property escape matching a set of characters specified by a Unicode property. + * + * Regex pattern: `\p{Property}` or `\p{Property=Value}` + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape + * + * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * @param property Unicode property name. + * @param value Unicode property value (optional). + * @returns A character class representing the unicode property escape. + */ +export function unicodeProperty(property: string, value?: string): CharacterEscape { + const escape = `\\p{${property}${value ? `=${value}` : ''}}`; + + return { + precedence: 'atom', + pattern: escape, + chars: [escape], + }; +} diff --git a/src/index.ts b/src/index.ts index d2c5791..30d6677 100644 --- a/src/index.ts +++ b/src/index.ts @@ -5,35 +5,37 @@ export type { QuantifierOptions } from './constructs/quantifiers'; export type { RepeatOptions } from './constructs/repeat'; // Builders -export { buildPattern, buildRegExp } from './builders'; +export { buildRegExp, buildPattern } from './builders'; // Constructs export { + startOfString, endOfString, + wordBoundary, nonWordBoundary, notWordBoundary, - startOfString, - wordBoundary, } from './constructs/anchors'; export { capture, ref } from './constructs/capture'; -export { anyOf, charClass, charRange, negated, inverted } from './constructs/char-class'; +export { charClass, charRange, anyOf, negated, inverted } from './constructs/char-class'; export { any, digit, nonDigit, - nonWhitespace, + word, nonWord, + whitespace, + nonWhitespace, notDigit, notWhitespace, notWord, - whitespace, - word, + char, + unicodeProperty, } from './constructs/char-escape'; export { choiceOf } from './constructs/choice-of'; export { lookahead } from './constructs/lookahead'; export { lookbehind } from './constructs/lookbehind'; export { negativeLookahead } from './constructs/negative-lookahead'; export { negativeLookbehind } from './constructs/negative-lookbehind'; -export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers'; +export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers'; export { regex } from './constructs/regex'; export { repeat } from './constructs/repeat'; diff --git a/src/types.ts b/src/types.ts index 2b102d5..81e23a3 100644 --- a/src/types.ts +++ b/src/types.ts @@ -79,8 +79,19 @@ export interface RegexFlags { dotAll?: boolean; /** - * MDN: _Matches only from the index indicated by the lastIndex property of this regular expression in the target string. Does not attempt to match from any later indexes._ + * MDN: _Matches only from the index indicated by the `lastIndex` property of this regular expression in the target string. Does not attempt to match from any later indexes._ * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/sticky */ sticky?: boolean; + + /** + * Enables [Unicode-aware mode](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + * + * This enables features like: + * - Unicode character escapes: `\u{xxxx}` + * - Unicode character property escapes:`\p{Property=Value}` + * + * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode + */ + unicode?: boolean; } diff --git a/website/docs/api/overview.md b/website/docs/api/overview.md index 6f5f71f..ab6031e 100644 --- a/website/docs/api/overview.md +++ b/website/docs/api/overview.md @@ -75,18 +75,20 @@ See [Quantifiers](./api/quantifiers) for more info. ### Character classes -| Character class | Regex Syntax | Description | -| --------------------- | ------------ | ------------------------------------------------- | -| `any` | `.` | Any character | -| `word` | `\w` | Word character: letter, digit, underscore | -| `digit` | `\d` | Digit character: 0 to 9 | -| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | -| `anyOf('abc')` | `[abc]` | Any of provided characters | -| `charRange('a', 'z')` | `[a-z]` | Character in a range | -| `charClass(...)` | `[...]` | Union of multiple character classes | -| `negated(...)` | `[^...]` | Negation of a given character class | - -See [Character Classes](./api/character-classes) for more info. +| Character class | Regex Syntax | Description | +| ---------------------- | ------------ | ------------------------------------------------- | +| `any` | `.` | Any character | +| `word` | `\w` | Word character: letter, digit, underscore | +| `digit` | `\d` | Digit character: 0 to 9 | +| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... | +| `anyOf('abc')` | `[abc]` | Any of provided characters | +| `charRange('a', 'z')` | `[a-z]` | Character in a range | +| `charClass(...)` | `[...]` | Union of multiple character classes | +| `negated(...)` | `[^...]` | Negation of a given character class | +| `char(...)` | `\uXXXX` | Character specified given Unicode code point | +| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property | + +See [Character Classes](./api/character-classes) and [Unicode](./api/unicode) for more info. ### Assertions diff --git a/website/docs/api/unicode.md b/website/docs/api/unicode.md new file mode 100644 index 0000000..fc1648b --- /dev/null +++ b/website/docs/api/unicode.md @@ -0,0 +1,35 @@ +--- +id: unicode +title: Unicode +--- + +### Unicode-aware mode + +JavaScript `RegExp` object offers [Unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + +### Character escapes + +```ts +function char(codePoint: number): CharacterEscape; +``` + +Regex syntax: + +- `\uXXXX`: 4-digit hex escape for code points below 0x10000. +- `\u{X}`: Unicode code point escape for code points above 0xFFFF. + +Note: for code points above 0xFFFF, the regex engine must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode). + +### Unicode character property escapes + +```ts +function unicodeProperty(property: string, value?: string): CharacterEscape; +``` + +Unicode character property escape matching a set of characters specified by a Unicode property. + +Regex syntax: `\p{Property}` or `\p{Property=Value}` + +See: +- [MDN: Unicode character class escape](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape) +- [UTS#18: Unicode Regular Expressions](https://www.unicode.org/reports/tr18/) diff --git a/website/sidebars.js b/website/sidebars.js index ed97527..ebb4d7b 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -33,6 +33,7 @@ export default { 'api/quantifiers', 'api/character-classes', 'api/assertions', + 'api/unicode', ], }, {