Skip to content

Commit

Permalink
feat: unicode modes (#85)
Browse files Browse the repository at this point in the history
  • Loading branch information
mdjastrzebski authored Sep 6, 2024
1 parent 17641ad commit 0cf4af8
Show file tree
Hide file tree
Showing 11 changed files with 339 additions and 38 deletions.
34 changes: 20 additions & 14 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -127,18 +127,21 @@ See [Quantifiers API doc](https://callstack.github.io/ts-regex-builder/api/quant

### Character classes

| Character class | Regex Syntax | Description |
| --------------------- | ------------ | ------------------------------------------------- |
| `any` | `.` | Any character |
| `word` | `\w` | Word character: letter, digit, underscore |
| `digit` | `\d` | Digit character: 0 to 9 |
| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... |
| `anyOf('abc')` | `[abc]` | Any of provided characters |
| `charRange('a', 'z')` | `[a-z]` | Character in a range |
| `charClass(...)` | `[...]` | Union of multiple character classes |
| `negated(...)` | `[^...]` | Negation of a given character class |

See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) for more info.
| Character class | Regex Syntax | Description |
| ---------------------- | ------------ | ------------------------------------------------- |
| `any` | `.` | Any character |
| `word` | `\w` | Word character: letter, digit, underscore |
| `digit` | `\d` | Digit character: 0 to 9 |
| `whitespace` | `\s` | Whitespace character: space, tab, line break, ... |
| `anyOf('abc')` | `[abc]` | Any of provided characters |
| `charRange('a', 'z')` | `[a-z]` | Character in a range |
| `charClass(...)` | `[...]` | Union of multiple character classes |
| `negated(...)` | `[^...]` | Negation of a given character class |
| `char(...)` | `\uXXXX` | Character specified given Unicode code point |
| `unicodeProperty(...)` | `\p{...}` | Characters with given Unicode property |


See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) and [Unicode API doc](https://callstack.github.io/ts-regex-builder/api/unicode) for more info.

### Assertions

Expand Down Expand Up @@ -177,9 +180,12 @@ TS Regex Builder is inspired by [Swift Regex Builder API](https://developer.appl

## Reference

- [ECMAScript Regular Expression BNF Grammar](https://262.ecma-international.org/7.0/#sec-regular-expressions)
- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)
- [ECMAScript Regular Expression BNF Grammar](https://tc39.es/ecma262/#sec-regular-expressions)
- [Unicode Regular Expressions](https://www.unicode.org/reports/tr18/)
- [Swift Evolution 351: Regex Builder DSL](https://github.com/apple/swift-evolution/blob/main/proposals/0351-regex-builder.md)
- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)



---

Expand Down
23 changes: 22 additions & 1 deletion src/__tests__/builder.test.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { buildRegExp } from '..';
import { buildRegExp, char, unicodeProperty } from '..';

test('`regexBuilder` flags', () => {
expect(buildRegExp('a').flags).toBe('');
Expand Down Expand Up @@ -32,3 +32,24 @@ test('`regexBuilder` flags', () => {
}).flags,
).toBe('gisy');
});

test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => {
expect(() => buildRegExp(char(0x1234))).not.toThrow();
expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow();
expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow();

expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot(
`"Expected a valid unicode code point but received 1193046"`,
);
expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot(
`"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
);
expect(() =>
buildRegExp(unicodeProperty('Emoji_Presentation')),
).toThrowErrorMatchingInlineSnapshot(
`"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
);
expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot(
`"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
);
});
18 changes: 18 additions & 0 deletions src/builders.ts
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,16 @@ import { encode } from './encoder';
export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp {
const pattern = encode(sequence).pattern;
const flagsString = encodeFlags(flags ?? {});

if (!flags?.unicode) {
const unicodeModePattern = getUnicodeModePattern(pattern);
if (unicodeModePattern) {
throw new Error(
`The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`,
);
}
}

return new RegExp(pattern, flagsString);
}

Expand All @@ -32,6 +42,14 @@ function encodeFlags(flags: RegexFlags): string {
if (flags.hasIndices) result += 'd';
if (flags.dotAll) result += 's';
if (flags.sticky) result += 'y';
if (flags.unicode) result += 'u';

return result;
}

const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/;

function getUnicodeModePattern(pattern: string): string | null {
const match = pattern.match(unicodeModePatterns);
return match?.[0] ?? null;
}
154 changes: 154 additions & 0 deletions src/constructs/__tests__/char-escape-unicode.test.tsx
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
import {
buildRegExp,
char,
charClass,
endOfString,
type RegexSequence,
startOfString,
unicodeProperty,
} from '../..';

function u(sequence: RegexSequence) {
return buildRegExp(sequence, { unicode: true });
}

test('`char` pattern', () => {
// eslint-disable-next-line no-control-regex
expect(char(0)).toEqualRegex(/\u0000/);
// eslint-disable-next-line no-control-regex
expect(char(0x1)).toEqualRegex(/\u0001/);
// eslint-disable-next-line no-control-regex
expect(char(0x12)).toEqualRegex(/\u0012/);
expect(char(0x123)).toEqualRegex(/\u0123/);
expect(char(0x1234)).toEqualRegex(/\u1234/);

// eslint-disable-next-line no-control-regex
expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u'));
// eslint-disable-next-line no-control-regex
expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u'));
expect(u(char(0x12))).toEqualRegex(
// eslint-disable-next-line no-control-regex
new RegExp('\\u0012', 'u'),
);
expect(char(0x0123)).toEqualRegex(/\u0123/);
expect(char(0x1234)).toEqualRegex(/\u1234/);

expect(u(char(0x0123))).toEqualRegex(/\u0123/u);
expect(u(char(0x1234))).toEqualRegex(/\u1234/u);
expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u'));
expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u'));
});

test('`char` matching', () => {
expect(char(0)).toMatchString('\u{0}');
expect(char(0x1)).toMatchString('\u{1}');
expect(char(0x12)).toMatchString('\u{12}}');
expect(char(0x123)).toMatchString('\u{123}');
expect(char(0x1234)).toMatchString('\u{1234}}');

expect(char('a'.codePointAt(0)!)).toMatchString('a');
expect(char('ą'.codePointAt(0)!)).toMatchString('ą');
expect(char('©'.codePointAt(0)!)).toMatchString('©');

expect(u(char(0))).toMatchString('\u{0}');
expect(u(char(0))).not.toMatchString('a');
expect(u(char(0x1))).toMatchString('\u{1}');
expect(u(char(0x12))).toMatchString('\u{12}');
expect(u(char(0x123))).toMatchString('\u{123}');
expect(u(char(0x1234))).toMatchString('\u{1234}');
expect(u(char(0x12345))).toMatchString('\u{12345}');
expect(u(char(0x103456))).toMatchString('\u{103456}');

expect(u(char('a'.codePointAt(0)!))).toMatchString('a');
expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą');
expect(u(char('©'.codePointAt(0)!))).toMatchString('©');
expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎');
expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}');
});

test('`char` nesting matching', () => {
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a');
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą');
expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b');
});

test('`char` edge cases handling', () => {
expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot(
`"Expected a valid unicode code point but received NaN"`,
);
expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot(
`"Expected a valid unicode code point but received 1.5"`,
);
expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot(
`"Expected a valid unicode code point but received -1"`,
);
expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot(
`"Expected a valid unicode code point but received 1114112"`,
);

expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u);
});

test('`unicodeProperty` pattern', () => {
expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex(
/\p{General_Category=Letter}/u,
);
expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u);
expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u);
expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u);
expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u);
expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u);
expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u);
expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u);

expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}');
expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}');
expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}');

expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}');
expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex(
'\\p{Script_Extensions=Thaana}',
);
expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}');

expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}');
});

test('`unicodeProperty` matching', () => {
expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A');
expect(u(unicodeProperty('Letter'))).toMatchString('A');
expect(u(unicodeProperty('L'))).toMatchString('A');

expect(u(unicodeProperty('Uppercase'))).toMatchString('A');
expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a');
expect(u(unicodeProperty('Lu'))).toMatchString('A');

expect(u(unicodeProperty('Lowercase'))).toMatchString('a');
expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A');
expect(u(unicodeProperty('Ll'))).toMatchString('a');

expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A');
expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α');
expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α');
expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A');

// Basic emoji
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎');
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌');

// Complex emoji with skin tone modifier
expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼');
expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼');
});

test('`unicodeProperty` nesting matching', () => {
expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
'a',
);
expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
' ',
);
expect(
u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))),
).not.toMatchString('A');
});
4 changes: 2 additions & 2 deletions src/constructs/char-class.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ export function charRange(start: string, end: string): CharacterClass {
}

export function anyOf(characters: string): CharacterClass {
const chars = characters.split('').map((c) => escapeForCharacterClass(c));
const chars = characters.split('').map((c) => escapeCharClass(c));

if (chars.length === 0) {
throw new Error('`anyOf` should received at least one character');
Expand All @@ -52,6 +52,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex
*/
export const inverted = negated;

function escapeForCharacterClass(text: string): string {
function escapeCharClass(text: string): string {
return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string
}
51 changes: 51 additions & 0 deletions src/constructs/char-escape.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,54 @@ export const notWord = nonWord;
* @deprecated Renamed to `nonWhitespace`.
*/
export const notWhitespace = nonWhitespace;

/**
* Unicode character code point escape.
*
* Regex pattern:
* - `\uXXXX`: 4-digit hex escape for code points below 0x10000.
* - `\u{X}`: Unicode code point escape for code points above 0xFFFF.
*
* Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
*
* @param codePoint The code point of the character to escape.
* @returns A character class representing the unicode escape.
*/
export function char(codePoint: number): CharacterEscape {
if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`);
}

let escape =
codePoint < 0x10000
? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes)
: `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode)

return {
precedence: 'atom',
pattern: escape,
chars: [escape],
};
}

/**
* Unicode property escape matching a set of characters specified by a Unicode property.
*
* Regex pattern: `\p{Property}` or `\p{Property=Value}`
* @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape
*
* Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
*
* @param property Unicode property name.
* @param value Unicode property value (optional).
* @returns A character class representing the unicode property escape.
*/
export function unicodeProperty(property: string, value?: string): CharacterEscape {
const escape = `\\p{${property}${value ? `=${value}` : ''}}`;

return {
precedence: 'atom',
pattern: escape,
chars: [escape],
};
}
18 changes: 10 additions & 8 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,35 +5,37 @@ export type { QuantifierOptions } from './constructs/quantifiers';
export type { RepeatOptions } from './constructs/repeat';

// Builders
export { buildPattern, buildRegExp } from './builders';
export { buildRegExp, buildPattern } from './builders';

// Constructs
export {
startOfString,
endOfString,
wordBoundary,
nonWordBoundary,
notWordBoundary,
startOfString,
wordBoundary,
} from './constructs/anchors';
export { capture, ref } from './constructs/capture';
export { anyOf, charClass, charRange, negated, inverted } from './constructs/char-class';
export { charClass, charRange, anyOf, negated, inverted } from './constructs/char-class';
export {
any,
digit,
nonDigit,
nonWhitespace,
word,
nonWord,
whitespace,
nonWhitespace,
notDigit,
notWhitespace,
notWord,
whitespace,
word,
char,
unicodeProperty,
} from './constructs/char-escape';
export { choiceOf } from './constructs/choice-of';
export { lookahead } from './constructs/lookahead';
export { lookbehind } from './constructs/lookbehind';
export { negativeLookahead } from './constructs/negative-lookahead';
export { negativeLookbehind } from './constructs/negative-lookbehind';
export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers';
export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers';
export { regex } from './constructs/regex';
export { repeat } from './constructs/repeat';
Loading

0 comments on commit 0cf4af8

Please sign in to comment.