feat: unicode modes (#85)

callstack · Sep 6, 2024 · 0cf4af8 · 0cf4af8
1 parent 17641ad
commit 0cf4af8
Show file tree

Hide file tree

Showing 11 changed files with 339 additions and 38 deletions.
diff --git a/README.md b/README.md
@@ -127,18 +127,21 @@ See [Quantifiers API doc](https://callstack.github.io/ts-regex-builder/api/quant
 
 ### Character classes
 
-| Character class       | Regex Syntax | Description                                       |
-| --------------------- | ------------ | ------------------------------------------------- |
-| `any`                 | `.`          | Any character                                     |
-| `word`                | `\w`         | Word character: letter, digit, underscore         |
-| `digit`               | `\d`         | Digit character: 0 to 9                           |
-| `whitespace`          | `\s`         | Whitespace character: space, tab, line break, ... |
-| `anyOf('abc')`        | `[abc]`      | Any of provided characters                        |
-| `charRange('a', 'z')` | `[a-z]`      | Character in a range                              |
-| `charClass(...)`      | `[...]`      | Union of multiple character classes               |
-| `negated(...)`        | `[^...]`     | Negation of a given character class               |
-
-See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) for more info.
+| Character class        | Regex Syntax | Description                                       |
+| ---------------------- | ------------ | ------------------------------------------------- |
+| `any`                  | `.`          | Any character                                     |
+| `word`                 | `\w`         | Word character: letter, digit, underscore         |
+| `digit`                | `\d`         | Digit character: 0 to 9                           |
+| `whitespace`           | `\s`         | Whitespace character: space, tab, line break, ... |
+| `anyOf('abc')`         | `[abc]`      | Any of provided characters                        |
+| `charRange('a', 'z')`  | `[a-z]`      | Character in a range                              |
+| `charClass(...)`       | `[...]`      | Union of multiple character classes               |
+| `negated(...)`         | `[^...]`     | Negation of a given character class               |
+| `char(...)`            | `\uXXXX`     | Character specified given Unicode code point      |
+| `unicodeProperty(...)` | `\p{...}`    | Characters with given Unicode property            |
+
+
+See [Character Classes API doc](https://callstack.github.io/ts-regex-builder/api/character-classes) and [Unicode API doc](https://callstack.github.io/ts-regex-builder/api/unicode) for more info.
 
 ### Assertions
 
@@ -177,9 +180,12 @@ TS Regex Builder is inspired by [Swift Regex Builder API](https://developer.appl
 
 ## Reference
 
-- [ECMAScript Regular Expression BNF Grammar](https://262.ecma-international.org/7.0/#sec-regular-expressions)
-- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)
+- [ECMAScript Regular Expression BNF Grammar](https://tc39.es/ecma262/#sec-regular-expressions)
+- [Unicode Regular Expressions](https://www.unicode.org/reports/tr18/)
 - [Swift Evolution 351: Regex Builder DSL](https://github.com/apple/swift-evolution/blob/main/proposals/0351-regex-builder.md)
+- [Swift Regex Builder API docs](https://developer.apple.com/documentation/regexbuilder)
+
+
 
 ---
 

diff --git a/src/__tests__/builder.test.ts b/src/__tests__/builder.test.ts
@@ -1,4 +1,4 @@
-import { buildRegExp } from '..';
+import { buildRegExp, char, unicodeProperty } from '..';
 
 test('`regexBuilder` flags', () => {
   expect(buildRegExp('a').flags).toBe('');
@@ -32,3 +32,24 @@ test('`regexBuilder` flags', () => {
     }).flags,
   ).toBe('gisy');
 });
+
+test('`regexBuilder` throws when using unicode-aware features without `unicode` flag', () => {
+  expect(() => buildRegExp(char(0x1234))).not.toThrow();
+  expect(() => buildRegExp(char(0x12345), { unicode: true })).not.toThrow();
+  expect(() => buildRegExp(unicodeProperty('Emoji_Presentation'), { unicode: true })).not.toThrow();
+
+  expect(() => buildRegExp(char(0x123456))).toThrowErrorMatchingInlineSnapshot(
+    `"Expected a valid unicode code point but received 1193046"`,
+  );
+  expect(() => buildRegExp(char(0x12345))).toThrowErrorMatchingInlineSnapshot(
+    `"The pattern "\\u{12345}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
+  );
+  expect(() =>
+    buildRegExp(unicodeProperty('Emoji_Presentation')),
+  ).toThrowErrorMatchingInlineSnapshot(
+    `"The pattern "\\p{Emoji_Presentation}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
+  );
+  expect(() => buildRegExp(/\P{Letter}/u)).toThrowErrorMatchingInlineSnapshot(
+    `"The pattern "\\P{Letter}" requires Unicode-aware mode. Please ensure the "unicode" flag is set."`,
+  );
+});
diff --git a/src/builders.ts b/src/builders.ts
@@ -11,6 +11,16 @@ import { encode } from './encoder';
 export function buildRegExp(sequence: RegexSequence, flags?: RegexFlags): RegExp {
   const pattern = encode(sequence).pattern;
   const flagsString = encodeFlags(flags ?? {});
+
+  if (!flags?.unicode) {
+    const unicodeModePattern = getUnicodeModePattern(pattern);
+    if (unicodeModePattern) {
+      throw new Error(
+        `The pattern "${unicodeModePattern}" requires Unicode-aware mode. Please ensure the "unicode" flag is set.`,
+      );
+    }
+  }
+
   return new RegExp(pattern, flagsString);
 }
 
@@ -32,6 +42,14 @@ function encodeFlags(flags: RegexFlags): string {
   if (flags.hasIndices) result += 'd';
   if (flags.dotAll) result += 's';
   if (flags.sticky) result += 'y';
+  if (flags.unicode) result += 'u';
 
   return result;
 }
+
+const unicodeModePatterns = /(?:\\u|\\p|\\P)\{.+?\}/;
+
+function getUnicodeModePattern(pattern: string): string | null {
+  const match = pattern.match(unicodeModePatterns);
+  return match?.[0] ?? null;
+}
diff --git a/src/constructs/__tests__/char-escape-unicode.test.tsx b/src/constructs/__tests__/char-escape-unicode.test.tsx
@@ -0,0 +1,154 @@
+import {
+  buildRegExp,
+  char,
+  charClass,
+  endOfString,
+  type RegexSequence,
+  startOfString,
+  unicodeProperty,
+} from '../..';
+
+function u(sequence: RegexSequence) {
+  return buildRegExp(sequence, { unicode: true });
+}
+
+test('`char` pattern', () => {
+  // eslint-disable-next-line no-control-regex
+  expect(char(0)).toEqualRegex(/\u0000/);
+  // eslint-disable-next-line no-control-regex
+  expect(char(0x1)).toEqualRegex(/\u0001/);
+  // eslint-disable-next-line no-control-regex
+  expect(char(0x12)).toEqualRegex(/\u0012/);
+  expect(char(0x123)).toEqualRegex(/\u0123/);
+  expect(char(0x1234)).toEqualRegex(/\u1234/);
+
+  // eslint-disable-next-line no-control-regex
+  expect(u(char(0))).toEqualRegex(new RegExp('\\u0000', 'u'));
+  // eslint-disable-next-line no-control-regex
+  expect(u(char(0x1))).toEqualRegex(new RegExp('\\u0001', 'u'));
+  expect(u(char(0x12))).toEqualRegex(
+    // eslint-disable-next-line no-control-regex
+    new RegExp('\\u0012', 'u'),
+  );
+  expect(char(0x0123)).toEqualRegex(/\u0123/);
+  expect(char(0x1234)).toEqualRegex(/\u1234/);
+
+  expect(u(char(0x0123))).toEqualRegex(/\u0123/u);
+  expect(u(char(0x1234))).toEqualRegex(/\u1234/u);
+  expect(u(char(0x12345))).toEqualRegex(new RegExp('\\u{12345}', 'u'));
+  expect(u(char(0x103456))).toEqualRegex(new RegExp('\\u{103456}', 'u'));
+});
+
+test('`char` matching', () => {
+  expect(char(0)).toMatchString('\u{0}');
+  expect(char(0x1)).toMatchString('\u{1}');
+  expect(char(0x12)).toMatchString('\u{12}}');
+  expect(char(0x123)).toMatchString('\u{123}');
+  expect(char(0x1234)).toMatchString('\u{1234}}');
+
+  expect(char('a'.codePointAt(0)!)).toMatchString('a');
+  expect(char('ą'.codePointAt(0)!)).toMatchString('ą');
+  expect(char('©'.codePointAt(0)!)).toMatchString('©');
+
+  expect(u(char(0))).toMatchString('\u{0}');
+  expect(u(char(0))).not.toMatchString('a');
+  expect(u(char(0x1))).toMatchString('\u{1}');
+  expect(u(char(0x12))).toMatchString('\u{12}');
+  expect(u(char(0x123))).toMatchString('\u{123}');
+  expect(u(char(0x1234))).toMatchString('\u{1234}');
+  expect(u(char(0x12345))).toMatchString('\u{12345}');
+  expect(u(char(0x103456))).toMatchString('\u{103456}');
+
+  expect(u(char('a'.codePointAt(0)!))).toMatchString('a');
+  expect(u(char('ą'.codePointAt(0)!))).toMatchString('ą');
+  expect(u(char('©'.codePointAt(0)!))).toMatchString('©');
+  expect(u(char('😎'.codePointAt(0)!))).toMatchString('😎');
+  expect(u(char('😎'.codePointAt(0)!))).toMatchString('\u{1f60e}');
+});
+
+test('`char` nesting matching', () => {
+  expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('a');
+  expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).toMatchString('ą');
+  expect(u(charClass(char('a'.codePointAt(0)!), char('ą'.codePointAt(0)!)))).not.toMatchString('b');
+});
+
+test('`char` edge cases handling', () => {
+  expect(() => u(char(NaN))).toThrowErrorMatchingInlineSnapshot(
+    `"Expected a valid unicode code point but received NaN"`,
+  );
+  expect(() => u(char(1.5))).toThrowErrorMatchingInlineSnapshot(
+    `"Expected a valid unicode code point but received 1.5"`,
+  );
+  expect(() => u(char(-1))).toThrowErrorMatchingInlineSnapshot(
+    `"Expected a valid unicode code point but received -1"`,
+  );
+  expect(() => u(char(0x110000))).toThrowErrorMatchingInlineSnapshot(
+    `"Expected a valid unicode code point but received 1114112"`,
+  );
+
+  expect(u(char(0x10ffff))).toEqualRegex(/\u{10ffff}/u);
+});
+
+test('`unicodeProperty` pattern', () => {
+  expect(u(unicodeProperty('General_Category', 'Letter'))).toEqualRegex(
+    /\p{General_Category=Letter}/u,
+  );
+  expect(u(unicodeProperty('Letter'))).toEqualRegex(/\p{Letter}/u);
+  expect(u(unicodeProperty('L'))).toEqualRegex(/\p{L}/u);
+  expect(u(unicodeProperty('Lu'))).toEqualRegex(/\p{Lu}/u);
+  expect(u(unicodeProperty('Ll'))).toEqualRegex(/\p{Ll}/u);
+  expect(u(unicodeProperty('Lt'))).toEqualRegex(/\p{Lt}/u);
+  expect(u(unicodeProperty('Lm'))).toEqualRegex(/\p{Lm}/u);
+  expect(u(unicodeProperty('Lo'))).toEqualRegex(/\p{Lo}/u);
+
+  expect(u(unicodeProperty('Script', 'Latin'))).toEqualRegex('\\p{Script=Latin}');
+  expect(u(unicodeProperty('Script', 'Grek'))).toEqualRegex('\\p{Script=Grek}');
+  expect(u(unicodeProperty('sc', 'Cyrillic'))).toEqualRegex('\\p{sc=Cyrillic}');
+
+  expect(u(unicodeProperty('Script', 'Thaana'))).toEqualRegex('\\p{Script=Thaana}');
+  expect(u(unicodeProperty('Script_Extensions', 'Thaana'))).toEqualRegex(
+    '\\p{Script_Extensions=Thaana}',
+  );
+  expect(u(unicodeProperty('scx', 'Thaana'))).toEqualRegex('\\p{scx=Thaana}');
+
+  expect(u(unicodeProperty('Emoji'))).toEqualRegex('\\p{Emoji}');
+});
+
+test('`unicodeProperty` matching', () => {
+  expect(u(unicodeProperty('General_Category', 'Letter'))).toMatchString('A');
+  expect(u(unicodeProperty('Letter'))).toMatchString('A');
+  expect(u(unicodeProperty('L'))).toMatchString('A');
+
+  expect(u(unicodeProperty('Uppercase'))).toMatchString('A');
+  expect(u(unicodeProperty('Uppercase'))).not.toMatchString('a');
+  expect(u(unicodeProperty('Lu'))).toMatchString('A');
+
+  expect(u(unicodeProperty('Lowercase'))).toMatchString('a');
+  expect(u(unicodeProperty('Lowercase'))).not.toMatchString('A');
+  expect(u(unicodeProperty('Ll'))).toMatchString('a');
+
+  expect(u(unicodeProperty('Script', 'Latin'))).toMatchString('A');
+  expect(u(unicodeProperty('Script', 'Latin'))).not.toMatchString('α');
+  expect(u(unicodeProperty('Script', 'Grek'))).toMatchString('α');
+  expect(u(unicodeProperty('Script', 'Grek'))).not.toMatchString('A');
+
+  // Basic emoji
+  expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('😎');
+  expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).toMatchString('🐌');
+
+  // Complex emoji with skin tone modifier
+  expect(u(unicodeProperty('Emoji'))).toMatchString('☝🏼');
+  expect(u([startOfString, unicodeProperty('Emoji'), endOfString])).not.toMatchString('☝🏼');
+});
+
+test('`unicodeProperty` nesting matching', () => {
+  expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
+    'a',
+  );
+  expect(u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space')))).toMatchString(
+    ' ',
+  );
+  expect(
+    u(charClass(unicodeProperty('Lowercase'), unicodeProperty('White_Space'))),
+  ).not.toMatchString('A');
+});
diff --git a/src/constructs/char-class.ts b/src/constructs/char-class.ts
@@ -32,7 +32,7 @@ export function charRange(start: string, end: string): CharacterClass {
 }
 
 export function anyOf(characters: string): CharacterClass {
-  const chars = characters.split('').map((c) => escapeForCharacterClass(c));
+  const chars = characters.split('').map((c) => escapeCharClass(c));
 
   if (chars.length === 0) {
     throw new Error('`anyOf` should received at least one character');
@@ -52,6 +52,6 @@ export function negated(element: CharacterClass | CharacterEscape): EncodedRegex
  */
 export const inverted = negated;
 
-function escapeForCharacterClass(text: string): string {
+function escapeCharClass(text: string): string {
   return text.replace(/[\]\\]/g, '\\$&'); // $& means the whole matched string
 }
diff --git a/src/constructs/char-escape.ts b/src/constructs/char-escape.ts
@@ -59,3 +59,54 @@ export const notWord = nonWord;
  * @deprecated Renamed to `nonWhitespace`.
  */
 export const notWhitespace = nonWhitespace;
+
+/**
+ * Unicode character code point escape.
+ *
+ * Regex pattern:
+ * - `\uXXXX`: 4-digit hex escape for code points below 0x10000.
+ * - `\u{X}`: Unicode code point escape for code points above 0xFFFF.
+ *
+ * Note: for code points above 0xFFFF, the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
+ *
+ * @param codePoint The code point of the character to escape.
+ * @returns A character class representing the unicode escape.
+ */
+export function char(codePoint: number): CharacterEscape {
+  if (!Number.isInteger(codePoint) || codePoint < 0 || codePoint > 0x10ffff) {
+    throw new RangeError(`Expected a valid unicode code point but received ${codePoint}`);
+  }
+
+  let escape =
+    codePoint < 0x10000
+      ? `\\u${codePoint.toString(16).padStart(4, '0')}` // 4-digit hex (works in all modes)
+      : `\\u{${codePoint.toString(16)}}`; // 1-6 digit hex (requires unicode-aware mode)
+
+  return {
+    precedence: 'atom',
+    pattern: escape,
+    chars: [escape],
+  };
+}
+
+/**
+ * Unicode property escape matching a set of characters specified by a Unicode property.
+ *
+ * Regex pattern: `\p{Property}` or `\p{Property=Value}`
+ * @see https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Unicode_character_class_escape
+ *
+ * Note: the regex must be [unicode-aware](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/RegExp/unicode#unicode-aware_mode).
+ *
+ * @param property Unicode property name.
+ * @param value Unicode property value (optional).
+ * @returns A character class representing the unicode property escape.
+ */
+export function unicodeProperty(property: string, value?: string): CharacterEscape {
+  const escape = `\\p{${property}${value ? `=${value}` : ''}}`;
+
+  return {
+    precedence: 'atom',
+    pattern: escape,
+    chars: [escape],
+  };
+}
diff --git a/src/index.ts b/src/index.ts
@@ -5,35 +5,37 @@ export type { QuantifierOptions } from './constructs/quantifiers';
 export type { RepeatOptions } from './constructs/repeat';
 
 // Builders
-export { buildPattern, buildRegExp } from './builders';
+export { buildRegExp, buildPattern } from './builders';
 
 // Constructs
 export {
+  startOfString,
   endOfString,
+  wordBoundary,
   nonWordBoundary,
   notWordBoundary,
-  startOfString,
-  wordBoundary,
 } from './constructs/anchors';
 export { capture, ref } from './constructs/capture';
-export { anyOf, charClass, charRange, negated, inverted } from './constructs/char-class';
+export { charClass, charRange, anyOf, negated, inverted } from './constructs/char-class';
 export {
   any,
   digit,
   nonDigit,
-  nonWhitespace,
+  word,
   nonWord,
+  whitespace,
+  nonWhitespace,
   notDigit,
   notWhitespace,
   notWord,
-  whitespace,
-  word,
+  char,
+  unicodeProperty,
 } from './constructs/char-escape';
 export { choiceOf } from './constructs/choice-of';
 export { lookahead } from './constructs/lookahead';
 export { lookbehind } from './constructs/lookbehind';
 export { negativeLookahead } from './constructs/negative-lookahead';
 export { negativeLookbehind } from './constructs/negative-lookbehind';
-export { oneOrMore, optional, zeroOrMore } from './constructs/quantifiers';
+export { zeroOrMore, oneOrMore, optional } from './constructs/quantifiers';
 export { regex } from './constructs/regex';
 export { repeat } from './constructs/repeat';