[markdown] Extend Markdown rules to support common CJK formatting gra…

…mmar (#16) Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted segments of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid `strong` formatting according to the CommonMark spec, since the right flank of the ending delimiter is a non-space Unicode character. See more information in the CommonMark discussion here: https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5 commonmark/cmark#208 Because this library is explicitly intended to support many languages including most Asian languages, we are adding an extension to the Markdown rules to accommodate these situations. The following tests assert that the special cases for East Asian languages function in a logically-similar way to Western languages. The tests for this change are pretty small, as I'm not fluent in anything near CJK and have purely gone off of suggestions and forums to enumerate these. Most importantly, `**{value}**件の投稿`, is now treated as a **bold** `value` followed by plain text, rather than being completely ignored.
discord · Dec 19, 2024 · 1e4f0ba · 1e4f0ba
1 parent 7c87eb3
commit 1e4f0ba
Show file tree

Hide file tree

Showing 6 changed files with 104 additions and 15 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/crates/intl_markdown/Cargo.toml b/crates/intl_markdown/Cargo.toml
@@ -13,6 +13,7 @@ doctest = false
 
 [dependencies]
 bitflags = "2"
+cjk = "0.2.5"
 intl_markdown_macros = { workspace = true }
 serde = { workspace = true }
 serde_json = { workspace = true }

diff --git a/crates/intl_markdown/src/lexer.rs b/crates/intl_markdown/src/lexer.rs
@@ -23,6 +23,7 @@ pub(super) struct LexerState {
     pub last_was_whitespace: bool,
     pub last_was_punctuation: bool,
     pub last_was_newline: bool,
+    pub last_was_cjk_punctuation: bool,
     /// True if the last token was entirely an escaped token, which has an
     /// effect on whether the next token is considered punctuation or not when
     /// computing delimiters.
@@ -45,6 +46,7 @@ impl LexerState {
             last_was_newline: true,
             last_was_whitespace: true,
             last_was_punctuation: false,
+            last_was_cjk_punctuation: false,
             last_token_was_escape: false,
             is_after_newline: true,
         }
@@ -56,6 +58,7 @@ impl LexerState {
         self.last_was_whitespace = true;
         self.last_was_newline = true;
         self.last_was_punctuation = false;
+        self.last_was_cjk_punctuation = false;
         self.last_token_was_escape = false;
         self.is_after_newline = true;
     }
@@ -215,9 +218,9 @@ impl<'source> Lexer<'source> {
             c if self.state.last_was_newline
                 && c.is_ascii_whitespace()
                 && self.state.indent_depth > 0 =>
-            {
-                self.consume_leading_whitespace()
-            }
+                {
+                    self.consume_leading_whitespace()
+                }
             b'\0' => self.consume_byte(SyntaxKind::EOF),
             _ => self.consume_verbatim_line(),
         }
@@ -585,6 +588,9 @@ impl<'source> Lexer<'source> {
                 GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Symbol
             )
         });
+        // https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+        // https://github.com/commonmark/cmark/pull/208
+        let next_is_cjk = next.map_or(false, |c| !c.is_ascii() && cjk::is_cjk_codepoint(c));
         let next_is_escaped = matches!(next, Some('\\'));
 
         let mut flags = TokenFlags::default();
@@ -594,12 +600,18 @@ impl<'source> Lexer<'source> {
         if self.state.last_was_punctuation && !self.state.last_token_was_escape {
             flags.insert(TokenFlags::HAS_PRECEDING_PUNCTUATION);
         }
+        if self.state.last_was_cjk_punctuation {
+            flags.insert(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
+        }
         if next_is_whitespace {
             flags.insert(TokenFlags::HAS_FOLLOWING_WHITESPACE);
         }
         if next_is_punctuation && !next_is_escaped {
             flags.insert(TokenFlags::HAS_FOLLOWING_PUNCTUATION);
         }
+        if next_is_cjk {
+            flags.insert(TokenFlags::HAS_FOLLOWING_CJK);
+        }
 
         self.advance();
 
@@ -1043,6 +1055,10 @@ impl<'source> Lexer<'source> {
         } else {
             last_char.is_ascii_punctuation()
         };
+        // [cjk] includes all ascii characters as CJK punctuation for some reason, which we
+        // specifically do not want to match here, so the check is also guarded that the character
+        // is not plain ASCII.
+        self.state.last_was_cjk_punctuation = !last_char.is_ascii() && cjk::is_cjk_punctuation_codepoint(last_char);
 
         self.state.last_was_newline = last_char == '\n';
         self.state.last_was_whitespace = last_char.is_whitespace();

diff --git a/crates/intl_markdown/src/parser/delimiter.rs b/crates/intl_markdown/src/parser/delimiter.rs
@@ -1,8 +1,8 @@
 use std::ops::Range;
 
-use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
 use crate::delimiter::Delimiter;
 use crate::parser::emphasis::process_emphasis;
+use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
 
 use super::ICUMarkdownParser;
 
@@ -59,22 +59,24 @@ pub(super) fn parse_delimiter_run(p: &mut ICUMarkdownParser, kind: SyntaxKind) -
         !first_flags.has_preceding_whitespace()
             // 2. Either:
             && (
-                // - not preceded by a punctuation. OR
-                !first_flags.has_preceding_punctuation()
+            // - not preceded by a punctuation. OR
+            // (CJK extension: preceding CJK punctuation is allowed)
+            (!first_flags.has_preceding_punctuation() || first_flags.has_preceding_cjk_punctuation())
                 // - preceded by punctuation but followed by whitespace or punctuation
-                || (last_flags.has_following_whitespace() || last_flags.has_following_punctuation())
-            );
+                // (CJK extension: following CJK characters are allowed)
+                || (last_flags.has_following_whitespace() || last_flags.has_following_punctuation() || last_flags.has_following_cjk())
+        );
 
     // Left-flanking definition
     // 1. Not followed by whitespace AND
     let is_left_flanking = !last_flags.has_following_whitespace()
-            // 2. Either:
-            && (
-                // - not followed by a punctuation. OR
-                !last_flags.has_following_punctuation()
-                // - followed by punctuation but preceded by whitespace or punctuation.
-                || (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
-            );
+        // 2. Either:
+        && (
+        // - not followed by a punctuation. OR
+        !last_flags.has_following_punctuation()
+            // - followed by punctuation but preceded by whitespace or punctuation.
+            || (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
+    );
 
     // Using the determined flanking and context flags and the `kind` of the
     // token, determine if it can be used to open and/or close emphasis.

diff --git a/crates/intl_markdown/src/token.rs b/crates/intl_markdown/src/token.rs
@@ -19,6 +19,12 @@ bitflags! {
         // Only used for some delimiters currently. `ESCAPED` kinds will also
         // always have this set.
         const IS_ESCAPED = 1 << 6;
+
+        // Extension for support delimiters around CJK script characters.
+        // https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+        // https://github.com/commonmark/cmark/pull/208
+        const HAS_PRECEDING_CJK_PUNCTUATION = 1 << 5;
+        const HAS_FOLLOWING_CJK = 1 << 6;
     }
 }
 
@@ -39,6 +45,13 @@ impl TokenFlags {
     pub fn is_escaped(&self) -> bool {
         self.contains(TokenFlags::IS_ESCAPED)
     }
+
+    pub fn has_preceding_cjk_punctuation(&self) -> bool {
+        self.contains(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
+    }
+    pub fn has_following_cjk(&self) -> bool {
+        self.contains(TokenFlags::HAS_FOLLOWING_CJK)
+    }
 }
 
 #[derive(Clone, PartialEq, Eq)]

diff --git a/crates/intl_markdown/tests/md_extensions.rs b/crates/intl_markdown/tests/md_extensions.rs
@@ -3,6 +3,38 @@
 
 mod harness;
 
+/// Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted
+/// segments  of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid
+/// `strong`  formatting according to the CommonMark spec, since the right flank of the ending
+/// delimiter is a non-space Unicode character.
+///
+/// See more information in the CommonMark discussion here:
+/// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
+/// https://github.com/commonmark/cmark/pull/208
+///
+/// Because this library is explicitly intended to support many languages including most Asian
+/// languages, we are adding an extension to the Markdown rules to accommodate these situations.
+/// The following tests assert that the special cases for East Asian languages function in a
+/// logically-similar way to Western languages.
+mod asian_punctuation {
+    use crate::harness::icu_string_test;
+    icu_string_test!(
+        japanese_adjacent_formatting,
+        "**{value}**件の投稿",
+        r#"<b>{value}</b>件の投稿"#
+    );
+    icu_string_test!(
+        japanese_spaced_formatting,
+        "**{value}** 件の投稿",
+        r#"<b>{value}</b> 件の投稿"#
+    );
+    icu_string_test!(
+        korean_western_punctuation,
+        "*스크립트(script)*라고",
+        r#"<i>스크립트(script)</i>라고"#
+    );
+}
+
 mod hooks {
     use crate::harness::ast_test;
     ast_test!(