Skip to content

Commit

Permalink
[markdown] Extend Markdown rules to support common CJK formatting gra…
Browse files Browse the repository at this point in the history
…mmar (#16)

Chinese and Japanese content usually do _not_ include spaces between
formatted and unformatted segments of a single phrase, such as
`**{value}**件の投稿`. But this is technically not valid `strong` formatting
according to the CommonMark spec, since the right flank of the ending
delimiter is a non-space Unicode character.

See more information in the CommonMark discussion here:
https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
commonmark/cmark#208

Because this library is explicitly intended to support many languages
including most Asian languages, we are adding an extension to the
Markdown rules to accommodate these situations. The following tests
assert that the special cases for East Asian languages function in a
logically-similar way to Western languages.

The tests for this change are pretty small, as I'm not fluent in
anything near CJK and have purely gone off of suggestions and forums to
enumerate these. Most importantly, `**{value}**件の投稿`, is now treated as
a **bold** `value` followed by plain text, rather than being completely
ignored.
  • Loading branch information
faultyserver authored Dec 19, 2024
1 parent 7c87eb3 commit 1e4f0ba
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 15 deletions.
25 changes: 25 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions crates/intl_markdown/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ doctest = false

[dependencies]
bitflags = "2"
cjk = "0.2.5"
intl_markdown_macros = { workspace = true }
serde = { workspace = true }
serde_json = { workspace = true }
Expand Down
22 changes: 19 additions & 3 deletions crates/intl_markdown/src/lexer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ pub(super) struct LexerState {
pub last_was_whitespace: bool,
pub last_was_punctuation: bool,
pub last_was_newline: bool,
pub last_was_cjk_punctuation: bool,
/// True if the last token was entirely an escaped token, which has an
/// effect on whether the next token is considered punctuation or not when
/// computing delimiters.
Expand All @@ -45,6 +46,7 @@ impl LexerState {
last_was_newline: true,
last_was_whitespace: true,
last_was_punctuation: false,
last_was_cjk_punctuation: false,
last_token_was_escape: false,
is_after_newline: true,
}
Expand All @@ -56,6 +58,7 @@ impl LexerState {
self.last_was_whitespace = true;
self.last_was_newline = true;
self.last_was_punctuation = false;
self.last_was_cjk_punctuation = false;
self.last_token_was_escape = false;
self.is_after_newline = true;
}
Expand Down Expand Up @@ -215,9 +218,9 @@ impl<'source> Lexer<'source> {
c if self.state.last_was_newline
&& c.is_ascii_whitespace()
&& self.state.indent_depth > 0 =>
{
self.consume_leading_whitespace()
}
{
self.consume_leading_whitespace()
}
b'\0' => self.consume_byte(SyntaxKind::EOF),
_ => self.consume_verbatim_line(),
}
Expand Down Expand Up @@ -585,6 +588,9 @@ impl<'source> Lexer<'source> {
GeneralCategoryGroup::Punctuation | GeneralCategoryGroup::Symbol
)
});
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
// https://github.com/commonmark/cmark/pull/208
let next_is_cjk = next.map_or(false, |c| !c.is_ascii() && cjk::is_cjk_codepoint(c));
let next_is_escaped = matches!(next, Some('\\'));

let mut flags = TokenFlags::default();
Expand All @@ -594,12 +600,18 @@ impl<'source> Lexer<'source> {
if self.state.last_was_punctuation && !self.state.last_token_was_escape {
flags.insert(TokenFlags::HAS_PRECEDING_PUNCTUATION);
}
if self.state.last_was_cjk_punctuation {
flags.insert(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
}
if next_is_whitespace {
flags.insert(TokenFlags::HAS_FOLLOWING_WHITESPACE);
}
if next_is_punctuation && !next_is_escaped {
flags.insert(TokenFlags::HAS_FOLLOWING_PUNCTUATION);
}
if next_is_cjk {
flags.insert(TokenFlags::HAS_FOLLOWING_CJK);
}

self.advance();

Expand Down Expand Up @@ -1043,6 +1055,10 @@ impl<'source> Lexer<'source> {
} else {
last_char.is_ascii_punctuation()
};
// [cjk] includes all ascii characters as CJK punctuation for some reason, which we
// specifically do not want to match here, so the check is also guarded that the character
// is not plain ASCII.
self.state.last_was_cjk_punctuation = !last_char.is_ascii() && cjk::is_cjk_punctuation_codepoint(last_char);

self.state.last_was_newline = last_char == '\n';
self.state.last_was_whitespace = last_char.is_whitespace();
Expand Down
26 changes: 14 additions & 12 deletions crates/intl_markdown/src/parser/delimiter.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::ops::Range;

use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};
use crate::delimiter::Delimiter;
use crate::parser::emphasis::process_emphasis;
use crate::{delimiter::EmphasisDelimiter, event::Event, SyntaxKind};

use super::ICUMarkdownParser;

Expand Down Expand Up @@ -59,22 +59,24 @@ pub(super) fn parse_delimiter_run(p: &mut ICUMarkdownParser, kind: SyntaxKind) -
!first_flags.has_preceding_whitespace()
// 2. Either:
&& (
// - not preceded by a punctuation. OR
!first_flags.has_preceding_punctuation()
// - not preceded by a punctuation. OR
// (CJK extension: preceding CJK punctuation is allowed)
(!first_flags.has_preceding_punctuation() || first_flags.has_preceding_cjk_punctuation())
// - preceded by punctuation but followed by whitespace or punctuation
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation())
);
// (CJK extension: following CJK characters are allowed)
|| (last_flags.has_following_whitespace() || last_flags.has_following_punctuation() || last_flags.has_following_cjk())
);

// Left-flanking definition
// 1. Not followed by whitespace AND
let is_left_flanking = !last_flags.has_following_whitespace()
// 2. Either:
&& (
// - not followed by a punctuation. OR
!last_flags.has_following_punctuation()
// - followed by punctuation but preceded by whitespace or punctuation.
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
);
// 2. Either:
&& (
// - not followed by a punctuation. OR
!last_flags.has_following_punctuation()
// - followed by punctuation but preceded by whitespace or punctuation.
|| (first_flags.has_preceding_whitespace() || first_flags.has_preceding_punctuation())
);

// Using the determined flanking and context flags and the `kind` of the
// token, determine if it can be used to open and/or close emphasis.
Expand Down
13 changes: 13 additions & 0 deletions crates/intl_markdown/src/token.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,12 @@ bitflags! {
// Only used for some delimiters currently. `ESCAPED` kinds will also
// always have this set.
const IS_ESCAPED = 1 << 6;

// Extension for support delimiters around CJK script characters.
// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
// https://github.com/commonmark/cmark/pull/208
const HAS_PRECEDING_CJK_PUNCTUATION = 1 << 5;
const HAS_FOLLOWING_CJK = 1 << 6;
}
}

Expand All @@ -39,6 +45,13 @@ impl TokenFlags {
pub fn is_escaped(&self) -> bool {
self.contains(TokenFlags::IS_ESCAPED)
}

pub fn has_preceding_cjk_punctuation(&self) -> bool {
self.contains(TokenFlags::HAS_PRECEDING_CJK_PUNCTUATION)
}
pub fn has_following_cjk(&self) -> bool {
self.contains(TokenFlags::HAS_FOLLOWING_CJK)
}
}

#[derive(Clone, PartialEq, Eq)]
Expand Down
32 changes: 32 additions & 0 deletions crates/intl_markdown/tests/md_extensions.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,38 @@
mod harness;

/// Chinese and Japanese content usually do _not_ include spaces between formatted and unformatted
/// segments of a single phrase, such as `**{value}**件の投稿`. But this is technically not valid
/// `strong` formatting according to the CommonMark spec, since the right flank of the ending
/// delimiter is a non-space Unicode character.
///
/// See more information in the CommonMark discussion here:
/// https://talk.commonmark.org/t/emphasis-and-east-asian-text/2491/5
/// https://github.com/commonmark/cmark/pull/208
///
/// Because this library is explicitly intended to support many languages including most Asian
/// languages, we are adding an extension to the Markdown rules to accommodate these situations.
/// The following tests assert that the special cases for East Asian languages function in a
/// logically-similar way to Western languages.
mod asian_punctuation {
use crate::harness::icu_string_test;
icu_string_test!(
japanese_adjacent_formatting,
"**{value}**件の投稿",
r#"<b>{value}</b>件の投稿"#
);
icu_string_test!(
japanese_spaced_formatting,
"**{value}** 件の投稿",
r#"<b>{value}</b> 件の投稿"#
);
icu_string_test!(
korean_western_punctuation,
"*스크립트(script)*라고",
r#"<i>스크립트(script)</i>라고"#
);
}

mod hooks {
use crate::harness::ast_test;
ast_test!(
Expand Down

0 comments on commit 1e4f0ba

Please sign in to comment.