diff --git a/README.md b/README.md index b1f432d..10120ce 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD) Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time). -* Split a text based on period, question- and exclamation marks. +* Split a text based on period, question- and exclamation marks, and Chinese sentence boundary character (。). * Skips (most) abbreviations (Mr., Mrs., PhD.) * Skips numbers/currency * Skips urls, websites, email addresses, phone nr. * Counts ellipsis and ?! as single punctuation - + * Supports Chinese sentence boundaries + ### Demo [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/) diff --git a/dist/sbd.js b/dist/sbd.js index 65698f4..246bcb5 100644 --- a/dist/sbd.js +++ b/dist/sbd.js @@ -160,12 +160,13 @@ exports.isConcatenated = function(word) { if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || - (i = word.indexOf("?")) > -1) + (i = word.indexOf("?")) > -1 || + (i = word.indexOf("。")) > -1) { var c = word.charAt(i + 1); - // Check if the next word starts with a letter - if (c.match(/[a-zA-Z].*/)) { + // Check if the next word starts with a letter or Chinese character + if (c.match(/[a-zA-Z\u4e00-\u9fa5].*/)) { return [word.slice(0, i), word.slice(i+1)]; } } @@ -176,7 +177,8 @@ exports.isConcatenated = function(word) { exports.isBoundaryChar = function(word) { return word === "." || word === "!" || - word === "?"; + word === "?" || + word === "。"; }; },{}],2:[function(require,module,exports){ diff --git a/lib/Match.js b/lib/Match.js index 0172b95..2047ab6 100644 --- a/lib/Match.js +++ b/lib/Match.js @@ -159,12 +159,13 @@ exports.isConcatenated = function(word) { if ((i = word.indexOf(".")) > -1 || (i = word.indexOf("!")) > -1 || - (i = word.indexOf("?")) > -1) + (i = word.indexOf("?")) > -1 || + (i = word.indexOf("。")) > -1) { var c = word.charAt(i + 1); - // Check if the next word starts with a letter - if (c.match(/[a-zA-Z].*/)) { + // Check if the next word starts with a letter or Chinese character + if (c.match(/[a-zA-Z\u4e00-\u9fa5].*/)) { return [word.slice(0, i), word.slice(i+1)]; } } @@ -175,5 +176,6 @@ exports.isConcatenated = function(word) { exports.isBoundaryChar = function(word) { return word === "." || word === "!" || - word === "?"; + word === "?" || + word === "。"; }; diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js index 24b49ed..24b161c 100644 --- a/test/multiple_sentences.js +++ b/test/multiple_sentences.js @@ -162,4 +162,31 @@ describe('Multiple sentences', function () { assert.equal(sentences.length, 2); }); }); + + describe('Chinese sentence boundary character', function () { + var entry = "这是第一句。这是第二句。"; + var sentences = tokenizer.sentences(entry); + + it('should get two sentences', function () { + assert.equal(sentences.length, 2); + }); + }); + + describe('Multiple Chinese sentences', function () { + var entry = "这是第一句。这是第二句。这是第三句。"; + var sentences = tokenizer.sentences(entry); + + it('should get three sentences', function () { + assert.equal(sentences.length, 3); + }); + }); + + describe('Mixed English and Chinese sentences', function () { + var entry = "This is an English sentence. 这是一个中文句子。This is another English sentence."; + var sentences = tokenizer.sentences(entry); + + it('should get three sentences', function () { + assert.equal(sentences.length, 3); + }); + }); }); \ No newline at end of file