Tessmore · jjroelofs · Aug 31, 2024
diff --git a/README.md b/README.md
@@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)
 
 Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).
 
-* Split a text based on period, question- and exclamation marks.
+* Split a text based on period, question- and exclamation marks, and Chinese sentence boundary character (。).
     * Skips (most) abbreviations (Mr., Mrs., PhD.)
     * Skips numbers/currency
     * Skips urls, websites, email addresses, phone nr.
     * Counts ellipsis and ?! as single punctuation
-
+    * Supports Chinese sentence boundaries
+
 ### Demo
 
 [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)

diff --git a/dist/sbd.js b/dist/sbd.js
@@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {
 
     if ((i = word.indexOf(".")) > -1 ||
         (i = word.indexOf("!")) > -1 ||
-        (i = word.indexOf("?")) > -1)
+        (i = word.indexOf("?")) > -1 ||
+        (i = word.indexOf("。")) > -1)
     {
         var c = word.charAt(i + 1);
 
-        // Check if the next word starts with a letter
-        if (c.match(/[a-zA-Z].*/)) {
+        // Check if the next word starts with a letter or Chinese character
+        if (c.match(/[a-zA-Z\u4e00-\u9fa5].*/)) {
             return [word.slice(0, i), word.slice(i+1)];
         }
     }
@@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "。";
 };
 
 },{}],2:[function(require,module,exports){

diff --git a/lib/Match.js b/lib/Match.js
@@ -159,12 +159,13 @@ exports.isConcatenated = function(word) {
 
     if ((i = word.indexOf(".")) > -1 ||
         (i = word.indexOf("!")) > -1 ||
-        (i = word.indexOf("?")) > -1)
+        (i = word.indexOf("?")) > -1 ||
+        (i = word.indexOf("。")) > -1)
     {
         var c = word.charAt(i + 1);
 
-        // Check if the next word starts with a letter
-        if (c.match(/[a-zA-Z].*/)) {
+        // Check if the next word starts with a letter or Chinese character
+        if (c.match(/[a-zA-Z\u4e00-\u9fa5].*/)) {
             return [word.slice(0, i), word.slice(i+1)];
         }
     }
@@ -175,5 +176,6 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "。";
 };
diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js
@@ -162,4 +162,31 @@ describe('Multiple sentences', function () {
             assert.equal(sentences.length, 2);
         });
     });
+
+    describe('Chinese sentence boundary character', function () {
+        var entry = "这是第一句。这是第二句。";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get two sentences', function () {
+            assert.equal(sentences.length, 2);
+        });
+    });
+
+    describe('Multiple Chinese sentences', function () {
+        var entry = "这是第一句。这是第二句。这是第三句。";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get three sentences', function () {
+            assert.equal(sentences.length, 3);
+        });
+    });
+
+    describe('Mixed English and Chinese sentences', function () {
+        var entry = "This is an English sentence. 这是一个中文句子。This is another English sentence.";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get three sentences', function () {
+            assert.equal(sentences.length, 3);
+        });
+    });
 });