Add support for Hindi sentence boundary character (।)

- Update tokenizer to recognize Devanagari danda (।) as a sentence boundary - Modify word splitting logic to handle Hindi text - Add test cases for Hindi sentences - Update README to mention Hindi sentence boundary support
Tessmore · Aug 31, 2024 · fed5c3e · fed5c3e
1 parent 6eb7254
commit fed5c3e
Show file tree

Hide file tree

Showing 5 changed files with 28 additions and 7 deletions.
diff --git a/README.md b/README.md
@@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)
 
 Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).
 
-* Split a text based on period, question- and exclamation marks.
+* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।).
     * Skips (most) abbreviations (Mr., Mrs., PhD.)
     * Skips numbers/currency
     * Skips urls, websites, email addresses, phone nr.
     * Counts ellipsis and ?! as single punctuation
-
+    * Supports Hindi sentence boundaries
+
 ### Demo
 
 [http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)

diff --git a/dist/sbd.js b/dist/sbd.js
@@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {
 
     if ((i = word.indexOf(".")) > -1 ||
         (i = word.indexOf("!")) > -1 ||
-        (i = word.indexOf("?")) > -1)
+        (i = word.indexOf("?")) > -1 ||
+        (i = word.indexOf("।")) > -1)
     {
         var c = word.charAt(i + 1);
 
         // Check if the next word starts with a letter
-        if (c.match(/[a-zA-Z].*/)) {
+        if (c.match(/[a-zA-Z\u0900-\u097F].*/)) {
             return [word.slice(0, i), word.slice(i+1)];
         }
     }
@@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "।";
 };
 
 },{}],2:[function(require,module,exports){

diff --git a/lib/Match.js b/lib/Match.js
@@ -175,5 +175,6 @@ exports.isConcatenated = function(word) {
 exports.isBoundaryChar = function(word) {
     return word === "." ||
            word === "!" ||
-           word === "?";
+           word === "?" ||
+           word === "।";
 };
diff --git a/lib/tokenizer.js b/lib/tokenizer.js
@@ -13,7 +13,6 @@ var whiteSpaceCheck = new RegExp("\\S", "");
 var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
 var splitIntoWords = new RegExp("\\S+|\\n", "g");
 
-
 // Split the entry into sentences.
 exports.sentences = function(text, user_options) {
     if (!text || typeof text !== "string" || !text.length) {

diff --git a/test/multiple_sentences.js b/test/multiple_sentences.js
@@ -162,4 +162,22 @@ describe('Multiple sentences', function () {
             assert.equal(sentences.length, 2);
         });
     });
+
+    describe('Hindi sentence boundary character', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get two sentences', function () {
+            assert.equal(sentences.length, 2);
+        });
+    });
+
+    describe('Multiple Hindi sentences', function () {
+        var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।";
+        var sentences = tokenizer.sentences(entry);
+
+        it('should get three sentences', function () {
+            assert.equal(sentences.length, 3);
+        });
+    });
 });