Skip to content

Commit

Permalink
Add support for Hindi sentence boundary character (।)
Browse files Browse the repository at this point in the history
- Update tokenizer to recognize Devanagari danda (।) as a sentence boundary
- Modify word splitting logic to handle Hindi text
- Add test cases for Hindi sentences
- Update README to mention Hindi sentence boundary support
  • Loading branch information
Jurriaan Roelofs committed Aug 31, 2024
1 parent 6eb7254 commit fed5c3e
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 7 deletions.
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,13 @@ Sentence Boundary Detection (SBD)

Split text into sentences with a `vanilla` rule based approach (i.e working ~95% of the time).

* Split a text based on period, question- and exclamation marks.
* Split a text based on period, question- and exclamation marks, and the Hindi sentence boundary character (।).
* Skips (most) abbreviations (Mr., Mrs., PhD.)
* Skips numbers/currency
* Skips urls, websites, email addresses, phone nr.
* Counts ellipsis and ?! as single punctuation

* Supports Hindi sentence boundaries

### Demo

[http://tessmore.github.io/sbd/](http://tessmore.github.io/sbd/)
Expand Down
8 changes: 5 additions & 3 deletions dist/sbd.js
Original file line number Diff line number Diff line change
Expand Up @@ -160,12 +160,13 @@ exports.isConcatenated = function(word) {

if ((i = word.indexOf(".")) > -1 ||
(i = word.indexOf("!")) > -1 ||
(i = word.indexOf("?")) > -1)
(i = word.indexOf("?")) > -1 ||
(i = word.indexOf("।")) > -1)
{
var c = word.charAt(i + 1);

// Check if the next word starts with a letter
if (c.match(/[a-zA-Z].*/)) {
if (c.match(/[a-zA-Z\u0900-\u097F].*/)) {
return [word.slice(0, i), word.slice(i+1)];
}
}
Expand All @@ -176,7 +177,8 @@ exports.isConcatenated = function(word) {
exports.isBoundaryChar = function(word) {
return word === "." ||
word === "!" ||
word === "?";
word === "?" ||
word === "।";
};

},{}],2:[function(require,module,exports){
Expand Down
3 changes: 2 additions & 1 deletion lib/Match.js
Original file line number Diff line number Diff line change
Expand Up @@ -175,5 +175,6 @@ exports.isConcatenated = function(word) {
exports.isBoundaryChar = function(word) {
return word === "." ||
word === "!" ||
word === "?";
word === "?" ||
word === "।";
};
1 change: 0 additions & 1 deletion lib/tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@ var whiteSpaceCheck = new RegExp("\\S", "");
var addNewLineBoundaries = new RegExp("\\n+|[-#=_+*]{4,}", "g");
var splitIntoWords = new RegExp("\\S+|\\n", "g");


// Split the entry into sentences.
exports.sentences = function(text, user_options) {
if (!text || typeof text !== "string" || !text.length) {
Expand Down
18 changes: 18 additions & 0 deletions test/multiple_sentences.js
Original file line number Diff line number Diff line change
Expand Up @@ -162,4 +162,22 @@ describe('Multiple sentences', function () {
assert.equal(sentences.length, 2);
});
});

describe('Hindi sentence boundary character', function () {
var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है।";
var sentences = tokenizer.sentences(entry);

it('should get two sentences', function () {
assert.equal(sentences.length, 2);
});
});

describe('Multiple Hindi sentences', function () {
var entry = "यह पहला वाक्य है। यह दूसरा वाक्य है। और यह तीसरा वाक्य है।";
var sentences = tokenizer.sentences(entry);

it('should get three sentences', function () {
assert.equal(sentences.length, 3);
});
});
});

0 comments on commit fed5c3e

Please sign in to comment.