From fdbeaa19e2e8a9dd4f57bc69b1d19a17b88523ad Mon Sep 17 00:00:00 2001 From: Siarhei Yankouski Date: Tue, 26 Oct 2021 17:43:30 +0300 Subject: [PATCH] Add support of dotter abbreviations with multiple letters. --- lib/Match.js | 2 +- test/abbr.js | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/lib/Match.js b/lib/Match.js index 0172b95..46c25c4 100644 --- a/lib/Match.js +++ b/lib/Match.js @@ -100,7 +100,7 @@ exports.isTimeAbbreviation = function(word, next) { } exports.isDottedAbbreviation = function(word) { - var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/(.\.)*/); + var matches = word.replace(/[\(\)\[\]\{\}]/g, '').match(/((.|[A-Z]+)\.)*/); return matches && matches[0].length > 0; } diff --git a/test/abbr.js b/test/abbr.js index b009827..ede22dd 100644 --- a/test/abbr.js +++ b/test/abbr.js @@ -33,6 +33,15 @@ describe('Abbreviations in sentences', function () { }); }); + describe('Skip dotted abbreviations with multiple capital letters', function () { + const entry = "State-owned lender Caixa Economica Federal (CEF.UL) is planning to list its asset management unit and the payments firm Elo by the beginning of 2022, Chief Executive Pedro Guimaraes said in an interview."; + const sentences = tokenizer.sentences(entry); + + it("should get 1 sentence", function () { + assert.equal(sentences.length, 1); + }); + }); + describe('Skip common abbreviations', function () { const entry = "Fig. 2. displays currency rates i.e. something libsum. Currencies widely available (i.e. euro, dollar, pound), or alternatively (e.g. €, $, etc.)"; const sentences = tokenizer.sentences(entry);