From 513fff52d51376f397794c4ad9b38ed4042cfafc Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 27 Nov 2024 15:17:49 +0530 Subject: [PATCH 1/6] Addition of whitelist and word classes Signed-off-by: Tarushi V --- .../hi/data/whitelist/whitelist.tsv | 13 +++++++++++++ .../hi/data/whitelist/whitelist_fraction.tsv | 3 --- .../hi/data/whitelist/whitelist_time.tsv | 2 -- .../hi/taggers/tokenize_and_classify.py | 6 +++--- .../hi/taggers/whitelist.py | 2 +- .../hi/verbalizers/verbalize.py | 4 +++- .../test_cases_whitelist.txt | 12 ++++++++++++ .../test_cases_word.txt | 15 +++++++++++++++ ...test_sparrowhawk_inverse_text_normalization.sh | 10 ++++++++++ tests/nemo_text_processing/hi/test_whitelist.py | 9 +++++++++ tests/nemo_text_processing/hi/test_word.py | 10 ++++++++++ tools/text_processing_deployment/pynini_export.py | 2 +- 12 files changed, 77 insertions(+), 11 deletions(-) create mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv delete mode 100644 nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt create mode 100644 tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv new file mode 100644 index 000000000..f9eb081b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -0,0 +1,13 @@ +१/४ पाव +१/२ आधा +३/४ पौन +१:३० डेढ़ बजे +२:३० ढाई बजे +१.५ डेढ़ +२.५ ढाई +कु. कुमारी +स्मि. श्रीमती +श्री. श्री +श्री. श्रीमान +मा. मास्टर +डॉ. डॉक्टर \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv deleted file mode 100644 index d3596a955..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv +++ /dev/null @@ -1,3 +0,0 @@ -१/४ पाव -१/२ आधा -३/४ पौन \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv deleted file mode 100644 index aaf5baf8b..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv +++ /dev/null @@ -1,2 +0,0 @@ -१:३० डेढ़ -२:३० ढाई \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 5267da2bb..9c8168aa0 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -35,7 +35,7 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst class ClassifyFst(GraphFst): @@ -83,7 +83,7 @@ def __init__( money = MoneyFst(cardinal, decimal) money_graph = money.fst punct_graph = PunctuationFst().fst - # whitelist_graph = WhiteListFst(input_file=whitelist).fst + whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst classify = ( @@ -96,7 +96,7 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(word_graph, 100) - # | pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(whitelist_graph, 1.01) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py index 2d522c4ba..caeab03b1 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py @@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): super().__init__(name="whitelist", kind="classify") if input_file is None: - input_file = get_abs_path("data/whitelist.tsv") + input_file = get_abs_path("data/whitelist/whitelist.tsv") if not os.path.exists(input_file): raise ValueError(f"Whitelist file {input_file} not found") diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index b6f9bd70a..7aaef4fc3 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -22,6 +22,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst @@ -44,12 +45,13 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst - + word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst graph = ( cardinal_graph | whitelist_graph + | word_graph | ordinal_graph | decimal_graph | fraction_graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..30824fced --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,12 @@ +डेढ़ बजे~१:३० +ढाई बजे~२:३० +मास्टर निखिल तनिष~मा. निखिल तनिष +पाव~१/४ +श्रीमती ज्योत्सना~स्मि. ज्योत्सना +डॉक्टर~डॉ. +आधा कप चाय~१/२ कप चाय +श्रीमान भारत कुमार~श्री. भारत कुमार +डॉक्टर प्रशांत~डॉ. प्रशांत +डेढ़~१.५ +कुमारी~कु. +ढाई~२.५ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..ce044e7cf --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,15 @@ +नींद~नींद +याहू!~याहू! +-~- +आआआ~आआआ +आकाशगंगा~आकाशगंगा +लटरपटर~लटरपटर +कच्चा-पक्का~कच्चा-पक्का +गुब्बारा~गुब्बारा +चिट्ठी~चिट्ठी +ढूंढना~ढूंढना +लोहे का!~लोहे का! +टाटा~टाटा +~ +झ~झ +संगीत~संगीत \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index 61093c60d..aec7299d5 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,16 @@ testITNMoney() { runtest $input } +testITNWord() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +testITNWhiteList() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py index 4a090d823..c6a228e6e 100644 --- a/tests/nemo_text_processing/hi/test_whitelist.py +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -15,6 +15,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,7 @@ class TestWhitelist: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +33,10 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py index 4d6bd2261..30d809356 100644 --- a/tests/nemo_text_processing/hi/test_word.py +++ b/tests/nemo_text_processing/hi/test_word.py @@ -16,6 +16,7 @@ from parameterized import parameterized from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,8 @@ class TestWord: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) + @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +34,10 @@ class TestWord: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 23b1f7deb..6b82dfbec 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -21,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.rw.graph_utils import generator_main +from nemo_text_processing.text_normalization.en.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes From 535af69bb96d376cadcd9e8f03eebddc4afc3c06 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 27 Nov 2024 09:52:07 +0000 Subject: [PATCH 2/6] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- .../hi/taggers/tokenize_and_classify.py | 2 +- .../inverse_text_normalization/hi/verbalizers/verbalize.py | 2 +- tests/nemo_text_processing/hi/test_whitelist.py | 2 +- tests/nemo_text_processing/hi/test_word.py | 5 ++--- 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 9c8168aa0..a5a371d90 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -34,8 +34,8 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst class ClassifyFst(GraphFst): diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index 7aaef4fc3..d88bd25d9 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -22,8 +22,8 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst class VerbalizeFst(GraphFst): diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py index c6a228e6e..1e45e6a0e 100644 --- a/tests/nemo_text_processing/hi/test_whitelist.py +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -33,7 +33,7 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py index 30d809356..6fc5883cc 100644 --- a/tests/nemo_text_processing/hi/test_word.py +++ b/tests/nemo_text_processing/hi/test_word.py @@ -15,8 +15,8 @@ import pytest from parameterized import parameterized -from nemo_text_processing.text_normalization.normalize import Normalizer from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer +from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -27,14 +27,13 @@ class TestWord: ) inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) - @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred == expected - + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @pytest.mark.unit From d4e380fdc8a65a8c9847575b991cf80e6873a50c Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Wed, 27 Nov 2024 16:43:57 +0530 Subject: [PATCH 3/6] Updation of Jenkins date Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index e9cfcde12..fe6a75161 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-27-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 60f87577e4418f35a06d313166864b8575873d6d Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Thu, 28 Nov 2024 15:23:19 +0530 Subject: [PATCH 4/6] Cleanup Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index fe6a75161..63fb1a01b 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-27-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-28-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From 9aa85c0cf927a7459d0e9ee00c91c109c1df7dc8 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 29 Nov 2024 15:49:43 +0530 Subject: [PATCH 5/6] Updation Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 63fb1a01b..40dd4d626 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-28-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { From bf6ebe3c5f5fd8841af8e0176abbbfc8b1116b23 Mon Sep 17 00:00:00 2001 From: Tarushi V Date: Fri, 29 Nov 2024 17:30:48 +0530 Subject: [PATCH 6/6] Updation Signed-off-by: Tarushi V --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index 40dd4d626..4883d7169 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages {