diff --git a/Jenkinsfile b/Jenkinsfile index e9cfcde12..4883d7169 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv new file mode 100644 index 000000000..f9eb081b9 --- /dev/null +++ b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv @@ -0,0 +1,13 @@ +१/४ पाव +१/२ आधा +३/४ पौन +१:३० डेढ़ बजे +२:३० ढाई बजे +१.५ डेढ़ +२.५ ढाई +कु. कुमारी +स्मि. श्रीमती +श्री. श्री +श्री. श्रीमान +मा. मास्टर +डॉ. डॉक्टर \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv deleted file mode 100644 index d3596a955..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv +++ /dev/null @@ -1,3 +0,0 @@ -१/४ पाव -१/२ आधा -३/४ पौन \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv deleted file mode 100644 index aaf5baf8b..000000000 --- a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv +++ /dev/null @@ -1,2 +0,0 @@ -१:३० डेढ़ -२:३० ढाई \ No newline at end of file diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py index 5267da2bb..a5a371d90 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py @@ -34,8 +34,8 @@ from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst +from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst -from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst class ClassifyFst(GraphFst): @@ -83,7 +83,7 @@ def __init__( money = MoneyFst(cardinal, decimal) money_graph = money.fst punct_graph = PunctuationFst().fst - # whitelist_graph = WhiteListFst(input_file=whitelist).fst + whitelist_graph = WhiteListFst().fst word_graph = WordFst().fst classify = ( @@ -96,7 +96,7 @@ def __init__( | pynutil.add_weight(measure_graph, 1.1) | pynutil.add_weight(money_graph, 1.1) | pynutil.add_weight(word_graph, 100) - # | pynutil.add_weight(whitelist_graph, 1.01) + | pynutil.add_weight(whitelist_graph, 1.01) ) punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }") diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py index 2d522c4ba..caeab03b1 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py +++ b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py @@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None): super().__init__(name="whitelist", kind="classify") if input_file is None: - input_file = get_abs_path("data/whitelist.tsv") + input_file = get_abs_path("data/whitelist/whitelist.tsv") if not os.path.exists(input_file): raise ValueError(f"Whitelist file {input_file} not found") diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py index b6f9bd70a..d88bd25d9 100644 --- a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py @@ -23,6 +23,7 @@ from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst class VerbalizeFst(GraphFst): @@ -44,12 +45,13 @@ def __init__(self): time_graph = TimeFst().fst measure_graph = MeasureFst(cardinal, decimal).fst money_graph = MoneyFst(cardinal, decimal).fst - + word_graph = WordFst().fst whitelist_graph = WhiteListFst().fst graph = ( cardinal_graph | whitelist_graph + | word_graph | ordinal_graph | decimal_graph | fraction_graph diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt new file mode 100644 index 000000000..30824fced --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt @@ -0,0 +1,12 @@ +डेढ़ बजे~१:३० +ढाई बजे~२:३० +मास्टर निखिल तनिष~मा. निखिल तनिष +पाव~१/४ +श्रीमती ज्योत्सना~स्मि. ज्योत्सना +डॉक्टर~डॉ. +आधा कप चाय~१/२ कप चाय +श्रीमान भारत कुमार~श्री. भारत कुमार +डॉक्टर प्रशांत~डॉ. प्रशांत +डेढ़~१.५ +कुमारी~कु. +ढाई~२.५ \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt new file mode 100644 index 000000000..ce044e7cf --- /dev/null +++ b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt @@ -0,0 +1,15 @@ +नींद~नींद +याहू!~याहू! +-~- +आआआ~आआआ +आकाशगंगा~आकाशगंगा +लटरपटर~लटरपटर +कच्चा-पक्का~कच्चा-पक्का +गुब्बारा~गुब्बारा +चिट्ठी~चिट्ठी +ढूंढना~ढूंढना +लोहे का!~लोहे का! +टाटा~टाटा +~ +झ~झ +संगीत~संगीत \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh index 61093c60d..aec7299d5 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh @@ -63,6 +63,16 @@ testITNMoney() { runtest $input } +testITNWord() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt + runtest $input +} + +testITNWhiteList() { + input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt + runtest $input +} + # Load shUnit2 . $PROJECT_DIR/../shunit2/shunit2 diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py index 4a090d823..1e45e6a0e 100644 --- a/tests/nemo_text_processing/hi/test_whitelist.py +++ b/tests/nemo_text_processing/hi/test_whitelist.py @@ -15,6 +15,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,7 @@ class TestWhitelist: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +33,10 @@ class TestWhitelist: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py index 4d6bd2261..6fc5883cc 100644 --- a/tests/nemo_text_processing/hi/test_word.py +++ b/tests/nemo_text_processing/hi/test_word.py @@ -15,6 +15,7 @@ import pytest from parameterized import parameterized +from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer from ..utils import CACHE_DIR, parse_test_case_file @@ -24,6 +25,7 @@ class TestWord: normalizer = Normalizer( input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True ) + inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False) @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt')) @pytest.mark.run_only_on('CPU') @@ -31,3 +33,10 @@ class TestWord: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True) assert pred == expected + + @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt')) + @pytest.mark.run_only_on('CPU') + @pytest.mark.unit + def test_denorm(self, test_input, expected): + pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) + assert pred == expected diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py index 23b1f7deb..6b82dfbec 100644 --- a/tools/text_processing_deployment/pynini_export.py +++ b/tools/text_processing_deployment/pynini_export.py @@ -21,7 +21,7 @@ import pynini -from nemo_text_processing.text_normalization.rw.graph_utils import generator_main +from nemo_text_processing.text_normalization.en.graph_utils import generator_main # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files # tokenize_and_classify.far and verbalize.far for production purposes