NVIDIA · tarushi2k2 · Nov 27, 2024 · Nov 27, 2024 · Nov 27, 2024 · Nov 28, 2024
diff --git a/Jenkinsfile b/Jenkinsfile
@@ -27,7 +27,7 @@ pipeline {
     HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
     MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
     JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
-    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0'
+    HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0'
     DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
   }
   stages {

diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist.tsv
@@ -0,0 +1,13 @@
+१/४	पाव
+१/२	आधा
+३/४	पौन
+१:३०	डेढ़ बजे
+२:३०	ढाई बजे
+१.५	डेढ़
+२.५	ढाई
+कु.	कुमारी
+स्मि.	श्रीमती
+श्री.	श्री
+श्री.	श्रीमान
+मा.	मास्टर
+डॉ.	डॉक्टर
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_fraction.tsv
diff --git a/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv b/nemo_text_processing/inverse_text_normalization/hi/data/whitelist/whitelist_time.tsv
diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/tokenize_and_classify.py
@@ -34,8 +34,8 @@
 from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
+from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
 from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
-from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
 
 
 class ClassifyFst(GraphFst):
@@ -83,7 +83,7 @@ def __init__(
             money = MoneyFst(cardinal, decimal)
             money_graph = money.fst
             punct_graph = PunctuationFst().fst
-            # whitelist_graph = WhiteListFst(input_file=whitelist).fst
+            whitelist_graph = WhiteListFst().fst
             word_graph = WordFst().fst
 
             classify = (
@@ -96,7 +96,7 @@ def __init__(
                 | pynutil.add_weight(measure_graph, 1.1)
                 | pynutil.add_weight(money_graph, 1.1)
                 | pynutil.add_weight(word_graph, 100)
-                # |  pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(whitelist_graph, 1.01)
             )
 
             punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")

diff --git a/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py b/nemo_text_processing/inverse_text_normalization/hi/taggers/whitelist.py
@@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
         super().__init__(name="whitelist", kind="classify")
 
         if input_file is None:
-            input_file = get_abs_path("data/whitelist.tsv")
+            input_file = get_abs_path("data/whitelist/whitelist.tsv")
 
         if not os.path.exists(input_file):
             raise ValueError(f"Whitelist file {input_file} not found")

diff --git a/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/inverse_text_normalization/hi/verbalizers/verbalize.py
@@ -23,6 +23,7 @@
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
 from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
+from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst
 
 
 class VerbalizeFst(GraphFst):
@@ -44,12 +45,13 @@ def __init__(self):
         time_graph = TimeFst().fst
         measure_graph = MeasureFst(cardinal, decimal).fst
         money_graph = MoneyFst(cardinal, decimal).fst
-
+        word_graph = WordFst().fst
         whitelist_graph = WhiteListFst().fst
 
         graph = (
             cardinal_graph
             | whitelist_graph
+            | word_graph
             | ordinal_graph
             | decimal_graph
             | fraction_graph

diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_whitelist.txt
@@ -0,0 +1,12 @@
+डेढ़ बजे~१:३०
+ढाई बजे~२:३०
+मास्टर निखिल तनिष~मा. निखिल तनिष
+पाव~१/४
+श्रीमती ज्योत्सना~स्मि. ज्योत्सना
+डॉक्टर~डॉ.
+आधा कप चाय~१/२ कप चाय
+श्रीमान भारत कुमार~श्री. भारत कुमार
+डॉक्टर प्रशांत~डॉ. प्रशांत
+डेढ़~१.५
+कुमारी~कु.
+ढाई~२.५
diff --git a/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt b/tests/nemo_text_processing/hi/data_inverse_text_normalization/test_cases_word.txt
@@ -0,0 +1,15 @@
+नींद~नींद
+याहू!~याहू!
+-~-
+आआआ~आआआ
+आकाशगंगा~आकाशगंगा
+लटरपटर~लटरपटर
+कच्चा-पक्का~कच्चा-पक्का
+गुब्बारा~गुब्बारा
+चिट्ठी~चिट्ठी
+ढूंढना~ढूंढना
+लोहे का!~लोहे का!
+टाटा~टाटा
+~
+झ~झ
+संगीत~संगीत
diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_inverse_text_normalization.sh
@@ -63,6 +63,16 @@ testITNMoney() {
   runtest $input
 }
 
+testITNWord() {
+  input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt
+  runtest $input
+}
+
+testITNWhiteList() {
+  input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt
+  runtest $input
+}
+
 
 # Load shUnit2
 . $PROJECT_DIR/../shunit2/shunit2
diff --git a/tests/nemo_text_processing/hi/test_whitelist.py b/tests/nemo_text_processing/hi/test_whitelist.py
@@ -15,6 +15,7 @@
 import pytest
 from parameterized import parameterized
 
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
 from nemo_text_processing.text_normalization.normalize import Normalizer
 
 from ..utils import CACHE_DIR, parse_test_case_file
@@ -24,10 +25,18 @@ class TestWhitelist:
     normalizer = Normalizer(
         input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False
     )
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
 
     @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt'))
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_norm(self, test_input, expected):
         pred = self.normalizer.normalize(test_input, verbose=False)
         assert pred.strip() == expected.strip()
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tests/nemo_text_processing/hi/test_word.py b/tests/nemo_text_processing/hi/test_word.py
@@ -15,6 +15,7 @@
 import pytest
 from parameterized import parameterized
 
+from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
 from nemo_text_processing.text_normalization.normalize import Normalizer
 
 from ..utils import CACHE_DIR, parse_test_case_file
@@ -24,10 +25,18 @@ class TestWord:
     normalizer = Normalizer(
         input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True
     )
+    inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)
 
     @parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt'))
     @pytest.mark.run_only_on('CPU')
     @pytest.mark.unit
     def test_norm(self, test_input, expected):
         pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True)
         assert pred == expected
+
+    @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt'))
+    @pytest.mark.run_only_on('CPU')
+    @pytest.mark.unit
+    def test_denorm(self, test_input, expected):
+        pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
+        assert pred == expected
diff --git a/tools/text_processing_deployment/pynini_export.py b/tools/text_processing_deployment/pynini_export.py
@@ -21,7 +21,7 @@
 
 import pynini
 
-from nemo_text_processing.text_normalization.rw.graph_utils import generator_main
+from nemo_text_processing.text_normalization.en.graph_utils import generator_main
 
 # This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files
 # tokenize_and_classify.far and verbalize.far for production purposes