Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addition of whitelist and word classes for Hindi ITN #247

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-13-24-0'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
१/४ पाव
१/२ आधा
३/४ पौन
१:३० डेढ़ बजे
२:३० ढाई बजे
१.५ डेढ़
२.५ ढाई
कु. कुमारी
स्मि. श्रीमती
श्री. श्री
श्री. श्रीमान
मा. मास्टर
डॉ. डॉक्टर

This file was deleted.

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@
from nemo_text_processing.inverse_text_normalization.hi.taggers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.punctuation import PunctuationFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.taggers.word import WordFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst


class ClassifyFst(GraphFst):
Expand Down Expand Up @@ -83,7 +83,7 @@ def __init__(
money = MoneyFst(cardinal, decimal)
money_graph = money.fst
punct_graph = PunctuationFst().fst
# whitelist_graph = WhiteListFst(input_file=whitelist).fst
whitelist_graph = WhiteListFst().fst
word_graph = WordFst().fst

classify = (
Expand All @@ -96,7 +96,7 @@ def __init__(
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
# | pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(whitelist_graph, 1.01)
)

punct = pynutil.insert("tokens { ") + pynutil.add_weight(punct_graph, weight=1.1) + pynutil.insert(" }")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def __init__(self, input_case: str = INPUT_LOWER_CASED, input_file: str = None):
super().__init__(name="whitelist", kind="classify")

if input_file is None:
input_file = get_abs_path("data/whitelist.tsv")
input_file = get_abs_path("data/whitelist/whitelist.tsv")

if not os.path.exists(input_file):
raise ValueError(f"Whitelist file {input_file} not found")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.ordinal import OrdinalFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.time import TimeFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.whitelist import WhiteListFst
from nemo_text_processing.inverse_text_normalization.hi.verbalizers.word import WordFst


class VerbalizeFst(GraphFst):
Expand All @@ -44,12 +45,13 @@ def __init__(self):
time_graph = TimeFst().fst
measure_graph = MeasureFst(cardinal, decimal).fst
money_graph = MoneyFst(cardinal, decimal).fst

word_graph = WordFst().fst
whitelist_graph = WhiteListFst().fst

graph = (
cardinal_graph
| whitelist_graph
| word_graph
| ordinal_graph
| decimal_graph
| fraction_graph
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
डेढ़ बजे~१:३०
ढाई बजे~२:३०
मास्टर निखिल तनिष~मा. निखिल तनिष
पाव~१/४
श्रीमती ज्योत्सना~स्मि. ज्योत्सना
डॉक्टर~डॉ.
आधा कप चाय~१/२ कप चाय
श्रीमान भारत कुमार~श्री. भारत कुमार
डॉक्टर प्रशांत~डॉ. प्रशांत
डेढ़~१.५
कुमारी~कु.
ढाई~२.५
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
नींद~नींद
याहू!~याहू!
-~-
आआआ~आआआ
आकाशगंगा~आकाशगंगा
लटरपटर~लटरपटर
कच्चा-पक्का~कच्चा-पक्का
गुब्बारा~गुब्बारा
चिट्ठी~चिट्ठी
ढूंढना~ढूंढना
लोहे का!~लोहे का!
टाटा~टाटा
~
झ~झ
संगीत~संगीत
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,16 @@ testITNMoney() {
runtest $input
}

testITNWord() {
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_word.txt
runtest $input
}

testITNWhiteList() {
input=$PROJECT_DIR/hi/data_inverse_text_normalization/test_cases_whitelist.txt
runtest $input
}


# Load shUnit2
. $PROJECT_DIR/../shunit2/shunit2
9 changes: 9 additions & 0 deletions tests/nemo_text_processing/hi/test_whitelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest
from parameterized import parameterized

from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from nemo_text_processing.text_normalization.normalize import Normalizer

from ..utils import CACHE_DIR, parse_test_case_file
Expand All @@ -24,10 +25,18 @@ class TestWhitelist:
normalizer = Normalizer(
input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=False
)
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_whitelist.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_norm(self, test_input, expected):
pred = self.normalizer.normalize(test_input, verbose=False)
assert pred.strip() == expected.strip()

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_whitelist.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
9 changes: 9 additions & 0 deletions tests/nemo_text_processing/hi/test_word.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pytest
from parameterized import parameterized

from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
from nemo_text_processing.text_normalization.normalize import Normalizer

from ..utils import CACHE_DIR, parse_test_case_file
Expand All @@ -24,10 +25,18 @@ class TestWord:
normalizer = Normalizer(
input_case='cased', lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False, post_process=True
)
inverse_normalizer = InverseNormalizer(lang='hi', cache_dir=CACHE_DIR, overwrite_cache=False)

@parameterized.expand(parse_test_case_file('hi/data_text_normalization/test_cases_word.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_norm(self, test_input, expected):
pred = self.normalizer.normalize(test_input, verbose=False, punct_post_process=True)
assert pred == expected

@parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_word.txt'))
@pytest.mark.run_only_on('CPU')
@pytest.mark.unit
def test_denorm(self, test_input, expected):
pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False)
assert pred == expected
2 changes: 1 addition & 1 deletion tools/text_processing_deployment/pynini_export.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@

import pynini

from nemo_text_processing.text_normalization.rw.graph_utils import generator_main
from nemo_text_processing.text_normalization.en.graph_utils import generator_main

# This script exports compiled grammars inside nemo_text_processing into OpenFst finite state archive files
# tokenize_and_classify.far and verbalize.far for production purposes
Expand Down