diff --git a/Jenkinsfile b/Jenkinsfile index 6edad14a2..567a72827 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -27,7 +27,7 @@ pipeline { HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0' MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1' JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1' - HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1' + HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-15-25-0' DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0' } stages { @@ -104,7 +104,7 @@ pipeline { parallel { stage('L0: Hi TN grammars') { steps { - sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --lang=hi --text="१" --cache_dir ${HI_TN_CACHE}' + sh 'CUDA_VISIBLE_DEVICES="" python nemo_text_processing/text_normalization/normalize.py --text="१" --cache_dir ${HI_TN_CACHE}' } } stage('L0: Hi ITN grammars') { diff --git a/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv new file mode 100644 index 000000000..7da791489 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/date/year_suffix.tsv @@ -0,0 +1,3 @@ +ई. पू. ईसा पूर्व +ई. ईसवी +तक तक \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv index 0bf561379..189512687 100644 --- a/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv +++ b/nemo_text_processing/text_normalization/hi/data/measure/unit.tsv @@ -141,14 +141,16 @@ month महीना months महीने ct कैरेट pH पीएच +km/h किलोमीटर प्रति घंटा km/hr किलोमीटर प्रति घंटा km/min किलोमीटर प्रति मिनट +m/h मीटर प्रति घंटा m/hr मीटर प्रति घंटा mi/s मील प्रति सेकंड +mi/h मील प्रति घंटा mi/hr मील प्रति घंटा mi/min मील प्रति मिनट ₹/ac रुपए प्रति एकड़ x बाई X बाई * बाई -- से diff --git a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv index 88633ec7c..8f4a955cc 100644 --- a/nemo_text_processing/text_normalization/hi/data/money/currency.tsv +++ b/nemo_text_processing/text_normalization/hi/data/money/currency.tsv @@ -1,5 +1,4 @@ ₹ रुपए -P पैसे £ पाउंड ₩ वॉन $ डॉलर @@ -7,4 +6,4 @@ $ डॉलर ৳ टका ¥ येन ₦ नाइरा -€ यूरो +€ यूरो \ No newline at end of file diff --git a/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.py b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.py new file mode 100644 index 000000000..0db876c08 --- /dev/null +++ b/nemo_text_processing/text_normalization/hi/data/money/major_minor_currencies.py @@ -0,0 +1,11 @@ +major_minor_currencies = { + "रुपए": "पैसे", + "पाउंड": "पेंस", + "वॉन": "जिओन", + "डॉलर": "सेंट", + "लीरा": "कुरस", + "टका": "पैसे", + "येन": "सेन", + "नाइरा": "कोबो", + "यूरो": "सेंट", +} diff --git a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv index d5e85a784..dd8623284 100644 --- a/nemo_text_processing/text_normalization/hi/data/time/hours.tsv +++ b/nemo_text_processing/text_normalization/hi/data/time/hours.tsv @@ -1,3 +1,4 @@ +० शून्य १ एक २ दो ३ तीन diff --git a/nemo_text_processing/text_normalization/hi/graph_utils.py b/nemo_text_processing/text_normalization/hi/graph_utils.py index ced1b8949..37b145918 100644 --- a/nemo_text_processing/text_normalization/hi/graph_utils.py +++ b/nemo_text_processing/text_normalization/hi/graph_utils.py @@ -21,6 +21,7 @@ import pynini from pynini import Far +from pynini.examples import plurals from pynini.export import export from pynini.lib import byte, pynutil, utf8 @@ -99,6 +100,30 @@ def generator_main(file_name: str, graphs: Dict[str, 'pynini.FstLike']): logging.info(f'Created {file_name}') +def get_plurals(fst): + """ + Given singular returns plurals + + Args: + fst: Fst + + Returns plurals to given singular forms + """ + return SINGULAR_TO_PLURAL @ fst + + +def get_singulars(fst): + """ + Given plural returns singulars + + Args: + fst: Fst + + Returns singulars to given plural forms + """ + return PLURAL_TO_SINGULAR @ fst + + def convert_space(fst) -> 'pynini.FstLike': """ Converts space to nonbreaking space. @@ -113,6 +138,44 @@ def convert_space(fst) -> 'pynini.FstLike': return fst @ pynini.cdrewrite(pynini.cross(NEMO_SPACE, NEMO_NON_BREAKING_SPACE), "", "", NEMO_SIGMA) +def string_map_cased(input_file: str, input_case: str = INPUT_LOWER_CASED): + labels = load_labels(input_file) + + if input_case == INPUT_CASED: + additional_labels = [] + for written, spoken, *weight in labels: + written_capitalized = written[0].upper() + written[1:] + additional_labels.extend( + [ + [written_capitalized, spoken.capitalize()], # first letter capitalized + [ + written_capitalized, + spoken.upper().replace(" AND ", " and "), + ], # # add pairs with the all letters capitalized + ] + ) + + spoken_no_space = spoken.replace(" ", "") + # add abbreviations without spaces (both lower and upper case), i.e. "BMW" not "B M W" + if len(spoken) == (2 * len(spoken_no_space) - 1): + logging.debug(f"This is weight {weight}") + if len(weight) == 0: + additional_labels.extend( + [[written, spoken_no_space], [written_capitalized, spoken_no_space.upper()]] + ) + else: + additional_labels.extend( + [ + [written, spoken_no_space, weight[0]], + [written_capitalized, spoken_no_space.upper(), weight[0]], + ] + ) + labels += additional_labels + + whitelist = pynini.string_map(labels).invert().optimize() + return whitelist + + class GraphFst: """ Base class for all grammar fsts. diff --git a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py index fe3ad9a1d..229a51b9a 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/taggers/cardinal.py @@ -13,10 +13,10 @@ # limitations under the License. import pynini -from pynini.lib import pynutil +from pynini.lib import pynutil, rewrite -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst -from nemo_text_processing.text_normalization.hi.utils import get_abs_path +from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.utils import apply_fst, get_abs_path class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/taggers/date.py b/nemo_text_processing/text_normalization/hi/taggers/date.py index 19aaf3139..1a7b0f97e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/date.py +++ b/nemo_text_processing/text_normalization/hi/taggers/date.py @@ -14,7 +14,6 @@ import pynini from pynini.lib import pynutil - from nemo_text_processing.text_normalization.hi.graph_utils import ( NEMO_HI_DIGIT, NEMO_HI_NON_ZERO, @@ -26,6 +25,7 @@ days = pynini.string_file(get_abs_path("data/date/days.tsv")) months = pynini.string_file(get_abs_path("data/date/months.tsv")) +year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv")) class DateFst(GraphFst): @@ -62,12 +62,17 @@ def __init__(self, cardinal: GraphFst): years_graph = pynutil.insert("year: \"") + graph_year + pynutil.insert("\"") + insert_space - graph_dd_mm = days_graph + delete_dash + months_graph + graph_dd_mm = days_graph + (delete_dash | pynini.accep("")) + months_graph - graph_mm_dd = months_graph + delete_dash + days_graph + graph_mm_dd = months_graph + (delete_dash | pynini.accep("")) + days_graph graph_mm_dd += pynutil.insert(" preserve_order: true ") + # Graph for era + era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space + + range_graph = pynini.cross("-", "से") + graph_dd_mm_yyyy = ( days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph ) @@ -78,7 +83,22 @@ def __init__(self, cardinal: GraphFst): graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ") - graph_mm_yyyy = months_graph + delete_dash + years_graph + graph_mm_yyyy = ( + months_graph + (delete_dash | pynini.accep("")) + years_graph + pynutil.insert(" preserve_order: true ") + ) + + graph_year_suffix = era_graph + + graph_range = ( + pynutil.insert("text: \"") + + (cardinal.final_graph | graph_year) + + insert_space + + range_graph + + insert_space + + (cardinal.final_graph | graph_year) + + pynutil.insert("\"") + + pynutil.insert(" preserve_order: true ") + ) # default assume dd_mm_yyyy @@ -88,6 +108,8 @@ def __init__(self, cardinal: GraphFst): | pynutil.add_weight(graph_dd_mm_yyyy, -0.001) | graph_mm_dd_yyyy | graph_mm_yyyy + | pynutil.add_weight(graph_year_suffix, -0.001) + | pynutil.add_weight(graph_range, -0.005) ) self.final_graph = final_graph.optimize() diff --git a/nemo_text_processing/text_normalization/hi/taggers/fraction.py b/nemo_text_processing/text_normalization/hi/taggers/fraction.py index a29a72666..72b8706fd 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/taggers/fraction.py @@ -15,7 +15,9 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/taggers/measure.py b/nemo_text_processing/text_normalization/hi/taggers/measure.py index 7434fd70f..a8cc3fad3 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/measure.py +++ b/nemo_text_processing/text_normalization/hi/taggers/measure.py @@ -44,7 +44,10 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): ) # Define the unit handling - self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ") + + # Handling symbols like x, X, * + symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),]) graph_measurements = ( pynutil.insert("decimal { ") @@ -52,8 +55,9 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + decimal_graph + pynutil.insert(" }") + delete_space - + self.unit + + unit ) + graph_measurements |= ( pynutil.insert("cardinal { ") + optional_graph_negative @@ -62,7 +66,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst): + pynutil.insert("\"") + pynutil.insert(" }") + delete_space - + self.unit + + unit + ) + + # Handling cardinal clubbed with symbol as single token + graph_measurements |= ( + pynutil.insert("cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") + + pynutil.insert(" }") + + pynutil.insert(" units: \"") + + symbol_graph + + pynutil.insert("\" ") + + pynutil.insert("} }") + + insert_space + + pynutil.insert("tokens { cardinal { ") + + optional_graph_negative + + pynutil.insert("integer: \"") + + cardinal_graph + + pynutil.insert("\"") ) graph = graph_measurements diff --git a/nemo_text_processing/text_normalization/hi/taggers/money.py b/nemo_text_processing/text_normalization/hi/taggers/money.py index c44d6d346..3de3017ed 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/money.py +++ b/nemo_text_processing/text_normalization/hi/taggers/money.py @@ -24,9 +24,11 @@ class MoneyFst(GraphFst): """ Finite state transducer for classifying money, suppletive aware, e.g. - ₹1 -> money { currency: "रुपए" integer_part: "एक" } - ₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" } - + ₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" } + ₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" } + ₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } + Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination + Args: cardinal: CardinalFst decimal: DecimalFst @@ -34,29 +36,23 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self, cardinal: GraphFst): super().__init__(name="money", kind="classify") cardinal_graph = cardinal.final_graph - optional_graph_negative = pynini.closure( - pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1, - ) - self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ") - self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ") - self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ") - - graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger - graph_currencies |= ( - optional_graph_negative - + self.currency - + insert_space - + self.interger - + pynutil.delete(".") - + insert_space - + self.fraction + currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"') + integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"') + fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"') + currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"') + + graph_major_only = currency_major + insert_space + integer + graph_major_and_minor = ( + currency_major + insert_space + integer + pynini.cross(".", " ") + fraction + insert_space + currency_minor ) - graph = graph_currencies - self.graph = graph.optimize() + + graph_currencies = graph_major_only | graph_major_and_minor + + graph = graph_currencies.optimize() final_graph = self.add_tokens(graph) self.fst = final_graph diff --git a/nemo_text_processing/text_normalization/hi/taggers/time.py b/nemo_text_processing/text_normalization/hi/taggers/time.py index 622d4d5cb..bdbcf8c4e 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/time.py +++ b/nemo_text_processing/text_normalization/hi/taggers/time.py @@ -13,10 +13,10 @@ # limitations under the License. import pynini -from pynini.lib import pynutil +from pynini.lib import pynutil, rewrite from nemo_text_processing.text_normalization.hi.graph_utils import GraphFst, insert_space -from nemo_text_processing.text_normalization.hi.utils import get_abs_path +from nemo_text_processing.text_normalization.hi.utils import apply_fst, get_abs_path hours_graph = pynini.string_file(get_abs_path("data/time/hours.tsv")) minutes_graph = pynini.string_file(get_abs_path("data/time/minutes.tsv")) diff --git a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py index 48ee97ef3..bdec90c06 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py +++ b/nemo_text_processing/text_normalization/hi/taggers/tokenize_and_classify.py @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst): Final class that composes all other classification grammars. This class can process an entire sentence including punctuation. For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File. More details to deployment at NeMo/tools/text_processing_deployment. - + Args: input_case: accepting either "lower_cased" or "cased" input. deterministic: if True will provide a single transduction option, @@ -68,11 +68,11 @@ def __init__( os.makedirs(cache_dir, exist_ok=True) whitelist_file = os.path.basename(whitelist) if whitelist else "" far_file = os.path.join( - cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far" + cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far", ) if not overwrite_cache and far_file and os.path.exists(far_file): self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"] - logging.info(f'ClassifyFst.fst was restored from {far_file}.') + logging.info(f"ClassifyFst.fst was restored from {far_file}.") else: logging.info(f"Creating ClassifyFst grammars.") @@ -107,7 +107,7 @@ def __init__( logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes") start_time = time.time() - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst(cardinal=cardinal) money_graph = money.fst logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes") diff --git a/nemo_text_processing/text_normalization/hi/taggers/word.py b/nemo_text_processing/text_normalization/hi/taggers/word.py index bc354232b..2577a4037 100644 --- a/nemo_text_processing/text_normalization/hi/taggers/word.py +++ b/nemo_text_processing/text_normalization/hi/taggers/word.py @@ -18,6 +18,7 @@ from nemo_text_processing.text_normalization.hi.graph_utils import ( MIN_NEG_WEIGHT, NEMO_NOT_SPACE, + NEMO_SIGMA, GraphFst, convert_space, ) diff --git a/nemo_text_processing/text_normalization/hi/utils.py b/nemo_text_processing/text_normalization/hi/utils.py index 102212183..d21135e42 100644 --- a/nemo_text_processing/text_normalization/hi/utils.py +++ b/nemo_text_processing/text_normalization/hi/utils.py @@ -40,7 +40,6 @@ def load_labels(abs_path): """ label_tsv = open(abs_path, encoding="utf-8") labels = list(csv.reader(label_tsv, delimiter="\t")) - label_tsv.close() return labels diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py index 9882aa4cf..9f80a9eae 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/cardinal.py @@ -13,9 +13,11 @@ # limitations under the License. import pynini -from pynini.lib import pynutil +from pynini.lib import pynutil, rewrite from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst class CardinalFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/date.py b/nemo_text_processing/text_normalization/hi/verbalizers/date.py index 1265fcec6..aa1032136 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/date.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/date.py @@ -16,6 +16,8 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst, delete_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst, get_abs_path class DateFst(GraphFst): @@ -39,6 +41,10 @@ def __init__(self): year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + + range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"") + graph_dd_mm = day + NEMO_SPACE + month graph_mm_dd = month + NEMO_SPACE + day @@ -49,6 +55,10 @@ def __init__(self): graph_mm_yyyy = month + NEMO_SPACE + year + graph_era = era + + graph_range = range + optional_preserve_order = pynini.closure( pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space | pynutil.delete("field_order:") @@ -60,7 +70,7 @@ def __init__(self): ) self.graph = ( - (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy) + (graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range) + delete_space + optional_preserve_order ) diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py index e4cfae302..d7ca6a3fb 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/fraction.py @@ -16,6 +16,8 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import MINUS, NEMO_NOT_QUOTE, GraphFst, insert_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst class FractionFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py index 6cc6f8879..da1af37d8 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/measure.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/measure.py @@ -16,6 +16,8 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.taggers.cardinal import CardinalFst +from nemo_text_processing.text_normalization.hi.utils import apply_fst class MeasureFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/money.py b/nemo_text_processing/text_normalization/hi/verbalizers/money.py index d5cab33d8..ca40f2805 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/money.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/money.py @@ -15,14 +15,16 @@ import pynini from pynini.lib import pynutil -from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.data.money.major_minor_currencies import major_minor_currencies +from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, NEMO_SPACE, GraphFst class MoneyFst(GraphFst): """ Finite state transducer for verbalizing money, e.g. - money { integer_part: "बारह" currency: "रुपए" } -> बारह रुपए - money { integer_part: "बारह" currency: "रुपए" fractional_part: "पचास" currency: "पैसे" } -> बारह रुपए पचास पैसे + money { integer_part: "बारह" currency_maj: "रुपए" } -> बारह रुपए + money { integer_part: "बारह" currency_maj: "रुपए" fractional_part: "पचास" currency_min: "centiles" } -> बारह रुपए पचास पैसे + money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" } -> पचास पैसे Args: cardinal: CardinalFst @@ -31,33 +33,58 @@ class MoneyFst(GraphFst): for False multiple transduction are generated (used for audio-based normalization) """ - def __init__(self, cardinal: GraphFst, decimal: GraphFst): + def __init__(self): super().__init__(name="money", kind="verbalize") - insert_paise = pynutil.insert("पैसे") + currency_major = pynutil.delete('currency_maj: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') - currency = ( - pynutil.delete('currency: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) - - integer_part = ( - pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('" ') + insert_space - ) + integer_part = pynutil.delete('integer_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') fractional_part = ( - pynutil.delete('fractional_part: "') - + pynini.closure(NEMO_NOT_QUOTE, 1) - + pynutil.delete('" ') - + insert_space + pynutil.delete('fractional_part: "') + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete('"') ) - graph_integer = integer_part + delete_space + currency + # Handles major denominations only + graph_major_only = integer_part + pynini.accep(NEMO_SPACE) + currency_major - graph_interger_fraction = ( - integer_part + delete_space + currency + delete_space + fractional_part + delete_space + insert_paise - ) + # Handles both major and minor denominations + major_minor_graphs = [] + + # Handles minor denominations only + minor_graphs = [] + + # Logic for handling minor denominations + for major, minor in major_minor_currencies.items(): + graph_major = pynutil.delete('currency_maj: "') + pynini.accep(major) + pynutil.delete('"') + graph_minor = pynutil.delete('currency_min: "') + pynini.cross("centiles", minor) + pynutil.delete('"') + graph_major_minor_partial = ( + integer_part + + pynini.accep(NEMO_SPACE) + + graph_major + + pynini.accep(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + major_minor_graphs.append(graph_major_minor_partial) + + graph_minor_partial = ( + pynutil.delete('integer_part: "शून्य"') + + pynutil.delete(NEMO_SPACE) + + pynutil.delete('currency_maj: "') + + pynutil.delete(major) + + pynutil.delete('"') + + pynutil.delete(NEMO_SPACE) + + fractional_part + + pynini.accep(NEMO_SPACE) + + graph_minor + ) + minor_graphs.append(graph_minor_partial) + + graph_major_minor = pynini.union(*major_minor_graphs) + graph_minor_only = pynini.union(*minor_graphs) - graph = graph_integer | graph_interger_fraction + graph = graph_major_only | graph_major_minor | pynutil.add_weight(graph_minor_only, -0.1) delete_tokens = self.delete_tokens(graph) self.fst = delete_tokens.optimize() diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/time.py b/nemo_text_processing/text_normalization/hi/verbalizers/time.py index da10df4a0..0e4e4c425 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/time.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/time.py @@ -16,6 +16,7 @@ from pynini.lib import pynutil from nemo_text_processing.text_normalization.hi.graph_utils import NEMO_NOT_QUOTE, GraphFst, delete_space, insert_space +from nemo_text_processing.text_normalization.hi.utils import apply_fst, get_abs_path class TimeFst(GraphFst): diff --git a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py index ca06fc9c3..e91f0d9f6 100644 --- a/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py +++ b/nemo_text_processing/text_normalization/hi/verbalizers/verbalize.py @@ -20,8 +20,7 @@ from nemo_text_processing.text_normalization.hi.verbalizers.measure import MeasureFst from nemo_text_processing.text_normalization.hi.verbalizers.money import MoneyFst from nemo_text_processing.text_normalization.hi.verbalizers.time import TimeFst - -# from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst +from nemo_text_processing.text_normalization.hi.verbalizers.whitelist import WhiteListFst class VerbalizeFst(GraphFst): @@ -56,11 +55,20 @@ def __init__(self, deterministic: bool = True): measure = MeasureFst(cardinal=cardinal, decimal=decimal) measure_graph = measure.fst - money = MoneyFst(cardinal=cardinal, decimal=decimal) + money = MoneyFst() money_graph = money.fst - # whitelist_graph = WhiteListFst(deterministic=deterministic).fst - - graph = cardinal_graph | decimal_graph | fraction_graph | date_graph | time_graph | measure_graph | money_graph + whitelist_graph = WhiteListFst(deterministic=deterministic).fst + + graph = ( + cardinal_graph + | decimal_graph + | fraction_graph + | date_graph + | time_graph + | measure_graph + | money_graph + | whitelist_graph + ) self.fst = graph diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt index d92a53852..b45b40a9e 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_date.txt @@ -14,6 +14,7 @@ १०-२९-२०००~अक्टूबर उनतीस दो हज़ार ११-१४-११००~नवंबर चौदह ग्यारह सौ ०३-२०१०~मार्च दो हज़ार दस -११-२०२४~नवंबर दो हज़ार चौबीस २०७०~दो हज़ार सत्तर २०२४~दो हज़ार चौबीस +१२० ई. पू.~एक सौ बीस ईसा पूर्व +२९७-२७२ ई. पू.~दो सौ सत्तानबे से दो सौ बहत्तर ईसा पूर्व diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt index 453369f82..86a824f72 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_measure.txt @@ -60,3 +60,7 @@ ९९.५ oz~निन्यानबे दशमलव पाँच आउन्स ८५ q~पचासी क्विंटल ८५.९९ q~पचासी दशमलव नौ नौ क्विंटल +२००x१० के गद्दे~दो सौ बाई दस के गद्दे +५x५ का सोफ़ा~पाँच बाई पाँच का सोफ़ा +२x२ रुबिक्स क्यूब~दो बाई दो रुबिक्स क्यूब +१३x१३ का घर~तेरह बाई तेरह का घर diff --git a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt index c7b32628b..cd1b9cba2 100644 --- a/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt +++ b/tests/nemo_text_processing/hi/data_text_normalization/test_cases_money.txt @@ -97,4 +97,22 @@ $२८२१~दो हज़ार आठ सौ इक्कीस डॉल ₹५४५~पाँच सौ पैंतालीस रुपए ₹१८४५~एक हज़ार आठ सौ पैंतालीस रुपए ₹३७२~तीन सौ बहत्तर रुपए -$९८~अट्ठानबे डॉलर \ No newline at end of file +$९८~अट्ठानबे डॉलर +₹१२३.५७~एक सौ तेईस रुपए सत्तावन पैसे +₹९९९.५०~नौ सौ निन्यानबे रुपए पचास पैसे +£१५०.२९~एक सौ पचास पाउंड उनतीस पेंस +£८०.३१~अस्सी पाउंड इकतीस पेंस +₩२३४५.१०~दो हज़ार तीन सौ पैंतालीस वॉन दस जिओन +₩१००.२५~एक सौ वॉन पच्चीस जिओन +$१२५.७०~एक सौ पच्चीस डॉलर सत्तर सेंट +$९.९९~नौ डॉलर निन्यानबे सेंट +₺८०.३६~अस्सी लीरा छत्तीस कुरस +₺१२३४.७८~एक हज़ार दो सौ चौंतीस लीरा अठहत्तर कुरस +৳१००.४२~एक सौ टका बयालीस पैसे +৳३०२५.८७~तीन हज़ार पच्चीस टका सत्तासी पैसे +¥१००.४८~एक सौ येन अड़तालीस सेन +¥७७७.२३~सात सौ सतहत्तर येन तेईस सेन +₦८७६.५३~आठ सौ छिहत्तर नाइरा तिरेपन कोबो +₦१०.२७~दस नाइरा सत्ताईस कोबो +€२००.९०~दो सौ यूरो नब्बे सेंट +€१२३४.७५~एक हज़ार दो सौ चौंतीस यूरो पचहत्तर सेंट \ No newline at end of file diff --git a/tests/nemo_text_processing/hi/test_cardinal.py b/tests/nemo_text_processing/hi/test_cardinal.py index 8298ec0e3..95ed798f1 100644 --- a/tests/nemo_text_processing/hi/test_cardinal.py +++ b/tests/nemo_text_processing/hi/test_cardinal.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestCardinal: @@ -33,10 +34,3 @@ class TestCardinal: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_cardinal.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred == expected diff --git a/tests/nemo_text_processing/hi/test_date.py b/tests/nemo_text_processing/hi/test_date.py index df12e9874..f1f268675 100644 --- a/tests/nemo_text_processing/hi/test_date.py +++ b/tests/nemo_text_processing/hi/test_date.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestDate: @@ -33,10 +34,3 @@ class TestDate: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_date.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_decimal.py b/tests/nemo_text_processing/hi/test_decimal.py index 582b59422..0b6599437 100644 --- a/tests/nemo_text_processing/hi/test_decimal.py +++ b/tests/nemo_text_processing/hi/test_decimal.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestDecimal: @@ -33,10 +34,3 @@ class TestDecimal: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_decimal.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_fraction.py b/tests/nemo_text_processing/hi/test_fraction.py index bedf9d0f7..dd754d12a 100644 --- a/tests/nemo_text_processing/hi/test_fraction.py +++ b/tests/nemo_text_processing/hi/test_fraction.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestFraction: @@ -33,10 +34,3 @@ class TestFraction: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_fraction.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_measure.py b/tests/nemo_text_processing/hi/test_measure.py index 71352cdc8..bf07c71e6 100644 --- a/tests/nemo_text_processing/hi/test_measure.py +++ b/tests/nemo_text_processing/hi/test_measure.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestMeasure: @@ -33,10 +34,3 @@ class TestMeasure: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_measure.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_money.py b/tests/nemo_text_processing/hi/test_money.py index 0665146a6..07f1d6224 100644 --- a/tests/nemo_text_processing/hi/test_money.py +++ b/tests/nemo_text_processing/hi/test_money.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestMoney: @@ -33,10 +34,3 @@ class TestMoney: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_money.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip() diff --git a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh index 498443f71..ce823ec54 100644 --- a/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh +++ b/tests/nemo_text_processing/hi/test_sparrowhawk_normalization.sh @@ -14,12 +14,8 @@ runtest () { denorm_pred=$(echo $written | normalizer_main --config=sparrowhawk_configuration.ascii_proto 2>&1 | tail -n 1 | sed 's/\xC2\xA0/ /g') # trim white space - # spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" - # denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//')" - - # trim white space and remove space before punctuation - spoken="$(echo -e "${spoken}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" - denorm_pred="$(echo -e "${denorm_pred}" | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/ \([!?.]\)/\1/g')" + spoken="$(echo "${spoken}" | tr -d ' ')" + denorm_pred="$(echo "${denorm_pred}" | tr -d ' ')" # input expected actual assertEquals "$written" "$spoken" "$denorm_pred" diff --git a/tests/nemo_text_processing/hi/test_time.py b/tests/nemo_text_processing/hi/test_time.py index 402faf414..da030f4c4 100644 --- a/tests/nemo_text_processing/hi/test_time.py +++ b/tests/nemo_text_processing/hi/test_time.py @@ -17,8 +17,9 @@ from nemo_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer from nemo_text_processing.text_normalization.normalize import Normalizer +from nemo_text_processing.text_normalization.normalize_with_audio import NormalizerWithAudio -from ..utils import CACHE_DIR, parse_test_case_file +from ..utils import CACHE_DIR, RUN_AUDIO_BASED_TESTS, parse_test_case_file class TestTime: @@ -33,10 +34,3 @@ class TestTime: def test_norm(self, test_input, expected): pred = self.normalizer.normalize(test_input, verbose=False) assert pred.strip() == expected.strip() - - @parameterized.expand(parse_test_case_file('hi/data_inverse_text_normalization/test_cases_time.txt')) - @pytest.mark.run_only_on('CPU') - @pytest.mark.unit - def test_denorm(self, test_input, expected): - pred = self.inverse_normalizer.inverse_normalize(test_input, verbose=False) - assert pred.strip() == expected.strip()