Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Future Implementations for classes - Measure, Money, and Date #258

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
2 changes: 1 addition & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ pipeline {
HY_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-0'
MR_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/03-12-24-1'
JA_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/10-17-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/11-29-24-1'
HI_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/01-31-25-0'
DEFAULT_TN_CACHE='/home/jenkinsci/TestData/text_norm/ci/grammars/06-08-23-0'
}
stages {
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
ई. पू. ईसा पूर्व
ई. ईसवी
तक तक
Original file line number Diff line number Diff line change
Expand Up @@ -141,14 +141,16 @@ month महीना
months महीने
ct कैरेट
pH पीएच
km/h किलोमीटर प्रति घंटा
km/hr किलोमीटर प्रति घंटा
km/min किलोमीटर प्रति मिनट
m/h मीटर प्रति घंटा
m/hr मीटर प्रति घंटा
mi/s मील प्रति सेकंड
mi/h मील प्रति घंटा
mi/hr मील प्रति घंटा
mi/min मील प्रति मिनट
₹/ac रुपए प्रति एकड़
x बाई
X बाई
* बाई
- से
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
₹ रुपए
P पैसे
£ पाउंड
₩ वॉन
$ डॉलर
₺ लीरा
৳ टका
¥ येन
₦ नाइरा
€ यूरो
€ यूरो
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
रुपए पैसे
पाउंड पेंस
वॉन जिओन
डॉलर सेंट
लीरा कुरस
टका पैसे
येन सेन
नाइरा कोबो
यूरो सेंट
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
० शून्य
१ एक
२ दो
३ तीन
Expand Down
25 changes: 23 additions & 2 deletions nemo_text_processing/text_normalization/hi/taggers/date.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

days = pynini.string_file(get_abs_path("data/date/days.tsv"))
months = pynini.string_file(get_abs_path("data/date/months.tsv"))
year_suffix = pynini.string_file(get_abs_path("data/date/year_suffix.tsv"))


class DateFst(GraphFst):
Expand Down Expand Up @@ -68,6 +69,11 @@ def __init__(self, cardinal: GraphFst):

graph_mm_dd += pynutil.insert(" preserve_order: true ")

# Graph for era
era_graph = pynutil.insert("era: \"") + year_suffix + pynutil.insert("\"") + insert_space

range_graph = pynini.cross("-", "से")

graph_dd_mm_yyyy = (
days_graph + (delete_dash | delete_slash) + months_graph + (delete_dash | delete_slash) + years_graph
)
Expand All @@ -78,7 +84,20 @@ def __init__(self, cardinal: GraphFst):

graph_mm_dd_yyyy += pynutil.insert(" preserve_order: true ")

graph_mm_yyyy = months_graph + delete_dash + years_graph
graph_mm_yyyy = months_graph + delete_dash + insert_space + years_graph

graph_year_suffix = era_graph

graph_range = (
pynutil.insert("text: \"")
+ (cardinal.final_graph | graph_year)
+ insert_space
+ range_graph
+ insert_space
+ (cardinal.final_graph | graph_year)
+ pynutil.insert("\"")
+ pynutil.insert(" preserve_order: true ")
)

# default assume dd_mm_yyyy

Expand All @@ -87,7 +106,9 @@ def __init__(self, cardinal: GraphFst):
| graph_mm_dd
| pynutil.add_weight(graph_dd_mm_yyyy, -0.001)
| graph_mm_dd_yyyy
| graph_mm_yyyy
| pynutil.add_weight(graph_mm_yyyy, -0.2)
| pynutil.add_weight(graph_year_suffix, -0.001)
| pynutil.add_weight(graph_range, -0.005)
)

self.final_graph = final_graph.optimize()
Expand Down
37 changes: 33 additions & 4 deletions nemo_text_processing/text_normalization/hi/taggers/measure.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,11 @@
from nemo_text_processing.text_normalization.hi.utils import get_abs_path


digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
teens_ties = pynini.string_file(get_abs_path("data/numbers/teens_and_ties.tsv"))
teens_and_ties = pynutil.add_weight(teens_ties, -0.1)


class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure, suppletive aware, e.g.
Expand All @@ -35,7 +40,7 @@ class MeasureFst(GraphFst):
def __init__(self, cardinal: GraphFst, decimal: GraphFst):
super().__init__(name="measure", kind="classify")

cardinal_graph = cardinal.final_graph
cardinal_graph = digit | teens_and_ties | cardinal.graph_hundreds | cardinal.graph_thousands
decimal_graph = decimal.final_graph_wo_negative
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

does it make sense to do the same thing for decimals as we did for cardinals?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Changing the decimal class based on the cardinal's final graph makes no sense.

unit_graph = pynini.string_file(get_abs_path("data/measure/unit.tsv"))

Expand All @@ -44,16 +49,20 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
)

# Define the unit handling
self.unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")
unit = pynutil.insert("units: \"") + unit_graph + pynutil.insert("\" ")

# Handling symbols like x, X, *
symbol_graph = pynini.string_map([("x", "बाई"), ("X", "बाई"), ("*", "बाई"),])

graph_measurements = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ decimal_graph
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)

graph_measurements |= (
pynutil.insert("cardinal { ")
+ optional_graph_negative
Expand All @@ -62,7 +71,27 @@ def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ delete_space
+ self.unit
+ unit
)

# Handling cardinal clubbed with symbol as single token
graph_measurements |= (
pynutil.insert("cardinal { ")
ngachchi marked this conversation as resolved.
Show resolved Hide resolved
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
+ pynutil.insert(" }")
+ pynutil.insert(" units: \"")
+ symbol_graph
+ pynutil.insert("\" ")
+ pynutil.insert("} }")
+ insert_space
+ pynutil.insert("tokens { cardinal { ")
+ optional_graph_negative
+ pynutil.insert("integer: \"")
+ cardinal_graph
+ pynutil.insert("\"")
)

graph = graph_measurements
Expand Down
36 changes: 21 additions & 15 deletions nemo_text_processing/text_normalization/hi/taggers/money.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,39 +24,45 @@
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money, suppletive aware, e.g.
₹1 -> money { currency: "रुपए" integer_part: "एक" }
₹1.2 -> money { currency: "रुपए" integer_part: "एक" fractional_part: "दो" }

₹५० -> money { money { currency_maj: "रुपए" integer_part: "पचास" }
₹५०.५० -> money { currency_maj: "रुपए" integer_part: "पचास" fractional_part: "पचास" currency_min: "centiles" }
₹०.५० -> money { currency_maj: "रुपए" integer_part: "शून्य" fractional_part: "पचास" currency_min: "centiles" }
Note that the 'centiles' string is a placeholder to handle by the verbalizer by applying the corresponding minor currency denomination

Args:
cardinal: CardinalFst
decimal: DecimalFst
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""

def __init__(self, cardinal: GraphFst, decimal: GraphFst):
def __init__(self, cardinal: GraphFst):
super().__init__(name="money", kind="classify")

cardinal_graph = cardinal.final_graph

optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("-", "\"true\"") + insert_space, 0, 1,
)
self.currency = pynutil.insert("currency: \"") + currency_graph + pynutil.insert("\" ")
self.interger = pynutil.insert("integer_part: \"") + cardinal_graph + pynutil.insert("\" ")
self.fraction = pynutil.insert("fractional_part: \"") + cardinal_graph + pynutil.insert("\" ")
currency_major = pynutil.insert('currency_maj: "') + currency_graph + pynutil.insert('"')
integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
fraction = pynutil.insert('fractional_part: "') + cardinal_graph + pynutil.insert('"')
currency_minor = pynutil.insert('currency_min: "') + pynutil.insert("centiles") + pynutil.insert('"')

graph_currencies = optional_graph_negative + self.currency + insert_space + self.interger
graph_currencies |= (
graph_major_only = optional_graph_negative + currency_major + insert_space + integer
graph_major_and_minor = (
optional_graph_negative
ngachchi marked this conversation as resolved.
Show resolved Hide resolved
+ self.currency
+ currency_major
+ insert_space
+ self.interger
+ pynutil.delete(".")
+ integer
+ pynini.cross(".", " ")
+ fraction
+ insert_space
+ self.fraction
+ currency_minor
)
graph = graph_currencies
self.graph = graph.optimize()

graph_currencies = graph_major_only | graph_major_and_minor

graph = graph_currencies.optimize()
final_graph = self.add_tokens(graph)
self.fst = final_graph
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class ClassifyFst(GraphFst):
Final class that composes all other classification grammars. This class can process an entire sentence including punctuation.
For deployment, this grammar will be compiled and exported to OpenFst Finite State Archive (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.

Args:
input_case: accepting either "lower_cased" or "cased" input.
deterministic: if True will provide a single transduction option,
Expand All @@ -68,11 +68,11 @@ def __init__(
os.makedirs(cache_dir, exist_ok=True)
whitelist_file = os.path.basename(whitelist) if whitelist else ""
far_file = os.path.join(
cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far"
cache_dir, f"hi_tn_{deterministic}_deterministic_{input_case}_{whitelist_file}_tokenize.far",
)
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f'ClassifyFst.fst was restored from {far_file}.')
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")

Expand Down Expand Up @@ -107,7 +107,7 @@ def __init__(
logging.debug(f"measure: {time.time() - start_time: .2f}s -- {measure_graph.num_states()} nodes")

start_time = time.time()
money = MoneyFst(cardinal=cardinal, decimal=decimal)
money = MoneyFst(cardinal=cardinal)
money_graph = money.fst
logging.debug(f"money: {time.time() - start_time: .2f}s -- {money_graph.num_states()} nodes")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ def __init__(self):

year = pynutil.delete("year: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

graph_era = pynutil.delete("era: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

graph_range = pynutil.delete("text: \"") + pynini.closure(NEMO_NOT_QUOTE, 1) + pynutil.delete("\"")

graph_dd_mm = day + NEMO_SPACE + month

graph_mm_dd = month + NEMO_SPACE + day
Expand All @@ -60,7 +64,7 @@ def __init__(self):
)

self.graph = (
(graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy)
(graph_dd_mm | graph_mm_dd | graph_dd_mm_yyyy | graph_mm_dd_yyyy | graph_mm_yyyy | graph_era | graph_range)
+ delete_space
+ optional_preserve_order
)
Expand Down
Loading
Loading