initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/data/time/time_zone.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/data/time/time_zone.tsv
+cst	c s t
+cet	c e t
+pst	p s t
+est	e s t
+pt	p t
+et	e t
+gmt	g m t
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/data/time/to_hour.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/data/time/to_hour.tsv
+one	12
+two	1
+three	2
+four	3
+five	4
+six	5
+seven	6
+eigh	7
+nine	8
+ten	9
+eleven	10
+twelve	11
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/data/whitelist.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/data/whitelist.tsv
+e.g.	for example
+dr.	doctor
+mr.	mister
+mrs.	misses
+st.	saint
+7-eleven	seven eleven
+es3	e s three
+s&p	s and p
+ASAP	a s a p
+AT&T	a t and t
+LLP	l l p
+ATM	a t m
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/cardinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    DAMO_SIGMA,
+    DAMO_SPACE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for classifying cardinals
+        e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
+    Numbers below thirteen are not converted.
+    """
+
+    def __init__(self):
+        super().__init__(name="cardinal", kind="classify")
+        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
+
+        graph_hundred = pynini.cross("hundred", "")
+
+        graph_hundred_component = pynini.union(
+            graph_digit + delete_space + graph_hundred, pynutil.insert("0")
+        )
+        graph_hundred_component += delete_space
+        graph_hundred_component += pynini.union(
+            graph_teen | pynutil.insert("00"),
+            (graph_ties | pynutil.insert("0")) + delete_space + (graph_digit | pynutil.insert("0")),
+        )
+
+        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
+            pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
+        )
+        self.graph_hundred_component_at_least_one_none_zero_digit = (
+            graph_hundred_component_at_least_one_none_zero_digit
+        )
+
+        graph_thousands = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("thousand"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_million = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("million"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_billion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("billion"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_trillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("trillion"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_quadrillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("quadrillion"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_quintillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("quintillion"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_sextillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("sextillion"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph = pynini.union(
+            graph_sextillion
+            + delete_space
+            + graph_quintillion
+            + delete_space
+            + graph_quadrillion
+            + delete_space
+            + graph_trillion
+            + delete_space
+            + graph_billion
+            + delete_space
+            + graph_million
+            + delete_space
+            + graph_thousands
+            + delete_space
+            + graph_hundred_component,
+            graph_zero,
+        )
+
+        graph = graph @ pynini.union(
+            pynutil.delete(pynini.closure("0"))
+            + pynini.difference(DAMO_DIGIT, "0")
+            + pynini.closure(DAMO_DIGIT),
+            "0",
+        )
+
+        labels_exception = [num_to_word(x) for x in range(0, 13)]
+        graph_exception = pynini.union(*labels_exception)
+
+        graph = (
+            pynini.cdrewrite(pynutil.delete("and"), DAMO_SPACE, DAMO_SPACE, DAMO_SIGMA)
+            @ (DAMO_ALPHA + DAMO_SIGMA)
+            @ graph
+        )
+
+        self.graph_no_exception = graph
+
+        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
+
+        optional_minus_graph = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("minus", '"-"') + DAMO_SPACE, 0, 1
+        )
+
+        final_graph = (
+            optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        )
+
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize()
+graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
+ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize()
+
+
+def _get_month_graph():
+    """
+    Transducer for month, e.g. march -> march
+    """
+    month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
+    return month_graph
+
+
+def _get_ties_graph():
+    """
+    Transducer for 20-99 e.g
+    twenty three -> 23
+    """
+    graph = ties_graph + (delete_space + graph_digit | pynutil.insert("0"))
+    return graph
+
+
+def _get_range_graph():
+    """
+    Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
+    """
+    graph_ties = _get_ties_graph()
+    graph = (graph_ties | graph_teen) + delete_space + pynini.cross("hundreds", "00s")
+    graph |= pynini.cross("two", "2") + delete_space + pynini.cross("thousands", "000s")
+    graph |= (
+        (graph_ties | graph_teen)
+        + delete_space
+        + (pynini.closure(DAMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
+        @ (graph_ties | pynini.cross("ten", "10"))
+        + pynutil.insert("s")
+    )
+    graph @= pynini.union("1", "2") + DAMO_DIGIT + DAMO_DIGIT + DAMO_DIGIT + "s"
+    return graph
+
+
+def _get_year_graph():
+    """
+    Transducer for year, e.g. twenty twenty -> 2020
+    """
+
+    def _get_digits_graph():
+        zero = pynini.cross((pynini.accep("oh") | pynini.accep("o")), "0")
+        graph = zero + delete_space + graph_digit
+        graph.optimize()
+        return graph
+
+    def _get_thousands_graph():
+        graph_ties = _get_ties_graph()
+        graph_hundred_component = (
+            graph_digit + delete_space + pynutil.delete("hundred")
+        ) | pynutil.insert("0")
+        graph = (
+            graph_digit
+            + delete_space
+            + pynutil.delete("thousand")
+            + delete_space
+            + graph_hundred_component
+            + delete_space
+            + (graph_teen | graph_ties)
+        )
+        return graph
+
+    graph_ties = _get_ties_graph()
+    graph_digits = _get_digits_graph()
+    graph_thousands = _get_thousands_graph()
+    year_graph = (
+        # 20 19, 40 12, 2012 - assuming no limit on the year
+        (graph_teen + delete_space + (graph_ties | graph_digits | graph_teen))
+        | (graph_ties + delete_space + (graph_ties | graph_digits | graph_teen))
+        | graph_thousands
+    )
+    year_graph.optimize()
+    return year_graph
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for classifying date,
+        e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
+        e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
+        e.g. twenty twenty -> date { year: "2012" preserve_order: true }
+
+    Args:
+        ordinal: OrdinalFst
+    """
+
+    def __init__(self, ordinal: GraphFst):
+        super().__init__(name="date", kind="classify")
+
+        ordinal_graph = ordinal.graph
+        year_graph = _get_year_graph()
+        YEAR_WEIGHT = 0.001
+        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
+        month_graph = _get_month_graph()
+
+        month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
+
+        day_graph = (
+            pynutil.insert('day: "') + pynutil.add_weight(ordinal_graph, -0.7) + pynutil.insert('"')
+        )
+        graph_year = (
+            delete_extra_space
+            + pynutil.insert('year: "')
+            + pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+            + pynutil.insert('"')
+        )
+        optional_graph_year = pynini.closure(
+            graph_year,
+            0,
+            1,
+        )
+        graph_mdy = month_graph + (
+            (delete_extra_space + day_graph)
+            | graph_year
+            | (delete_extra_space + day_graph + graph_year)
+        )
+        graph_dmy = (
+            pynutil.delete("the")
+            + delete_space
+            + day_graph
+            + delete_space
+            + pynutil.delete("of")
+            + delete_extra_space
+            + month_graph
+            + optional_graph_year
+        )
+        graph_year = (
+            pynutil.insert('year: "') + (year_graph | _get_range_graph()) + pynutil.insert('"')
+        )
+
+        final_graph = graph_mdy | graph_dmy | graph_year
+        final_graph += pynutil.insert(" preserve_order: true")
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/decimal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/decimal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+def get_quantity(
+    decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike"
+) -> "pynini.FstLike":
+    """
+    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
+    e.g. one million -> integer_part: "1" quantity: "million"
+    e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
+
+    Args:
+        decimal: decimal FST
+        cardinal_up_to_hundred: cardinal FST
+    """
+    numbers = cardinal_up_to_hundred @ (
+        pynutil.delete(pynini.closure("0"))
+        + pynini.difference(DAMO_DIGIT, "0")
+        + pynini.closure(DAMO_DIGIT)
+    )
+    suffix = pynini.union(
+        "million", "billion", "trillion", "quadrillion", "quintillion", "sextillion"
+    )
+    res = (
+        pynutil.insert('integer_part: "')
+        + numbers
+        + pynutil.insert('"')
+        + delete_extra_space
+        + pynutil.insert('quantity: "')
+        + suffix
+        + pynutil.insert('"')
+    )
+    res |= (
+        decimal
+        + delete_extra_space
+        + pynutil.insert('quantity: "')
+        + (suffix | "thousand")
+        + pynutil.insert('"')
+    )
+    return res
+
+
+class DecimalFst(GraphFst):
+    """
+    Finite state transducer for classifying decimal
+        e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12"  fractional_part: "5006" quantity: "billion" }
+        e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="decimal", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+
+        graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        graph_decimal |= pynini.string_file(get_abs_path("data/numbers/zero.tsv")) | pynini.cross(
+            "o", "0"
+        )
+
+        graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
+        self.graph = graph_decimal
+
+        point = pynutil.delete("point")
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("minus", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        graph_fractional = (
+            pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
+        )
+        graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
+        final_graph_wo_sign = (
+            pynini.closure(graph_integer + delete_extra_space, 0, 1)
+            + point
+            + delete_extra_space
+            + graph_fractional
+        )
+        final_graph = optional_graph_negative + final_graph_wo_sign
+
+        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
+        )
+        final_graph |= optional_graph_negative + get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
+        )
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/electronic.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/electronic.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_ALPHA, GraphFst, insert_space
+from pynini.lib import pynutil
+
+
+class ElectronicFst(GraphFst):
+    """
+    Finite state transducer for classifying electronic: as URLs, email addresses, etc.
+        e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
+    """
+
+    def __init__(self):
+        super().__init__(name="electronic", kind="classify")
+
+        delete_extra_space = pynutil.delete(" ")
+        alpha_num = (
+            DAMO_ALPHA
+            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+            | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        )
+
+        symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
+
+        accepted_username = alpha_num | symbols
+        process_dot = pynini.cross("dot", ".")
+        username = (
+            alpha_num + pynini.closure(delete_extra_space + accepted_username)
+        ) | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
+        username = pynutil.insert('username: "') + username + pynutil.insert('"')
+        single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
+        server = single_alphanum | pynini.string_file(
+            get_abs_path("data/electronic/server_name.tsv")
+        )
+        domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
+        domain_graph = (
+            pynutil.insert('domain: "')
+            + server
+            + delete_extra_space
+            + process_dot
+            + delete_extra_space
+            + domain
+            + pynutil.insert('"')
+        )
+        graph = (
+            username
+            + delete_extra_space
+            + pynutil.delete("at")
+            + insert_space
+            + delete_extra_space
+            + domain_graph
+        )
+
+        ############# url ###
+        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
+        protocol_start = (
+            pynini.cross("h t t p", "http") | pynini.cross("h t t p s", "https")
+        ) + pynini.cross(" colon slash slash ", "://")
+        # .com,
+        ending = (
+            delete_extra_space
+            + symbols
+            + delete_extra_space
+            + (
+                domain
+                | pynini.closure(
+                    accepted_username + delete_extra_space,
+                )
+                + accepted_username
+            )
+        )
+
+        protocol_default = (
+            (
+                (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+                | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
+            )
+            + pynini.closure(ending, 1)
+        ).optimize()
+        protocol = (
+            pynini.closure(protocol_start, 0, 1)
+            + protocol_end
+            + delete_extra_space
+            + process_dot
+            + protocol_default
+        ).optimize()
+
+        protocol |= (
+            pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default
+        )
+
+        protocol = pynutil.insert('protocol: "') + protocol.optimize() + pynutil.insert('"')
+        graph |= protocol
+        ########
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/fraction.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/fraction.py
+from fun_text_processing.text_normalization.en.graph_utils import GraphFst
+
+
+class FractionFst(GraphFst):
+    """
+    Finite state transducer for classifying fraction
+    """
+
+    def __init__(self):
+        super().__init__(name="fraction", kind="classify")
+        # integer_part # numerator # denominator
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/measure.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/measure.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_SIGMA,
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    get_singulars,
+)
+from pynini.lib import pynutil
+
+
+class MeasureFst(GraphFst):
+    """
+    Finite state transducer for classifying measure
+        e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+        super().__init__(name="measure", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+
+        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
+        graph_unit_singular = pynini.invert(graph_unit)  # singular -> abbr
+        graph_unit_plural = get_singulars(graph_unit_singular)  # plural -> abbr
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("minus", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        unit_singular = convert_space(graph_unit_singular)
+        unit_plural = convert_space(graph_unit_plural)
+        unit_misc = (
+            pynutil.insert("/")
+            + pynutil.delete("per")
+            + delete_space
+            + convert_space(graph_unit_singular)
+        )
+
+        unit_singular = (
+            pynutil.insert('units: "')
+            + (
+                unit_singular
+                | unit_misc
+                | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)
+            )
+            + pynutil.insert('"')
+        )
+        unit_plural = (
+            pynutil.insert('units: "')
+            + (
+                unit_plural
+                | unit_misc
+                | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)
+            )
+            + pynutil.insert('"')
+        )
+
+        subgraph_decimal = (
+            pynutil.insert("decimal { ")
+            + optional_graph_negative
+            + decimal.final_graph_wo_negative
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_plural
+        )
+        subgraph_cardinal = (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + ((DAMO_SIGMA - "one") @ cardinal_graph)
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_plural
+        )
+        subgraph_cardinal |= (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + pynini.cross("one", "1")
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_singular
+        )
+        final_graph = subgraph_decimal | subgraph_cardinal
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/money.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/money.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_DIGIT,
+    DAMO_NOT_SPACE,
+    DAMO_SIGMA,
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    get_singulars,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class MoneyFst(GraphFst):
+    """
+    Finite state transducer for classifying money
+        e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+        super().__init__(name="money", kind="classify")
+        # quantity, integer_part, fractional_part, currency
+
+        cardinal_graph = cardinal.graph_no_exception
+        # add support for missing hundred (only for 3 digit numbers)
+        # "one fifty" -> "one hundred fifty"
+        with_hundred = pynini.compose(
+            pynini.closure(DAMO_NOT_SPACE)
+            + pynini.accep(" ")
+            + pynutil.insert("hundred ")
+            + DAMO_SIGMA,
+            pynini.compose(cardinal_graph, DAMO_DIGIT**3),
+        )
+        cardinal_graph |= with_hundred
+        graph_decimal_final = decimal.final_graph_wo_negative
+
+        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
+        unit_singular = pynini.invert(unit)
+        unit_plural = get_singulars(unit_singular)
+
+        graph_unit_singular = (
+            pynutil.insert('currency: "') + convert_space(unit_singular) + pynutil.insert('"')
+        )
+        graph_unit_plural = (
+            pynutil.insert('currency: "') + convert_space(unit_plural) + pynutil.insert('"')
+        )
+
+        add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
+            pynutil.insert("0") + DAMO_DIGIT
+        )
+        # twelve dollars (and) fifty cents, zero cents
+        cents_standalone = (
+            pynutil.insert('fractional_part: "')
+            + pynini.union(
+                pynutil.add_weight(((DAMO_SIGMA - "one") @ cardinal_graph), -0.7)
+                @ add_leading_zero_to_double_digit
+                + delete_space
+                + (pynutil.delete("cents") | pynutil.delete("cent")),
+                pynini.cross("one", "01") + delete_space + pynutil.delete("cent"),
+            )
+            + pynutil.insert('"')
+        )
+
+        optional_cents_standalone = pynini.closure(
+            delete_space
+            + pynini.closure(pynutil.delete("and") + delete_space, 0, 1)
+            + insert_space
+            + cents_standalone,
+            0,
+            1,
+        )
+        # twelve dollars fifty, only after integer
+        optional_cents_suffix = pynini.closure(
+            delete_extra_space
+            + pynutil.insert('fractional_part: "')
+            + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+            + pynutil.insert('"'),
+            0,
+            1,
+        )
+
+        graph_integer = (
+            pynutil.insert('integer_part: "')
+            + ((DAMO_SIGMA - "one") @ cardinal_graph)
+            + pynutil.insert('"')
+            + delete_extra_space
+            + graph_unit_plural
+            + (optional_cents_standalone | optional_cents_suffix)
+        )
+        graph_integer |= (
+            pynutil.insert('integer_part: "')
+            + pynini.cross("one", "1")
+            + pynutil.insert('"')
+            + delete_extra_space
+            + graph_unit_singular
+            + (optional_cents_standalone | optional_cents_suffix)
+        )
+        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
+        graph_decimal |= pynutil.insert('currency: "$" integer_part: "0" ') + cents_standalone
+        final_graph = graph_integer | graph_decimal
+
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/ordinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/ordinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst
+from pynini.lib import pynutil
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for classifying ordinal
+        e.g. thirteenth -> ordinal { integer: "13" }
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="ordinal", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+        graph_digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
+        graph_teens = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
+        graph = pynini.closure(DAMO_CHAR) + pynini.union(
+            graph_digit, graph_teens, pynini.cross("tieth", "ty"), pynini.cross("th", "")
+        )
+
+        self.graph = graph @ cardinal_graph
+        final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/punctuation.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/punctuation.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import GraphFst
+from pynini.lib import pynutil
+
+
+class PunctuationFst(GraphFst):
+    """
+    Finite state transducer for classifying punctuation
+        e.g. a, -> tokens { name: "a" } tokens { name: "," }
+    """
+
+    def __init__(self):
+        super().__init__(name="punctuation", kind="classify")
+
+        s = "!#$%&'()*+,-./:;<=>?@^_`{|}~"
+        punct = pynini.union(*s)
+
+        graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
+
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/telephone.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/telephone.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_ALNUM,
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    GraphFst,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+def get_serial_number(cardinal):
+    """
+    any alphanumerical character sequence with at least one number with length greater equal to 3
+    """
+    digit = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT)
+    character = digit | DAMO_ALPHA
+    sequence = character + pynini.closure(pynutil.delete(" ") + character, 2)
+    sequence = sequence @ (pynini.closure(DAMO_ALNUM) + DAMO_DIGIT + pynini.closure(DAMO_ALNUM))
+    return sequence.optimize()
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for classifying telephone numbers, e.g.
+        one two three one two three five six seven eight -> { number_part: "123-123-5678" }
+
+    This class also support card number and IP format.
+        "one two three dot one double three dot o dot four o" -> { number_part: "123.133.0.40"}
+
+        "three two double seven three two one four three two one four three double zero five" ->
+            { number_part: 3277 3214 3214 3005}
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="telephone", kind="classify")
+        # country code, number_part, extension
+        digit_to_str = (
+            pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
+            | pynini.cross("0", pynini.union("o", "oh", "zero")).optimize()
+        )
+
+        str_to_digit = pynini.invert(digit_to_str)
+
+        double_digit = pynini.union(
+            *[
+                pynini.cross(
+                    pynini.project(str(i) @ digit_to_str, "output")
+                    + pynini.accep(" ")
+                    + pynini.project(str(i) @ digit_to_str, "output"),
+                    pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
+                )
+                for i in range(10)
+            ]
+        )
+        double_digit.invert()
+
+        # to handle cases like "one twenty three"
+        two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT**2)
+        double_digit_to_digit = (
+            pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
+            | two_digit_cardinal
+        )
+
+        single_or_double_digit = (
+            pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit
+        ).optimize()
+        single_or_double_digit |= (
+            single_or_double_digit
+            + pynini.closure(
+                pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001)
+            )
+        ).optimize()
+
+        number_part = pynini.compose(
+            single_or_double_digit,
+            DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**4,
+        ).optimize()
+        number_part = (
+            pynutil.insert('number_part: "') + number_part.optimize() + pynutil.insert('"')
+        )
+
+        cardinal_option = pynini.compose(single_or_double_digit, DAMO_DIGIT ** (2, 3))
+
+        country_code = (
+            pynutil.insert('country_code: "')
+            + pynini.closure(pynini.cross("plus ", "+"), 0, 1)
+            + (
+                (pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit)
+                | cardinal_option
+            )
+            + pynutil.insert('"')
+        )
+
+        optional_country_code = pynini.closure(
+            country_code + pynutil.delete(" ") + insert_space, 0, 1
+        ).optimize()
+        graph = optional_country_code + number_part
+
+        # credit card number
+        space_four_digits = insert_space + DAMO_DIGIT**4
+        credit_card_graph = pynini.compose(
+            single_or_double_digit, DAMO_DIGIT**4 + space_four_digits**3
+        ).optimize()
+        graph |= (
+            pynutil.insert('number_part: "') + credit_card_graph.optimize() + pynutil.insert('"')
+        )
+
+        # SSN
+        ssn_graph = pynini.compose(
+            single_or_double_digit,
+            DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**2
+            + pynutil.insert("-")
+            + DAMO_DIGIT**4,
+        ).optimize()
+        graph |= pynutil.insert('number_part: "') + ssn_graph.optimize() + pynutil.insert('"')
+
+        # ip
+        digit_or_double = (
+            pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit
+        )
+        digit_or_double |= double_digit_to_digit + pynini.closure(
+            pynutil.delete(" ") + str_to_digit, 0, 1
+        )
+        digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2)
+        digit_or_double |= cardinal_option
+        digit_or_double = digit_or_double.optimize()
+
+        ip_graph = digit_or_double + (pynini.cross(" dot ", ".") + digit_or_double) ** 3
+
+        graph |= pynutil.insert('number_part: "') + ip_graph.optimize() + pynutil.insert('"')
+        graph |= (
+            pynutil.insert('number_part: "')
+            + pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001)
+            + pynutil.insert('"')
+        )
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/time.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/time.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path, num_to_word
+from fun_text_processing.text_normalization.en.graph_utils import (
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class TimeFst(GraphFst):
+    """
+    Finite state transducer for classifying time
+        e.g. twelve thirty -> time { hours: "12" minutes: "30" }
+        e.g. twelve past one -> time { minutes: "12" hours: "1" }
+        e.g. two o clock a m -> time { hours: "2" suffix: "a.m." }
+        e.g. quarter to two -> time { hours: "1" minutes: "45" }
+        e.g. quarter past two -> time { hours: "2" minutes: "15" }
+        e.g. half past two -> time { hours: "2" minutes: "30" }
+    """
+
+    def __init__(self):
+        super().__init__(name="time", kind="classify")
+        # hours, minutes, seconds, suffix, zone, style, speak_period
+
+        suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
+        time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
+        to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv"))
+        minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv"))
+
+        # only used for < 1000 thousand -> 0 weight
+        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7)
+
+        labels_hour = [num_to_word(x) for x in range(0, 24)]
+        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
+        labels_minute_double = [num_to_word(x) for x in range(10, 60)]
+
+        graph_hour = pynini.union(*labels_hour) @ cardinal
+
+        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
+        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
+        graph_minute_verbose = pynini.cross("half", "30") | pynini.cross("quarter", "15")
+        oclock = pynini.cross(pynini.union("o' clock", "o clock", "o'clock", "oclock"), "")
+
+        final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"')
+        graph_minute = (
+            oclock + pynutil.insert("00")
+            | pynutil.delete("o") + delete_space + graph_minute_single
+            | graph_minute_double
+        )
+        final_suffix = (
+            pynutil.insert('suffix: "') + convert_space(suffix_graph) + pynutil.insert('"')
+        )
+        final_suffix = delete_space + insert_space + final_suffix
+        final_suffix_optional = pynini.closure(final_suffix, 0, 1)
+        final_time_zone_optional = pynini.closure(
+            delete_space
+            + insert_space
+            + pynutil.insert('zone: "')
+            + convert_space(time_zone_graph)
+            + pynutil.insert('"'),
+            0,
+            1,
+        )
+
+        # five o' clock
+        # two o eight, two thirty five (am/pm)
+        # two pm/am
+        graph_hm = (
+            final_graph_hour
+            + delete_extra_space
+            + pynutil.insert('minutes: "')
+            + graph_minute
+            + pynutil.insert('"')
+        )
+        # 10 past four, quarter past four, half past four
+        graph_m_past_h = (
+            pynutil.insert('minutes: "')
+            + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose)
+            + pynutil.insert('"')
+            + delete_space
+            + pynutil.delete("past")
+            + delete_extra_space
+            + final_graph_hour
+        )
+
+        graph_quarter_time = (
+            pynutil.insert('minutes: "')
+            + pynini.cross("quarter", "45")
+            + pynutil.insert('"')
+            + delete_space
+            + pynutil.delete(pynini.union("to", "till"))
+            + delete_extra_space
+            + pynutil.insert('hours: "')
+            + to_hour_graph
+            + pynutil.insert('"')
+        )
+
+        graph_m_to_h_suffix_time = (
+            pynutil.insert('minutes: "')
+            + ((graph_minute_single | graph_minute_double).optimize() @ minute_to_graph)
+            + pynutil.insert('"')
+            + pynini.closure(
+                delete_space + pynutil.delete(pynini.union("min", "mins", "minute", "minutes")),
+                0,
+                1,
+            )
+            + delete_space
+            + pynutil.delete(pynini.union("to", "till"))
+            + delete_extra_space
+            + pynutil.insert('hours: "')
+            + to_hour_graph
+            + pynutil.insert('"')
+            + final_suffix
+        )
+
+        graph_h = (
+            final_graph_hour
+            + delete_extra_space
+            + pynutil.insert('minutes: "')
+            + (pynutil.insert("00") | graph_minute)
+            + pynutil.insert('"')
+            + final_suffix
+            + final_time_zone_optional
+        )
+        final_graph = (
+            (graph_hm | graph_m_past_h | graph_quarter_time)
+            + final_suffix_optional
+            + final_time_zone_optional
+        )
+        final_graph |= graph_h
+        final_graph |= graph_m_to_h_suffix_time
+
+        final_graph = self.add_tokens(final_graph.optimize())
+
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/tokenize_and_classify.py
+import os
+
+import pynini
+from fun_text_processing.inverse_text_normalization.en.taggers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.en.taggers.date import DateFst
+from fun_text_processing.inverse_text_normalization.en.taggers.decimal import DecimalFst
+from fun_text_processing.inverse_text_normalization.en.taggers.electronic import ElectronicFst
+from fun_text_processing.inverse_text_normalization.en.taggers.measure import MeasureFst
+from fun_text_processing.inverse_text_normalization.en.taggers.money import MoneyFst
+from fun_text_processing.inverse_text_normalization.en.taggers.ordinal import OrdinalFst
+from fun_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
+from fun_text_processing.inverse_text_normalization.en.taggers.telephone import TelephoneFst
+from fun_text_processing.inverse_text_normalization.en.taggers.time import TimeFst
+from fun_text_processing.inverse_text_normalization.en.taggers.whitelist import WhiteListFst
+from fun_text_processing.inverse_text_normalization.en.taggers.word import WordFst
+from fun_text_processing.text_normalization.en.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from pynini.lib import pynutil
+
+import logging
+
+
+class ClassifyFst(GraphFst):
+    """
+    Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+        super().__init__(name="tokenize_and_classify", kind="classify")
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, "_en_itn.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            logging.info(f"Creating ClassifyFst grammars.")
+            cardinal = CardinalFst()
+            cardinal_graph = cardinal.fst
+
+            ordinal = OrdinalFst(cardinal)
+            ordinal_graph = ordinal.fst
+
+            decimal = DecimalFst(cardinal)
+            decimal_graph = decimal.fst
+
+            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
+            date_graph = DateFst(ordinal=ordinal).fst
+            word_graph = WordFst().fst
+            time_graph = TimeFst().fst
+            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
+            whitelist_graph = WhiteListFst().fst
+            punct_graph = PunctuationFst().fst
+            electronic_graph = ElectronicFst().fst
+            telephone_graph = TelephoneFst(cardinal).fst
+
+            classify = (
+                pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(time_graph, 1.1)
+                | pynutil.add_weight(date_graph, 1.09)
+                | pynutil.add_weight(decimal_graph, 1.1)
+                | pynutil.add_weight(measure_graph, 1.1)
+                | pynutil.add_weight(cardinal_graph, 1.1)
+                | pynutil.add_weight(ordinal_graph, 1.1)
+                | pynutil.add_weight(money_graph, 1.1)
+                | pynutil.add_weight(telephone_graph, 1.1)
+                | pynutil.add_weight(electronic_graph, 1.1)
+                | pynutil.add_weight(word_graph, 100)
+            )
+
+            punct = (
+                pynutil.insert("tokens { ")
+                + pynutil.add_weight(punct_graph, weight=1.1)
+                + pynutil.insert(" }")
+            )
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token_plus_punct = (
+                pynini.closure(punct + pynutil.insert(" "))
+                + token
+                + pynini.closure(pynutil.insert(" ") + punct)
+            )
+
+            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = delete_space + graph + delete_space
+
+            self.fst = graph.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
+                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/whitelist.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/whitelist.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.utils import get_abs_path
+from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
+from pynini.lib import pynutil
+
+
+class WhiteListFst(GraphFst):
+    """
+    Finite state transducer for classifying whitelisted tokens
+        e.g. misses -> tokens { name: "mrs." }
+    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    """
+
+    def __init__(self):
+        super().__init__(name="whitelist", kind="classify")
+
+        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/word.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/taggers/word.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_SPACE, GraphFst
+from pynini.lib import pynutil
+
+
+class WordFst(GraphFst):
+    """
+    Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
+        e.g. sleep -> tokens { name: "sleep" }
+    """
+
+    def __init__(self):
+        super().__init__(name="word", kind="classify")
+        word = pynutil.insert('name: "') + pynini.closure(DAMO_NOT_SPACE, 1) + pynutil.insert('"')
+        self.fst = word.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/utils.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/utils.py
+import os
+from typing import Union
+
+import inflect
+
+_inflect = inflect.engine()
+
+
+def num_to_word(x: Union[str, int]):
+    """
+    converts integer to spoken representation
+
+    Args
+        x: integer
+
+    Returns: spoken representation
+    """
+    if isinstance(x, int):
+        x = str(x)
+        x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "")
+    return x
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
--- a/FunASR/fun_text_processing/inverse_text_normalization/en/verbalizers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/en/verbalizers/__init__.py
+