initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/fraction.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/fraction.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_CHAR,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class FractionFst(GraphFst):
+    """
+    Finite state transducer for classifying fraction
+        e.g. demi -> tokens { fraction { numerator: "1" denominator: "2" } }
+        e.g. un et demi -> tokens { fraction { integer_part: "1" numerator: "1" denominator: "2" } }
+        e.g. trois et deux centième -> tokens { fraction { integer_part: "3" numerator: "2" denominator: "100" } }
+
+    Args:
+        cardinal: OrdinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="fraction", kind="classify")
+        # integer_part # numerator # denominator
+
+        graph_cardinal = cardinal.graph_no_exception
+        graph_strip_undo_root_change = pynini.string_file(
+            get_abs_path("data/fractions.tsv")
+        )  # add in absolute path
+
+        graph_strip_no_root_change = pynutil.delete("ième")  # For no change to root
+        graph_strip_no_root_change += pynutil.delete("s").ques  # for plurals
+
+        graph_strip = graph_strip_no_root_change | graph_strip_undo_root_change
+
+        self.fractional = ((pynini.closure(DAMO_CHAR) + graph_strip) @ graph_cardinal).optimize()
+
+        integer = pynutil.insert('integer_part: "') + graph_cardinal + pynutil.insert('" ')
+        integer += delete_space
+        integer += pynutil.delete("et")  # used to demarcate integer and fractional parts
+
+        numerator = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"')
+        denominator = pynutil.insert(' denominator: "') + self.fractional + pynutil.insert('"')
+
+        # Demi (half) can occur alone without explicit numerator.
+        graph_demi_component = (
+            pynutil.delete("demi") + pynutil.delete("e").ques + pynutil.delete("s").ques
+        )
+        graph_demi_component += pynutil.insert('numerator: "1" denominator: "2"')
+
+        graph_fraction_component = numerator + delete_space + denominator
+        graph_fraction_component |= graph_demi_component
+        self.graph_fraction_component = graph_fraction_component
+
+        graph = pynini.closure(integer + delete_space, 0, 1) + graph_fraction_component
+        graph = graph.optimize()
+        self.final_graph_wo_negative = graph
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("moins", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+        graph = optional_graph_negative + graph
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/measure.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/measure.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    get_singulars,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class MeasureFst(GraphFst):
+    """
+    Finite state transducer for classifying measure. Allows for plural form for unit.
+        e.g. moins onze kilogramme -> measure { negative: "true" cardinal { integer: "11" } units: "kg" }
+        e.g. trois heures -> measure { cardinal { integer: "3" } units: "h" }
+        e.g. demi gramme -> measure { fraction { numerator: "1" denominator: "2" } units: "g" }
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+        fraction: FractionFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst, fraction: GraphFst):
+        super().__init__(name="measure", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+
+        graph_prefix = pynini.string_file(get_abs_path("data/measurements/magnitudes.tsv"))
+        graph_unit_singular = pynini.string_file(get_abs_path("data/measurements/measurements.tsv"))
+
+        unit = get_singulars(graph_unit_singular) | graph_unit_singular
+        unit = graph_prefix.ques + unit
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("moins", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        unit_misc = (
+            pynutil.insert("/")
+            + (pynutil.delete("par") | pynutil.delete("à"))
+            + delete_space
+            + unit
+        )
+
+        unit = (
+            pynutil.insert('units: "')
+            + (unit | unit_misc | pynutil.add_weight(unit + delete_space + unit_misc, 0.01))
+            + pynutil.insert('"')
+        )
+
+        subgraph_decimal = (
+            pynutil.insert("decimal { ")
+            + optional_graph_negative
+            + decimal.final_graph_wo_negative
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit
+        )
+
+        subgraph_fraction = (
+            pynutil.insert("fraction { ")
+            + optional_graph_negative
+            + fraction.final_graph_wo_negative
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit
+        )
+
+        subgraph_cardinal = (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + cardinal_graph
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit
+        )
+        final_graph = subgraph_decimal | subgraph_cardinal | subgraph_fraction
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/money.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/money.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class MoneyFst(GraphFst):
+    """
+    Finite state transducer for classifying money
+        e.g. douze euro cinq -> money { integer_part: "12" currency: "€" fractional_part: 05}
+        e.g. zéro euro cinq -> money { integer_part: "0" currency: "€" fractional_part: 05}
+        e.g. cinq centimes -> money { integer_part: "0" currency: "€" fractional_part: 05}
+
+        Note, the currency symbol seems more common for exact amounts and quantities less than 'un million'
+        For 'round' quantities of >=million (milliard, billion), the symbol is dropped. This allows
+        use of the 'de' preposition.
+        e.g. cinq millions d'euros -> money { integer_part: "5" currency: "d'euros" fractional_part: 00}
+        e.g. un milliard d'euro -> money { integer_part: "5" currency: "d'euro" fractional_part: 00}
+        e.g. trois virgule trois millions d'euros -> money { integer_part: "3" currency: "d'euros" fractional_part: 3}
+
+        Currency is included for uniform tagging.
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+        super().__init__(name="money", kind="classify")
+        # quantity, integer_part, fractional_part, currency
+
+        # quantities
+        cardinal_graph = cardinal.graph_no_exception
+        graph_decimal = decimal.final_graph_wo_negative
+
+        # Converts currency names to symbols
+        convert_currency_major = pynini.string_file(
+            get_abs_path("data/money/currency_major.tsv")
+        )  # major denominations
+        convert_currency_minor = pynini.string_file(
+            get_abs_path("data/money/currency_minor.tsv")
+        )  # minor denominations to major symbol. (e.g. 5 cents -> 0.05 $ )
+
+        accept_all_currency = (convert_currency_major | convert_currency_minor).project(
+            "input"
+        )  # recognizes all currencies
+
+        # Graphs for large round amounts ('deux billiards d'euros', 'un milliard de dollars')
+        graph_de = pynini.union(
+            "de ", "des ", "d'"
+        )  # the use of de/d'only occurs with round amounts
+        graph_currency_component_large_round_amounts = graph_de + accept_all_currency
+        graph_currency_component_large_round_amounts = (
+            pynutil.insert(' currency: "')
+            + graph_currency_component_large_round_amounts
+            + pynutil.insert('"')
+        )
+
+        graph_money_large_round_amounts = (
+            graph_decimal + delete_space
+        )  # graph_decimal includes tags and quantities already
+        graph_money_large_round_amounts += graph_currency_component_large_round_amounts
+
+        # For standard currency
+        add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
+            pynutil.insert("0") + DAMO_DIGIT
+        )
+
+        # Graphs integer denomination for large denominations (e.g. $)
+        graph_integer_component_major = (
+            pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
+        )
+        graph_integer_component_major += delete_space
+
+        graph_currency_component_major = (
+            pynutil.insert(' currency: "') + convert_currency_major + pynutil.insert('"')
+        )
+
+        graph_decimal_component_major = (
+            delete_space
+            + pynutil.insert(' fractional_part: "')
+            + (cardinal_graph @ add_leading_zero_to_double_digit)
+            + pynutil.insert('"')
+        )
+
+        # Rare cases where 'et' will separate major and minor denominations.
+        delete_minor_currency = pynini.project(convert_currency_minor, "input")
+        delete_minor_currency = delete_extra_space + pynutil.delete(delete_minor_currency)
+
+        delete_et = delete_extra_space + pynutil.delete("et")
+
+        graph_money_major = (
+            graph_integer_component_major
+            + graph_currency_component_major
+            + delete_et.ques
+            + graph_decimal_component_major.ques
+            + delete_minor_currency.ques
+        )
+
+        # For cases when only small denominations are used.
+        graph_integer_component_minor = pynutil.insert('integer_part: "0"')
+
+        graph_decimal_component_minor = (
+            pynutil.insert(' fractional_part: "')
+            + (cardinal_graph @ add_leading_zero_to_double_digit)
+            + pynutil.insert('"')
+        )
+        graph_decimal_component_minor += delete_extra_space
+
+        graph_currency_component_minor = (
+            pynutil.insert(' currency: "') + convert_currency_minor + pynutil.insert('"')
+        )
+
+        graph_money_minor = (
+            graph_integer_component_minor
+            + graph_decimal_component_minor
+            + graph_currency_component_minor
+        )
+
+        graph_money_standard_amounts = graph_money_major | graph_money_minor
+
+        final_graph = graph_money_large_round_amounts | graph_money_standard_amounts
+        final_graph = self.add_tokens(final_graph)
+
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/ordinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/ordinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_SIGMA,
+    GraphFst,
+    delete_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for classifying ordinal
+        vingt-deuxième -> ordinal { integer: "22" morphosyntactic_features: "e" }
+
+    Also notes specific nouns that have unique normalization conventions.
+    For instance, 'siècles' are rendered in roman numerals when given an ordinal adjective.
+    e.g. dix-neuvième siècle -> XIXe
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="ordinal", kind="classify")
+
+        graph_cardinal = cardinal.graph_no_exception
+        graph_undo_root_change = pynini.string_file(
+            get_abs_path("data/ordinals/digits_root_change.tsv")
+        )  # Returns base number to normal after root change.
+        graph_firsts = pynini.string_file(get_abs_path("data/ordinals/firsts.tsv"))
+        graph_second = pynini.string_file(get_abs_path("data/ordinals/second.tsv"))
+        graph_special_ordinals = pynini.string_file(get_abs_path("data/ordinals/key_nouns.tsv"))
+
+        # Removes morpheme
+        graph_no_root_change = pynutil.delete("ième")  # For no change to root
+
+        graph_strip_morpheme = pynini.union(graph_no_root_change, graph_undo_root_change)
+        graph_strip_morpheme = DAMO_SIGMA + graph_strip_morpheme
+
+        graph_integer_component = graph_strip_morpheme @ graph_cardinal
+
+        graph_morpheme_component = pynutil.insert("e")  # Put the superscript in.
+        graph_morpheme_component += pynini.accep("s").ques  # In case of plurals.
+
+        # Concatenate with cardinal graph.
+        graph_ordinal = pynutil.insert('integer: "') + graph_integer_component + pynutil.insert('"')
+        graph_ordinal += (
+            pynutil.insert(' morphosyntactic_features: "') + graph_morpheme_component
+        )  # Leave open in case further morphems occur
+
+        # Primer has a different subscript depending on gender, need to take note if
+        # 'premier' or 'première'
+        graph_firsts = pynutil.insert('integer: "1" morphosyntactic_features: "') + graph_firsts
+
+        # Second used 'd' as a superscript.
+        graph_second = pynutil.insert('integer: "2" morphosyntactic_features: "') + graph_second
+
+        graph = graph_firsts | graph_second | graph_ordinal
+
+        # For roman numerals. Carries over designation to verbalizer
+        graph_special_ordinals = pynutil.insert("/") + delete_space + graph_special_ordinals
+
+        graph += graph_special_ordinals.ques + pynutil.insert('"')
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/punctuation.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/punctuation.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst
+from pynini.lib import pynutil
+
+
+class PunctuationFst(GraphFst):
+    """
+    Finite state transducer for classifying punctuation
+        e.g. a, -> tokens { name: "a" } tokens { name: "," }
+    """
+
+    def __init__(self):
+        super().__init__(name="punctuation", kind="classify")
+
+        s = "!#$%&'()*+,-./:;<=>?@^_`{|}~"
+        guillemets = "\u00AB" + "\u00BB"  # quotation marks in French.
+        s += guillemets
+        punct = pynini.union(*s)
+
+        graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
+
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/telephone.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/telephone.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    GraphFst,
+    delete_hyphen,
+    delete_space,
+    insert_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for classifying telephone numbers. Assumes conventional grouping for Metropolitan France (and overseas departments)
+    (two number sequences are grouped as individual cardinals) or digit by digit (chiffre-par-chiffre) e.g.
+    "zero un quatre-vingt-deux zero deux vingt-deux cinquante" -> { number_part: "01 42 02 22 50" }
+    "zero un quatre deux zero deux deux deux cinq zero" -> { number_part: "01 42 02 22 50" }
+
+    In cases where only one digit of the first pairing is admitted, assumes that the 0 was skipped.
+    "une vingt-trois quatre-vingt zero six dix-sept" -> { number_part: "01 23 40 06 17" }
+    """
+
+    def __init__(self):
+        super().__init__(name="telephone", kind="classify")
+
+        # create `single_digits` and `double_digits` graphs as these will be
+        # the building blocks of possible telephone numbers
+        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+        graph_ties_unique = pynini.string_file((get_abs_path("data/numbers/ties_unique.tsv")))
+        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
+        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+
+        double_digits = pynini.union(
+            graph_teen,
+            graph_ties_unique,
+            (graph_ties + pynutil.insert("0")),
+            (graph_ties + delete_hyphen + graph_digit),
+        )
+
+        graph_first_pair = graph_zero + delete_space + graph_digit
+        graph_first_pair |= pynutil.insert("0") + graph_digit  # if zero is omitted
+        graph_first_pair += (
+            delete_space + insert_space
+        )  # delete_space since closure allows possible gaps to be removed
+
+        # All digits
+        single_digits = graph_digit | graph_zero
+
+        graph_pair_all_digits = single_digits + delete_space
+        graph_pair_all_digits += single_digits
+
+        graph_all_digits = pynini.closure(graph_pair_all_digits + delete_space + insert_space, 3, 3)
+        graph_all_digits = graph_first_pair + graph_all_digits + graph_pair_all_digits
+
+        # Paired digits
+        graph_pair_digits_and_ties = double_digits | graph_pair_all_digits
+
+        graph_digits_and_ties = pynini.closure(
+            graph_pair_digits_and_ties + delete_space + insert_space, 3, 3
+        )
+        graph_digits_and_ties = (
+            graph_first_pair + graph_digits_and_ties + graph_pair_digits_and_ties
+        )
+
+        number_part = pynini.union(graph_all_digits, graph_digits_and_ties)
+
+        number_part = pynutil.insert('number_part: "') + number_part + pynutil.insert('"')
+
+        graph = number_part
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/time.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/time.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, delete_space
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class TimeFst(GraphFst):
+    """
+    Finite state transducer for classifying time
+        e.g. huit heures -> time { hours: "8" minutes: "00" }
+        e.g. treize heures -> time { hours: "13" minutes: "00" }
+        e.g. treize heures dix -> time { hours: "13" minutes: "10" }
+        e.g. huit heures du matin -> time { hours: "8" minutes: "00" suffix: "avant mid"}
+        e.g. huite heures du après midi -> time { hours: "8" minutes: "00" suffix: "après-midi"}
+        e.g. douze heures moins qart -> time { hours: "11" minutes: "45" }
+        e.g. douze heures et qart -> time { hours: "12" minutes: "15" }
+        e.g. midi et qart -> time { hours: "12" minutes: "15" }
+        e.g. minuit et medi -> time { hours: "0" minutes: "30" }
+        e.g. douze heures moins medi -> time { hours: "11" minutes: "30" }
+        e.g. douze heures moins trois -> time { hours: "11" minutes: "57" }
+    """
+
+    def __init__(self):
+        super().__init__(name="time", kind="classify")
+        # hours, minutes, seconds, suffix, zone, style, speak_period
+
+        # time_zone = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
+        graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv"))
+        graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv"))
+        graph_hours = pynini.string_file(get_abs_path("data/time/hours.tsv"))
+        graph_minutes = pynini.string_file(get_abs_path("data/time/minutes.tsv"))
+        graph_suffix_am = pynini.string_file(get_abs_path("data/time/time_suffix_am.tsv"))
+        graph_suffix_pm = pynini.string_file(get_abs_path("data/time/time_suffix_pm.tsv"))
+
+        graph_suffix = pynini.cross(graph_suffix_am, "am") | pynini.cross(graph_suffix_pm, "pm")
+
+        # Mapping 'heures'
+        graph_heures = pynini.accep("heure") + pynini.accep("s").ques
+        graph_heures = pynutil.delete(graph_heures)
+
+        graph_hours += delete_space + graph_heures
+
+        # Midi and minuit
+        graph_midi = pynini.cross("midi", "12")
+        graph_minuit = pynini.cross("minuit", "0")
+
+        # Mapping 'et demi' and 'et qart'
+        graph_et = pynutil.delete("et") + delete_space
+
+        graph_demi = pynini.accep("demi")
+        graph_demi += pynini.accep("e").ques  # people vary on feminine or masculine form
+        graph_demi = pynini.cross(graph_demi, "30")
+
+        graph_quart = pynini.accep("quart")
+        graph_quart = pynini.accep("le ").ques + graph_quart  # sometimes used
+        graph_quart = pynini.cross(graph_quart, "15")
+        graph_trois_quart = pynini.cross("trois quarts", "45")
+
+        graph_fractions = pynini.union(graph_demi, graph_quart, graph_trois_quart)
+
+        graph_et_fractions = graph_et + graph_fractions
+
+        # Hours component is usually just a cardinal + 'heures' (ignored in case of 'midi/minuit').
+        graph_hours_component = pynini.union(graph_hours, graph_midi, graph_minuit)
+        graph_hours_component = (
+            pynutil.insert('hours: "') + graph_hours_component + pynutil.insert('"')
+        )
+        graph_hours_component += delete_space
+
+        # Minutes component
+        graph_minutes_component = (
+            pynutil.insert(' minutes: "')
+            + pynini.union(graph_minutes, graph_et_fractions)
+            + pynutil.insert('"')
+        )
+
+        # Hour and minutes together. For 'demi' and 'qart', 'et' is used as a conjunction.
+        graph_time_standard = graph_hours_component + graph_minutes_component.ques
+
+        # For time until hour. "quatre heures moins qart" -> 4 h 00 - 0 h 15 = 3 h 45
+        graph_moins = pynutil.delete("moins")
+        graph_moins += delete_space
+
+        graph_hours_to_component = graph_hours | graph_midi | graph_minuit
+        graph_hours_to_component @= graph_hours_to
+        graph_hours_to_component = (
+            pynutil.insert('hours: "') + graph_hours_to_component + pynutil.insert('"')
+        )
+        graph_hours_to_component += delete_space
+
+        graph_minutes_to_component = pynini.union(graph_minutes, graph_fractions)
+        graph_minutes_to_component @= graph_minutes_to
+        graph_minutes_to_component = (
+            pynutil.insert(' minutes: "') + graph_minutes_to_component + pynutil.insert('"')
+        )
+
+        graph_time_to = graph_hours_to_component + graph_moins + graph_minutes_to_component
+
+        graph_time_no_suffix = graph_time_standard | graph_time_to
+
+        graph_suffix_component = pynutil.insert(' suffix: "') + graph_suffix + pynutil.insert('"')
+        graph_suffix_component = delete_space + graph_suffix_component
+        graph_suffix_component = graph_suffix_component.ques
+
+        final_graph = graph_time_no_suffix + graph_suffix_component
+
+        final_graph = self.add_tokens(final_graph)
+
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/tokenize_and_classify.py
+import os
+
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+)
+from fun_text_processing.inverse_text_normalization.fr.taggers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.date import DateFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.decimal import DecimalFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.electronic import ElectronicFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.fraction import FractionFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.measure import MeasureFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.money import MoneyFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.ordinal import OrdinalFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.punctuation import PunctuationFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.telephone import TelephoneFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.time import TimeFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.whitelist import WhiteListFst
+from fun_text_processing.inverse_text_normalization.fr.taggers.word import WordFst
+from pynini.lib import pynutil
+
+import logging
+
+
+class ClassifyFst(GraphFst):
+    """
+    Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
+        super().__init__(name="tokenize_and_classify", kind="classify")
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, "_fr_itn.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            logging.info(f"Creating ClassifyFst grammars.")
+
+            cardinal = CardinalFst()
+            cardinal_graph = cardinal.fst
+
+            fraction = FractionFst(cardinal)
+            fraction_graph = fraction.fst
+
+            ordinal = OrdinalFst(cardinal)
+            ordinal_graph = ordinal.fst
+
+            decimal = DecimalFst(cardinal)
+            decimal_graph = decimal.fst
+
+            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal, fraction=fraction).fst
+            date_graph = DateFst(cardinal).fst
+            word_graph = WordFst().fst
+            time_graph = TimeFst().fst
+            money_graph = MoneyFst(cardinal, decimal).fst
+            whitelist_graph = WhiteListFst().fst
+            punct_graph = PunctuationFst().fst
+            electronic_graph = ElectronicFst().fst
+            telephone_graph = TelephoneFst().fst
+
+            classify = (
+                pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(time_graph, 1.05)
+                | pynutil.add_weight(date_graph, 1.09)
+                | pynutil.add_weight(decimal_graph, 1.08)
+                | pynutil.add_weight(measure_graph, 1.1)
+                | pynutil.add_weight(cardinal_graph, 1.1)
+                | pynutil.add_weight(ordinal_graph, 1.1)
+                | pynutil.add_weight(fraction_graph, 1.09)
+                | pynutil.add_weight(money_graph, 1.07)
+                | pynutil.add_weight(telephone_graph, 1.1)
+                | pynutil.add_weight(electronic_graph, 1.1)
+                | pynutil.add_weight(word_graph, 100)
+            )
+
+            punct = (
+                pynutil.insert("tokens { ")
+                + pynutil.add_weight(punct_graph, weight=1.1)
+                + pynutil.insert(" }")
+            )
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token_plus_punct = (
+                pynini.closure(punct + pynutil.insert(" "))
+                + token
+                + pynini.closure(pynutil.insert(" ") + punct)
+            )
+
+            graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
+            graph = delete_space + graph + delete_space
+
+            self.fst = graph.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
+                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/whitelist.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/whitelist.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import GraphFst, convert_space
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class WhiteListFst(GraphFst):
+    """
+    Finite state transducer for classifying whitelisted tokens
+        e.g. misses -> tokens { name: "mrs." }
+    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    """
+
+    def __init__(self):
+        super().__init__(name="whitelist", kind="classify")
+
+        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv"))
+        graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/word.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/word.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import DAMO_NOT_SPACE, GraphFst
+from pynini.lib import pynutil
+
+
+class WordFst(GraphFst):
+    """
+    Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
+        e.g. sleep -> tokens { name: "sleep" }
+    """
+
+    def __init__(self):
+        super().__init__(name="word", kind="classify")
+        word = pynutil.insert('name: "') + pynini.closure(DAMO_NOT_SPACE, 1) + pynutil.insert('"')
+        self.fst = word.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/utils.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/utils.py
+import os
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/cardinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing cardinal
+        e.g. cardinal { negative: "-" integer: "23" } -> -23
+    """
+
+    def __init__(self):
+        super().__init__(name="cardinal", kind="verbalize")
+        optional_sign = pynini.closure(
+            pynutil.delete("negative:")
+            + delete_space
+            + pynutil.delete('"')
+            + DAMO_NOT_QUOTE
+            + pynutil.delete('"')
+            + delete_space,
+            0,
+            1,
+        )
+        graph = (
+            pynutil.delete("integer:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        self.numbers = graph
+        graph = optional_sign + graph
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for verbalizing date, e.g.
+        date { day: "1" month: "janvier" preserve_order: true } -> 1 de enero
+    """
+
+    def __init__(self):
+        super().__init__(name="date", kind="verbalize")
+
+        convert_primer = pynini.cross("1", "1ᵉʳ")
+        day = (
+            pynutil.delete("day:")
+            + delete_space
+            + pynutil.delete('"')
+            + (
+                pynini.closure(DAMO_NOT_QUOTE, 1) | pynutil.add_weight(convert_primer, -1)
+            )  # first of the month is ordinal
+            + pynutil.delete('"')
+        )
+        month = (
+            pynutil.delete("month:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        year = (
+            pynutil.delete("year:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        # day month
+        graph_dm = day + delete_extra_space + month
+        graph_dmy = graph_dm + delete_extra_space + year
+
+        optional_preserve_order = pynini.closure(
+            pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
+            | pynutil.delete("field_order:")
+            + delete_space
+            + pynutil.delete('"')
+            + DAMO_NOT_QUOTE
+            + pynutil.delete('"')
+            + delete_space
+        )
+
+        final_graph = (graph_dm | graph_dmy) + delete_space + optional_preserve_order
+
+        delete_tokens = self.delete_tokens(final_graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/decimal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_DIGIT,
+    DAMO_NON_BREAKING_SPACE,
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class NumberParser(GraphFst):
+    """
+    Finite state transducer for parsing strings of digis. Breaks up digit strings into groups of three for
+        strings of digits of four or more (inclusive). Groupings are separated by non-breaking space.
+    e.g. '1000' -> '1 000'
+    e.g. '1000,33333' -> '1 000,333 33
+    """
+
+    def __init__(self):
+        super().__init__(name="parser", kind="verbalize")
+
+
+class DecimalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing decimal, e.g.
+        decimal { negative: "true" integer_part: "12"  fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
+    """
+
+    def __init__(self):
+        super().__init__(name="decimal", kind="verbalize")
+
+        # Need parser to group digits by threes
+        exactly_three_digits = DAMO_DIGIT**3
+        at_most_three_digits = pynini.closure(DAMO_DIGIT, 1, 3)
+
+        space_every_three_integer = (
+            at_most_three_digits
+            + (pynutil.insert(DAMO_NON_BREAKING_SPACE) + exactly_three_digits).closure()
+        )
+        space_every_three_decimal = (
+            pynini.accep(",")
+            + (exactly_three_digits + pynutil.insert(DAMO_NON_BREAKING_SPACE)).closure()
+            + at_most_three_digits
+        )
+        group_by_threes = space_every_three_integer | space_every_three_decimal
+        self.group_by_threes = group_by_threes
+
+        optional_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
+        integer = (
+            pynutil.delete("integer_part:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        integer = integer @ group_by_threes
+        optional_integer = pynini.closure(integer + delete_space, 0, 1)
+        fractional = (
+            pynutil.insert(",")
+            + pynutil.delete("fractional_part:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        fractional = fractional @ group_by_threes
+        optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
+        quantity = (
+            pynutil.delete("quantity:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
+        graph = (optional_integer + optional_fractional + optional_quantity).optimize()
+        self.numbers = graph
+        graph = optional_sign + graph
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/electronic.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class ElectronicFst(GraphFst):
+    """
+    Finite state transducer for verbalizing electronic
+        e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
+    """
+
+    def __init__(self):
+        super().__init__(name="electronic", kind="verbalize")
+        user_name = (
+            pynutil.delete("username:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        domain = (
+            pynutil.delete("domain:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        graph = user_name + delete_space + pynutil.insert("@") + domain
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/fraction.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class FractionFst(GraphFst):
+    """
+    Finite state transducer for verbalizing fraction
+        e.g. fraction { integer_part: "1" numerator: "2" denominator: "3" } } -> 1 2/3
+
+    """
+
+    def __init__(self):
+        super().__init__(name="fraction", kind="verbalize")
+        optional_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
+        integer = (
+            pynutil.delete('integer_part: "')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+            + insert_space
+        )
+        numerator = (
+            pynutil.delete('numerator: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
+        )
+
+        denominator = (
+            pynutil.insert("/")
+            + pynutil.delete('denominator: "')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        graph = (
+            pynini.closure(integer + delete_space, 0, 1) + numerator + delete_space + denominator
+        ).optimize()
+        self.numbers = graph
+        delete_tokens = self.delete_tokens(optional_sign + graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/measure.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/measure.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_CHAR,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class MeasureFst(GraphFst):
+    """
+    Finite state transducer for verbalizing measure, e.g.
+        measure { negative: "true" cardinal { integer: "12" } units: "kg" } -> -12 kg
+
+    Args:
+        decimal: DecimalFst
+        cardinal: CardinalFst
+        fraction: FractionFst
+    """
+
+    def __init__(self, decimal: GraphFst, cardinal: GraphFst, fraction: GraphFst):
+        super().__init__(name="measure", kind="verbalize")
+        optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
+        unit = (
+            pynutil.delete("units:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_CHAR - " ", 1)
+            + pynutil.delete('"')
+            + delete_space
+        )
+        graph_decimal = (
+            pynutil.delete("decimal {")
+            + delete_space
+            + optional_sign
+            + delete_space
+            + decimal.numbers
+            + delete_space
+            + pynutil.delete("}")
+        )
+        graph_cardinal = (
+            pynutil.delete("cardinal {")
+            + delete_space
+            + optional_sign
+            + delete_space
+            + cardinal.numbers
+            @ decimal.group_by_threes  # measurements most obey three by three spacing
+            + delete_space
+            + pynutil.delete("}")
+        )
+        graph_fraction = (
+            pynutil.delete("fraction {")
+            + delete_space
+            + optional_sign
+            + delete_space
+            + fraction.numbers
+            + delete_space
+            + pynutil.delete("}")
+        )
+
+        graph = (
+            (graph_cardinal | graph_decimal | graph_fraction)
+            + delete_space
+            + pynutil.insert(" ")
+            + unit
+        )
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/money.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/money.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class MoneyFst(GraphFst):
+    """
+    Finite state transducer for verbalizing money, e.g.
+        money { integer_part: "12" fractional_part: "05" currency: "$" } -> 12.05 $
+
+    Args:
+        decimal: DecimalFst
+    """
+
+    def __init__(self, decimal: GraphFst):
+        super().__init__(name="money", kind="verbalize")
+        unit = (
+            pynutil.delete("currency:")
+            + delete_extra_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        graph = decimal.numbers + delete_space + unit
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/verbalizers/ordinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_DIGIT,
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing ordinal, e.g.
+        ordinal { integer: "13" morphosyntactic_features: "e" } -> 13ᵉ
+
+    Given 'special' terms for ordinals (e.g. siècle), renders
+        amount in conventional format. e.g.
+
+        ordinal { integer: "13" morphosyntactic_features: "e/siècle" } -> XIIIᵉ
+    """
+
+    def __init__(self):
+        super().__init__(name="ordinal", kind="verbalize")
+        graph_integer = (
+            pynutil.delete("integer:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        replace_suffix = pynini.union(
+            pynini.cross("e", "ᵉ"),  # only delete first quote since there may be more features
+            pynini.cross("d", "ᵈ"),
+            pynini.cross("r", "ʳ"),
+            pynini.cross("s", "ˢ"),
+        )
+        replace_suffix = pynutil.delete(' morphosyntactic_features: "') + replace_suffix.plus
+
+        graph_arabic = graph_integer + replace_suffix.plus
+
+        # For roman.
+        graph_roman_digits = pynini.string_file(
+            get_abs_path("data/roman/digits_large.tsv")
+        ).invert()
+        graph_roman_ties = pynini.string_file(get_abs_path("data/roman/ties_large.tsv")).invert()
+        graph_roman_hundreds = pynini.string_file(
+            get_abs_path("data/roman/hundreds_large.tsv")
+        ).invert()
+        graph_roman_zero_digit = pynutil.delete("0")
+
+        graph_roman_hundreds = DAMO_DIGIT**3 @ (
+            graph_roman_hundreds
+            + pynini.union(graph_roman_ties, graph_roman_zero_digit)
+            + pynini.union(graph_roman_digits, graph_roman_zero_digit)
+        )
+        graph_roman_ties = DAMO_DIGIT**2 @ (
+            graph_roman_ties + pynini.union(graph_roman_digits, graph_roman_zero_digit)
+        )
+        graph_roman_digits = DAMO_DIGIT @ graph_roman_digits
+
+        graph_roman_integers = graph_roman_hundreds | graph_roman_ties | graph_roman_digits
+
+        graph_roman = (graph_integer @ graph_roman_integers) + replace_suffix
+        graph_roman += pynini.cross("/", " ") + "siècle"
+
+        graph = (graph_roman | graph_arabic) + pynutil.delete('"')
+
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()