initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/digits_large.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/digits_large.tsv
+I	1
+II	2
+III	3
+IV	4
+V	5
+VI	6
+VII	7
+VIII	8
+IX	9
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/hundreds_large.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/hundreds_large.tsv
+C	1
+CC	2
+CCC	3
+CD	4
+D	5
+DC	6
+DCC	7
+DCCC	8
+CM	9
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/ties_large.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/ties_large.tsv
+X	1
+XX	2
+XXX	3
+XL	4
+L	5
+LX	6
+LXX	7
+LXXX	8
+XC	9
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/suppletive.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/suppletive.tsv
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hour_to_night.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hour_to_night.tsv
+1	13
+2	14
+3	15
+4	16
+5	17
+6	18
+7	19
+8	20
+9	21
+10	22
+11	23
+12	0
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours.tsv
+zéro	0
+une	1
+deux	2
+trois	3
+quatre	4
+cinq	5
+six	6
+sept	7
+huit	8
+neuf	9
+dix	10
+onze	11
+douze	12
+treize	13
+quatorze	14
+quinze	15
+seize	16
+dix-sept	17
+dix-huit	18
+dix-neuf	19
+vingt	20
+vingt-et-une	21
+vingt et une
+vingt-deux	22
+vingt-trois	23
+vingt-quatre	24
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours_to.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours_to.tsv
+1	0
+2	1
+3	2
+4	3
+5	4
+6	5
+7	6
+8	7
+9	8
+10	9
+11	10
+12	11
+13	12
+14	13
+15	14
+16	15
+17	16
+18	17
+19	18
+20	19
+21	20
+22	21
+23	22
+24	23
+0	23
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes.tsv
+une	01
+deux	02
+trois	03
+quatre	04
+cinq	05
+six	06
+sept	07
+huit	08
+neuf	09
+dix	10
+onze	11
+douze	12
+treize	13
+quatorze	14
+quinze	15
+seize	16
+dix-sept	17
+dix-huit	18
+dix-neuf	19
+vingt	20
+vingt-et-une	21
+vingt et une	21
+vingt-deux	22
+vingt-trois	23
+vingt-quatre	27
+vingt-cinq	25
+vingt-six	26
+vingt-sept	27
+vingt-huit	28
+vingt-neuf	29
+trente	30
+trente-et-une	31
+trente et une	31
+trente-deux	32
+trente-trois	33
+trente-quatre	34
+trente-cinq	35
+trente-six	36
+trente-sept	37
+trente-huit	38
+trente-neuf	39
+quarante	40
+quarante-et-une	41
+quarante et une	41
+quarante-deux	42
+quarante-trois	43
+quarante-quatre	44
+quarante-cinq	45
+quarante-six	46
+quarante-sept	47
+quarante-huit	48
+quarante-neuf	49
+cinquante	50
+cinquante-et-une	51
+cinquante et une	51
+cinquante-deux	52
+cinquante-trois	53
+cinquante-quatre	54
+cinquante-cinq	55
+cinquante-six	56
+cinquante-sept	57
+cinquante-huit	58
+cinquante-neuf	59
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes_to.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes_to.tsv
+01	59
+02	58
+03	57
+04	56
+05	55
+06	54
+07	53
+08	52
+09	51
+10	50
+11	49
+12	48
+13	47
+14	46
+15	45
+16	44
+17	43
+18	42
+19	41
+20	40
+21	39
+22	38
+23	37
+24	36
+25	35
+26	34
+27	33
+28	32
+29	31
+30	30
+31	29
+32	28
+33	27
+34	26
+35	25
+36	24
+37	23
+38	22
+39	21
+40	20
+41	19
+42	18
+43	17
+44	16
+45	15
+46	14
+47	13
+48	12
+49	11
+50	10
+51	09
+52	08
+53	07
+54	06
+55	05
+56	04
+57	03
+58	02
+59	01
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_am.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_am.tsv
+du matin
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_pm.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_pm.tsv
+de l'après-midi
+du soir
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/data/whitelist.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/data/whitelist.tsv
+monsieur	M.
+messieurs	MM.
+madame	Mᵐᵉ
+mesdames	Mᵐᵉˢ
+mademoiselle	Mˡˡᵉ
+mademoiselles	Mˡˡᵉˢ
+docteur	Dʳ
+docteurs	Dʳˢ
+docteure	Dʳᵉ
+docteures	Dʳᵉˢ
+après jésus-christ	apr. J.-C.
+avant Jésus-Christ	av. J.-C.
+ca	v.
+vers	v.
+l’honorable	le hon.
+le très hononrable	le très hon.
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/graph_utils.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/graph_utils.py
+import os
+import string
+from pathlib import Path
+from typing import Dict
+
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini import Far
+from pynini.examples import plurals
+from pynini.export import export
+from pynini.lib import byte, pynutil, utf8
+
+DAMO_CHAR = utf8.VALID_UTF8_CHAR
+
+DAMO_DIGIT = byte.DIGIT
+DAMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
+DAMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
+DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
+DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
+DAMO_HEX = pynini.union(*string.hexdigits).optimize()
+DAMO_NON_BREAKING_SPACE = "\u00A0"
+DAMO_SPACE = " "
+DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
+DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
+DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
+
+DAMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
+DAMO_GRAPH = pynini.union(DAMO_ALNUM, DAMO_PUNCT).optimize()
+
+DAMO_SIGMA = pynini.closure(DAMO_CHAR)
+
+delete_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE))
+insert_space = pynutil.insert(" ")
+delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
+
+# French frequently compounds numbers with hyphen.
+delete_hyphen = pynutil.delete(pynini.closure("-", 0, 1))
+insert_hyphen = pynutil.insert("-")
+suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
+
+_s = DAMO_SIGMA + pynutil.insert("s")
+_x = DAMO_SIGMA + pynini.string_map([("eau"), ("eu"), ("ou")]) + pynutil.insert("x")
+_aux = DAMO_SIGMA + pynini.string_map([("al", "aux"), ("ail", "aux")])
+
+graph_plural = plurals._priority_union(
+    suppletive, plurals._priority_union(_s, pynini.union(_x, _aux), DAMO_SIGMA), DAMO_SIGMA
+).optimize()
+
+SINGULAR_TO_PLURAL = graph_plural
+PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
+TO_LOWER = pynini.union(
+    *[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
+)
+TO_UPPER = pynini.invert(TO_LOWER)
+
+
+def generator_main(file_name: str, graphs: Dict[str, pynini.FstLike]):
+    """
+    Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
+
+    Args:
+        file_name: exported file name
+        graphs: Mapping of a rule name and Pynini WFST graph to be exported
+    """
+    exporter = export.Exporter(file_name)
+    for rule, graph in graphs.items():
+        exporter[rule] = graph.optimize()
+    exporter.close()
+    print(f"Created {file_name}")
+
+
+def get_plurals(fst):
+    """
+    Given singular returns plurals
+
+    Args:
+        fst: Fst
+
+    Returns plurals to given singular forms
+    """
+    return SINGULAR_TO_PLURAL @ fst
+
+
+def get_singulars(fst):
+    """
+    Given plural returns singulars
+
+    Args:
+        fst: Fst
+
+    Returns singulars to given plural forms
+    """
+    return PLURAL_TO_SINGULAR @ fst
+
+
+def convert_space(fst) -> "pynini.FstLike":
+    """
+    Converts space to nonbreaking space.
+    Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
+    This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
+
+    Args:
+        fst: input fst
+
+    Returns output fst where breaking spaces are converted to non breaking spaces
+    """
+    return fst @ pynini.cdrewrite(
+        pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
+    )
+
+
+class GraphFst:
+    """
+    Base class for all grammar fsts.
+
+    Args:
+        name: name of grammar class
+        kind: either 'classify' or 'verbalize'
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, name: str, kind: str, deterministic: bool = True):
+        self.name = name
+        self.kind = kind
+        self._fst = None
+        self.deterministic = deterministic
+
+        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
+        if self.far_exist():
+            self._fst = Far(
+                self.far_path, mode="r", arc_type="standard", far_type="default"
+            ).get_fst()
+
+    def far_exist(self) -> bool:
+        """
+        Returns true if FAR can be loaded
+        """
+        return self.far_path.exists()
+
+    @property
+    def fst(self) -> "pynini.FstLike":
+        return self._fst
+
+    @fst.setter
+    def fst(self, fst):
+        self._fst = fst
+
+    def add_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Wraps class name around to given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
+
+    def delete_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Deletes class name wrap around output of given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        res = (
+            pynutil.delete(f"{self.name}")
+            + delete_space
+            + pynutil.delete("{")
+            + delete_space
+            + fst
+            + delete_space
+            + pynutil.delete("}")
+        )
+        return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/cardinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_CHAR,
+    DAMO_DIGIT,
+    DAMO_NOT_SPACE,
+    DAMO_SIGMA,
+    DAMO_SPACE,
+    GraphFst,
+    delete_hyphen,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+def rewrite(cardinal: "pynini.FstLike") -> "pynini.FstLike":
+    """
+    Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100)
+    to current orthography ('-' between all words in number string)
+    e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois.
+    In cases where original orthography is current, or string is mixture of two orthographies,
+    will render invalid form that will not pass through CardinalFst
+    e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.)
+    e.g. deux
+
+    Args:
+        cardinal: cardinal FST
+    """
+
+    # Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in
+    # those contexts.
+    targets = pynini.string_map(
+        [
+            "et",  # for 'et un/onze'
+            "cent",
+            "mille",
+            "million",
+            "milliard",
+            "billion",
+            "billiard",
+            "trillion",
+            "trilliard",
+        ]
+    )
+    targets += pynini.accep("s").ques
+
+    no_spaces = pynini.closure(DAMO_NOT_SPACE)
+
+    # Valid numbers in reformed orthography will have no spaces.
+    new_orthography_sigma = no_spaces
+
+    # Old orthography will not have these strings. Replacing with character to mark.
+    targets_for_filtering = ("-" + targets) | ("-" + targets + "-") | (targets + "-")
+
+    filter = pynini.cdrewrite(
+        pynini.cross(targets_for_filtering, "#"), "", "", DAMO_SIGMA
+    )  # Invalid for cardinal
+
+    old_orthography_sigma = pynini.difference(
+        DAMO_CHAR, "#"
+    )  # Marked character removed from sigma_star.
+    old_orthography_sigma.closure()
+
+    # Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.)
+    # e.g. mille cent-une -> mille-cent-une
+    filter @= old_orthography_sigma
+
+    # Now know replacements will only work around targets
+    replace_left = pynini.cdrewrite(pynini.cross(" ", "-"), "", targets, DAMO_SIGMA)
+
+    replace_right = pynini.cdrewrite(pynini.cross(" ", "-"), targets, "", DAMO_SIGMA)
+
+    replace = replace_left @ replace_right
+
+    graph = new_orthography_sigma | (filter @ replace)
+
+    return graph @ cardinal
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for classifying cardinals
+        e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"}
+    This class converts cardinals up to (but not including) "un-quatrillion",
+    i.e up to "one septillion" in English (10^{24}).
+    Cardinals below nine are not converted (in order to avoid
+    "j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.)
+    This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100)
+    and current hyphenation (all elements of number are hyphenated), prioritizing the latter.
+    e.g cent cinquante et un -> cardinal { integer: "151"}
+        cent-cinquante-et-un -> cardinal { integer: "151"}
+    This is done through a context dependent rewrite that attempts to map old spelling to new.
+    e.g. cent cinquante et un -> cent-cinquante-et-un
+    """
+
+    def __init__(self):
+        super().__init__(name="cardinal", kind="classify")
+        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        graph_teens = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
+        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+        graph_ties_unique = pynini.string_file(get_abs_path("data/numbers/ties_unique.tsv"))
+
+        # Tens components
+        graph_tens_component = graph_ties + ((delete_hyphen + graph_digit) | pynutil.insert("0"))
+        graph_tens_component = pynini.union(graph_tens_component, graph_teens, graph_ties_unique)
+
+        graph_tens_component_with_leading_zeros = pynini.union(
+            graph_tens_component,
+            (pynutil.insert("0") + (graph_digit | pynutil.insert("0", weight=0.01))),
+        )
+
+        # Hundreds components
+        graph_cent_singular = pynutil.delete("cent")  # Used in hundreds place
+        graph_cent_plural = pynini.cross(
+            "cents", "00"
+        )  # Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201
+
+        graph_digit_no_one = pynini.project(pynini.union("un", "une"), "input")
+        graph_digit_no_one = (
+            pynini.project(graph_digit, "input") - graph_digit_no_one.arcsort()
+        ) @ graph_digit
+
+        graph_hundreds_component_singular = (
+            graph_digit_no_one + delete_hyphen + graph_cent_singular
+        )  # Regular way: [1-9] * 100
+
+        graph_hundreds_component_singular = pynini.union(
+            graph_hundreds_component_singular, pynini.cross("cent", "1")
+        )
+        graph_hundreds_component_singular += delete_hyphen
+        graph_hundreds_component_singular += graph_tens_component_with_leading_zeros
+
+        graph_hundreds_component_plural = graph_digit_no_one + delete_hyphen + graph_cent_plural
+
+        graph_hundreds_component = pynini.union(
+            graph_hundreds_component_singular,
+            graph_hundreds_component_plural,
+            pynutil.insert("0") + graph_tens_component_with_leading_zeros,
+        )
+
+        graph_hundreds_component_at_least_one_none_zero_digit = graph_hundreds_component @ (
+            pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
+        )
+        self.graph_hundreds_component_at_least_one_none_zero_digit = rewrite(
+            graph_hundreds_component_at_least_one_none_zero_digit
+        ).optimize()
+
+        # Graph thousands (we'll need this for cases of mille millions, mille milliards...)
+        graph_tens_of_hundreds_component_singular = (
+            graph_tens_component + delete_hyphen + graph_cent_singular
+        )  # Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
+        graph_tens_of_hundreds_component_singular += (
+            delete_hyphen + graph_tens_component_with_leading_zeros
+        )
+        graph_tens_of_hundreds_component_plural = (
+            graph_tens_component + delete_hyphen + graph_cent_plural
+        )
+        graph_tens_of_hundred_component = (
+            graph_tens_of_hundreds_component_plural | graph_tens_of_hundreds_component_singular
+        )
+
+        graph_thousands = pynini.union(
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + pynutil.delete("mille"),
+            pynutil.insert("001") + pynutil.delete("mille"),  # because 'mille', not 'un mille'
+            pynutil.insert("000", weight=0.1),
+        )
+
+        # All other large amounts
+        graph_millions = pynini.union(
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("million") | pynutil.delete("millions")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_milliards = pynini.union(  # French for English 'billion'
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("milliard") | pynutil.delete("milliards")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_billions = pynini.union(  # NOTE: this is English 'trillion.'
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("billions") | pynutil.delete("billion")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_mille_billion = pynini.union(
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + pynutil.delete("mille"),
+            pynutil.insert("001")
+            + pynutil.delete("mille"),  # because we say 'mille', not 'un mille'
+        )
+        graph_mille_billion += delete_hyphen + (
+            graph_millions | pynutil.insert("000") + pynutil.delete("billions")
+        )  # allow for 'mil millones'
+        graph_mille_billion |= pynutil.insert("000000", weight=0.1)
+
+        graph_billiards = pynini.union(
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("billiards") | pynutil.delete("billiard")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_trillions = pynini.union(  # One thousand English trillions.
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("trillions") | pynutil.delete("trillion")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph_trilliards = pynini.union(
+            graph_hundreds_component_at_least_one_none_zero_digit
+            + delete_hyphen
+            + (pynutil.delete("trilliards") | pynutil.delete("trilliard")),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        graph = pynini.union(
+            graph_trilliards
+            + delete_hyphen
+            + graph_trillions
+            + delete_hyphen
+            + graph_billiards
+            + delete_hyphen
+            + graph_billions
+            + delete_hyphen
+            + graph_milliards
+            + delete_hyphen
+            + graph_millions
+            + delete_hyphen
+            + graph_thousands
+            + delete_hyphen
+            + graph_hundreds_component,
+            graph_tens_of_hundred_component,
+            graph_zero,
+        )
+
+        graph = graph @ pynini.union(
+            pynutil.delete(pynini.closure("0"))
+            + pynini.difference(DAMO_DIGIT, "0")
+            + pynini.closure(DAMO_DIGIT),
+            "0",
+        )
+
+        graph = rewrite(graph)
+
+        self.graph_no_exception = graph.optimize()
+
+        # save self.numbers_up_to_thousand for use in DecimalFst
+        digits_up_to_thousand = DAMO_DIGIT | (DAMO_DIGIT**2) | (DAMO_DIGIT**3)
+        numbers_up_to_thousand = pynini.compose(graph, digits_up_to_thousand).optimize()
+        self.numbers_up_to_thousand = numbers_up_to_thousand
+
+        # save self.numbers_up_to_million for use in DecimalFst
+        digits_up_to_million = (
+            DAMO_DIGIT
+            | (DAMO_DIGIT**2)
+            | (DAMO_DIGIT**3)
+            | (DAMO_DIGIT**4)
+            | (DAMO_DIGIT**5)
+            | (DAMO_DIGIT**6)
+        )
+        numbers_up_to_million = pynini.compose(graph, digits_up_to_million).optimize()
+        self.numbers_up_to_million = numbers_up_to_million
+
+        # don't convert cardinals from zero to nine inclusive
+        graph_exception = pynini.project(pynini.union(graph_digit, graph_zero), "input")
+
+        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
+
+        optional_minus_graph = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("moins", '"-"') + DAMO_SPACE, 0, 1
+        )
+
+        final_graph = (
+            optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        )
+
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for classifying date, in the form of (day) month (year) or year
+        e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
+        e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
+        e.g. le premier janvier -> date { day: "1" month: "janvier"  preserve_order: true }
+
+    Also will convert colloquialism of spelling in which tens of hundreds are used to express date. (e.g. nineteen hundred and four)
+        e.g. le vingt mais dix-neuf-cent-quatre -> date { day: "20" month: "mais" year: "1904" preserve_order: true }
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="date", kind="classify")
+
+        self.cardinal = cardinal.graph_no_exception
+
+        year_graph = self.cardinal
+
+        month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
+        month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
+
+        day_graph = self.cardinal | pynini.cross(
+            "premier", "1"
+        )  # Premier is only ordinal used for dates
+        day_graph = pynutil.insert('day: "') + day_graph + pynutil.insert('"')
+        optional_graph_year = pynini.closure(
+            delete_extra_space + pynutil.insert('year: "') + year_graph + pynutil.insert('"'),
+            0,
+            1,
+        )
+        graph_dmy = day_graph + delete_extra_space + month_graph + optional_graph_year
+
+        final_graph = graph_dmy
+        final_graph += pynutil.insert(" preserve_order: true")
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/decimal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/decimal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_hyphen,
+    delete_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+def get_quantity(
+    decimal: "pynini.FstLike", cardinal_up_to_thousand: "pynini.FstLike"
+) -> "pynini.FstLike":
+    """
+    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
+    e.g. one million -> integer_part: "1" quantity: "million"
+    e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
+
+    Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions
+
+    Args:
+        decimal: decimal FST
+        cardinal_up_to_million: cardinal FST
+    """
+    numbers = cardinal_up_to_thousand @ (
+        pynutil.delete(pynini.closure("0"))
+        + pynini.difference(DAMO_DIGIT, "0")
+        + pynini.closure(DAMO_DIGIT)
+    )
+
+    suffix = pynini.union(
+        "million",
+        "millions",
+        "milliard",
+        "milliards",
+        "billion",
+        "billions",
+        "billiard",
+        "billiards",
+        "trillion",
+        "trillions",
+        "trilliard",
+        "trilliards",
+    )
+    res = (
+        pynutil.insert('integer_part: "')
+        + numbers
+        + pynutil.insert('"')
+        + (
+            pynini.union(delete_hyphen, delete_extra_space)
+        )  # Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.
+        + pynutil.insert(' quantity: "')
+        + suffix
+        + pynutil.insert('"')
+    )
+    res |= (
+        decimal + delete_extra_space + pynutil.insert(' quantity: "') + suffix + pynutil.insert('"')
+    )
+    return res
+
+
+class DecimalFst(GraphFst):
+    """
+    Finite state transducer for classifying decimal
+        Decimal point is "," (virgule).
+            e.g. moins un virgule deux six -> decimal { negative: "true" integer_part: "1" fractional_part: "26" }
+
+        This decimal rule assumes that decimals can be pronounced as:
+        (a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero')
+
+        Also writes large numbers in shortened form, e.g.
+            e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" }
+            e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" }
+            e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" }
+    Args:
+        cardinal: CardinalFst
+
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="decimal", kind="classify")
+
+        # number after decimal point can be any series of cardinals <1 million, including 'zero'
+        graph_decimal = cardinal.numbers_up_to_million
+        graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
+        self.graph = graph_decimal
+
+        # decimal point is denote by virgule
+        graph_fractional_separator = pynutil.delete("virgule")
+
+        # Possible negatives
+        optional_graph_negative = (
+            pynutil.insert("negative: ") + pynini.cross("moins", '"true"') + delete_extra_space
+        )
+        optional_graph_negative = optional_graph_negative.ques
+
+        # Fractional portion
+        graph_fractional = (
+            pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
+        )
+
+        # Integers
+        cardinal_graph = cardinal.graph_no_exception | pynini.string_file(
+            get_abs_path("data/numbers/zero.tsv")
+        )
+        graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
+
+        # Final graphs
+        final_graph_wo_sign = (
+            pynini.closure(graph_integer + delete_extra_space, 0, 1)
+            + graph_fractional_separator
+            + delete_extra_space
+            + graph_fractional
+        )
+        final_graph = optional_graph_negative + final_graph_wo_sign
+
+        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
+        )
+        final_graph |= optional_graph_negative + get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundreds_component_at_least_one_none_zero_digit
+        )
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/electronic.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/electronic.py
+import pynini
+from fun_text_processing.inverse_text_normalization.fr.graph_utils import (
+    DAMO_ALPHA,
+    GraphFst,
+    insert_space,
+)
+from fun_text_processing.inverse_text_normalization.fr.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class ElectronicFst(GraphFst):
+    """
+    Finite state transducer for classifying 'electronic' semiotic classes, i.e.
+    email address (which get converted to "username" and "domain" fields),
+    and URLS (which get converted to a "protocol" field).
+        e.g. c d f une arobase a b c point e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
+        e.g. double vé double vé double vé a b c point e d u -> tokens { electronic { protocol: "www.abc.edu" } }
+    """
+
+    def __init__(self):
+        super().__init__(name="electronic", kind="classify")
+
+        delete_extra_space = pynutil.delete(" ")
+        alpha_num = (
+            DAMO_ALPHA
+            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+            | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        )
+
+        symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv"))
+        ampersand = pynini.string_map([("arobase"), ("chez"), ("at"), ("à")])
+
+        accepted_username = alpha_num | symbols
+        process_dot = pynini.cross("point", ".")
+        username = (
+            pynutil.insert('username: "')
+            + alpha_num
+            + delete_extra_space
+            + pynini.closure(accepted_username + delete_extra_space)
+            + alpha_num
+            + pynutil.insert('"')
+        )
+        single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
+        server = single_alphanum | pynini.string_file(
+            get_abs_path("data/electronic/server_name.tsv")
+        )
+        domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
+        domain_graph = (
+            pynutil.insert('domain: "')
+            + server
+            + delete_extra_space
+            + process_dot
+            + delete_extra_space
+            + domain
+            + pynutil.insert('"')
+        )
+        graph = (
+            username
+            + delete_extra_space
+            + pynutil.delete(ampersand)
+            + insert_space
+            + delete_extra_space
+            + domain_graph
+        )
+
+        ############# url ###
+        protocol_end = pynini.cross(
+            pynini.union("www", "w w w", "double vé double vé double vé"), "www"
+        )
+        protocol_start = pynini.cross(pynini.union("http", "h t t p", "ache té té pé"), "http")
+        protocol_start |= pynini.cross(
+            pynini.union("https", "h t t p s", "ache té té pé esse"), "https"
+        )
+        protocol_start += pynini.cross(
+            pynini.union(
+                " deux-points barre oblique barre oblique ",
+                " deux-points barre barre ",
+                " deux-points double barre ",
+                " deux-points slash slash ",
+            ),
+            "://",
+        )
+
+        # e.g. .com, .es
+        ending = (
+            delete_extra_space
+            + symbols
+            + delete_extra_space
+            + (domain | pynini.closure(accepted_username + delete_extra_space) + accepted_username)
+        )
+
+        protocol = (
+            pynini.closure(protocol_start, 0, 1)
+            + protocol_end
+            + delete_extra_space
+            + process_dot
+            + delete_extra_space
+            + (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+            + pynini.closure(ending, 1)
+        )
+        protocol = pynutil.insert('protocol: "') + protocol + pynutil.insert('"')
+        graph |= protocol
+        ########
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()