initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/digit_ties2.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/digit_ties2.tsv
+apat na	4
+siyam na	9
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/hundred.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/hundred.tsv
+daan
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/teen.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/teen.tsv
+sampu	10
+labing-isa	11
+labindalawa	12
+labintatlo	13
+labing-apat	14
+labinlima	15
+labing-anim	16
+labimpito	17
+labingwalo	18
+labinsiyam	19
+dalawampu	20
+tatlumpu	30
+apatnapu	40
+limampu	50
+animnapu	60
+pitumpu	70
+walumpu	80
+siyamnapu	90
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/thousands.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/thousands.tsv
+libo
+milyon
+bilyon
+trilyon
+quadrillion
+quintillion
+sextillion
+septillion
+octillion
+nonillion
+decillion
+undecillion
+duodecillion
+tredecillion
+quattuordecillion
+quindecillion
+sexdecillion
+septendecillion
+octodecillion
+novemdecillion
+viintillion
+centillion
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/ties.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/ties.tsv
+dalawampu't	2
+tatlumpu't	3
+apatnapu't	4
+limampu't	5
+animnapu't	6
+pitumpu't	7
+walumpu't	8
+siyamnapu't	9
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/zero.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/numbers/zero.tsv
+sero	0
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/digit.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/digit.tsv
+una	isa
+pangalawang	dalawa
+pangatlo	tatlo
+ikaapat	apat
+ikalimang	lima
+ikaanim	ikaanim
+ikapitong	pito
+ikawalo	walo
+ikasiyam	siyam
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/teen.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/ordinals/teen.tsv
+ikalabindalawa	labindalawa
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/suppletive.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/suppletive.tsv
+deer
+fish
+sheep
+foot	feet
+goose	geese
+man	men
+mouse	mice
+tooth	teeth
+woman	women
+won
+child	children
+ox	oxen
+wife	wives
+wolf	wolves
+analysis	analyses
+criterion	criteria
+lbs
+focus	foci
+percent
+hertz
+kroner	krone
+inch	inches
+calory	calories
+yen
+megahertz
+gigahertz
+kilohertz
+hertz
+CC
+c c
+horsepower
+hundredweight
+kilogram force	kilograms force
+mega siemens
+revolution per minute	revolutions per minute
+mile per hour	miles per hour
+megabit per second	megabits per second
+square foot	square feet
+kilobit per second	kilobits per second
+degree Celsius	degrees Celsius
+degree Fahrenheit	degrees Fahrenheit
+ATM
+AU
+BQ
+CC
+CD
+DA
+EB
+EV
+F
+GB
+G
+GL
+GPA
+GY
+HA
+H
+HL
+GP
+HS
+KB
+KL
+KN
+KT
+KV
+LM
+MA
+MA
+MB
+MC
+MF
+M
+MM
+MS
+MV
+MW
+PB
+PG
+PS
+S
+TB
+YB
+ZB
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/minute_to.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/minute_to.tsv
+1	59
+2	58
+3	57
+4	56
+5	55
+6	54
+7	53
+8	52
+9	51
+10	50
+11	49
+12	48
+13	47
+14	46
+15	45
+16	44
+17	43
+18	42
+19	41
+20	40
+21	39
+22	38
+23	37
+24	36
+25	35
+26	34
+27	33
+28	32
+29	31
+30	30
+31	29
+32	28
+33	27
+34	26
+35	25
+36	24
+37	23
+38	22
+39	21
+40	20
+41	19
+42	18
+43	17
+44	16
+45	15
+46	14
+47	13
+48	12
+49	11
+50	10
+51	9
+52	8
+53	7
+54	6
+55	5
+56	4
+57	3
+58	2
+59	1
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/time_suffix.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/time_suffix.tsv
+p m	p.m.
+pm	p.m.
+p.m.
+p.m	p.m.
+am	a.m.
+a.m.
+a.m	a.m.
+a m	a.m.
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/time_zone.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/time_zone.tsv
+cst	c s t
+cet	c e t
+pst	p s t
+est	e s t
+pt	p t
+et	e t
+gmt	g m t
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/to_hour.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/time/to_hour.tsv
+one	12
+two	1
+three	2
+four	3
+five	4
+six	5
+seven	6
+eigh	7
+nine	8
+ten	9
+eleven	10
+twelve	11
\ No newline at end of file
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/data/whitelist.tsv
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/data/whitelist.tsv
+e.g.	for example
+dr.	doctor
+mr.	mister
+mrs.	misses
+st.	saint
+7-eleven	seven eleven
+es3	e s three
+s&p	s and p
+ASAP	a s a p
+AT&T	a t and t
+LLP	l l p
+ATM	a t m
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/graph_utils.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/graph_utils.py
+import os
+import string
+from pathlib import Path
+from typing import Dict
+
+import pynini
+from fun_text_processing.inverse_text_normalization.tl.utils import get_abs_path
+from pynini import Far
+from pynini.examples import plurals
+from pynini.export import export
+from pynini.lib import byte, pynutil, utf8
+
+DAMO_CHAR = utf8.VALID_UTF8_CHAR
+
+DAMO_DIGIT = byte.DIGIT
+DAMO_LOWER = pynini.union(*string.ascii_lowercase).optimize()
+DAMO_UPPER = pynini.union(*string.ascii_uppercase).optimize()
+DAMO_ALPHA = pynini.union(DAMO_LOWER, DAMO_UPPER).optimize()
+DAMO_ALNUM = pynini.union(DAMO_DIGIT, DAMO_ALPHA).optimize()
+DAMO_HEX = pynini.union(*string.hexdigits).optimize()
+DAMO_NON_BREAKING_SPACE = "\u00A0"
+DAMO_SPACE = " "
+DAMO_WHITE_SPACE = pynini.union(" ", "\t", "\n", "\r", "\u00A0").optimize()
+DAMO_NOT_SPACE = pynini.difference(DAMO_CHAR, DAMO_WHITE_SPACE).optimize()
+DAMO_NOT_QUOTE = pynini.difference(DAMO_CHAR, r'"').optimize()
+
+DAMO_PUNCT = pynini.union(*map(pynini.escape, string.punctuation)).optimize()
+DAMO_GRAPH = pynini.union(DAMO_ALNUM, DAMO_PUNCT).optimize()
+
+DAMO_SIGMA = pynini.closure(DAMO_CHAR)
+
+delete_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE))
+delete_zero_or_one_space = pynutil.delete(pynini.closure(DAMO_WHITE_SPACE, 0, 1))
+insert_space = pynutil.insert(" ")
+delete_extra_space = pynini.cross(pynini.closure(DAMO_WHITE_SPACE, 1), " ")
+delete_preserve_order = pynini.closure(
+    pynutil.delete(" preserve_order: true")
+    | (pynutil.delete(' field_order: "') + DAMO_NOT_QUOTE + pynutil.delete('"'))
+)
+
+suppletive = pynini.string_file(get_abs_path("data/suppletive.tsv"))
+# _v = pynini.union("a", "e", "i", "o", "u")
+_c = pynini.union(
+    "b",
+    "c",
+    "d",
+    "f",
+    "g",
+    "h",
+    "j",
+    "k",
+    "l",
+    "m",
+    "n",
+    "p",
+    "q",
+    "r",
+    "s",
+    "t",
+    "v",
+    "w",
+    "x",
+    "y",
+    "z",
+)
+_ies = DAMO_SIGMA + _c + pynini.cross("y", "ies")
+_es = DAMO_SIGMA + pynini.union("s", "sh", "ch", "x", "z") + pynutil.insert("es")
+_s = DAMO_SIGMA + pynutil.insert("s")
+
+graph_plural = plurals._priority_union(
+    suppletive,
+    plurals._priority_union(_ies, plurals._priority_union(_es, _s, DAMO_SIGMA), DAMO_SIGMA),
+    DAMO_SIGMA,
+).optimize()
+
+SINGULAR_TO_PLURAL = graph_plural
+PLURAL_TO_SINGULAR = pynini.invert(graph_plural)
+TO_LOWER = pynini.union(
+    *[pynini.cross(x, y) for x, y in zip(string.ascii_uppercase, string.ascii_lowercase)]
+)
+TO_UPPER = pynini.invert(TO_LOWER)
+MIN_NEG_WEIGHT = -0.0001
+MIN_POS_WEIGHT = 0.0001
+
+
+def generator_main(file_name: str, graphs: Dict[str, "pynini.FstLike"]):
+    """
+    Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
+
+    Args:
+        file_name: exported file name
+        graphs: Mapping of a rule name and Pynini WFST graph to be exported
+    """
+    exporter = export.Exporter(file_name)
+    for rule, graph in graphs.items():
+        exporter[rule] = graph.optimize()
+    exporter.close()
+    print(f"Created {file_name}")
+
+
+def get_plurals(fst):
+    """
+    Given singular returns plurals
+
+    Args:
+        fst: Fst
+
+    Returns plurals to given singular forms
+    """
+    return SINGULAR_TO_PLURAL @ fst
+
+
+def get_singulars(fst):
+    """
+    Given plural returns singulars
+
+    Args:
+        fst: Fst
+
+    Returns singulars to given plural forms
+    """
+    return PLURAL_TO_SINGULAR @ fst
+
+
+def convert_space(fst) -> "pynini.FstLike":
+    """
+    Converts space to nonbreaking space.
+    Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
+    This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
+
+    Args:
+        fst: input fst
+
+    Returns output fst where breaking spaces are converted to non breaking spaces
+    """
+    return fst @ pynini.cdrewrite(
+        pynini.cross(DAMO_SPACE, DAMO_NON_BREAKING_SPACE), "", "", DAMO_SIGMA
+    )
+
+
+class GraphFst:
+    """
+    Base class for all grammar fsts.
+
+    Args:
+        name: name of grammar class
+        kind: either 'classify' or 'verbalize'
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, name: str, kind: str, deterministic: bool = True):
+        self.name = name
+        self.kind = kind
+        self._fst = None
+        self.deterministic = deterministic
+
+        self.far_path = Path(os.path.dirname(__file__) + "/grammars/" + kind + "/" + name + ".far")
+        if self.far_exist():
+            self._fst = Far(
+                self.far_path, mode="r", arc_type="standard", far_type="default"
+            ).get_fst()
+
+    def far_exist(self) -> bool:
+        """
+        Returns true if FAR can be loaded
+        """
+        return self.far_path.exists()
+
+    @property
+    def fst(self) -> "pynini.FstLike":
+        return self._fst
+
+    @fst.setter
+    def fst(self, fst):
+        self._fst = fst
+
+    def add_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Wraps class name around to given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        return pynutil.insert(f"{self.name} {{ ") + fst + pynutil.insert(" }")
+
+    def delete_tokens(self, fst) -> "pynini.FstLike":
+        """
+        Deletes class name wrap around output of given fst
+
+        Args:
+            fst: input fst
+
+        Returns:
+            Fst: fst
+        """
+        res = (
+            pynutil.delete(f"{self.name}")
+            + delete_space
+            + pynutil.delete("{")
+            + delete_space
+            + fst
+            + delete_space
+            + pynutil.delete("}")
+        )
+        return res @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/__init__.py
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/cardinal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.tl.utils import get_abs_path, num_to_word
+from fun_text_processing.inverse_text_normalization.tl.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    DAMO_SIGMA,
+    DAMO_SPACE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for classifying cardinals
+        e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
+    Numbers below thirteen are not converted.
+    """
+
+    def __init__(self):
+        super().__init__(name="cardinal", kind="classify")
+        graph_zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+
+        graph_digit_ties_all = pynini.string_file(get_abs_path("data/numbers/digit_ties.tsv"))
+
+        graph_digit_ties1 = pynini.string_file(get_abs_path("data/numbers/digit_ties1.tsv"))
+        graph_digit_ties2 = pynini.string_file(get_abs_path("data/numbers/digit_ties2.tsv"))
+
+        graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
+        graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
+
+        graph_teens_using_ties = graph_ties + pynini.cross(" ", "") + graph_digit
+
+        addzero = pynutil.insert("0")
+        zero = graph_zero
+
+        ##一位数
+        graph_digits = graph_digit | graph_zero
+        digits = graph_digits
+        digit = graph_digit
+        digit_ties = graph_digit_ties1 | graph_digit_ties2
+
+        ##两位数
+        graph_teens = graph_teen | graph_teens_using_ties
+        teens = graph_teens
+
+        ##三位数, daan 百，raan 百（只有4和9的时候）
+        graph_hundred1 = pynutil.delete("daan")
+        graph_hundred2 = pynutil.delete("raan")
+        graph_at = pynutil.delete("at")
+
+        delete_at = graph_at
+
+        graph_hundred_component1 = graph_digit_ties1 + delete_space + graph_hundred1
+        graph_hundred_component2 = graph_digit_ties2 + delete_space + graph_hundred2
+
+        graph_hundred_component = graph_hundred_component1 | graph_hundred_component2
+
+        hundred = (graph_hundred_component + pynutil.insert("00")) | (
+            graph_hundred_component
+            + delete_space
+            + delete_at
+            + delete_space
+            + (
+                teens
+                | pynutil.add_weight(addzero + digit, 0.1)
+                | pynutil.add_weight(digit + addzero, 0.5)
+            )
+        )
+        graph_hundred_component_at_least_one_none_zero_digit = hundred @ (
+            pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
+        )
+        self.graph_hundred_component_at_least_one_none_zero_digit = (
+            graph_hundred_component_at_least_one_none_zero_digit
+        )
+
+        ##千， libo 表示千
+        thousand = (
+            (hundred | teens | digit_ties)
+            + delete_space
+            + pynutil.delete("libo")
+            + delete_space
+            + (
+                hundred
+                | (delete_at + delete_space).ques + pynutil.add_weight(addzero + teens, 0.1)
+                | (delete_at + delete_space).ques + pynutil.add_weight(addzero**2 + digit, 0.5)
+                | pynutil.add_weight(digit + addzero**2, 0.8)
+                | pynutil.add_weight(addzero**3, 1.0)
+            )
+        )
+
+        ##百万，milyon表示百万
+        # million = (((hundred | teens | digit_ties) + delete_space + pynutil.delete("milyon") | pynutil.insert("000", weight=0.1))+ delete_space + (
+        #            thousand
+        #            | pynutil.add_weight(addzero + hundred, 0.1)
+        #            | (delete_at + delete_space).ques + pynutil.add_weight(addzero**2 + teens, 0.5)
+        #            | (delete_at + delete_space).ques + pynutil.add_weight(addzero + addzero + addzero + digit, 0.5)
+        #            | pynutil.add_weight(digit + addzero**3, 0.8)
+        #            | pynutil.add_weight(addzero**4, 1.0)))
+
+        graph_million = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("milyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        ##bilyon bilyon表示十亿
+        graph_billion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("bilyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        ##trilyon trilyon表示兆
+        graph_trillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("trilyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        ##
+        graph_quadrillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("quadrilyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_quintillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("quintilyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+        graph_sextillion = pynini.union(
+            graph_hundred_component_at_least_one_none_zero_digit
+            + delete_space
+            + pynutil.delete("sextilyon"),
+            pynutil.insert("000", weight=0.1),
+        )
+
+        #
+        graph = pynini.union(
+            graph_sextillion
+            + delete_space
+            + graph_quintillion
+            + delete_space
+            + graph_quadrillion
+            + delete_space
+            + graph_trillion
+            + delete_space
+            + graph_billion
+            + delete_space
+            + graph_million
+            + delete_space
+            + thousand
+            + delete_space
+            + graph_hundred_component,
+            thousand,
+            hundred,
+            teens,
+            digits,
+            graph_zero,
+        )
+
+        # graph = zero | digits | teens | hundred | thousand | million
+
+        graph = graph @ pynini.union(
+            pynutil.delete(pynini.closure("0"))
+            + pynini.difference(DAMO_DIGIT, "0")
+            + pynini.closure(DAMO_DIGIT),
+            "0",
+        )
+
+        labels_exception = [num_to_word(x) for x in range(0, 13)]
+        graph_exception = pynini.union(*labels_exception)
+
+        # graph = (
+        #    pynini.cdrewrite(pynutil.delete("and"), DAMO_SPACE, DAMO_SPACE, DAMO_SIGMA)
+        #    @ (DAMO_ALPHA + DAMO_SIGMA)
+        #    @ graph
+        # )
+
+        self.graph_no_exception = graph
+
+        self.graph = (pynini.project(graph, "input") - graph_exception.arcsort()) @ graph
+
+        optional_minus_graph = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("minus", '"-"') + DAMO_SPACE, 0, 1
+        )
+
+        final_graph = (
+            optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        )
+
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.tl.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.tl.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize()
+graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
+ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize()
+
+
+def _get_month_graph():
+    """
+    Transducer for month, e.g. march -> march
+    """
+    month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
+    return month_graph
+
+
+def _get_ties_graph():
+    """
+    Transducer for 20-99 e.g
+    twenty three -> 23
+    """
+    graph = ties_graph + (delete_space + graph_digit | pynutil.insert("0"))
+    return graph
+
+
+def _get_range_graph():
+    """
+    Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
+    """
+    graph_ties = _get_ties_graph()
+    graph = (graph_ties | graph_teen) + delete_space + pynini.cross("hundreds", "00s")
+    graph |= pynini.cross("two", "2") + delete_space + pynini.cross("thousands", "000s")
+    graph |= (
+        (graph_ties | graph_teen)
+        + delete_space
+        + (pynini.closure(DAMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
+        @ (graph_ties | pynini.cross("ten", "10"))
+        + pynutil.insert("s")
+    )
+    graph @= pynini.union("1", "2") + DAMO_DIGIT + DAMO_DIGIT + DAMO_DIGIT + "s"
+    return graph
+
+
+def _get_year_graph():
+    """
+    Transducer for year, e.g. twenty twenty -> 2020
+    """
+
+    def _get_digits_graph():
+        zero = pynini.cross((pynini.accep("oh") | pynini.accep("o")), "0")
+        graph = zero + delete_space + graph_digit
+        graph.optimize()
+        return graph
+
+    def _get_thousands_graph():
+        graph_ties = _get_ties_graph()
+        graph_hundred_component = (
+            graph_digit + delete_space + pynutil.delete("hundred")
+        ) | pynutil.insert("0")
+        graph = (
+            graph_digit
+            + delete_space
+            + pynutil.delete("thousand")
+            + delete_space
+            + graph_hundred_component
+            + delete_space
+            + (graph_teen | graph_ties)
+        )
+        return graph
+
+    graph_ties = _get_ties_graph()
+    graph_digits = _get_digits_graph()
+    graph_thousands = _get_thousands_graph()
+    year_graph = (
+        # 20 19, 40 12, 2012 - assuming no limit on the year
+        (graph_teen + delete_space + (graph_ties | graph_digits | graph_teen))
+        | (graph_ties + delete_space + (graph_ties | graph_digits | graph_teen))
+        | graph_thousands
+    )
+    year_graph.optimize()
+    return year_graph
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for classifying date,
+        e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
+        e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
+        e.g. twenty twenty -> date { year: "2012" preserve_order: true }
+
+    Args:
+        ordinal: OrdinalFst
+    """
+
+    def __init__(self, ordinal: GraphFst):
+        super().__init__(name="date", kind="classify")
+
+        ordinal_graph = ordinal.graph
+        year_graph = _get_year_graph()
+        YEAR_WEIGHT = 0.001
+        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
+        month_graph = _get_month_graph()
+
+        month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
+
+        day_graph = (
+            pynutil.insert('day: "') + pynutil.add_weight(ordinal_graph, -0.7) + pynutil.insert('"')
+        )
+        graph_year = (
+            delete_extra_space
+            + pynutil.insert('year: "')
+            + pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+            + pynutil.insert('"')
+        )
+        optional_graph_year = pynini.closure(
+            graph_year,
+            0,
+            1,
+        )
+        graph_mdy = month_graph + (
+            (delete_extra_space + day_graph)
+            | graph_year
+            | (delete_extra_space + day_graph + graph_year)
+        )
+        graph_dmy = (
+            pynutil.delete("the")
+            + delete_space
+            + day_graph
+            + delete_space
+            + pynutil.delete("of")
+            + delete_extra_space
+            + month_graph
+            + optional_graph_year
+        )
+        graph_year = (
+            pynutil.insert('year: "') + (year_graph | _get_range_graph()) + pynutil.insert('"')
+        )
+
+        final_graph = graph_mdy | graph_dmy | graph_year
+        final_graph += pynutil.insert(" preserve_order: true")
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()