initial commit

70a8a9e0 · wangwei990215 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0 · 70a8a9e0
Commit 70a8a9e0 authored Oct 03, 2024 by wangwei990215
20 changed files
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/__init__.py
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/cardinal.py
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+import pynini
+from pynini import accep, cross, string_file, union
+from pynini.lib.pynutil import delete, insert, add_weight
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    DAMO_CHAR,
+    DAMO_SIGMA,
+    DAMO_SPACE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+import unicodedata
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for classifying cardinals
+        e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
+    Numbers below thirteen are not converted.
+    """
+
+    def __init__(self, enable_standalone_number: bool = True, enable_0_to_9: bool = True):
+        super().__init__(name="cardinal", kind="classify")
+        self.enable_standalone_number = enable_standalone_number
+        self.enable_0_to_9 = enable_0_to_9
+        zero = string_file(get_abs_path("data/numbers/zero.tsv"))
+        digit = string_file(get_abs_path("data/numbers/digit.tsv"))
+        hundred_digit = string_file(get_abs_path("data/numbers/hundred_digit.tsv"))
+        sign = string_file(get_abs_path("data/numbers/sign.tsv"))
+        dot = string_file(get_abs_path("data/numbers/dot.tsv"))
+        ties = string_file(get_abs_path("data/numbers/ties.tsv"))
+        graph_teen = string_file(get_abs_path("data/numbers/teen.tsv"))
+
+        addzero = insert("0")
+        digits = zero | digit  # 0 ~ 9
+        teen = graph_teen
+        teen |= cross("十", "1") + (digit | addzero)
+        tens = ties + addzero | (ties + (digit | addzero))
+
+        hundred = (
+            digit
+            + delete("百")
+            + (
+                tens
+                | teen
+                | add_weight(zero + digit, 0.1)
+                | add_weight(digit + addzero, 0.5)
+                | add_weight(addzero**2, 1.0)
+            )
+        )
+        hundred |= cross("百", "1") + (
+            tens
+            | teen
+            | add_weight(zero + digit, 0.1)
+            | add_weight(digit + addzero, 0.5)
+            | add_weight(addzero**2, 1.0)
+        )
+        hundred |= hundred_digit
+
+        thousand = (
+            (hundred | teen | tens | digits)
+            + delete("千")
+            + (
+                hundred
+                | add_weight(zero + tens, 0.1)
+                | add_weight(addzero + zero + digit, 0.5)
+                | add_weight(digit + addzero**2, 0.8)
+                | add_weight(addzero**3, 1.0)
+            )
+        )
+        ten_thousand = (
+            (thousand | hundred | teen | tens | digits)
+            + delete("万")
+            + (
+                thousand
+                | add_weight(zero + hundred, 0.1)
+                | add_weight(addzero + zero + tens, 0.5)
+                | add_weight(addzero + addzero + zero + digit, 0.5)
+                | add_weight(digit + addzero**3, 0.8)
+                | add_weight(addzero**4, 1.0)
+            )
+        )
+
+        hundred_thousand = (
+            (ten_thousand | thousand | hundred | teen | tens | digits)
+            + delete("十万")
+            + (
+                ten_thousand
+                | add_weight(zero + thousand, 0.1)
+                | add_weight(addzero + zero + hundred, 0.5)
+                | add_weight(addzero + addzero + zero + tens, 0.5)
+                | add_weight(addzero**3 + zero + digit, 0.5)
+                | add_weight(digit + addzero**4, 0.8)
+                | add_weight(addzero**5, 1.0)
+            )
+        )
+
+        million = (
+            (hundred_thousand | ten_thousand | thousand | hundred | teen | tens | digits)
+            + delete("百万")
+            + (
+                hundred_thousand
+                | add_weight(zero + ten_thousand, 0.1)
+                | add_weight(addzero + zero + thousand, 0.5)
+                | add_weight(addzero + addzero + zero + hundred, 0.5)
+                | add_weight(addzero**3 + zero + tens, 0.5)
+                | add_weight(addzero**4 + zero + digit, 0.5)
+                | add_weight(digit + addzero**5, 0.8)
+                | add_weight(addzero**6, 1.0)
+            )
+        )
+        # 1亿
+        hundred_million = (
+            (million | hundred_thousand | ten_thousand | thousand | hundred | teen | tens | digits)
+            + delete("億")
+            + (
+                add_weight(zero + million, 0.1)
+                | add_weight(addzero + zero + hundred_thousand, 0.5)
+                | add_weight(addzero**2 + zero + ten_thousand, 0.5)
+                | add_weight(addzero**3 + zero + thousand, 0.5)
+                | add_weight(addzero**4 + hundred, 0.5)
+                | add_weight(addzero**5 + tens, 0.5)
+                | add_weight(addzero**6 + digit, 0.5)
+                | add_weight(digit + addzero**7, 0.8)
+                | add_weight(addzero**8, 1.0)
+            )
+        )
+        # 1兆
+        hundred_billion = (
+            (
+                hundred_million
+                | million
+                | hundred_thousand
+                | ten_thousand
+                | thousand
+                | hundred
+                | teen
+                | tens
+                | digits
+            )
+            + delete("兆")
+            + (
+                add_weight(addzero**3 + zero + hundred_million, 0.1)
+                | add_weight(addzero**4 + zero + million, 0.5)
+                | add_weight(addzero**5 + zero + hundred_thousand, 0.5)
+                | add_weight(addzero**6 + zero + ten_thousand, 0.5)
+                | add_weight(addzero**7 + zero + thousand, 0.5)
+                | add_weight(addzero**8 + hundred, 0.5)
+                | add_weight(addzero**9 + tens, 0.5)
+                | add_weight(addzero**10 + digit, 0.5)
+                | add_weight(digit + addzero**11, 0.8)
+                | add_weight(addzero**12, 1.0)
+            )
+        )
+        # 1.11, 1.01
+        number = (
+            digits | teen | tens | hundred | thousand | ten_thousand | hundred_thousand | million
+        )
+        # number = digits | teen | tens | hundred | thousand | ten_thousand | hundred_thousand | million | hundred_million | hundred_billion
+        # 兆/亿
+        number = (number + accep("兆") + delete("零").ques).ques + (
+            number + accep("億") + delete("零").ques
+        ).ques + number | (number + accep("兆") + delete("〇").ques).ques + (
+            number + accep("億") + delete("〇").ques
+        ).ques + number
+
+        number = sign.ques + number + (dot + digits.plus).ques
+        self.number = number.optimize()
+        self.digits = digits.optimize()
+
+        # cardinal string like 127.0.0.1, used in ID, IP, etc.
+        cardinal = digit.plus + (dot + digits.plus).plus
+        # float number like 1.11
+        cardinal |= number + dot + digits.plus
+        # cardinal string like 110 or 12306 or 13125617878, used in phone
+        cardinal |= digits**3 | digits**5 | digits**10 | digits**11 | digits**12
+        # cardinal string like 23
+        if self.enable_standalone_number:
+            if self.enable_0_to_9:
+                cardinal |= number
+            else:
+                number_two_plus = (
+                    (digits + digits.plus)
+                    | teen
+                    | tens
+                    | hundred
+                    | thousand
+                    | ten_thousand
+                    | hundred_thousand
+                    | million
+                    | hundred_million
+                    | hundred_billion
+                )
+                cardinal |= number_two_plus
+        labels_exception = [""]
+        graph_exception = pynini.union(*labels_exception)
+
+        self.graph_no_exception = cardinal
+        self.graph = (pynini.project(cardinal, "input") - graph_exception.arcsort()) @ cardinal
+
+        optional_minus_graph = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("マイナス", '"-"') + DAMO_SPACE, 0, 1
+        )
+
+        final_graph = (
+            optional_minus_graph + pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        )
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
+
+        # ########
+        graph_hundred = pynini.cross("百", "")
+        graph_a_hundred_digit_component = pynini.union(pynini.cross("百", "10") + digit)
+        graph_one_hundred_component = pynini.union(pynini.cross("百", "100"))
+        graph_hundred_ties_component = pynini.cross("百", "1") + pynini.union(
+            graph_teen | pynutil.insert("00"),
+            (ties | pynutil.insert("0")) + (digit | pynutil.insert("0")),
+        )
+        graph_hundred_component = pynini.union(digit + graph_hundred, pynutil.insert("0"))
+        graph_hundred_component += pynini.union(
+            graph_teen | pynutil.insert("00"),
+            (ties | pynutil.insert("0")) + (digit | pynutil.insert("0")),
+        )
+        graph_hundred_component = (
+            graph_hundred_component
+            | graph_a_hundred_digit_component
+            | graph_one_hundred_component
+            | graph_hundred_ties_component
+        )
+        #
+        graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
+            pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
+        )
+        self.graph_hundred_component_at_least_one_none_zero_digit = (
+            graph_hundred_component_at_least_one_none_zero_digit
+        )
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")).optimize()
+graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize()
+ties_graph = pynini.string_file(get_abs_path("data/numbers/ties.tsv")).optimize()
+
+
+def _get_month_graph():
+    """
+    Transducer for month, e.g. march -> march
+    """
+    month_graph = pynini.string_file(get_abs_path("data/months.tsv"))
+    return month_graph
+
+
+def _get_ties_graph():
+    """
+    Transducer for 20-99 e.g
+    twenty three -> 23
+    """
+    graph = ties_graph + (delete_space + graph_digit | pynutil.insert("0"))
+    return graph
+
+
+def _get_range_graph():
+    """
+    Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
+    """
+    graph_ties = _get_ties_graph()
+    graph = (graph_ties | graph_teen) + delete_space + pynini.cross("百", "00s")
+    graph |= pynini.cross("二", "2") + delete_space + pynini.cross("千", "000s")
+    graph |= (
+        (graph_ties | graph_teen)
+        + delete_space
+        + (pynini.closure(DAMO_ALPHA, 1) + (pynini.cross("ies", "y") | pynutil.delete("s")))
+        @ (graph_ties | pynini.cross("十", "10"))
+        + pynutil.insert("s")
+    )
+    graph @= pynini.union("1", "2") + DAMO_DIGIT + DAMO_DIGIT + DAMO_DIGIT + "s"
+    return graph
+
+
+def _get_year_graph():
+    """
+    Transducer for year, e.g. twenty twenty -> 2020
+    """
+
+    def _get_digits_graph():
+        zero = pynini.cross((pynini.accep("〇") | pynini.accep("零")), "0")
+        graph = zero + delete_space + graph_digit
+        graph.optimize()
+        return graph
+
+    def _get_thousands_graph():
+        graph_ties = _get_ties_graph()
+        graph_hundred_component = (
+            graph_digit + delete_space + pynutil.delete("百")
+        ) | pynutil.insert("0")
+        graph_hundred_component |= (pynini.cross("百", "1")) | pynutil.insert("0")
+        graph = (
+            graph_digit
+            + delete_space
+            + pynutil.delete("千")
+            + delete_space
+            + graph_hundred_component
+            + delete_space
+            + (graph_teen | graph_ties)
+        )
+        graph |= (
+            pynini.cross("千", "1")
+            + delete_space
+            + graph_hundred_component
+            + delete_space
+            + (graph_teen | graph_ties)
+        )
+
+        return graph
+
+    graph_ties = _get_ties_graph()
+    graph_digits = _get_digits_graph()
+    graph_thousands = _get_thousands_graph()
+    year_graph = (
+        # 20 19, 40 12, 2012 - assuming no limit on the year
+        (graph_teen + delete_space + (graph_ties | graph_digits | graph_teen))
+        | (graph_ties + delete_space + (graph_ties | graph_digits | graph_teen))
+        | graph_thousands
+    )
+    year_graph.optimize()
+    return year_graph
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for classifying date,
+        e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
+        e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
+        e.g. twenty twenty -> date { year: "2012" preserve_order: true }
+
+    Args:
+        ordinal: OrdinalFst
+    """
+
+    def __init__(self, ordinal: GraphFst):
+        super().__init__(name="date", kind="classify")
+
+        ordinal_graph = ordinal.graph
+        year_graph = _get_year_graph()
+        YEAR_WEIGHT = 0.001
+        year_graph = pynutil.add_weight(year_graph, YEAR_WEIGHT)
+        month_graph = _get_month_graph()
+
+        month_graph = pynutil.insert('month: "') + month_graph + pynutil.insert('"')
+
+        day_graph = (
+            pynutil.insert('day: "') + pynutil.add_weight(ordinal_graph, -0.7) + pynutil.insert('"')
+        )
+        graph_year = (
+            delete_extra_space
+            + pynutil.insert('year: "')
+            + pynutil.add_weight(year_graph, -YEAR_WEIGHT)
+            + pynutil.insert('"')
+        )
+        optional_graph_year = pynini.closure(
+            graph_year,
+            0,
+            1,
+        )
+        graph_mdy = month_graph + (
+            (delete_extra_space + day_graph)
+            | graph_year
+            | (delete_extra_space + day_graph + graph_year)
+        )
+        graph_dmy = (
+            pynutil.delete("the")
+            + delete_space
+            + day_graph
+            + delete_space
+            + pynutil.delete("of")
+            + delete_extra_space
+            + month_graph
+            + optional_graph_year
+        )
+        graph_year = (
+            pynutil.insert('year: "') + (year_graph | _get_range_graph()) + pynutil.insert('"')
+        )
+
+        final_graph = graph_mdy | graph_dmy | graph_year
+        final_graph += pynutil.insert(" preserve_order: true")
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/decimal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/decimal.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_DIGIT,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+def get_quantity(
+    decimal: "pynini.FstLike", cardinal_up_to_hundred: "pynini.FstLike"
+) -> "pynini.FstLike":
+    """
+    Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
+    e.g. one million -> integer_part: "1" quantity: "million"
+    e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
+
+    Args:
+        decimal: decimal FST
+        cardinal_up_to_hundred: cardinal FST
+    """
+    numbers = cardinal_up_to_hundred @ (
+        pynutil.delete(pynini.closure("0"))
+        + pynini.difference(DAMO_DIGIT, "0")
+        + pynini.closure(DAMO_DIGIT)
+    )
+    suffix = pynini.union(
+        "万", "百万", "千万", "億" "十億", "trillion", "quadrillion", "quintillion", "sextillion"
+    )
+    res = (
+        pynutil.insert('integer_part: "')
+        + numbers
+        + pynutil.insert('"')
+        + delete_extra_space
+        + pynutil.insert('quantity: "')
+        + suffix
+        + pynutil.insert('"')
+    )
+    res |= (
+        decimal
+        + delete_extra_space
+        + pynutil.insert('quantity: "')
+        + (suffix | "千")
+        + pynutil.insert('"')
+    )
+    return res
+
+
+class DecimalFst(GraphFst):
+    """
+    Finite state transducer for classifying decimal
+        e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12"  fractional_part: "5006" quantity: "billion" }
+        e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="decimal", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+
+        graph_decimal = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+        graph_decimal |= (
+            pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+            | pynini.cross("零", "０")
+            | pynini.cross("〇", "０")
+        )
+
+        graph_decimal = pynini.closure(graph_decimal + delete_space) + graph_decimal
+        self.graph = graph_decimal
+
+        point = pynutil.delete("点")
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("マイナス", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        graph_fractional = (
+            pynutil.insert('fractional_part: "') + graph_decimal + pynutil.insert('"')
+        )
+        graph_integer = pynutil.insert('integer_part: "') + cardinal_graph + pynutil.insert('"')
+        final_graph_wo_sign = (
+            pynini.closure(graph_integer + delete_extra_space, 0, 1)
+            + point
+            + delete_extra_space
+            + graph_fractional
+        )
+        final_graph = optional_graph_negative + final_graph_wo_sign
+
+        self.final_graph_wo_negative = final_graph_wo_sign | get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
+        )
+        final_graph |= optional_graph_negative + get_quantity(
+            final_graph_wo_sign, cardinal.graph_hundred_component_at_least_one_none_zero_digit
+        )
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/electronic.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/electronic.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_ALPHA,
+    GraphFst,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class ElectronicFst(GraphFst):
+    """
+    Finite state transducer for classifying electronic: as URLs, email addresses, etc.
+        e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
+    """
+
+    def __init__(self):
+        super().__init__(name="electronic", kind="classify")
+
+        delete_extra_space = pynutil.delete(" ")
+        alpha_num = (
+            DAMO_ALPHA
+            | pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
+            | pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        )
+
+        symbols = pynini.string_file(get_abs_path("data/electronic/symbols.tsv")).invert()
+
+        accepted_username = alpha_num | symbols
+        process_dot = pynini.cross("ドット", ".")
+        username = (
+            alpha_num + pynini.closure(delete_extra_space + accepted_username)
+        ) | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
+        username = pynutil.insert('username: "') + username + pynutil.insert('"')
+        single_alphanum = pynini.closure(alpha_num + delete_extra_space) + alpha_num
+        server = single_alphanum | pynini.string_file(
+            get_abs_path("data/electronic/server_name.tsv")
+        )
+        domain = single_alphanum | pynini.string_file(get_abs_path("data/electronic/domain.tsv"))
+        domain_graph = (
+            pynutil.insert('domain: "')
+            + server
+            + delete_extra_space
+            + process_dot
+            + delete_extra_space
+            + domain
+            + pynutil.insert('"')
+        )
+        graph = (
+            username
+            + delete_extra_space
+            + pynutil.delete("at")
+            + insert_space
+            + delete_extra_space
+            + domain_graph
+        )
+
+        ############# url ###
+        protocol_end = pynini.cross(pynini.union("w w w", "www"), "www")
+        protocol_start = (
+            pynini.cross("h t t p", "http") | pynini.cross("h t t p s", "https")
+        ) + pynini.cross(
+            "コロンスラッシュスラッシュ", "://"  # colon slash slash
+        )
+        # .com,
+        ending = (
+            delete_extra_space
+            + symbols
+            + delete_extra_space
+            + (
+                domain
+                | pynini.closure(
+                    accepted_username + delete_extra_space,
+                )
+                + accepted_username
+            )
+        )
+
+        protocol_default = (
+            (
+                (pynini.closure(delete_extra_space + accepted_username, 1) | server)
+                | pynutil.add_weight(pynini.closure(DAMO_ALPHA, 1), weight=0.0001)
+            )
+            + pynini.closure(ending, 1)
+        ).optimize()
+        protocol = (
+            pynini.closure(protocol_start, 0, 1)
+            + protocol_end
+            + delete_extra_space
+            + process_dot
+            + protocol_default
+        ).optimize()
+
+        protocol |= (
+            pynini.closure(protocol_end + delete_extra_space + process_dot, 0, 1) + protocol_default
+        )
+
+        protocol = pynutil.insert('protocol: "') + protocol.optimize() + pynutil.insert('"')
+        graph |= protocol
+        ########
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/fraction.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/fraction.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+    insert_space,
+    DAMO_CHAR,
+)
+from pynini.lib import pynutil
+
+
+class FractionFst(GraphFst):
+    """
+    Finite state transducer for classifying fraction
+        e.g. 2 phần 3 -> tokens { fraction { numerator: "2" denominator: "3" } }
+        e.g. 2 trên 3 -> tokens { fraction { numerator: "2" denominator: "3" } }
+        e.g. 2 chia 3 -> tokens { fraction { numerator: "2" denominator: "3" } }
+
+    Args:
+        cardinal: OrdinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="fraction", kind="classify")
+
+        graph_cardinal = cardinal.graph_no_exception
+        graph_four = pynini.cross("クォーター", "4")  # quarter
+
+        denominator = (
+            pynutil.insert('denominator: "') + (graph_cardinal | graph_four) + pynutil.insert('"')
+        )
+        fraction_component = pynutil.delete(pynini.union("分の", "割る"))
+        numerator = pynutil.insert('numerator: "') + graph_cardinal + pynutil.insert('"')
+
+        graph_fraction_component = denominator + insert_space + fraction_component + numerator
+
+        self.graph_fraction_component = graph_fraction_component
+
+        graph = graph_fraction_component
+        graph = graph.optimize()
+        self.final_graph_wo_negative = graph
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("マイナス", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        graph = optional_graph_negative + graph
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/measure.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/measure.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_SIGMA,
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class MeasureFst(GraphFst):
+    """
+    Finite state transducer for classifying measure
+        e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+        super().__init__(name="measure", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+
+        graph_unit = pynini.string_file(get_abs_path("data/measurements.tsv"))
+        graph_unit_singular = graph_unit  # singular -> abbr
+        graph_unit_plural = graph_unit_singular  # plural -> abbr
+
+        optional_graph_negative = pynini.closure(
+            pynutil.insert("negative: ") + pynini.cross("マイナス", '"true"') + delete_extra_space,
+            0,
+            1,
+        )
+
+        unit_singular = graph_unit_singular
+        unit_plural = graph_unit_plural
+        unit_misc = pynutil.insert("/") + pynutil.delete("每") + delete_space + graph_unit_singular
+
+        unit_singular = (
+            pynutil.insert('units: "')
+            + (
+                unit_singular
+                | unit_misc
+                | pynutil.add_weight(unit_singular + delete_space + unit_misc, 0.01)
+            )
+            + pynutil.insert('"')
+        )
+
+        unit_plural = (
+            pynutil.insert('units: "')
+            + (
+                unit_plural
+                | unit_misc
+                | pynutil.add_weight(unit_plural + delete_space + unit_misc, 0.01)
+            )
+            + pynutil.insert('"')
+        )
+
+        subgraph_decimal = (
+            pynutil.insert("decimal { ")
+            + optional_graph_negative
+            + decimal.final_graph_wo_negative
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_plural
+        )
+        subgraph_decimal |= (
+            pynutil.insert("decimal { ")
+            + optional_graph_negative
+            + decimal.final_graph_wo_negative
+            + pynutil.insert(" }")
+            # + delete_extra_space
+            + unit_plural
+        )
+        subgraph_cardinal = (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + ((DAMO_SIGMA - "一") @ cardinal_graph)
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_plural
+        )
+        subgraph_cardinal |= (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + pynini.cross("一", "1")
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + delete_extra_space
+            + unit_singular
+        )
+        subgraph_cardinal |= (
+            pynutil.insert("cardinal { ")
+            + optional_graph_negative
+            + pynutil.insert('integer: "')
+            + ((DAMO_SIGMA - "一") @ cardinal_graph)
+            + pynutil.insert('"')
+            + pynutil.insert(" }")
+            + unit_singular
+        )
+        final_graph = subgraph_decimal | subgraph_cardinal
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/money.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/money.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_DIGIT,
+    DAMO_NOT_SPACE,
+    DAMO_SIGMA,
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    # get_singulars,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class MoneyFst(GraphFst):
+    """
+    Finite state transducer for classifying money
+        e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
+
+    Args:
+        cardinal: CardinalFst
+        decimal: DecimalFst
+    """
+
+    def __init__(self, cardinal: GraphFst, decimal: GraphFst):
+        super().__init__(name="money", kind="classify")
+        # quantity, integer_part, fractional_part, currency
+
+        cardinal_graph = cardinal.graph_no_exception
+        # add support for missing hundred (only for 3 digit numbers)
+        # "one fifty" -> "one hundred fifty"
+        with_hundred = pynini.compose(
+            pynini.closure(DAMO_NOT_SPACE) + pynini.accep(" ") + pynutil.insert("百") + DAMO_SIGMA,
+            pynini.compose(cardinal_graph, DAMO_DIGIT**3),
+        )
+        cardinal_graph |= with_hundred
+        graph_decimal_final = decimal.final_graph_wo_negative
+
+        unit = pynini.string_file(get_abs_path("data/currency.tsv"))
+        unit_singular = pynini.invert(unit)
+        unit_plural = unit_singular
+        # unit_plural = get_singulars(unit_singular)
+
+        graph_unit_singular = (
+            pynutil.insert('currency: "') + convert_space(unit_singular) + pynutil.insert('"')
+        )
+        graph_unit_plural = (
+            pynutil.insert('currency: "') + convert_space(unit_plural) + pynutil.insert('"')
+        )
+
+        add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
+            pynutil.insert("0") + DAMO_DIGIT
+        )
+        # twelve dollars (and) fifty cents, zero cents
+        cents_standalone = (
+            pynutil.insert('fractional_part: "')
+            + pynini.union(
+                pynutil.add_weight(((DAMO_SIGMA - "一") @ cardinal_graph), -0.7)
+                @ add_leading_zero_to_double_digit
+                + delete_space
+                + pynutil.delete("セント"),  # cent
+                pynini.cross("一", "01") + delete_space + pynutil.delete("セント"),  # cent
+            )
+            + pynutil.insert('"')
+        )
+
+        optional_cents_standalone = pynini.closure(
+            delete_space
+            + pynini.closure(pynutil.delete("と") + delete_space, 0, 1)  # and
+            + insert_space
+            + cents_standalone,
+            0,
+            1,
+        )
+        # twelve dollars fifty, only after integer
+        optional_cents_suffix = pynini.closure(
+            delete_extra_space
+            + pynutil.insert('fractional_part: "')
+            + pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+            + pynutil.insert('"'),
+            0,
+            1,
+        )
+
+        graph_integer = (
+            pynutil.insert('integer_part: "')
+            + ((DAMO_SIGMA - "一") @ cardinal_graph)
+            + pynutil.insert('"')
+            + delete_extra_space
+            + graph_unit_plural
+            + (optional_cents_standalone | optional_cents_suffix)
+        )
+        graph_integer |= (
+            pynutil.insert('integer_part: "')
+            + pynini.cross("一", "1")
+            + pynutil.insert('"')
+            + delete_extra_space
+            + graph_unit_singular
+            + (optional_cents_standalone | optional_cents_suffix)
+        )
+        graph_decimal = graph_decimal_final + delete_extra_space + graph_unit_plural
+        graph_decimal |= pynutil.insert('currency: "$" integer_part: "0" ') + cents_standalone
+        final_graph = graph_integer | graph_decimal
+
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/ordinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/ordinal.py
+import pynini
+from pynini import cross
+from pynini.lib.pynutil import delete, insert, add_weight
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import DAMO_CHAR, GraphFst
+from pynini.lib import pynutil
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for classifying ordinal
+        e.g. thirteenth -> ordinal { integer: "13" }
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="ordinal", kind="classify")
+
+        cardinal_graph = cardinal.graph_no_exception
+        digit = pynini.string_file(get_abs_path("data/ordinals/digit.tsv"))
+        ties = pynini.string_file(get_abs_path("data/ordinals/ties.tsv"))
+        teen = pynini.string_file(get_abs_path("data/ordinals/teen.tsv"))
+        zero = pynini.string_file(get_abs_path("data/numbers/zero.tsv"))
+        hundred_digit = pynini.string_file(get_abs_path("data/numbers/hundred_digit.tsv"))
+        addzero = insert("0")
+        tens = ties + addzero | (digit + delete("十") + (digit | addzero))
+        hundred = (
+            digit
+            + delete("百")
+            + (
+                tens
+                | teen
+                | add_weight(zero + digit, 0.1)
+                | add_weight(digit + addzero, 0.5)
+                | add_weight(addzero**2, 1.0)
+            )
+        )
+        hundred |= cross("百", "1") + (
+            tens
+            | teen
+            | add_weight(zero + digit, 0.1)
+            | add_weight(digit + addzero, 0.5)
+            | add_weight(addzero**2, 1.0)
+        )
+        hundred |= hundred_digit
+
+        ordinal = digit | teen | tens | hundred
+        graph = pynini.closure(DAMO_CHAR, 1) + ordinal
+
+        self.graph = graph @ cardinal_graph
+        final_graph = pynutil.insert('integer: "') + self.graph + pynutil.insert('"')
+        final_graph = self.add_tokens(final_graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/preprocessor.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import DAMO_SIGMA, GraphFst
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from pynini.lib import pynutil
+
+
+class PreProcessor(GraphFst):
+    def __init__(
+        self,
+        halfwidth_to_fullwidth: bool = True,
+    ):
+        super().__init__(name="PreProcessor", kind="processor")
+
+        graph = pynini.cdrewrite("", "", "", DAMO_SIGMA)
+
+        if halfwidth_to_fullwidth:
+            halfwidth_to_fullwidth_graph = pynini.string_file(
+                get_abs_path("data/char/halfwidth_to_fullwidth.tsv")
+            )
+            graph @= pynini.cdrewrite(halfwidth_to_fullwidth_graph, "", "", DAMO_SIGMA)
+
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/punctuation.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/punctuation.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import GraphFst
+from pynini.lib import pynutil
+
+
+class PunctuationFst(GraphFst):
+    """
+    Finite state transducer for classifying punctuation
+        e.g. a, -> tokens { name: "a" } tokens { name: "," }
+    """
+
+    def __init__(self):
+        super().__init__(name="punctuation", kind="classify")
+
+        s = "!#$%&'()*+,-./:;<=>?@^_`{|}~、。，！【】「」《》￥（）——・"
+        punct = pynini.union(*s)
+
+        graph = pynutil.insert('name: "') + punct + pynutil.insert('"')
+
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/telephone.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/telephone.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_ALNUM,
+    DAMO_ALPHA,
+    DAMO_DIGIT,
+    GraphFst,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+def get_serial_number(cardinal):
+    """
+    any alphanumerical character sequence with at least one number with length greater equal to 3
+    """
+    digit = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT)
+    character = digit
+    sequence = character + pynini.closure(character, 2)
+    sequence = sequence @ (pynini.closure(DAMO_ALNUM) + DAMO_DIGIT + pynini.closure(DAMO_ALNUM))
+    return sequence.optimize()
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for classifying telephone numbers, e.g.
+        one two three one two three five six seven eight -> { number_part: "123-123-5678" }
+
+    This class also support card number and IP format.
+        "one two three dot one double three dot o dot four o" -> { number_part: "123.133.0.40"}
+
+        "three two double seven three two one four three two one four three double zero five" ->
+            { number_part: 3277 3214 3214 3005}
+
+    Args:
+        cardinal: CardinalFst
+    """
+
+    def __init__(self, cardinal: GraphFst):
+        super().__init__(name="telephone", kind="classify")
+        # country code, number_part, extension
+        digit_to_str = (
+            pynini.invert(pynini.string_file(get_abs_path("data/numbers/digit.tsv")).optimize())
+            # | pynini.cross("０", pynini.union("o")).optimize()
+            | pynini.cross("０", pynini.union("〇", "零")).optimize()
+        )
+
+        str_to_digit = pynini.invert(digit_to_str)
+
+        double_digit = pynini.union(
+            *[
+                pynini.cross(
+                    pynini.project(str(i) @ digit_to_str, "output")
+                    + pynini.accep(" ")
+                    + pynini.project(str(i) @ digit_to_str, "output"),
+                    pynutil.insert("double ") + pynini.project(str(i) @ digit_to_str, "output"),
+                )
+                for i in range(10)
+            ]
+        )
+        double_digit.invert()
+
+        # to handle cases like "one twenty three"
+        two_digit_cardinal = pynini.compose(cardinal.graph_no_exception, DAMO_DIGIT**2)
+        double_digit_to_digit = (
+            pynini.compose(double_digit, str_to_digit + pynutil.delete(" ") + str_to_digit)
+            | two_digit_cardinal
+        )
+
+        single_or_double_digit = (
+            pynutil.add_weight(double_digit_to_digit, -0.0001) | str_to_digit
+        ).optimize()
+        single_or_double_digit |= (
+            single_or_double_digit
+            + pynini.closure(
+                pynutil.add_weight(pynutil.delete(" ") + single_or_double_digit, 0.0001)
+            )
+        ).optimize()
+
+        number_part = pynini.compose(
+            single_or_double_digit,
+            DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**4,
+        ).optimize()
+        number_part = (
+            pynutil.insert('number_part: "') + number_part.optimize() + pynutil.insert('"')
+        )
+
+        cardinal_option = pynini.compose(single_or_double_digit, DAMO_DIGIT ** (2, 3))
+
+        country_code = (
+            pynutil.insert('country_code: "')
+            + pynini.closure(pynini.cross("プラス ", "+"), 0, 1)  # plus
+            + (
+                (pynini.closure(str_to_digit + pynutil.delete(" "), 0, 2) + str_to_digit)
+                | cardinal_option
+            )
+            + pynutil.insert('"')
+        )
+
+        optional_country_code = pynini.closure(
+            country_code + pynutil.delete(" ") + insert_space, 0, 1
+        ).optimize()
+        graph = optional_country_code + number_part
+
+        # credit card number
+        space_four_digits = insert_space + DAMO_DIGIT**4
+        credit_card_graph = pynini.compose(
+            single_or_double_digit, DAMO_DIGIT**4 + space_four_digits**3
+        ).optimize()
+        graph |= (
+            pynutil.insert('number_part: "') + credit_card_graph.optimize() + pynutil.insert('"')
+        )
+
+        # SSN
+        ssn_graph = pynini.compose(
+            single_or_double_digit,
+            DAMO_DIGIT**3
+            + pynutil.insert("-")
+            + DAMO_DIGIT**2
+            + pynutil.insert("-")
+            + DAMO_DIGIT**4,
+        ).optimize()
+        graph |= pynutil.insert('number_part: "') + ssn_graph.optimize() + pynutil.insert('"')
+
+        # ip
+        digit_or_double = (
+            pynini.closure(str_to_digit + pynutil.delete(" "), 0, 1) + double_digit_to_digit
+        )
+        digit_or_double |= double_digit_to_digit + pynini.closure(
+            pynutil.delete(" ") + str_to_digit, 0, 1
+        )
+        digit_or_double |= str_to_digit + (pynutil.delete(" ") + str_to_digit) ** (0, 2)
+        digit_or_double |= cardinal_option
+        digit_or_double = digit_or_double.optimize()
+
+        ip_graph = digit_or_double + (pynini.cross(" 点 ", ".") + digit_or_double) ** 3  # dot
+
+        graph |= pynutil.insert('number_part: "') + ip_graph.optimize() + pynutil.insert('"')
+        graph |= (
+            pynutil.insert('number_part: "')
+            + pynutil.add_weight(get_serial_number(cardinal=cardinal), weight=0.0001)
+            + pynutil.insert('"')
+        )
+
+        final_graph = self.add_tokens(graph)
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/time.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/time.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.taggers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path, num_to_word
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    GraphFst,
+    convert_space,
+    delete_extra_space,
+    delete_space,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+
+class TimeFst(GraphFst):
+    """
+    Finite state transducer for classifying time
+        e.g. twelve thirty -> time { hours: "12" minutes: "30" }
+        e.g. twelve past one -> time { minutes: "12" hours: "1" }
+        e.g. two o clock a m -> time { hours: "2" suffix: "a.m." }
+        e.g. quarter to two -> time { hours: "1" minutes: "45" }
+        e.g. quarter past two -> time { hours: "2" minutes: "15" }
+        e.g. half past two -> time { hours: "2" minutes: "30" }
+    """
+
+    def __init__(self):
+        super().__init__(name="time", kind="classify")
+        # hours, minutes, seconds, suffix, zone, style, speak_period
+
+        suffix_graph = pynini.string_file(get_abs_path("data/time/time_suffix.tsv"))
+        time_zone_graph = pynini.invert(pynini.string_file(get_abs_path("data/time/time_zone.tsv")))
+        to_hour_graph = pynini.string_file(get_abs_path("data/time/to_hour.tsv"))
+        minute_to_graph = pynini.string_file(get_abs_path("data/time/minute_to.tsv"))
+
+        # only used for < 1000 thousand -> 0 weight
+        cardinal = pynutil.add_weight(CardinalFst().graph_no_exception, weight=-0.7)
+
+        labels_hour = [num_to_word(x) for x in range(0, 24)]
+        labels_minute_single = [num_to_word(x) for x in range(1, 10)]
+        labels_minute_double = [num_to_word(x) for x in range(10, 60)]
+
+        graph_hour = pynini.union(*labels_hour) @ cardinal
+
+        graph_minute_single = pynini.union(*labels_minute_single) @ cardinal
+        graph_minute_double = pynini.union(*labels_minute_double) @ cardinal
+        graph_minute_verbose = pynini.cross("半", "30") | pynini.cross(
+            "クォーター", "15"
+        )  # half, quarter
+        oclock = pynini.cross(pynini.union("時", "o' clock", "o clock", "o'clock", "oclock"), "")
+
+        final_graph_hour = pynutil.insert('hours: "') + graph_hour + pynutil.insert('"')
+        graph_minute = (
+            oclock + pynutil.insert("00")
+            | pynutil.delete(pynini.union("〇", "零")) + delete_space + graph_minute_single
+            | graph_minute_double
+        )
+        final_suffix = (
+            pynutil.insert('suffix: "') + convert_space(suffix_graph) + pynutil.insert('"')
+        )
+        final_suffix = delete_space + insert_space + final_suffix
+        final_suffix_optional = pynini.closure(final_suffix, 0, 1)
+        final_time_zone_optional = pynini.closure(
+            delete_space
+            + insert_space
+            + pynutil.insert('zone: "')
+            + convert_space(time_zone_graph)
+            + pynutil.insert('"'),
+            0,
+            1,
+        )
+
+        # five o' clock
+        # two o eight, two thirty five (am/pm)
+        # two pm/am
+        graph_hm = (
+            final_graph_hour
+            + delete_extra_space
+            + pynutil.insert('minutes: "')
+            + graph_minute
+            + pynutil.insert('"')
+        )
+        # 10 past four, quarter past four, half past four
+        graph_m_past_h = (
+            pynutil.insert('minutes: "')
+            + pynini.union(graph_minute_single, graph_minute_double, graph_minute_verbose)
+            + pynutil.insert('"')
+            + delete_extra_space
+            + final_graph_hour
+        )
+
+        graph_quarter_time = (
+            pynutil.insert('minutes: "')
+            + pynini.cross("クォーター", "45")  # quarter
+            + pynutil.insert('"')
+            + delete_space
+            + pynutil.delete(pynini.union("から", "to", "till"))  # to, till
+            + delete_extra_space
+            + pynutil.insert('hours: "')
+            + to_hour_graph
+            + pynutil.insert('"')
+        )
+
+        graph_m_to_h_suffix_time = (
+            pynutil.insert('minutes: "')
+            + ((graph_minute_single | graph_minute_double).optimize() @ minute_to_graph)
+            + pynutil.insert('"')
+            + pynini.closure(
+                delete_space
+                + pynutil.delete(pynini.union("分", "min", "mins", "minute", "minutes")),
+                0,
+                1,
+            )
+            + delete_space
+            + pynutil.delete(pynini.union("から", "to", "till"))  # to, till
+            + delete_extra_space
+            + pynutil.insert('hours: "')
+            + to_hour_graph
+            + pynutil.insert('"')
+            + final_suffix
+        )
+
+        graph_h = (
+            final_graph_hour
+            + delete_extra_space
+            + pynutil.insert('minutes: "')
+            + (pynutil.insert("00") | graph_minute)
+            + pynutil.insert('"')
+            + final_suffix
+            + final_time_zone_optional
+        )
+        final_graph = (
+            (graph_hm | graph_m_past_h | graph_quarter_time)
+            + final_suffix_optional
+            + final_time_zone_optional
+        )
+        final_graph |= graph_h
+        final_graph |= graph_m_to_h_suffix_time
+
+        final_graph = self.add_tokens(final_graph.optimize())
+
+        self.fst = final_graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/tokenize_and_classify.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/tokenize_and_classify.py
+import os
+
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.taggers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.date import DateFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.decimal import DecimalFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.electronic import ElectronicFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.measure import MeasureFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.fraction import FractionFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.money import MoneyFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.ordinal import OrdinalFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.punctuation import PunctuationFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.telephone import TelephoneFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.time import TimeFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.whitelist import WhiteListFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.word import WordFst
+from fun_text_processing.inverse_text_normalization.ja.taggers.preprocessor import PreProcessor
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    GraphFst,
+    DAMO_SIGMA,
+    delete_extra_space,
+    delete_space,
+    generator_main,
+    insert_space,
+)
+from pynini.lib import pynutil
+
+import logging
+
+
+class ClassifyFst(GraphFst):
+    """
+    Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+
+    Args:
+        cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
+        overwrite_cache: set to True to overwrite .far files
+    """
+
+    def __init__(
+        self,
+        cache_dir: str = None,
+        overwrite_cache: bool = False,
+        enable_standalone_number: bool = True,
+        enable_0_to_9: bool = True,
+    ):
+        super().__init__(name="tokenize_and_classify", kind="classify")
+        self.convert_number = enable_standalone_number
+        self.enable_0_to_9 = enable_0_to_9
+
+        far_file = None
+        if cache_dir is not None and cache_dir != "None":
+            os.makedirs(cache_dir, exist_ok=True)
+            far_file = os.path.join(cache_dir, "_ja_itn.far")
+        if not overwrite_cache and far_file and os.path.exists(far_file):
+            self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
+            logging.info(f"ClassifyFst.fst was restored from {far_file}.")
+        else:
+            logging.info(f"Creating ClassifyFst grammars.")
+            cardinal = CardinalFst(self.convert_number, self.enable_0_to_9)
+            cardinal_graph = cardinal.fst
+
+            fraction = FractionFst(cardinal)
+            fraction_graph = fraction.fst
+
+            ordinal = OrdinalFst(cardinal)
+            ordinal_graph = ordinal.fst
+
+            decimal = DecimalFst(cardinal)
+            decimal_graph = decimal.fst
+
+            measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
+            date_graph = DateFst(ordinal=ordinal).fst
+            word_graph = WordFst().fst
+            time_graph = TimeFst().fst
+            money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
+            whitelist_graph = WhiteListFst().fst
+            punct_graph = PunctuationFst().fst
+            preprocessor = PreProcessor(halfwidth_to_fullwidth=True).fst
+            electronic_graph = ElectronicFst().fst
+            telephone_graph = TelephoneFst(cardinal).fst
+
+            classify = (
+                pynutil.add_weight(whitelist_graph, 1.01)
+                | pynutil.add_weight(time_graph, 1.1)
+                | pynutil.add_weight(date_graph, 1.09)
+                | pynutil.add_weight(decimal_graph, 1.1)
+                | pynutil.add_weight(measure_graph, 1.1)
+                | pynutil.add_weight(cardinal_graph, 1.1)
+                | pynutil.add_weight(ordinal_graph, 1.1)
+                | pynutil.add_weight(fraction_graph, 1.09)
+                | pynutil.add_weight(money_graph, 1.1)
+                | pynutil.add_weight(telephone_graph, 1.1)
+                | pynutil.add_weight(electronic_graph, 1.1)
+                | pynutil.add_weight(word_graph, 100)
+            )
+
+            punct = (
+                pynutil.insert("tokens { ")
+                + pynutil.add_weight(punct_graph, weight=1.1)
+                + pynutil.insert(" }")
+            )
+            token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
+            token_plus_punct = (
+                pynini.closure(punct + pynutil.insert(" "))
+                + token
+                + pynini.closure(pynutil.insert(" ") + punct)
+            )
+            graph = token_plus_punct + pynini.closure(
+                pynini.union(insert_space, delete_extra_space) + token_plus_punct
+            )
+
+            graph = delete_space + graph + delete_space
+
+            self.fst = graph.optimize()
+
+            if far_file:
+                generator_main(far_file, {"tokenize_and_classify": self.fst})
+                logging.info(f"ClassifyFst grammars are saved to {far_file}.")
+
+            self.token_plus_punct = token_plus_punct.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/whitelist.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/whitelist.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.utils import get_abs_path
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import GraphFst, convert_space
+from pynini.lib import pynutil
+
+
+class WhiteListFst(GraphFst):
+    """
+    Finite state transducer for classifying whitelisted tokens
+        e.g. misses -> tokens { name: "mrs." }
+    This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
+    """
+
+    def __init__(self):
+        super().__init__(name="whitelist", kind="classify")
+
+        whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
+        graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
+        self.fst = graph.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/word.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/taggers/word.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_NOT_SPACE,
+    GraphFst,
+    DAMO_CHAR,
+)
+from pynini.lib import pynutil
+
+
+class WordFst(GraphFst):
+    """
+    Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
+        e.g. sleep -> tokens { name: "sleep" }
+    """
+
+    def __init__(self):
+        super().__init__(name="word", kind="classify")
+        word = pynutil.insert('name: "') + DAMO_NOT_SPACE + pynutil.insert('"')
+
+        self.fst = word.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/utils.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/utils.py
+import os
+from typing import Union
+
+import inflect
+
+_inflect = inflect.engine()
+
+
+def num_to_word(x: Union[str, int]):
+    """
+    converts integer to spoken representation
+
+    Args
+        x: integer
+
+    Returns: spoken representation
+    """
+    if isinstance(x, int):
+        x = str(x)
+        x = _inflect.number_to_words(str(x)).replace("-", " ").replace(",", "")
+    return x
+
+
+def get_abs_path(rel_path):
+    """
+    Get absolute path
+
+    Args:
+        rel_path: relative path to this file
+
+    Returns absolute path
+    """
+    return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/__init__.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/__init__.py
+
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/cardinal.py
+# -*- coding: utf-8 -*-
+
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+    DAMO_CHAR,
+)
+from pynini.lib import pynutil
+
+
+class CardinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing cardinal
+        e.g. cardinal { integer: "23" negative: "-" } -> -23
+    """
+
+    def __init__(self):
+        # enable_standalone_number: bool = True,
+        # enable_0_to_9: bool = True):
+        super().__init__(name="cardinal", kind="verbalize")
+        # self.enable_standalone_number = enable_standalone_number
+        # self.enable_0_to_9 = enable_0_to_9
+        optional_sign = pynini.closure(
+            pynutil.delete("negative:")
+            + delete_space
+            + pynutil.delete('"')
+            + DAMO_NOT_QUOTE
+            + pynutil.delete('"')
+            + delete_space,
+            0,
+            1,
+        )
+        graph = (
+            pynutil.delete("integer:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        # if self.enable_standalone_number:
+        #     if self.enable_0_to_9:
+        #     else:
+        self.numbers = graph
+        graph = optional_sign + graph
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/date.py
+++ b/FunASR/fun_text_processing/inverse_text_normalization/ja/verbalizers/date.py
+import pynini
+from fun_text_processing.inverse_text_normalization.ja.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class DateFst(GraphFst):
+    """
+    Finite state transducer for verbalizing date, e.g.
+        date { month: "january" day: "5" year: "2012" preserve_order: true } -> february 5 2012
+        date { day: "5" month: "january" year: "2012" preserve_order: true } -> 5 february 2012
+    """
+
+    def __init__(self):
+        super().__init__(name="date", kind="verbalize")
+        month = (
+            pynutil.delete("month:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        day = (
+            pynutil.delete("day:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        year = (
+            pynutil.delete("year:")
+            + delete_space
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + delete_space
+            + pynutil.delete('"')
+        )
+
+        # month (day) year
+        graph_mdy = (
+            month
+            + pynini.closure(delete_extra_space + day, 0, 1)
+            + pynini.closure(delete_extra_space + year, 0, 1)
+        )
+
+        # (day) month year
+        graph_dmy = (
+            pynini.closure(day + delete_extra_space, 0, 1)
+            + month
+            + pynini.closure(delete_extra_space + year, 0, 1)
+        )
+
+        optional_preserve_order = pynini.closure(
+            pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
+            | pynutil.delete("field_order:")
+            + delete_space
+            + pynutil.delete('"')
+            + DAMO_NOT_QUOTE
+            + pynutil.delete('"')
+            + delete_space
+        )
+
+        final_graph = (graph_mdy | year | graph_dmy) + delete_space + optional_preserve_order
+
+        delete_tokens = self.delete_tokens(final_graph)
+        self.fst = delete_tokens.optimize()