Initial commit

431278fa · “change” · 8c252776 · 431278fa · 431278fa · 431278fa
Commit 431278fa authored Nov 22, 2024 by “change”
8 changed files
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/money.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/money.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
+from pynini.lib import pynutil
+
+
+class MoneyFst(GraphFst):
+    """
+    Finite state transducer for verbalizing electronic
+        e.g. money { integer_part: "2 руб." } -> "2 руб."
+    """
+
+    def __init__(self):
+        super().__init__(name="money", kind="verbalize")
+
+        graph = (
+            pynutil.delete('integer_part: "')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
+from pynini.lib import pynutil
+
+
+class OrdinalFst(GraphFst):
+    """
+    Finite state transducer for verbalizing ordinal numbers
+        e.g. ordinal { integer: "2" } -> "2"
+
+    Args:
+        deterministic: if True will provide a single transduction option,
+            for False multiple transduction are generated (used for audio-based normalization)
+    """
+
+    def __init__(self, deterministic: bool = True):
+        super().__init__(name="ordinal", kind="verbalize", deterministic=deterministic)
+
+        value = pynini.closure(DAMO_NOT_QUOTE)
+        graph = pynutil.delete('integer: "') + value + pynutil.delete('"')
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
+from pynini.lib import pynutil
+
+
+class TelephoneFst(GraphFst):
+    """
+    Finite state transducer for verbalizing telephone
+        e.g. telephone { number_part: "8-913-983-56-01" } -> "8-913-983-56-01"
+    """
+
+    def __init__(self):
+        super().__init__(name="telephone", kind="verbalize")
+
+        graph = (
+            pynutil.delete('number_part: "')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/time.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/time.py
+import pynini
+from fun_text_processing.text_normalization.en.graph_utils import (
+    DAMO_NOT_QUOTE,
+    GraphFst,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class TimeFst(GraphFst):
+    """
+    Finite state transducer for verbalizing time
+        e.g. time { hours: "02:15" } -> "02:15"
+    """
+
+    def __init__(self):
+        super().__init__(name="time", kind="verbalize")
+        hour = (
+            pynutil.delete("hours: ")
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+        minutes = (
+            pynutil.delete("minutes: ")
+            + pynutil.delete('"')
+            + pynini.closure(DAMO_NOT_QUOTE, 1)
+            + pynutil.delete('"')
+        )
+
+        graph_preserve_order = (
+            pynutil.delete('hours: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
+        )
+
+        # for cases that require permutations for the correct verbalization
+        graph_reverse_order = hour + delete_space + pynutil.insert(":") + minutes + delete_space
+
+        graph = graph_preserve_order | graph_reverse_order
+        delete_tokens = self.delete_tokens(graph)
+        self.fst = delete_tokens.optimize()
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize.py
+from fun_text_processing.inverse_text_normalization.en.verbalizers.whitelist import WhiteListFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.cardinal import CardinalFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.date import DateFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.decimal import DecimalFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.electronic import ElectronicFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.measure import MeasureFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.money import MoneyFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.ordinal import OrdinalFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.telephone import TelephoneFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.time import TimeFst
+from fun_text_processing.text_normalization.en.graph_utils import GraphFst
+
+
+class VerbalizeFst(GraphFst):
+    """
+    Composes other verbalizer grammars.
+    For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
+    More details to deployment at NeMo/tools/text_processing_deployment.
+    """
+
+    def __init__(self):
+        super().__init__(name="verbalize", kind="verbalize")
+        cardinal = CardinalFst()
+        cardinal_graph = cardinal.fst
+        ordinal = OrdinalFst()
+        ordinal_graph = ordinal.fst
+        decimal = DecimalFst()
+        decimal_graph = decimal.fst
+        whitelist_graph = WhiteListFst().fst
+        electronic_graph = ElectronicFst().fst
+        money_graph = MoneyFst().fst
+        date_graph = DateFst().fst
+        measure_graph = MeasureFst().fst
+        telephone_graph = TelephoneFst().fst
+        time_graph = TimeFst().fst
+
+        graph = (
+            whitelist_graph
+            | cardinal_graph
+            | ordinal_graph
+            | decimal_graph
+            | electronic_graph
+            | date_graph
+            | money_graph
+            | measure_graph
+            | telephone_graph
+            | time_graph
+        )
+
+        self.fst = graph
--- a/fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py
+++ b/fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py
+import pynini
+from fun_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst
+from fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize import VerbalizeFst
+from fun_text_processing.text_normalization.en.graph_utils import (
+    GraphFst,
+    delete_extra_space,
+    delete_space,
+)
+from pynini.lib import pynutil
+
+
+class VerbalizeFinalFst(GraphFst):
+    """
+    Finite state transducer that verbalizes an entire sentence, e.g.
+    tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
+    """
+
+    def __init__(self):
+        super().__init__(name="verbalize_final", kind="verbalize")
+        verbalize = VerbalizeFst().fst
+        word = WordFst().fst
+        types = verbalize | word
+        graph = (
+            pynutil.delete("tokens")
+            + delete_space
+            + pynutil.delete("{")
+            + delete_space
+            + types
+            + delete_space
+            + pynutil.delete("}")
+        )
+        graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
+        self.fst = graph
--- a/fun_text_processing/inverse_text_normalization/run_evaluate.py
+++ b/fun_text_processing/inverse_text_normalization/run_evaluate.py
+from argparse import ArgumentParser
+
+from fun_text_processing.inverse_text_normalization.inverse_normalize import InverseNormalizer
+from fun_text_processing.text_normalization.data_loader_utils import (
+    evaluate,
+    known_types,
+    load_files,
+    training_data_to_sentences,
+    training_data_to_tokens,
+)
+
+"""
+Runs Evaluation on data in the format of : <semiotic class>\t<unnormalized text>\t<`self` if trivial class or normalized text>
+like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
+"""
+
+def parse_args():
+    parser = ArgumentParser()
+    parser.add_argument("--input", help="input file path", type=str, required=True)
+    parser.add_argument(
+        "--lang",
+        help="language",
+        choices=["en", "id", "ja", "de", "es", "pt", "ru", "fr", "vi", "ko", "zh", "fil"],
+        default="en",
+        type=str,
+    )
+    parser.add_argument(
+        "--cat",
+        dest="category",
+        help="focus on class only (" + ", ".join(known_types) + ")",
+        type=str,
+        default=None,
+        choices=known_types,
+    )
+    parser.add_argument(
+        "--filter", action="store_true", help="clean data for inverse normalization purposes"
+    )
+    return parser.parse_args()
+
+if __name__ == "__main__":
+    # Example usage:
+    # python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter
+    args = parse_args()
+    if args.lang == "en":
+        from fun_text_processing.inverse_text_normalization.en.clean_eval_data import filter_loaded_data
+
+    file_path = args.input
+    inverse_normalizer = InverseNormalizer()
+
+    print("Loading training data: " + file_path)
+    training_data = load_files([file_path])
+
+    if args.filter:
+        training_data = filter_loaded_data(training_data)
+
+    # Evaluate at sentence level if no specific category is provided
+    if args.category is None:
+        print("Sentence level evaluation...")
+        sentences_un_normalized, sentences_normalized, _ = training_data_to_sentences(training_data)
+        print("- Data: " + str(len(sentences_normalized)) + " sentences")
+        sentences_prediction = inverse_normalizer.inverse_normalize_list(sentences_normalized)
+        print("- Denormalized. Evaluating...")
+        sentences_accuracy = evaluate(
+            preds=sentences_prediction, labels=sentences_un_normalized, input=sentences_normalized
+        )
+        print("- Accuracy: " + str(sentences_accuracy))
+
+    # Evaluate at token level
+    print("Token level evaluation...")
+    tokens_per_type = training_data_to_tokens(training_data, category=args.category)
+    token_accuracy = {}
+    for token_type, (tokens_un_normalized, tokens_normalized) in tokens_per_type.items():
+        print("- Token type: " + token_type)
+        print("  - Data: " + str(len(tokens_normalized)) + " tokens")
+        tokens_prediction = inverse_normalizer.inverse_normalize_list(tokens_normalized)
+        print("  - Denormalized. Evaluating...")
+        token_accuracy[token_type] = evaluate(
+            tokens_prediction, tokens_un_normalized, input=tokens_normalized
+        )
+        print("  - Accuracy: " + str(token_accuracy[token_type]))
+
+    # Calculate weighted token accuracy
+    token_count_per_type = {token_type: len(tokens) for token_type, (tokens, _) in tokens_per_type.items()}
+    token_weighted_accuracy = [
+        token_count_per_type[token_type] * accuracy
+        for token_type, accuracy in token_accuracy.items()
+    ]
+    print("- Accuracy: " + str(sum(token_weighted_accuracy) / sum(token_count_per_type.values())))
+
+    print(" - Total: " + str(sum(token_count_per_type.values())), "\n")
+
+    for token_type in token_accuracy:
+        if token_type not in known_types:
+            raise ValueError("Unexpected token type: " + token_type)
+
+    # Output table summarizing evaluation results if no specific category is provided
+    if args.category is None:
+        c1 = ["Class", "sent level"] + known_types
+        c2 = ["Num Tokens", len(sentences_normalized)] + [
+            str(token_count_per_type.get(known_type, 0)) for known_type in known_types
+        ]
+        c3 = ["Denormalization", str(sentences_accuracy)] + [
+            str(token_accuracy.get(known_type, "0")) for known_type in known_types
+        ]
+        for i in range(len(c1)):
+            print(f"{c1[i]:10s} | {c2[i]:10s} | {c3[i]:5s}")
+    else:
+        print(f"numbers\t{token_count_per_type[args.category]}")
+        print(f"Denormalization\t{token_accuracy[args.category]}")
--- a/fun_text_processing/inverse_text_normalization/tl/__init__.py
+++ b/fun_text_processing/inverse_text_normalization/tl/__init__.py
+from fun_text_processing.inverse_text_normalization.tl.taggers.tokenize_and_classify import (
+    ClassifyFst,
+)
+from fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize import VerbalizeFst
+from fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize_final import (
+    VerbalizeFinalFst,
+)