Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
convert_space,
delete_space,
)
from pynini.lib import pynutil
class FractionFst(GraphFst):
"""
Finite state transducer for classifying fraction
e.g. ein halb -> tokens { name: "1/2" }
e.g. ein ein halb -> tokens { name: "1 1/2" }
e.g. drei zwei ein hundertstel -> tokens { name: "3 2/100" }
Args:
itn_cardinal_tagger: ITN cardinal tagger
tn_fraction_verbalizer: TN fraction verbalizer
"""
def __init__(
self,
itn_cardinal_tagger: GraphFst,
tn_fraction_verbalizer: GraphFst,
deterministic: bool = True,
):
super().__init__(name="fraction", kind="classify", deterministic=deterministic)
tagger = tn_fraction_verbalizer.graph.invert().optimize()
delete_optional_sign = pynini.closure(
pynutil.delete("negative: ") + pynini.cross('"true" ', "-"), 0, 1
)
delete_integer_marker = (
pynutil.delete('integer_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
) @ itn_cardinal_tagger.graph_no_exception
delete_numerator_marker = (
pynutil.delete('numerator: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
) @ itn_cardinal_tagger.graph_no_exception
delete_denominator_marker = (
pynutil.insert("/")
+ (
pynutil.delete('denominator: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
@ itn_cardinal_tagger.graph_no_exception
)
graph = (
pynini.closure(delete_integer_marker + pynini.accep(" "), 0, 1)
+ delete_numerator_marker
+ delete_space
+ delete_denominator_marker
).optimize()
verbalizer = delete_optional_sign + graph
self.graph = tagger @ verbalizer
graph = pynutil.insert('name: "') + convert_space(self.graph) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.de.taggers.measure import (
singular_to_plural,
unit_singular,
)
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for classifying measure. Allows for plural form for unit.
e.g. minus elf kilogramm -> measure { cardinal { negative: "true" integer: "11" } units: "kg" }
e.g. drei stunden -> measure { cardinal { integer: "3" } units: "h" }
e.g. ein halb kilogramm -> measure { decimal { integer_part: "1/2" } units: "kg" }
e.g. eins komma zwei kilogramm -> measure { decimal { integer_part: "1" fractional_part: "2" } units: "kg" }
Args:
itn_cardinal_tagger: ITN Cardinal tagger
itn_decimal_tagger: ITN Decimal tagger
itn_fraction_tagger: ITN Fraction tagger
"""
def __init__(
self,
itn_cardinal_tagger: GraphFst,
itn_decimal_tagger: GraphFst,
itn_fraction_tagger: GraphFst,
deterministic: bool = True,
):
super().__init__(name="measure", kind="classify", deterministic=deterministic)
cardinal_graph = (
pynini.cdrewrite(
pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]", "[EOS]", DAMO_SIGMA
)
@ itn_cardinal_tagger.graph_no_exception
)
graph_unit_singular = pynini.invert(unit_singular) # singular -> abbr
unit = (
pynini.invert(singular_to_plural()) @ graph_unit_singular
) | graph_unit_singular # plural -> abbr
unit = convert_space(unit)
graph_unit_singular = convert_space(graph_unit_singular)
optional_graph_negative = pynini.closure(
pynutil.insert("negative: ") + pynini.cross("minus", '"true"') + delete_extra_space,
0,
1,
)
unit_misc = pynutil.insert("/") + pynutil.delete("pro") + delete_space + graph_unit_singular
unit = (
pynutil.insert('units: "')
+ (unit | unit_misc | pynutil.add_weight(unit + delete_space + unit_misc, 0.01))
+ pynutil.insert('"')
)
subgraph_decimal = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ itn_decimal_tagger.final_graph_wo_negative
+ pynutil.insert(" }")
+ delete_extra_space
+ unit
)
subgraph_fraction = (
pynutil.insert("decimal { ")
+ optional_graph_negative
+ pynutil.insert('integer_part: "')
+ itn_fraction_tagger.graph
+ pynutil.insert('" }')
+ delete_extra_space
+ unit
)
subgraph_cardinal = (
pynutil.insert("cardinal { ")
+ optional_graph_negative
+ pynutil.insert('integer: "')
+ cardinal_graph
+ pynutil.insert('"')
+ pynutil.insert(" }")
+ delete_extra_space
+ unit
)
final_graph = subgraph_cardinal | subgraph_decimal | subgraph_fraction
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.de.taggers.money import (
maj_singular,
min_plural,
min_singular,
)
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
DAMO_SIGMA,
GraphFst,
convert_space,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for classifying money
e.g. elf euro und vier cent -> money { integer_part: "11" fractional_part: 04 currency: "€" }
Args:
itn_cardinal_tagger: ITN Cardinal Tagger
itn_decimal_tagger: ITN Decimal Tagger
"""
def __init__(
self,
itn_cardinal_tagger: GraphFst,
itn_decimal_tagger: GraphFst,
deterministic: bool = True,
):
super().__init__(name="money", kind="classify", deterministic=deterministic)
cardinal_graph = (
pynini.cdrewrite(
pynini.cross(pynini.union("ein", "eine"), "eins"), "[BOS]", "[EOS]", DAMO_SIGMA
)
@ itn_cardinal_tagger.graph_no_exception
)
graph_decimal_final = itn_decimal_tagger.final_graph_wo_negative
graph_unit = pynini.invert(maj_singular)
graph_unit = pynutil.insert('currency: "') + convert_space(graph_unit) + pynutil.insert('"')
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
min_unit = pynini.project(min_singular | min_plural, "output")
# elf euro (und) vier cent, vier cent
cents_standalone = (
pynutil.insert('fractional_part: "')
+ cardinal_graph @ add_leading_zero_to_double_digit
+ delete_space
+ pynutil.delete(min_unit)
+ pynutil.insert('"')
)
optional_cents_standalone = pynini.closure(
delete_space
+ pynini.closure(pynutil.delete("und") + delete_space, 0, 1)
+ insert_space
+ cents_standalone,
0,
1,
)
# elf euro vierzig, only after integer
optional_cents_suffix = pynini.closure(
delete_extra_space
+ pynutil.insert('fractional_part: "')
+ pynutil.add_weight(cardinal_graph @ add_leading_zero_to_double_digit, -0.7)
+ pynutil.insert('"'),
0,
1,
)
graph_integer = (
pynutil.insert('integer_part: "')
+ cardinal_graph
+ pynutil.insert('"')
+ delete_extra_space
+ graph_unit
+ (optional_cents_standalone | optional_cents_suffix)
)
graph_decimal = graph_decimal_final + delete_extra_space + graph_unit
graph_decimal |= pynutil.insert('currency: "€" integer_part: "0" ') + cents_standalone
final_graph = graph_integer | graph_decimal
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for classifying ordinal
e.g. dreizehnter -> tokens { name: "13." }
Args:
itn_cardinal_tagger: ITN Cardinal Tagger
tn_ordinal_verbalizer: TN Ordinal Verbalizer
"""
def __init__(
self,
itn_cardinal_tagger: GraphFst,
tn_ordinal_verbalizer: GraphFst,
deterministic: bool = True,
):
super().__init__(name="ordinal", kind="classify", deterministic=deterministic)
tagger = tn_ordinal_verbalizer.graph.invert().optimize()
graph = (
pynutil.delete('integer: "') + pynini.closure(DAMO_NOT_QUOTE, 1) + pynutil.delete('"')
) @ itn_cardinal_tagger.graph
final_graph = tagger @ graph + pynutil.insert(".")
graph = pynutil.insert('name: "') + final_graph + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
convert_space,
insert_space,
)
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for classifying telephone numbers, e.g.
null vier eins eins eins zwei drei vier eins zwei drei vier -> tokens { name: "(0411) 1234-1234" }
Args:
tn_cardinal_tagger: TN Cardinal Tagger
"""
def __init__(self, tn_cardinal_tagger: GraphFst, deterministic: bool = True):
super().__init__(name="telephone", kind="classify", deterministic=deterministic)
separator = pynini.accep(" ") # between components
digit = pynini.union(*list(map(str, range(1, 10)))) @ tn_cardinal_tagger.two_digit_non_zero
zero = pynini.cross("0", "null")
number_part = (
pynutil.delete("(")
+ zero
+ insert_space
+ pynini.closure(digit + insert_space, 2, 2)
+ digit
+ pynutil.delete(")")
+ separator
+ pynini.closure(digit + insert_space, 3, 3)
+ digit
+ pynutil.delete("-")
+ insert_space
+ pynini.closure(digit + insert_space, 3, 3)
+ digit
)
graph = convert_space(pynini.invert(number_part))
final_graph = pynutil.insert('name: "') + graph + pynutil.insert('"')
self.fst = final_graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_SIGMA, GraphFst
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time
e.g. acht uhr e s t-> time { hours: "8" zone: "e s t" }
e.g. dreizehn uhr -> time { hours: "13" }
e.g. dreizehn uhr zehn -> time { hours: "13" minutes: "10" }
e.g. viertel vor zwölf -> time { minutes: "45" hours: "11" }
e.g. viertel nach zwölf -> time { minutes: "15" hours: "12" }
e.g. halb zwölf -> time { minutes: "30" hours: "11" }
e.g. drei vor zwölf -> time { minutes: "57" hours: "11" }
e.g. drei nach zwölf -> time { minutes: "3" hours: "12" }
e.g. drei uhr zehn minuten zehn sekunden -> time { hours: "3" hours: "10" sekunden: "10"}
Args:
tn_time_verbalizer: TN time verbalizer
"""
def __init__(self, tn_time_verbalizer: GraphFst, deterministic: bool = True):
super().__init__(name="time", kind="classify", deterministic=deterministic)
# lazy way to make sure compounds work
optional_delete_space = pynini.closure(DAMO_SIGMA | pynutil.delete(" ", weight=0.0001))
graph = (tn_time_verbalizer.graph @ optional_delete_space).invert().optimize()
self.fst = self.add_tokens(graph).optimize()
import os
import pynini
from fun_text_processing.inverse_text_normalization.de.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.de.taggers.date import DateFst
from fun_text_processing.inverse_text_normalization.de.taggers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.de.taggers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.de.taggers.fraction import FractionFst
from fun_text_processing.inverse_text_normalization.de.taggers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.de.taggers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.de.taggers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.de.taggers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.de.taggers.time import TimeFst
from fun_text_processing.inverse_text_normalization.de.taggers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.en.taggers.punctuation import PunctuationFst
from fun_text_processing.inverse_text_normalization.en.taggers.word import WordFst
from fun_text_processing.text_normalization.de.taggers.cardinal import (
CardinalFst as TNCardinalTagger,
)
from fun_text_processing.text_normalization.de.taggers.date import DateFst as TNDateTagger
from fun_text_processing.text_normalization.de.taggers.decimal import DecimalFst as TNDecimalTagger
from fun_text_processing.text_normalization.de.taggers.electronic import (
ElectronicFst as TNElectronicTagger,
)
from fun_text_processing.text_normalization.de.taggers.whitelist import (
WhiteListFst as TNWhitelistTagger,
)
from fun_text_processing.text_normalization.de.verbalizers.date import DateFst as TNDateVerbalizer
from fun_text_processing.text_normalization.de.verbalizers.electronic import (
ElectronicFst as TNElectronicVerbalizer,
)
from fun_text_processing.text_normalization.de.verbalizers.fraction import (
FractionFst as TNFractionVerbalizer,
)
from fun_text_processing.text_normalization.de.verbalizers.ordinal import (
OrdinalFst as TNOrdinalVerbalizer,
)
from fun_text_processing.text_normalization.de.verbalizers.time import TimeFst as TNTimeVerbalizer
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from pynini.lib import pynutil
import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(
self, cache_dir: str = None, overwrite_cache: bool = False, deterministic: bool = True
):
super().__init__(name="tokenize_and_classify", kind="classify", deterministic=deterministic)
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "_de_itn.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")
tn_cardinal_tagger = TNCardinalTagger(deterministic=False)
tn_date_tagger = TNDateTagger(cardinal=tn_cardinal_tagger, deterministic=False)
tn_decimal_tagger = TNDecimalTagger(cardinal=tn_cardinal_tagger, deterministic=False)
tn_ordinal_verbalizer = TNOrdinalVerbalizer(deterministic=False)
tn_fraction_verbalizer = TNFractionVerbalizer(
ordinal=tn_ordinal_verbalizer, deterministic=False
)
tn_time_verbalizer = TNTimeVerbalizer(
cardinal_tagger=tn_cardinal_tagger, deterministic=False
)
tn_date_verbalizer = TNDateVerbalizer(
ordinal=tn_ordinal_verbalizer, deterministic=False
)
tn_electronic_tagger = TNElectronicTagger(deterministic=False)
tn_electronic_verbalizer = TNElectronicVerbalizer(deterministic=False)
tn_whitelist_tagger = TNWhitelistTagger(input_case="cased", deterministic=False)
cardinal = CardinalFst(tn_cardinal_tagger=tn_cardinal_tagger)
cardinal_graph = cardinal.fst
ordinal = OrdinalFst(
itn_cardinal_tagger=cardinal, tn_ordinal_verbalizer=tn_ordinal_verbalizer
)
ordinal_graph = ordinal.fst
decimal = DecimalFst(itn_cardinal_tagger=cardinal, tn_decimal_tagger=tn_decimal_tagger)
decimal_graph = decimal.fst
fraction = FractionFst(
itn_cardinal_tagger=cardinal, tn_fraction_verbalizer=tn_fraction_verbalizer
)
fraction_graph = fraction.fst
measure_graph = MeasureFst(
itn_cardinal_tagger=cardinal,
itn_decimal_tagger=decimal,
itn_fraction_tagger=fraction,
).fst
date_graph = DateFst(
itn_cardinal_tagger=cardinal,
tn_date_verbalizer=tn_date_verbalizer,
tn_date_tagger=tn_date_tagger,
).fst
word_graph = WordFst().fst
time_graph = TimeFst(tn_time_verbalizer=tn_time_verbalizer).fst
money_graph = MoneyFst(itn_cardinal_tagger=cardinal, itn_decimal_tagger=decimal).fst
whitelist_graph = WhiteListFst(tn_whitelist_tagger=tn_whitelist_tagger).fst
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst(
tn_electronic_tagger=tn_electronic_tagger,
tn_electronic_verbalizer=tn_electronic_verbalizer,
).fst
telephone_graph = TelephoneFst(tn_cardinal_tagger=tn_cardinal_tagger).fst
classify = (
pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(whitelist_graph, 1.0)
| pynutil.add_weight(time_graph, 1.1)
| pynutil.add_weight(date_graph, 1.1)
| pynutil.add_weight(decimal_graph, 1.1)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(fraction_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = (
pynutil.insert("tokens { ")
+ pynutil.add_weight(punct_graph, weight=1.1)
+ pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
graph = delete_space + graph + delete_space
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
import pynini
from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelisted tokens
e.g. misses -> tokens { name: "Mrs." }
Args:
tn_whitelist_tagger: TN whitelist tagger
"""
def __init__(self, tn_whitelist_tagger: GraphFst, deterministic: bool = True):
super().__init__(name="whitelist", kind="classify", deterministic=deterministic)
whitelist = pynini.invert(tn_whitelist_tagger.graph)
graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { integer: "23" negative: "-" } -> -23
Args:
tn_cardinal_verbalizer: TN cardinal verbalizer
"""
def __init__(self, tn_cardinal_verbalizer: GraphFst, deterministic: bool = True):
super().__init__(name="cardinal", kind="verbalize", deterministic=deterministic)
self.numbers = tn_cardinal_verbalizer.numbers
optional_sign = pynini.closure(
pynutil.delete('negative: "') + DAMO_NOT_QUOTE + pynutil.delete('" '), 0, 1
)
graph = optional_sign + self.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_preserve_order,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal, e.g.
decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" } -> -12.5006 billion
Args:
tn_decimal_verbalizer: TN decimal verbalizer
"""
def __init__(self, tn_decimal_verbalizer: GraphFst, deterministic: bool = True):
super().__init__(name="decimal", kind="verbalize", deterministic=deterministic)
delete_space = pynutil.delete(" ")
optional_sign = pynini.closure(
pynutil.delete('negative: "') + DAMO_NOT_QUOTE + pynutil.delete('"') + delete_space,
0,
1,
)
optional_integer = pynini.closure(tn_decimal_verbalizer.integer, 0, 1)
optional_fractional = pynini.closure(
delete_space + pynutil.insert(",") + tn_decimal_verbalizer.fractional_default, 0, 1
)
graph = (
optional_integer + optional_fractional + tn_decimal_verbalizer.optional_quantity
).optimize()
self.numbers = optional_sign + graph
graph = self.numbers + delete_preserve_order
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
measure { decimal { integer_part: "1/2" } units: "kg" } -> 1/2 kg
measure { decimal { integer_part: "1" fractional_part: "2" quantity: "million" } units: "kg" } -> 1,2 million kg
Args:
decimal: ITN Decimal verbalizer
cardinal: ITN Cardinal verbalizer
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst, deterministic: bool = True):
super().__init__(name="measure", kind="verbalize", deterministic=deterministic)
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
unit = (
pynutil.delete("units:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
+ delete_space
)
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" fractional_part: "05" currency: "$" } -> $12.05
Args:
decimal: ITN Decimal verbalizer
"""
def __init__(self, decimal: GraphFst, deterministic: bool = True):
super().__init__(name="money", kind="verbalize", deterministic=deterministic)
unit = (
pynutil.delete("currency:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = unit + delete_space + decimal.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_ALPHA,
DAMO_DIGIT,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time, e.g.
time { hours: "8" minutes: "30" zone: "e s t" } -> 08:30 Uhr est
time { hours: "8" } -> 8 Uhr
time { hours: "8" minutes: "30" seconds: "10" } -> 08:30:10 Uhr
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="time", kind="verbalize", deterministic=deterministic)
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
hour = pynutil.delete('hours: "') + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete('"')
minute = pynutil.delete('minutes: "') + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete('"')
second = pynutil.delete('seconds: "') + pynini.closure(DAMO_DIGIT, 1) + pynutil.delete('"')
zone = (
pynutil.delete('zone: "')
+ pynini.closure(DAMO_ALPHA + delete_space)
+ DAMO_ALPHA
+ pynutil.delete('"')
)
optional_zone = pynini.closure(pynini.accep(" ") + zone, 0, 1)
graph = (
delete_space
+ pynutil.insert(":")
+ (minute @ add_leading_zero_to_double_digit)
+ pynini.closure(
delete_space + pynutil.insert(":") + (second @ add_leading_zero_to_double_digit),
0,
1,
)
+ pynutil.insert(" Uhr")
+ optional_zone
)
graph_h = hour + pynutil.insert(" Uhr") + optional_zone
graph_hm = hour @ add_leading_zero_to_double_digit + graph
graph_hms = hour @ add_leading_zero_to_double_digit + graph
final_graph = graph_hm | graph_hms | graph_h
self.fst = self.delete_tokens(final_graph).optimize()
from fun_text_processing.inverse_text_normalization.de.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.de.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.de.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.de.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.de.verbalizers.time import TimeFst
from fun_text_processing.text_normalization.de.verbalizers.cardinal import (
CardinalFst as TNCardinalVerbalizer,
)
from fun_text_processing.text_normalization.de.verbalizers.decimal import (
DecimalFst as TNDecimalVerbalizer,
)
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="verbalize", kind="verbalize", deterministic=deterministic)
tn_cardinal_verbalizer = TNCardinalVerbalizer(deterministic=False)
tn_decimal_verbalizer = TNDecimalVerbalizer(deterministic=False)
cardinal = CardinalFst(tn_cardinal_verbalizer=tn_cardinal_verbalizer)
cardinal_graph = cardinal.fst
decimal = DecimalFst(tn_decimal_verbalizer=tn_decimal_verbalizer)
decimal_graph = decimal.fst
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst
money_graph = MoneyFst(decimal=decimal).fst
time_graph = TimeFst().fst
graph = time_graph | money_graph | measure_graph | decimal_graph | cardinal_graph
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.en.verbalizers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "jetzt" } tokens { name: "ist" } tokens { time { hours: "12" minutes: "30" } } -> jetzt ist 12:30 Uhr
"""
def __init__(self, deterministic: bool = True):
super().__init__(name="verbalize_final", kind="verbalize", deterministic=deterministic)
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
self.fst = graph
from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
from argparse import ArgumentParser
from typing import List
import regex as re
from fun_text_processing.text_normalization.data_loader_utils import (
EOS_TYPE,
Instance,
load_files,
training_data_to_sentences,
)
"""
This file is for evaluation purposes.
filter_loaded_data() cleans data (list of instances) for inverse text normalization. Filters and cleaners can be specified for each semiotic class individually.
For example, normalized text should only include characters and whitespace characters but no punctuation.
Cardinal unnormalized instances should contain at least one integer and all other characters are removed.
"""
class Filter:
"""
Filter class
Args:
class_type: semiotic class used in dataset
process_func: function to transform text
filter_func: function to filter text
"""
def __init__(self, class_type: str, process_func: object, filter_func: object):
self.class_type = class_type
self.process_func = process_func
self.filter_func = filter_func
def filter(self, instance: Instance) -> bool:
"""
filter function
Args:
filters given instance with filter function
Returns: True if given instance fulfills criteria or does not belong to class type
"""
if instance.token_type != self.class_type:
return True
return self.filter_func(instance)
def process(self, instance: Instance) -> Instance:
"""
process function
Args:
processes given instance with process function
Returns: processed instance if instance belongs to expected class type or original instance
"""
if instance.token_type != self.class_type:
return instance
return self.process_func(instance)
def filter_cardinal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_cardinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[^0-9]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_ordinal_1(instance: Instance) -> bool:
ok = re.search(r"(st|nd|rd|th)\s*$", instance.un_normalized)
return ok
def process_ordinal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r"[,\s]", "", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_decimal_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_decimal_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_measure_1(instance: Instance) -> bool:
ok = True
return ok
def process_measure_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"m2", "m²", un_normalized)
un_normalized = re.sub(r"(\d)([^\d.\s])", r"\1 \2", un_normalized)
normalized = re.sub(r"[^a-z\s]", "", normalized)
normalized = re.sub(r"per ([a-z\s]*)s$", r"per \1", normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_money_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_money_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
un_normalized = re.sub(r",", "", un_normalized)
un_normalized = re.sub(r"a\$", r"$", un_normalized)
un_normalized = re.sub(r"us\$", r"$", un_normalized)
un_normalized = re.sub(r"(\d)m\s*$", r"\1 million", un_normalized)
un_normalized = re.sub(r"(\d)bn?\s*$", r"\1 billion", un_normalized)
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_time_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_time_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r": ", ":", un_normalized)
un_normalized = re.sub(r"(\d)\s?a\s?m\s?", r"\1 a.m.", un_normalized)
un_normalized = re.sub(r"(\d)\s?p\s?m\s?", r"\1 p.m.", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_plain_1(instance: Instance) -> bool:
ok = True
return ok
def process_plain_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_punct_1(instance: Instance) -> bool:
ok = True
return ok
def process_punct_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_date_1(instance: Instance) -> bool:
ok = True
return ok
def process_date_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
un_normalized = re.sub(r",", "", un_normalized)
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_letters_1(instance: Instance) -> bool:
ok = True
return ok
def process_letters_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_verbatim_1(instance: Instance) -> bool:
ok = True
return ok
def process_verbatim_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_digit_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_digit_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_telephone_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_telephone_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_electronic_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_electronic_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_fraction_1(instance: Instance) -> bool:
ok = re.search(r"[0-9]", instance.un_normalized)
return ok
def process_fraction_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
def filter_address_1(instance: Instance) -> bool:
ok = True
return ok
def process_address_1(instance: Instance) -> Instance:
un_normalized = instance.un_normalized
normalized = instance.normalized
normalized = re.sub(r"[^a-z ]", "", normalized)
return Instance(
token_type=instance.token_type, un_normalized=un_normalized, normalized=normalized
)
filters = []
filters.append(
Filter(class_type="CARDINAL", process_func=process_cardinal_1, filter_func=filter_cardinal_1)
)
filters.append(
Filter(class_type="ORDINAL", process_func=process_ordinal_1, filter_func=filter_ordinal_1)
)
filters.append(
Filter(class_type="DECIMAL", process_func=process_decimal_1, filter_func=filter_decimal_1)
)
filters.append(
Filter(class_type="MEASURE", process_func=process_measure_1, filter_func=filter_measure_1)
)
filters.append(Filter(class_type="MONEY", process_func=process_money_1, filter_func=filter_money_1))
filters.append(Filter(class_type="TIME", process_func=process_time_1, filter_func=filter_time_1))
filters.append(Filter(class_type="DATE", process_func=process_date_1, filter_func=filter_date_1))
filters.append(Filter(class_type="PLAIN", process_func=process_plain_1, filter_func=filter_plain_1))
filters.append(Filter(class_type="PUNCT", process_func=process_punct_1, filter_func=filter_punct_1))
filters.append(
Filter(class_type="LETTERS", process_func=process_letters_1, filter_func=filter_letters_1)
)
filters.append(
Filter(class_type="VERBATIM", process_func=process_verbatim_1, filter_func=filter_verbatim_1)
)
filters.append(Filter(class_type="DIGIT", process_func=process_digit_1, filter_func=filter_digit_1))
filters.append(
Filter(class_type="TELEPHONE", process_func=process_telephone_1, filter_func=filter_telephone_1)
)
filters.append(
Filter(
class_type="ELECTRONIC", process_func=process_electronic_1, filter_func=filter_electronic_1
)
)
filters.append(
Filter(class_type="FRACTION", process_func=process_fraction_1, filter_func=filter_fraction_1)
)
filters.append(
Filter(class_type="ADDRESS", process_func=process_address_1, filter_func=filter_address_1)
)
filters.append(Filter(class_type=EOS_TYPE, process_func=lambda x: x, filter_func=lambda x: True))
def filter_loaded_data(data: List[Instance], verbose: bool = False) -> List[Instance]:
"""
Filters list of instances
Args:
data: list of instances
Returns: filtered and transformed list of instances
"""
updates_instances = []
for instance in data:
updated_instance = False
for fil in filters:
if fil.class_type == instance.token_type and fil.filter(instance):
instance = fil.process(instance)
updated_instance = True
if updated_instance:
if verbose:
print(instance)
updates_instances.append(instance)
return updates_instances
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--input", help="input file path", type=str, default="./en_with_types/output-00001-of-00100"
)
parser.add_argument("--verbose", help="print filtered instances", action="store_true")
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
file_path = args.input
print("Loading training data: " + file_path)
instance_list = load_files([file_path]) # List of instances
filtered_instance_list = filter_loaded_data(instance_list, args.verbose)
training_data_to_sentences(filtered_instance_list)
$ dollar
$ us dollar
$ united states dollar
£ british pound
€ euro
₩ won
nzd new zealand dollar
rs rupee
chf swiss franc
dkk danish kroner
fim finnish markka
aed arab emirates dirham
¥ yen
czk czech koruna
mro mauritanian ouguiya
pkr pakistani rupee
crc costa rican colon
hk$ hong kong dollar
npr nepalese rupee
awg aruban florin
nok norwegian kroner
tzs tanzanian shilling
sek swedish kronor
cyp cypriot pound
r real
sar saudi riyal
cve cape verde escudo
rsd serbian dinar
dm german mark
shp saint helena pounds
php philippine peso
cad canadian dollar
ssp south sudanese pound
scr seychelles rupee
mvr maldivian rufiyaa
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment