Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for classifying time
e.g. quinze pro meio dia -> time { hours: "11" minutes: "45" }
e.g. quinze pra meia noite -> time { hours: "23" minutes: "45" }
e.g. quinze pra uma -> time { hours: "12" minutes: "45" }
e.g. dez pras duas -> time { hours: "1" minutes: "50" }
e.g. quinze pras duas -> time { hours: "1" minutes: "45" }
e.g. ao meio dia -> time { hours: "12" minutes: "00" morphosyntactic_features: "ao" }
e.g. ao meio dia e meia -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" }
e.g. ao meio dia e meio -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" }
e.g. à meia noite e quinze -> time { hours: "0" minutes: "15" morphosyntactic_features: "à" }
e.g. à meia noite e meia -> time { hours: "0" minutes: "30" morphosyntactic_features: "à" }
e.g. à uma e trinta -> time { hours: "1" minutes: "30" morphosyntactic_features: "à" }
e.g. às onze e trinta -> time { hours: "11" minutes: "30" morphosyntactic_features: "às" }
e.g. às três horas e trinta minutos -> time { hours: "3" minutes: "30" morphosyntactic_features: "às" }
"""
def __init__(self):
super().__init__(name="time", kind="classify")
# graph_hour_to_am = pynini.string_file(get_abs_path("data/time/hour_to_am.tsv"))
# graph_hour_to_pm = pynini.string_file(get_abs_path("data/time/hour_to_pm.tsv"))
graph_hours_to = pynini.string_file(get_abs_path("data/time/hours_to.tsv"))
graph_minutes_to = pynini.string_file(get_abs_path("data/time/minutes_to.tsv"))
graph_suffix_am = pynini.string_file(get_abs_path("data/time/time_suffix_am.tsv"))
graph_suffix_pm = pynini.string_file(get_abs_path("data/time/time_suffix_pm.tsv"))
graph_digit = pynini.string_file(get_abs_path("data/numbers/digit.tsv"))
graph_ties = pynini.string_file(get_abs_path("data/numbers/ties.tsv"))
graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
graph_twenties = pynini.string_file(get_abs_path("data/numbers/twenties.tsv"))
graph_1_to_100 = pynini.union(
graph_digit,
graph_twenties,
graph_teen,
(graph_ties + pynutil.insert("0")),
(graph_ties + pynutil.delete(" e ") + graph_digit),
)
# note that graph_hour will start from 2 hours
# "1 o'clock" will be treated differently because it
# is singular
digits_2_to_23 = [str(digits) for digits in range(2, 24)]
digits_1_to_59 = [str(digits) for digits in range(1, 60)]
graph_2_to_23 = graph_1_to_100 @ pynini.union(*digits_2_to_23)
graph_1_to_59 = graph_1_to_100 @ pynini.union(*digits_1_to_59)
graph_uma = pynini.cross("uma", "1")
# Mapping 'horas'
graph_hour = pynutil.delete(pynini.accep("hora") + pynini.accep("s").ques)
graph_minute = pynutil.delete(pynini.accep("minuto") + pynini.accep("s").ques)
# Mapping 'meio dia' and 'meia noite'
graph_meio_dia = pynini.cross("meio dia", "12")
graph_meia_noite = pynini.cross("meia noite", "0")
# Mapping 'e meia'
graph_e = delete_space + pynutil.delete(" e ") + delete_space
graph_e_meia = graph_e + pynini.cross("meia", "30")
graph_e_meio = graph_e + pynini.cross("meio", "30")
# à uma e meia -> 1:30
# às três e meia -> 3:30
graph_hours_at_prefix_singular = (
pynutil.insert('morphosyntactic_features: "')
+ (pynini.cross("à", "à") | pynini.cross("a", "à"))
+ pynutil.insert('" ')
+ delete_space
)
graph_hours_at_singular = (
graph_hours_at_prefix_singular
+ pynutil.insert('hours: "')
+ graph_uma
+ pynutil.insert('"')
+ (delete_space + graph_hour).ques
)
graph_hours_at_prefix_plural = (
pynutil.insert('morphosyntactic_features: "')
+ (pynini.cross("às", "às") | pynini.cross("as", "às"))
+ pynutil.insert('" ')
+ delete_space
)
graph_hours_at_plural = (
graph_hours_at_prefix_plural
+ pynutil.insert('hours: "')
+ graph_2_to_23
+ pynutil.insert('"')
+ (delete_space + graph_hour).ques
)
final_graph_hour_at = graph_hours_at_singular | graph_hours_at_plural
graph_minutes_component_without_zero = (
graph_e + graph_1_to_59 + (delete_space + graph_minute).ques
)
graph_minutes_component_without_zero |= (
graph_e_meia + pynutil.delete(delete_space + pynini.accep("hora")).ques
)
final_graph_minute = (
pynutil.insert(' minutes: "')
+ graph_minutes_component_without_zero
+ pynutil.insert('"')
)
graph_hm = final_graph_hour_at + final_graph_minute
# à uma hora -> 1:00
graph_hours_at_singular_with_hour = (
graph_hours_at_prefix_singular
+ pynutil.insert('hours: "')
+ graph_uma
+ pynutil.insert('"')
+ delete_space
+ graph_hour
)
graph_hours_at_plural_with_hour = (
graph_hours_at_prefix_plural
+ pynutil.insert('hours: "')
+ graph_2_to_23
+ pynutil.insert('"')
+ delete_space
+ graph_hour
)
graph_hm |= (
graph_hours_at_singular_with_hour | graph_hours_at_plural_with_hour
) + pynutil.insert(' minutes: "00"', weight=0.2)
# meio dia e meia -> 12:30
# meia noite e meia -> 0:30
graph_minutes_without_zero = (
pynutil.insert(' minutes: "')
+ graph_minutes_component_without_zero
+ pynutil.insert('"')
)
graph_meio_min = (
pynutil.insert('hours: "')
+ (graph_meio_dia | graph_meia_noite)
+ pynutil.insert('"')
+ graph_minutes_without_zero
)
graph_meio_min |= (
pynutil.insert('hours: "')
+ graph_meio_dia
+ pynutil.insert('" minutes: "')
+ graph_e_meio
+ pynutil.insert('"')
)
graph_hm |= graph_meio_min
# às quinze para as quatro -> às 3:45
# NOTE: case 'para à uma' ('to one') could be either 0:XX or 12:XX
# leading to wrong reading ('meio dia e ...' or 'meia noite e ...')
graph_para_a = (
pynutil.delete("para")
| pynutil.delete("para a")
| pynutil.delete("para as")
| pynutil.delete("pra")
| pynutil.delete("pras")
)
graph_para_o = pynutil.delete("para") | pynutil.delete("para o") | pynutil.delete("pro")
graph_pra_min = (
pynutil.insert('morphosyntactic_features: "')
+ (
pynini.cross("à", "à")
| pynini.cross("às", "às")
| pynini.cross("a", "à")
| pynini.cross("as", "às")
)
+ pynutil.insert('" ')
+ delete_space
)
graph_pra_min += (
pynutil.insert('minutes: "')
+ (graph_1_to_59 @ graph_minutes_to)
+ pynutil.insert('" ')
+ (delete_space + graph_minute).ques
)
graph_pra_hour = (
pynutil.insert('hours: "')
+ (graph_2_to_23 @ graph_hours_to)
+ pynutil.insert('"')
+ (delete_space + graph_hour).ques
)
graph_pra_hour |= (
pynutil.insert('hours: "') + (graph_meia_noite @ graph_hours_to) + pynutil.insert('"')
)
graph_pra = graph_pra_min + delete_space + graph_para_a + delete_space + graph_pra_hour
# às quinze pro meio dia -> às 11:45
graph_pro = graph_pra_min + delete_space + graph_para_o + delete_space
graph_pro += (
pynutil.insert(' hours: "') + (graph_meio_dia @ graph_hours_to) + pynutil.insert('"')
)
graph_mh = graph_pra | graph_pro
# optional suffix
final_suffix = (
pynutil.insert('suffix: "') + (graph_suffix_am | graph_suffix_pm) + pynutil.insert('"')
)
final_suffix_optional = pynini.closure(delete_space + insert_space + final_suffix, 0, 1)
final_graph = pynini.union((graph_hm | graph_mh) + final_suffix_optional).optimize()
final_graph = self.add_tokens(final_graph)
self.fst = final_graph.optimize()
import os
import pynini
from fun_text_processing.inverse_text_normalization.pt.taggers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.pt.taggers.date import DateFst
from fun_text_processing.inverse_text_normalization.pt.taggers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.pt.taggers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.pt.taggers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.pt.taggers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.pt.taggers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.pt.taggers.punctuation import PunctuationFst
from fun_text_processing.inverse_text_normalization.pt.taggers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.pt.taggers.time import TimeFst
from fun_text_processing.inverse_text_normalization.pt.taggers.whitelist import WhiteListFst
from fun_text_processing.inverse_text_normalization.pt.taggers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
generator_main,
)
from pynini.lib import pynutil
import logging
class ClassifyFst(GraphFst):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def __init__(self, cache_dir: str = None, overwrite_cache: bool = False):
super().__init__(name="tokenize_and_classify", kind="classify")
far_file = None
if cache_dir is not None and cache_dir != "None":
os.makedirs(cache_dir, exist_ok=True)
far_file = os.path.join(cache_dir, "_pt_itn.far")
if not overwrite_cache and far_file and os.path.exists(far_file):
self.fst = pynini.Far(far_file, mode="r")["tokenize_and_classify"]
logging.info(f"ClassifyFst.fst was restored from {far_file}.")
else:
logging.info(f"Creating ClassifyFst grammars.")
cardinal = CardinalFst(use_strict_e=True)
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst(cardinal)
decimal_graph = decimal.fst
measure_graph = MeasureFst(cardinal=cardinal, decimal=decimal).fst
date_graph = DateFst(cardinal=cardinal).fst
word_graph = WordFst().fst
time_graph = TimeFst().fst
money_graph = MoneyFst(cardinal=cardinal, decimal=decimal).fst
whitelist_graph = WhiteListFst().fst
punct_graph = PunctuationFst().fst
electronic_graph = ElectronicFst().fst
telephone_graph = TelephoneFst().fst
classify = (
pynutil.add_weight(whitelist_graph, 1.01)
| pynutil.add_weight(time_graph, 1.09)
| pynutil.add_weight(date_graph, 1.09)
| pynutil.add_weight(decimal_graph, 1.09)
| pynutil.add_weight(measure_graph, 1.1)
| pynutil.add_weight(cardinal_graph, 1.1)
| pynutil.add_weight(ordinal_graph, 1.1)
| pynutil.add_weight(money_graph, 1.1)
| pynutil.add_weight(telephone_graph, 1.1)
| pynutil.add_weight(electronic_graph, 1.1)
| pynutil.add_weight(word_graph, 100)
)
punct = (
pynutil.insert("tokens { ")
+ pynutil.add_weight(punct_graph, weight=1.1)
+ pynutil.insert(" }")
)
token = pynutil.insert("tokens { ") + classify + pynutil.insert(" }")
token_plus_punct = (
pynini.closure(punct + pynutil.insert(" "))
+ token
+ pynini.closure(pynutil.insert(" ") + punct)
)
graph = token_plus_punct + pynini.closure(delete_extra_space + token_plus_punct)
graph = delete_space + graph + delete_space
self.fst = graph.optimize()
if far_file:
generator_main(far_file, {"tokenize_and_classify": self.fst})
logging.info(f"ClassifyFst grammars are saved to {far_file}.")
import pynini
from fun_text_processing.inverse_text_normalization.pt.utils import get_abs_path
from fun_text_processing.text_normalization.en.graph_utils import GraphFst, convert_space
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for classifying whitelisted tokens
e.g. usted -> tokens { name: "ud." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def __init__(self):
super().__init__(name="whitelist", kind="classify")
whitelist = pynini.string_file(get_abs_path("data/whitelist.tsv")).invert()
graph = pynutil.insert('name: "') + convert_space(whitelist) + pynutil.insert('"')
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_SPACE, GraphFst
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def __init__(self):
super().__init__(name="word", kind="classify")
word = pynutil.insert('name: "') + pynini.closure(DAMO_NOT_SPACE, 1) + pynutil.insert('"')
self.fst = word.optimize()
import os
def get_abs_path(rel_path):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return os.path.dirname(os.path.abspath(__file__)) + "/" + rel_path
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def __init__(self):
super().__init__(name="cardinal", kind="verbalize")
optional_sign = pynini.closure(
pynutil.delete("negative:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space,
0,
1,
)
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
self.numbers = graph
graph = optional_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "enero" preserve_order: true } -> 1 de enero
"""
def __init__(self):
super().__init__(name="date", kind="verbalize")
month = (
pynutil.delete("month:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
day = (
pynutil.delete("day:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
year = (
pynutil.delete("year:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
# day month
graph_dmy = (
day
+ delete_extra_space
+ pynutil.insert("de")
+ insert_space
+ month
+ (delete_extra_space + pynutil.insert("de") + insert_space + year).ques
)
graph_dmy |= (
day
+ delete_space
+ pynutil.insert("/")
+ month
+ pynutil.delete(' morphosyntactic_features: "/"')
+ (delete_space + pynutil.insert("/") + year).ques
)
optional_preserve_order = pynini.closure(
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
| pynutil.delete("field_order:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space
)
final_graph = graph_dmy + delete_space + optional_preserve_order
delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def __init__(self):
super().__init__(name="decimal", kind="verbalize")
optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
integer = (
pynutil.delete("integer_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_integer = pynini.closure(integer + delete_space, 0, 1)
decimal_point = pynini.cross('morphosyntactic_features: ","', ",")
decimal_point |= pynini.cross('morphosyntactic_features: "."', ".")
fractional = (
decimal_point
+ delete_space
+ pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
quantity = (
pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
graph = optional_integer + optional_fractional + optional_quantity
self.numbers = graph
graph = optionl_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def __init__(self):
super().__init__(name="electronic", kind="verbalize")
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
protocol = (
pynutil.delete("protocol:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph = user_name + delete_space + pynutil.insert("@") + domain
graph |= protocol
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst):
super().__init__(name="measure", kind="verbalize")
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
unit = (
pynutil.delete("units:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
+ delete_space
)
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def __init__(self, decimal: GraphFst):
super().__init__(name="money", kind="verbalize")
unit = (
pynutil.delete("currency:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = unit + delete_space + insert_space + decimal.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13º
"""
def __init__(self):
super().__init__(name="ordinal", kind="verbalize")
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
replace_suffix = pynini.union(
pynini.cross(' morphosyntactic_features: "o"', "º"),
pynini.cross(' morphosyntactic_features: "a"', "ª"),
)
graph = graph + replace_suffix
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def __init__(self):
super().__init__(name="telephone", kind="verbalize")
number_part = (
pynutil.delete('number_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(number_part)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_DIGIT,
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "à 1" minutes: "10" } -> à 1:10
e.g. time { hours: "às 2" minutes: "45" } -> às 2:45
"""
def __init__(self):
super().__init__(name="time", kind="verbalize")
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
prefix = (
pynutil.delete("morphosyntactic_features:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
+ delete_space
+ insert_space
)
optional_prefix = pynini.closure(prefix, 0, 1)
hour = (
pynutil.delete("hours:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
minute = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
suffix = (
delete_space
+ insert_space
+ pynutil.delete("suffix:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_suffix = pynini.closure(suffix, 0, 1)
graph = (
optional_prefix
+ hour
+ delete_space
+ pynutil.insert(":")
+ (minute @ add_leading_zero_to_double_digit)
+ optional_suffix
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
from fun_text_processing.inverse_text_normalization.pt.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.date import DateFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.time import TimeFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.whitelist import WhiteListFst
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst()
decimal_graph = decimal.fst
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst
money_graph = MoneyFst(decimal=decimal).fst
time_graph = TimeFst().fst
date_graph = DateFst().fst
whitelist_graph = WhiteListFst().fst
telephone_graph = TelephoneFst().fst
electronic_graph = ElectronicFst().fst
graph = (
time_graph
| date_graph
| money_graph
| measure_graph
| ordinal_graph
| decimal_graph
| cardinal_graph
| whitelist_graph
| telephone_graph
| electronic_graph
)
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.pt.verbalizers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def __init__(self):
super().__init__(name="verbalize_final", kind="verbalize")
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
self.fst = graph
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "sexta feira" } -> "sexta-feira"
"""
def __init__(self):
super().__init__(name="whitelist", kind="verbalize")
graph = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = graph @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(DAMO_CHAR - " ", 1)
char = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ chars
+ pynutil.delete('"')
)
graph = char @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment