Commit 431278fa authored by “change”'s avatar “change”
Browse files

Initial commit

parent 8c252776
Pipeline #1949 failed with stages
in 0 seconds
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class CardinalFst(GraphFst):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def __init__(self):
super().__init__(name="cardinal", kind="verbalize")
optional_sign = pynini.closure(
pynutil.delete("negative:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space,
0,
1,
)
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
self.numbers = graph
graph = optional_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_extra_space,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class DateFst(GraphFst):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "enero" preserve_order: true } -> 1 de enero
"""
def __init__(self):
super().__init__(name="date", kind="verbalize")
month = (
pynutil.delete("month:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
day = (
pynutil.delete("day:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
# day month
graph_dm = day + delete_extra_space + pynutil.insert("de") + insert_space + month
optional_preserve_order = pynini.closure(
pynutil.delete("preserve_order:") + delete_space + pynutil.delete("true") + delete_space
| pynutil.delete("field_order:")
+ delete_space
+ pynutil.delete('"')
+ DAMO_NOT_QUOTE
+ pynutil.delete('"')
+ delete_space
)
final_graph = graph_dm + delete_space + optional_preserve_order
delete_tokens = self.delete_tokens(final_graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def __init__(self):
super().__init__(name="decimal", kind="verbalize")
optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
integer = (
pynutil.delete("integer_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_integer = pynini.closure(integer + delete_space, 0, 1)
decimal_point = pynini.cross('morphosyntactic_features: ","', ",")
decimal_point |= pynini.cross('morphosyntactic_features: "."', ".")
fractional = (
decimal_point
+ delete_space
+ pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
quantity = (
pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
graph = optional_integer + optional_fractional + optional_quantity
self.numbers = graph
graph = optionl_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def __init__(self):
super().__init__(name="electronic", kind="verbalize")
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
protocol = (
pynutil.delete("protocol:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph = user_name + delete_space + pynutil.insert("@") + domain
graph |= protocol
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst):
super().__init__(name="measure", kind="verbalize")
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
unit = (
pynutil.delete("units:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
+ delete_space
)
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def __init__(self, decimal: GraphFst):
super().__init__(name="money", kind="verbalize")
unit = (
pynutil.delete("currency:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = unit + delete_space + decimal.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13.º
"""
def __init__(self):
super().__init__(name="ordinal", kind="verbalize")
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
replace_suffix = pynini.union(
pynini.cross(' morphosyntactic_features: "o"', ".º"),
pynini.cross(' morphosyntactic_features: "a"', ".ª"),
pynini.cross(' morphosyntactic_features: "er"', ".ᵉʳ"),
)
graph = graph + replace_suffix
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def __init__(self):
super().__init__(name="telephone", kind="verbalize")
number_part = (
pynutil.delete('number_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(number_part)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_DIGIT,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "la 1" minutes: "10" } -> la 1:10
e.g. time { hours: "la 1" minutes: "45" } -> la 1:45
"""
def __init__(self):
super().__init__(name="time", kind="verbalize")
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
# hour includes preposition ("la" or "las")
hour = (
pynutil.delete("hours:")
+ delete_space
+ pynutil.delete('"')
+ pynini.union("la ", "las ")
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
minute = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
suffix = (
delete_space
+ insert_space
+ pynutil.delete("suffix:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_suffix = pynini.closure(suffix, 0, 1)
zone = (
delete_space
+ insert_space
+ pynutil.delete("zone:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_zone = pynini.closure(zone, 0, 1)
graph = (
hour
+ delete_space
+ pynutil.insert(":")
+ (minute @ add_leading_zero_to_double_digit)
+ optional_suffix
+ optional_zone
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
from fun_text_processing.inverse_text_normalization.es.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.date import DateFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.time import TimeFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.whitelist import WhiteListFst
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst()
decimal_graph = decimal.fst
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst
money_graph = MoneyFst(decimal=decimal).fst
time_graph = TimeFst().fst
date_graph = DateFst().fst
whitelist_graph = WhiteListFst().fst
telephone_graph = TelephoneFst().fst
electronic_graph = ElectronicFst().fst
graph = (
time_graph
| date_graph
| money_graph
| measure_graph
| ordinal_graph
| decimal_graph
| cardinal_graph
| whitelist_graph
| telephone_graph
| electronic_graph
)
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def __init__(self):
super().__init__(name="verbalize_final", kind="verbalize")
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
self.fst = graph
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "uds." } -> uds.
"""
def __init__(self):
super().__init__(name="whitelist", kind="verbalize")
graph = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = graph @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(DAMO_CHAR - " ", 1)
char = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ chars
+ pynutil.delete('"')
)
graph = char @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import os
from time import perf_counter
from argparse import ArgumentParser
from fun_text_processing.text_normalization.en.graph_utils import generator_main
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--language",
help="language",
choices=["de", "en", "es", "fr", "id", "ja", "ko", "pt", "ru", "vi", "zh"],
default="en",
type=str,
)
parser.add_argument(
"--export_dir",
help="path to export directory. Default to current directory.",
default="./",
type=str,
)
return parser.parse_args()
def get_grammars(lang: str = "en"):
if lang == "de":
from fun_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "en":
from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "es":
from fun_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "fr":
from fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "id":
from fun_text_processing.inverse_text_normalization.id.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.id.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ja":
from fun_text_processing.inverse_text_normalization.ja.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ko":
from fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "pt":
from fun_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ru":
from fun_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "vi":
from fun_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "zh":
from fun_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
else:
from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
return ClassifyFst().fst, VerbalizeFinalFst().fst
if __name__ == "__main__":
args = parse_args()
export_dir = args.export_dir
os.makedirs(export_dir, exist_ok=True)
tagger_far_file = os.path.join(export_dir, args.language + "_itn_tagger.far")
verbalizer_far_file = os.path.join(export_dir, args.language + "_itn_verbalizer.far")
start_time = perf_counter()
tagger_fst, verbalizer_fst = get_grammars(args.language)
generator_main(tagger_far_file, {"tokenize_and_classify": tagger_fst})
generator_main(verbalizer_far_file, {"verbalize": verbalizer_fst})
print(f"Time to generate graph: {round(perf_counter() - start_time, 2)} sec")
# Note on French spelling
Due to a 1990 orthographic reform, there are currently two conventions for written French numbers:
1. **Reformed** All composite words are joined by a hyphen:
e.g. `1122 -> mille-cent-vingt-deux`
2. **Traditional** Hyphenation only occurs (with exception) for numbers from 17 to 99 (inclusive):
e.g. `1122 -> mille cent vingt-deux`
As available training data for upstream ASR will vary in use of convention, NeMo's French ITN accomodates either style for normalization e.g.
```
python inverse_normalize.py "mille-cent-vingt-deux" --language="fr" --> 1122
python inverse_normalize.py "mille cent vingt-deux" --language="fr" --> 1122
```
As a result, there exists some ambiguity in the case of currency conversions, namely minor denominations of the dollar e.g.
```
300 -> "trois-cents" # Reformed spelling
300 -> "trois cents" # Traditional spelling
3 ¢ -> "trois cents" # Valid for both
```
Cardinals take priority in such cases.
```
python inverse_normalize.py "trois cents" --language="fr" -> 300
```
from fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
com
es
uk
fr
net
br
in
ru
de
it
edu
co
ar
bo
cl
co
ec
fk
gf
fy
pe
py
sr
ve
uy
\ No newline at end of file
g mail gmail
gmail
n vidia nvidia
nvidia
outlook
hotmail
yahoo
aol
gmx
msn
live
yandex
orange
wanadoo
web
google
comcast
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment