Commit 70a8a9e0 authored by wangwei990215's avatar wangwei990215
Browse files

initial commit

parents
Pipeline #1738 failed with stages
in 0 seconds
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class DecimalFst(GraphFst):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def __init__(self):
super().__init__(name="decimal", kind="verbalize")
optionl_sign = pynini.closure(pynini.cross('negative: "true"', "-") + delete_space, 0, 1)
integer = (
pynutil.delete("integer_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_integer = pynini.closure(integer + delete_space, 0, 1)
decimal_point = pynini.cross('morphosyntactic_features: ","', ",")
decimal_point |= pynini.cross('morphosyntactic_features: "."', ".")
fractional = (
decimal_point
+ delete_space
+ pynutil.delete("fractional_part:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_fractional = pynini.closure(fractional + delete_space, 0, 1)
quantity = (
pynutil.delete("quantity:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
optional_quantity = pynini.closure(pynutil.insert(" ") + quantity + delete_space, 0, 1)
graph = optional_integer + optional_fractional + optional_quantity
self.numbers = graph
graph = optionl_sign + graph
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class ElectronicFst(GraphFst):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def __init__(self):
super().__init__(name="electronic", kind="verbalize")
user_name = (
pynutil.delete("username:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
domain = (
pynutil.delete("domain:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
protocol = (
pynutil.delete("protocol:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
graph = user_name + delete_space + pynutil.insert("@") + domain
graph |= protocol
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MeasureFst(GraphFst):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def __init__(self, decimal: GraphFst, cardinal: GraphFst):
super().__init__(name="measure", kind="verbalize")
optional_sign = pynini.closure(pynini.cross('negative: "true"', "-"), 0, 1)
unit = (
pynutil.delete("units:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
+ delete_space
)
graph_decimal = (
pynutil.delete("decimal {")
+ delete_space
+ optional_sign
+ delete_space
+ decimal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph_cardinal = (
pynutil.delete("cardinal {")
+ delete_space
+ optional_sign
+ delete_space
+ cardinal.numbers
+ delete_space
+ pynutil.delete("}")
)
graph = (graph_cardinal | graph_decimal) + delete_space + pynutil.insert(" ") + unit
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_CHAR, GraphFst, delete_space
from pynini.lib import pynutil
class MoneyFst(GraphFst):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def __init__(self, decimal: GraphFst):
super().__init__(name="money", kind="verbalize")
unit = (
pynutil.delete("currency:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = unit + delete_space + decimal.numbers
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_NOT_QUOTE,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class OrdinalFst(GraphFst):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13.º
"""
def __init__(self):
super().__init__(name="ordinal", kind="verbalize")
graph = (
pynutil.delete("integer:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
replace_suffix = pynini.union(
pynini.cross(' morphosyntactic_features: "o"', ".º"),
pynini.cross(' morphosyntactic_features: "a"', ".ª"),
pynini.cross(' morphosyntactic_features: "er"', ".ᵉʳ"),
)
graph = graph + replace_suffix
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import DAMO_NOT_QUOTE, GraphFst
from pynini.lib import pynutil
class TelephoneFst(GraphFst):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def __init__(self):
super().__init__(name="telephone", kind="verbalize")
number_part = (
pynutil.delete('number_part: "')
+ pynini.closure(DAMO_NOT_QUOTE, 1)
+ pynutil.delete('"')
)
delete_tokens = self.delete_tokens(number_part)
self.fst = delete_tokens.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_DIGIT,
GraphFst,
delete_space,
insert_space,
)
from pynini.lib import pynutil
class TimeFst(GraphFst):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "la 1" minutes: "10" } -> la 1:10
e.g. time { hours: "la 1" minutes: "45" } -> la 1:45
"""
def __init__(self):
super().__init__(name="time", kind="verbalize")
add_leading_zero_to_double_digit = (DAMO_DIGIT + DAMO_DIGIT) | (
pynutil.insert("0") + DAMO_DIGIT
)
# hour includes preposition ("la" or "las")
hour = (
pynutil.delete("hours:")
+ delete_space
+ pynutil.delete('"')
+ pynini.union("la ", "las ")
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
minute = (
pynutil.delete("minutes:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_DIGIT, 1)
+ pynutil.delete('"')
)
suffix = (
delete_space
+ insert_space
+ pynutil.delete("suffix:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_suffix = pynini.closure(suffix, 0, 1)
zone = (
delete_space
+ insert_space
+ pynutil.delete("zone:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
optional_zone = pynini.closure(zone, 0, 1)
graph = (
hour
+ delete_space
+ pynutil.insert(":")
+ (minute @ add_leading_zero_to_double_digit)
+ optional_suffix
+ optional_zone
)
delete_tokens = self.delete_tokens(graph)
self.fst = delete_tokens.optimize()
from fun_text_processing.inverse_text_normalization.es.verbalizers.cardinal import CardinalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.date import DateFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.decimal import DecimalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.electronic import ElectronicFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.measure import MeasureFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.money import MoneyFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.ordinal import OrdinalFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.telephone import TelephoneFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.time import TimeFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.whitelist import WhiteListFst
from fun_text_processing.text_normalization.en.graph_utils import GraphFst
class VerbalizeFst(GraphFst):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def __init__(self):
super().__init__(name="verbalize", kind="verbalize")
cardinal = CardinalFst()
cardinal_graph = cardinal.fst
ordinal_graph = OrdinalFst().fst
decimal = DecimalFst()
decimal_graph = decimal.fst
measure_graph = MeasureFst(decimal=decimal, cardinal=cardinal).fst
money_graph = MoneyFst(decimal=decimal).fst
time_graph = TimeFst().fst
date_graph = DateFst().fst
whitelist_graph = WhiteListFst().fst
telephone_graph = TelephoneFst().fst
electronic_graph = ElectronicFst().fst
graph = (
time_graph
| date_graph
| money_graph
| measure_graph
| ordinal_graph
| decimal_graph
| cardinal_graph
| whitelist_graph
| telephone_graph
| electronic_graph
)
self.fst = graph
import pynini
from fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.es.verbalizers.word import WordFst
from fun_text_processing.text_normalization.en.graph_utils import (
GraphFst,
delete_extra_space,
delete_space,
)
from pynini.lib import pynutil
class VerbalizeFinalFst(GraphFst):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def __init__(self):
super().__init__(name="verbalize_final", kind="verbalize")
verbalize = VerbalizeFst().fst
word = WordFst().fst
types = verbalize | word
graph = (
pynutil.delete("tokens")
+ delete_space
+ pynutil.delete("{")
+ delete_space
+ types
+ delete_space
+ pynutil.delete("}")
)
graph = delete_space + pynini.closure(graph + delete_extra_space) + graph + delete_space
self.fst = graph
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WhiteListFst(GraphFst):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "uds." } -> uds.
"""
def __init__(self):
super().__init__(name="whitelist", kind="verbalize")
graph = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ pynini.closure(DAMO_CHAR - " ", 1)
+ pynutil.delete('"')
)
graph = graph @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import pynini
from fun_text_processing.text_normalization.en.graph_utils import (
DAMO_CHAR,
DAMO_SIGMA,
GraphFst,
delete_space,
)
from pynini.lib import pynutil
class WordFst(GraphFst):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def __init__(self):
super().__init__(name="word", kind="verbalize")
chars = pynini.closure(DAMO_CHAR - " ", 1)
char = (
pynutil.delete("name:")
+ delete_space
+ pynutil.delete('"')
+ chars
+ pynutil.delete('"')
)
graph = char @ pynini.cdrewrite(pynini.cross("\u00A0", " "), "", "", DAMO_SIGMA)
self.fst = graph.optimize()
import os
from time import perf_counter
from argparse import ArgumentParser
from fun_text_processing.text_normalization.en.graph_utils import generator_main
def parse_args():
parser = ArgumentParser()
parser.add_argument(
"--language",
help="language",
choices=["de", "en", "es", "fr", "id", "ja", "ko", "pt", "ru", "vi", "zh"],
default="en",
type=str,
)
parser.add_argument(
"--export_dir",
help="path to export directory. Default to current directory.",
default="./",
type=str,
)
return parser.parse_args()
def get_grammars(lang: str = "en"):
if lang == "de":
from fun_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "en":
from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "es":
from fun_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "fr":
from fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "id":
from fun_text_processing.inverse_text_normalization.id.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.id.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ja":
from fun_text_processing.inverse_text_normalization.ja.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ko":
from fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "pt":
from fun_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "ru":
from fun_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "vi":
from fun_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
elif lang == "zh":
from fun_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
else:
from fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
return ClassifyFst().fst, VerbalizeFinalFst().fst
if __name__ == "__main__":
args = parse_args()
export_dir = args.export_dir
os.makedirs(export_dir, exist_ok=True)
tagger_far_file = os.path.join(export_dir, args.language + "_itn_tagger.far")
verbalizer_far_file = os.path.join(export_dir, args.language + "_itn_verbalizer.far")
start_time = perf_counter()
tagger_fst, verbalizer_fst = get_grammars(args.language)
generator_main(tagger_far_file, {"tokenize_and_classify": tagger_fst})
generator_main(verbalizer_far_file, {"verbalize": verbalizer_fst})
print(f"Time to generate graph: {round(perf_counter() - start_time, 2)} sec")
# Note on French spelling
Due to a 1990 orthographic reform, there are currently two conventions for written French numbers:
1. **Reformed** All composite words are joined by a hyphen:
e.g. `1122 -> mille-cent-vingt-deux`
2. **Traditional** Hyphenation only occurs (with exception) for numbers from 17 to 99 (inclusive):
e.g. `1122 -> mille cent vingt-deux`
As available training data for upstream ASR will vary in use of convention, NeMo's French ITN accomodates either style for normalization e.g.
```
python inverse_normalize.py "mille-cent-vingt-deux" --language="fr" --> 1122
python inverse_normalize.py "mille cent vingt-deux" --language="fr" --> 1122
```
As a result, there exists some ambiguity in the case of currency conversions, namely minor denominations of the dollar e.g.
```
300 -> "trois-cents" # Reformed spelling
300 -> "trois cents" # Traditional spelling
3 ¢ -> "trois cents" # Valid for both
```
Cardinals take priority in such cases.
```
python inverse_normalize.py "trois cents" --language="fr" -> 300
```
from fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify import (
ClassifyFst,
)
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize import VerbalizeFst
from fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final import (
VerbalizeFinalFst,
)
com
es
uk
fr
net
br
in
ru
de
it
edu
co
ar
bo
cl
co
ec
fk
gf
fy
pe
py
sr
ve
uy
\ No newline at end of file
g mail gmail
gmail
n vidia nvidia
nvidia
outlook
hotmail
yahoo
aol
gmx
msn
live
yandex
orange
wanadoo
web
google
comcast
\ No newline at end of file
chez @
at @
à @
arobase @
point .
barre oblique /
tiret -
tiret bas _
souligné _
sous-tiret _
blanc souligné _
underscore _
\ No newline at end of file
demie deux
demies deux
demi deux
demis deux
tiers trois
quart quatre
quarts quatre
quatrièmes quatre
quatrième quatre
cinquième cinq
cinquièmes cinq
neuvième neuf
neuvièmes neuf
onzième onze
onzièmes onze
douzième douze
douzièmes douze
treizième treize
treizièmes treize
quatorzième quatorze
quatorzièmes quatorze
quinzième quinze
quinzièmes quinze
seizième seize
seizièmes seize
trentième trente
trentièmes trente
quarantième quarante
quarantièmes quarante
cinquantième cinquante
cinquantièmes cinquante
soixantième soixante
soixantièmes soixante
septantième septante
septantièmes septante
huitantième huitante
huitantièmes huitante
nonantième nonante
nonantièmes nonante
millième mille
millièmes mille
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment