Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FunASR
Commits
431278fa
Commit
431278fa
authored
Nov 22, 2024
by
“change”
Browse files
Initial commit
parent
8c252776
Pipeline
#1949
failed with stages
in 0 seconds
Changes
788
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
735 additions
and
0 deletions
+735
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py
...ing/inverse_text_normalization/es/verbalizers/cardinal.py
+38
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/date.py
...cessing/inverse_text_normalization/es/verbalizers/date.py
+51
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/decimal.py
...sing/inverse_text_normalization/es/verbalizers/decimal.py
+56
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/electronic.py
...g/inverse_text_normalization/es/verbalizers/electronic.py
+45
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/measure.py
...sing/inverse_text_normalization/es/verbalizers/measure.py
+47
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/money.py
...essing/inverse_text_normalization/es/verbalizers/money.py
+26
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py
...sing/inverse_text_normalization/es/verbalizers/ordinal.py
+35
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/telephone.py
...ng/inverse_text_normalization/es/verbalizers/telephone.py
+22
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/time.py
...cessing/inverse_text_normalization/es/verbalizers/time.py
+70
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize.py
...ng/inverse_text_normalization/es/verbalizers/verbalize.py
+48
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py
...erse_text_normalization/es/verbalizers/verbalize_final.py
+33
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py
...ng/inverse_text_normalization/es/verbalizers/whitelist.py
+27
-0
fun_text_processing/inverse_text_normalization/es/verbalizers/word.py
...cessing/inverse_text_normalization/es/verbalizers/word.py
+29
-0
fun_text_processing/inverse_text_normalization/export_models.py
...xt_processing/inverse_text_normalization/export_models.py
+127
-0
fun_text_processing/inverse_text_normalization/fr/README.md
fun_text_processing/inverse_text_normalization/fr/README.md
+30
-0
fun_text_processing/inverse_text_normalization/fr/__init__.py
...text_processing/inverse_text_normalization/fr/__init__.py
+7
-0
fun_text_processing/inverse_text_normalization/fr/data/__init__.py
...processing/inverse_text_normalization/fr/data/__init__.py
+1
-0
fun_text_processing/inverse_text_normalization/fr/data/electronic/__init__.py
...inverse_text_normalization/fr/data/electronic/__init__.py
+1
-0
fun_text_processing/inverse_text_normalization/fr/data/electronic/domain.tsv
.../inverse_text_normalization/fr/data/electronic/domain.tsv
+25
-0
fun_text_processing/inverse_text_normalization/fr/data/electronic/server_name.tsv
...rse_text_normalization/fr/data/electronic/server_name.tsv
+17
-0
No files found.
Too many changes to show.
To preserve performance only
788 of 788+
files are displayed.
Plain diff
Email patch
fun_text_processing/inverse_text_normalization/es/verbalizers/cardinal.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynutil
.
delete
(
"negative:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
,
0
,
1
,
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
self
.
numbers
=
graph
graph
=
optional_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/date.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_extra_space
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
DateFst
(
GraphFst
):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "enero" preserve_order: true } -> 1 de enero
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"date"
,
kind
=
"verbalize"
)
month
=
(
pynutil
.
delete
(
"month:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
day
=
(
pynutil
.
delete
(
"day:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
# day month
graph_dm
=
day
+
delete_extra_space
+
pynutil
.
insert
(
"de"
)
+
insert_space
+
month
optional_preserve_order
=
pynini
.
closure
(
pynutil
.
delete
(
"preserve_order:"
)
+
delete_space
+
pynutil
.
delete
(
"true"
)
+
delete_space
|
pynutil
.
delete
(
"field_order:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
final_graph
=
graph_dm
+
delete_space
+
optional_preserve_order
delete_tokens
=
self
.
delete_tokens
(
final_graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/decimal.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"verbalize"
)
optionl_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
)
+
delete_space
,
0
,
1
)
integer
=
(
pynutil
.
delete
(
"integer_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_integer
=
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
decimal_point
=
pynini
.
cross
(
'morphosyntactic_features: ","'
,
","
)
decimal_point
|=
pynini
.
cross
(
'morphosyntactic_features: "."'
,
"."
)
fractional
=
(
decimal_point
+
delete_space
+
pynutil
.
delete
(
"fractional_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_fractional
=
pynini
.
closure
(
fractional
+
delete_space
,
0
,
1
)
quantity
=
(
pynutil
.
delete
(
"quantity:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_quantity
=
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
quantity
+
delete_space
,
0
,
1
)
graph
=
optional_integer
+
optional_fractional
+
optional_quantity
self
.
numbers
=
graph
graph
=
optionl_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/electronic.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"verbalize"
)
user_name
=
(
pynutil
.
delete
(
"username:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
domain
=
(
pynutil
.
delete
(
"domain:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
protocol
=
(
pynutil
.
delete
(
"protocol:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
user_name
+
delete_space
+
pynutil
.
insert
(
"@"
)
+
domain
graph
|=
protocol
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/measure.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_CHAR
,
GraphFst
,
delete_space
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
),
0
,
1
)
unit
=
(
pynutil
.
delete
(
"units:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
graph_decimal
=
(
pynutil
.
delete
(
"decimal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
decimal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph_cardinal
=
(
pynutil
.
delete
(
"cardinal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
cardinal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
(
graph_cardinal
|
graph_decimal
)
+
delete_space
+
pynutil
.
insert
(
" "
)
+
unit
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/money.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_CHAR
,
GraphFst
,
delete_space
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"verbalize"
)
unit
=
(
pynutil
.
delete
(
"currency:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
unit
+
delete_space
+
decimal
.
numbers
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13.º
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
replace_suffix
=
pynini
.
union
(
pynini
.
cross
(
' morphosyntactic_features: "o"'
,
".º"
),
pynini
.
cross
(
' morphosyntactic_features: "a"'
,
".ª"
),
pynini
.
cross
(
' morphosyntactic_features: "er"'
,
".ᵉʳ"
),
)
graph
=
graph
+
replace_suffix
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/telephone.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"verbalize"
)
number_part
=
(
pynutil
.
delete
(
'number_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
delete_tokens
=
self
.
delete_tokens
(
number_part
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/time.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_DIGIT
,
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "la 1" minutes: "10" } -> la 1:10
e.g. time { hours: "la 1" minutes: "45" } -> la 1:45
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"verbalize"
)
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
# hour includes preposition ("la" or "las")
hour
=
(
pynutil
.
delete
(
"hours:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
union
(
"la "
,
"las "
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
minute
=
(
pynutil
.
delete
(
"minutes:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
suffix
=
(
delete_space
+
insert_space
+
pynutil
.
delete
(
"suffix:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_suffix
=
pynini
.
closure
(
suffix
,
0
,
1
)
zone
=
(
delete_space
+
insert_space
+
pynutil
.
delete
(
"zone:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_zone
=
pynini
.
closure
(
zone
,
0
,
1
)
graph
=
(
hour
+
delete_space
+
pynutil
.
insert
(
":"
)
+
(
minute
@
add_leading_zero_to_double_digit
)
+
optional_suffix
+
optional_zone
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize.py
0 → 100644
View file @
431278fa
from
fun_text_processing.inverse_text_normalization.es.verbalizers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.whitelist
import
WhiteListFst
from
fun_text_processing.text_normalization.en.graph_utils
import
GraphFst
class
VerbalizeFst
(
GraphFst
):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize"
,
kind
=
"verbalize"
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
ordinal_graph
=
OrdinalFst
().
fst
decimal
=
DecimalFst
()
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
decimal
=
decimal
,
cardinal
=
cardinal
).
fst
money_graph
=
MoneyFst
(
decimal
=
decimal
).
fst
time_graph
=
TimeFst
().
fst
date_graph
=
DateFst
().
fst
whitelist_graph
=
WhiteListFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
graph
=
(
time_graph
|
date_graph
|
money_graph
|
measure_graph
|
ordinal_graph
|
decimal_graph
|
cardinal_graph
|
whitelist_graph
|
telephone_graph
|
electronic_graph
)
self
.
fst
=
graph
fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.word
import
WordFst
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
VerbalizeFinalFst
(
GraphFst
):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize_final"
,
kind
=
"verbalize"
)
verbalize
=
VerbalizeFst
().
fst
word
=
WordFst
().
fst
types
=
verbalize
|
word
graph
=
(
pynutil
.
delete
(
"tokens"
)
+
delete_space
+
pynutil
.
delete
(
"{"
)
+
delete_space
+
types
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
delete_space
+
pynini
.
closure
(
graph
+
delete_extra_space
)
+
graph
+
delete_space
self
.
fst
=
graph
fun_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "uds." } -> uds.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
graph
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
fun_text_processing/inverse_text_normalization/es/verbalizers/word.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"verbalize"
)
chars
=
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
char
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
chars
+
pynutil
.
delete
(
'"'
)
)
graph
=
char
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
fun_text_processing/inverse_text_normalization/export_models.py
0 → 100644
View file @
431278fa
import
os
from
time
import
perf_counter
from
argparse
import
ArgumentParser
from
fun_text_processing.text_normalization.en.graph_utils
import
generator_main
def
parse_args
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--language"
,
help
=
"language"
,
choices
=
[
"de"
,
"en"
,
"es"
,
"fr"
,
"id"
,
"ja"
,
"ko"
,
"pt"
,
"ru"
,
"vi"
,
"zh"
],
default
=
"en"
,
type
=
str
,
)
parser
.
add_argument
(
"--export_dir"
,
help
=
"path to export directory. Default to current directory."
,
default
=
"./"
,
type
=
str
,
)
return
parser
.
parse_args
()
def
get_grammars
(
lang
:
str
=
"en"
):
if
lang
==
"de"
:
from
fun_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"en"
:
from
fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"es"
:
from
fun_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"fr"
:
from
fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"id"
:
from
fun_text_processing.inverse_text_normalization.id.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.id.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ja"
:
from
fun_text_processing.inverse_text_normalization.ja.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ko"
:
from
fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"pt"
:
from
fun_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ru"
:
from
fun_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"vi"
:
from
fun_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"zh"
:
from
fun_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
else
:
from
fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
return
ClassifyFst
().
fst
,
VerbalizeFinalFst
().
fst
if
__name__
==
"__main__"
:
args
=
parse_args
()
export_dir
=
args
.
export_dir
os
.
makedirs
(
export_dir
,
exist_ok
=
True
)
tagger_far_file
=
os
.
path
.
join
(
export_dir
,
args
.
language
+
"_itn_tagger.far"
)
verbalizer_far_file
=
os
.
path
.
join
(
export_dir
,
args
.
language
+
"_itn_verbalizer.far"
)
start_time
=
perf_counter
()
tagger_fst
,
verbalizer_fst
=
get_grammars
(
args
.
language
)
generator_main
(
tagger_far_file
,
{
"tokenize_and_classify"
:
tagger_fst
})
generator_main
(
verbalizer_far_file
,
{
"verbalize"
:
verbalizer_fst
})
print
(
f
"Time to generate graph:
{
round
(
perf_counter
()
-
start_time
,
2
)
}
sec"
)
fun_text_processing/inverse_text_normalization/fr/README.md
0 → 100644
View file @
431278fa
# Note on French spelling
Due to a 1990 orthographic reform, there are currently two conventions for written French numbers:
1.
**Reformed**
All composite words are joined by a hyphen:
e.g.
`1122 -> mille-cent-vingt-deux`
2.
**Traditional**
Hyphenation only occurs (with exception) for numbers from 17 to 99 (inclusive):
e.g.
`1122 -> mille cent vingt-deux`
As available training data for upstream ASR will vary in use of convention, NeMo's French ITN accomodates either style for normalization e.g.
```
python inverse_normalize.py "mille-cent-vingt-deux" --language="fr" --> 1122
python inverse_normalize.py "mille cent vingt-deux" --language="fr" --> 1122
```
As a result, there exists some ambiguity in the case of currency conversions, namely minor denominations of the dollar e.g.
```
300 -> "trois-cents" # Reformed spelling
300 -> "trois cents" # Traditional spelling
3 ¢ -> "trois cents" # Valid for both
```
Cardinals take priority in such cases.
```
python inverse_normalize.py "trois cents" --language="fr" -> 300
```
fun_text_processing/inverse_text_normalization/fr/__init__.py
0 → 100644
View file @
431278fa
from
fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
fun_text_processing/inverse_text_normalization/fr/data/__init__.py
0 → 100644
View file @
431278fa
fun_text_processing/inverse_text_normalization/fr/data/electronic/__init__.py
0 → 100644
View file @
431278fa
fun_text_processing/inverse_text_normalization/fr/data/electronic/domain.tsv
0 → 100644
View file @
431278fa
com
es
uk
fr
net
br
in
ru
de
it
edu
co
ar
bo
cl
co
ec
fk
gf
fy
pe
py
sr
ve
uy
\ No newline at end of file
fun_text_processing/inverse_text_normalization/fr/data/electronic/server_name.tsv
0 → 100644
View file @
431278fa
g mail gmail
gmail
n vidia nvidia
nvidia
outlook
hotmail
yahoo
aol
gmx
msn
live
yandex
orange
wanadoo
web
google
comcast
\ No newline at end of file
Prev
1
…
17
18
19
20
21
22
23
24
25
…
40
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment