Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Pipeline
#1738
failed with stages
in 0 seconds
Changes
827
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
703 additions
and
0 deletions
+703
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/decimal.py
...sing/inverse_text_normalization/es/verbalizers/decimal.py
+56
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/electronic.py
...g/inverse_text_normalization/es/verbalizers/electronic.py
+45
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/measure.py
...sing/inverse_text_normalization/es/verbalizers/measure.py
+47
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/money.py
...essing/inverse_text_normalization/es/verbalizers/money.py
+26
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py
...sing/inverse_text_normalization/es/verbalizers/ordinal.py
+35
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/telephone.py
...ng/inverse_text_normalization/es/verbalizers/telephone.py
+22
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/time.py
...cessing/inverse_text_normalization/es/verbalizers/time.py
+70
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize.py
...ng/inverse_text_normalization/es/verbalizers/verbalize.py
+48
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py
...erse_text_normalization/es/verbalizers/verbalize_final.py
+33
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py
...ng/inverse_text_normalization/es/verbalizers/whitelist.py
+27
-0
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/word.py
...cessing/inverse_text_normalization/es/verbalizers/word.py
+29
-0
FunASR/fun_text_processing/inverse_text_normalization/export_models.py
...xt_processing/inverse_text_normalization/export_models.py
+127
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/README.md
...n_text_processing/inverse_text_normalization/fr/README.md
+30
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/__init__.py
...text_processing/inverse_text_normalization/fr/__init__.py
+7
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/__init__.py
...processing/inverse_text_normalization/fr/data/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/__init__.py
...inverse_text_normalization/fr/data/electronic/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/domain.tsv
.../inverse_text_normalization/fr/data/electronic/domain.tsv
+26
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/server_name.tsv
...rse_text_normalization/fr/data/electronic/server_name.tsv
+18
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/symbols.tsv
...inverse_text_normalization/fr/data/electronic/symbols.tsv
+13
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/fractions.tsv
...ocessing/inverse_text_normalization/fr/data/fractions.tsv
+42
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/decimal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"verbalize"
)
optionl_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
)
+
delete_space
,
0
,
1
)
integer
=
(
pynutil
.
delete
(
"integer_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_integer
=
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
decimal_point
=
pynini
.
cross
(
'morphosyntactic_features: ","'
,
","
)
decimal_point
|=
pynini
.
cross
(
'morphosyntactic_features: "."'
,
"."
)
fractional
=
(
decimal_point
+
delete_space
+
pynutil
.
delete
(
"fractional_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_fractional
=
pynini
.
closure
(
fractional
+
delete_space
,
0
,
1
)
quantity
=
(
pynutil
.
delete
(
"quantity:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_quantity
=
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
quantity
+
delete_space
,
0
,
1
)
graph
=
optional_integer
+
optional_fractional
+
optional_quantity
self
.
numbers
=
graph
graph
=
optionl_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/electronic.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"verbalize"
)
user_name
=
(
pynutil
.
delete
(
"username:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
domain
=
(
pynutil
.
delete
(
"domain:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
protocol
=
(
pynutil
.
delete
(
"protocol:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
user_name
+
delete_space
+
pynutil
.
insert
(
"@"
)
+
domain
graph
|=
protocol
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/measure.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_CHAR
,
GraphFst
,
delete_space
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
),
0
,
1
)
unit
=
(
pynutil
.
delete
(
"units:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
graph_decimal
=
(
pynutil
.
delete
(
"decimal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
decimal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph_cardinal
=
(
pynutil
.
delete
(
"cardinal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
cardinal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
(
graph_cardinal
|
graph_decimal
)
+
delete_space
+
pynutil
.
insert
(
" "
)
+
unit
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/money.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_CHAR
,
GraphFst
,
delete_space
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"verbalize"
)
unit
=
(
pynutil
.
delete
(
"currency:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
unit
+
delete_space
+
decimal
.
numbers
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/ordinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13.º
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
replace_suffix
=
pynini
.
union
(
pynini
.
cross
(
' morphosyntactic_features: "o"'
,
".º"
),
pynini
.
cross
(
' morphosyntactic_features: "a"'
,
".ª"
),
pynini
.
cross
(
' morphosyntactic_features: "er"'
,
".ᵉʳ"
),
)
graph
=
graph
+
replace_suffix
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/telephone.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"verbalize"
)
number_part
=
(
pynutil
.
delete
(
'number_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
delete_tokens
=
self
.
delete_tokens
(
number_part
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/time.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_DIGIT
,
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "la 1" minutes: "10" } -> la 1:10
e.g. time { hours: "la 1" minutes: "45" } -> la 1:45
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"verbalize"
)
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
# hour includes preposition ("la" or "las")
hour
=
(
pynutil
.
delete
(
"hours:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
union
(
"la "
,
"las "
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
minute
=
(
pynutil
.
delete
(
"minutes:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
suffix
=
(
delete_space
+
insert_space
+
pynutil
.
delete
(
"suffix:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_suffix
=
pynini
.
closure
(
suffix
,
0
,
1
)
zone
=
(
delete_space
+
insert_space
+
pynutil
.
delete
(
"zone:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_zone
=
pynini
.
closure
(
zone
,
0
,
1
)
graph
=
(
hour
+
delete_space
+
pynutil
.
insert
(
":"
)
+
(
minute
@
add_leading_zero_to_double_digit
)
+
optional_suffix
+
optional_zone
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize.py
0 → 100644
View file @
70a8a9e0
from
fun_text_processing.inverse_text_normalization.es.verbalizers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.whitelist
import
WhiteListFst
from
fun_text_processing.text_normalization.en.graph_utils
import
GraphFst
class
VerbalizeFst
(
GraphFst
):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize"
,
kind
=
"verbalize"
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
ordinal_graph
=
OrdinalFst
().
fst
decimal
=
DecimalFst
()
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
decimal
=
decimal
,
cardinal
=
cardinal
).
fst
money_graph
=
MoneyFst
(
decimal
=
decimal
).
fst
time_graph
=
TimeFst
().
fst
date_graph
=
DateFst
().
fst
whitelist_graph
=
WhiteListFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
graph
=
(
time_graph
|
date_graph
|
money_graph
|
measure_graph
|
ordinal_graph
|
decimal_graph
|
cardinal_graph
|
whitelist_graph
|
telephone_graph
|
electronic_graph
)
self
.
fst
=
graph
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/verbalize_final.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.es.verbalizers.word
import
WordFst
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
VerbalizeFinalFst
(
GraphFst
):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize_final"
,
kind
=
"verbalize"
)
verbalize
=
VerbalizeFst
().
fst
word
=
WordFst
().
fst
types
=
verbalize
|
word
graph
=
(
pynutil
.
delete
(
"tokens"
)
+
delete_space
+
pynutil
.
delete
(
"{"
)
+
delete_space
+
types
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
delete_space
+
pynini
.
closure
(
graph
+
delete_extra_space
)
+
graph
+
delete_space
self
.
fst
=
graph
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/whitelist.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "uds." } -> uds.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
graph
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/es/verbalizers/word.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"verbalize"
)
chars
=
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
char
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
chars
+
pynutil
.
delete
(
'"'
)
)
graph
=
char
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/export_models.py
0 → 100644
View file @
70a8a9e0
import
os
from
time
import
perf_counter
from
argparse
import
ArgumentParser
from
fun_text_processing.text_normalization.en.graph_utils
import
generator_main
def
parse_args
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--language"
,
help
=
"language"
,
choices
=
[
"de"
,
"en"
,
"es"
,
"fr"
,
"id"
,
"ja"
,
"ko"
,
"pt"
,
"ru"
,
"vi"
,
"zh"
],
default
=
"en"
,
type
=
str
,
)
parser
.
add_argument
(
"--export_dir"
,
help
=
"path to export directory. Default to current directory."
,
default
=
"./"
,
type
=
str
,
)
return
parser
.
parse_args
()
def
get_grammars
(
lang
:
str
=
"en"
):
if
lang
==
"de"
:
from
fun_text_processing.inverse_text_normalization.de.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.de.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"en"
:
from
fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"es"
:
from
fun_text_processing.inverse_text_normalization.es.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.es.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"fr"
:
from
fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"id"
:
from
fun_text_processing.inverse_text_normalization.id.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.id.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ja"
:
from
fun_text_processing.inverse_text_normalization.ja.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ja.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ko"
:
from
fun_text_processing.inverse_text_normalization.ko.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ko.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"pt"
:
from
fun_text_processing.inverse_text_normalization.pt.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"ru"
:
from
fun_text_processing.inverse_text_normalization.ru.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"vi"
:
from
fun_text_processing.inverse_text_normalization.vi.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.vi.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
elif
lang
==
"zh"
:
from
fun_text_processing.inverse_text_normalization.zh.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.zh.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
else
:
from
fun_text_processing.inverse_text_normalization.en.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.en.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
return
ClassifyFst
().
fst
,
VerbalizeFinalFst
().
fst
if
__name__
==
"__main__"
:
args
=
parse_args
()
export_dir
=
args
.
export_dir
os
.
makedirs
(
export_dir
,
exist_ok
=
True
)
tagger_far_file
=
os
.
path
.
join
(
export_dir
,
args
.
language
+
"_itn_tagger.far"
)
verbalizer_far_file
=
os
.
path
.
join
(
export_dir
,
args
.
language
+
"_itn_verbalizer.far"
)
start_time
=
perf_counter
()
tagger_fst
,
verbalizer_fst
=
get_grammars
(
args
.
language
)
generator_main
(
tagger_far_file
,
{
"tokenize_and_classify"
:
tagger_fst
})
generator_main
(
verbalizer_far_file
,
{
"verbalize"
:
verbalizer_fst
})
print
(
f
"Time to generate graph:
{
round
(
perf_counter
()
-
start_time
,
2
)
}
sec"
)
FunASR/fun_text_processing/inverse_text_normalization/fr/README.md
0 → 100644
View file @
70a8a9e0
# Note on French spelling
Due to a 1990 orthographic reform, there are currently two conventions for written French numbers:
1.
**Reformed**
All composite words are joined by a hyphen:
e.g.
`1122 -> mille-cent-vingt-deux`
2.
**Traditional**
Hyphenation only occurs (with exception) for numbers from 17 to 99 (inclusive):
e.g.
`1122 -> mille cent vingt-deux`
As available training data for upstream ASR will vary in use of convention, NeMo's French ITN accomodates either style for normalization e.g.
```
python inverse_normalize.py "mille-cent-vingt-deux" --language="fr" --> 1122
python inverse_normalize.py "mille cent vingt-deux" --language="fr" --> 1122
```
As a result, there exists some ambiguity in the case of currency conversions, namely minor denominations of the dollar e.g.
```
300 -> "trois-cents" # Reformed spelling
300 -> "trois cents" # Traditional spelling
3 ¢ -> "trois cents" # Valid for both
```
Cardinals take priority in such cases.
```
python inverse_normalize.py "trois cents" --language="fr" -> 300
```
FunASR/fun_text_processing/inverse_text_normalization/fr/__init__.py
0 → 100644
View file @
70a8a9e0
from
fun_text_processing.inverse_text_normalization.fr.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.fr.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
FunASR/fun_text_processing/inverse_text_normalization/fr/data/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/domain.tsv
0 → 100644
View file @
70a8a9e0
com
es
uk
fr
net
br
in
ru
de
it
edu
co
ar
bo
cl
co
ec
fk
gf
fy
pe
py
sr
ve
uy
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/server_name.tsv
0 → 100644
View file @
70a8a9e0
g mail gmail
gmail
n vidia nvidia
nvidia
outlook
hotmail
yahoo
aol
gmx
msn
live
yandex
orange
wanadoo
web
google
comcast
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/electronic/symbols.tsv
0 → 100644
View file @
70a8a9e0
chez @
at @
à @
arobase @
point .
barre oblique /
tiret -
tiret bas _
souligné _
sous-tiret _
blanc souligné _
underscore _
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/fractions.tsv
0 → 100644
View file @
70a8a9e0
demie deux
demies deux
demi deux
demis deux
tiers trois
quart quatre
quarts quatre
quatrièmes quatre
quatrième quatre
cinquième cinq
cinquièmes cinq
neuvième neuf
neuvièmes neuf
onzième onze
onzièmes onze
douzième douze
douzièmes douze
treizième treize
treizièmes treize
quatorzième quatorze
quatorzièmes quatorze
quinzième quinze
quinzièmes quinze
seizième seize
seizièmes seize
trentième trente
trentièmes trente
quarantième quarante
quarantièmes quarante
cinquantième cinquante
cinquantièmes cinquante
soixantième soixante
soixantièmes soixante
septantième septante
septantièmes septante
huitantième huitante
huitantièmes huitante
nonantième nonante
nonantièmes nonante
millième mille
millièmes mille
\ No newline at end of file
Prev
1
…
17
18
19
20
21
22
23
24
25
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment