Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Pipeline
#1738
failed with stages
in 0 seconds
Changes
827
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
927 additions
and
0 deletions
+927
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/time.py
..._processing/inverse_text_normalization/pt/taggers/time.py
+223
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py
...se_text_normalization/pt/taggers/tokenize_and_classify.py
+102
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/whitelist.py
...essing/inverse_text_normalization/pt/taggers/whitelist.py
+19
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/word.py
..._processing/inverse_text_normalization/pt/taggers/word.py
+15
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/utils.py
...un_text_processing/inverse_text_normalization/pt/utils.py
+13
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/__init__.py
...ing/inverse_text_normalization/pt/verbalizers/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py
...ing/inverse_text_normalization/pt/verbalizers/cardinal.py
+38
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/date.py
...cessing/inverse_text_normalization/pt/verbalizers/date.py
+73
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py
...sing/inverse_text_normalization/pt/verbalizers/decimal.py
+56
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py
...g/inverse_text_normalization/pt/verbalizers/electronic.py
+45
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/measure.py
...sing/inverse_text_normalization/pt/verbalizers/measure.py
+47
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/money.py
...essing/inverse_text_normalization/pt/verbalizers/money.py
+31
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py
...sing/inverse_text_normalization/pt/verbalizers/ordinal.py
+34
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py
...ng/inverse_text_normalization/pt/verbalizers/telephone.py
+22
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/time.py
...cessing/inverse_text_normalization/pt/verbalizers/time.py
+70
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/verbalize.py
...ng/inverse_text_normalization/pt/verbalizers/verbalize.py
+48
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py
...erse_text_normalization/pt/verbalizers/verbalize_final.py
+33
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py
...ng/inverse_text_normalization/pt/verbalizers/whitelist.py
+27
-0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/word.py
...cessing/inverse_text_normalization/pt/verbalizers/word.py
+29
-0
FunASR/fun_text_processing/inverse_text_normalization/ru/__init__.py
...text_processing/inverse_text_normalization/ru/__init__.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/time.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.pt.utils
import
get_abs_path
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for classifying time
e.g. quinze pro meio dia -> time { hours: "11" minutes: "45" }
e.g. quinze pra meia noite -> time { hours: "23" minutes: "45" }
e.g. quinze pra uma -> time { hours: "12" minutes: "45" }
e.g. dez pras duas -> time { hours: "1" minutes: "50" }
e.g. quinze pras duas -> time { hours: "1" minutes: "45" }
e.g. ao meio dia -> time { hours: "12" minutes: "00" morphosyntactic_features: "ao" }
e.g. ao meio dia e meia -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" }
e.g. ao meio dia e meio -> time { hours: "12" minutes: "30" morphosyntactic_features: "ao" }
e.g. à meia noite e quinze -> time { hours: "0" minutes: "15" morphosyntactic_features: "à" }
e.g. à meia noite e meia -> time { hours: "0" minutes: "30" morphosyntactic_features: "à" }
e.g. à uma e trinta -> time { hours: "1" minutes: "30" morphosyntactic_features: "à" }
e.g. às onze e trinta -> time { hours: "11" minutes: "30" morphosyntactic_features: "às" }
e.g. às três horas e trinta minutos -> time { hours: "3" minutes: "30" morphosyntactic_features: "às" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"classify"
)
# graph_hour_to_am = pynini.string_file(get_abs_path("data/time/hour_to_am.tsv"))
# graph_hour_to_pm = pynini.string_file(get_abs_path("data/time/hour_to_pm.tsv"))
graph_hours_to
=
pynini
.
string_file
(
get_abs_path
(
"data/time/hours_to.tsv"
))
graph_minutes_to
=
pynini
.
string_file
(
get_abs_path
(
"data/time/minutes_to.tsv"
))
graph_suffix_am
=
pynini
.
string_file
(
get_abs_path
(
"data/time/time_suffix_am.tsv"
))
graph_suffix_pm
=
pynini
.
string_file
(
get_abs_path
(
"data/time/time_suffix_pm.tsv"
))
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_ties
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties.tsv"
))
graph_teen
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/teen.tsv"
))
graph_twenties
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/twenties.tsv"
))
graph_1_to_100
=
pynini
.
union
(
graph_digit
,
graph_twenties
,
graph_teen
,
(
graph_ties
+
pynutil
.
insert
(
"0"
)),
(
graph_ties
+
pynutil
.
delete
(
" e "
)
+
graph_digit
),
)
# note that graph_hour will start from 2 hours
# "1 o'clock" will be treated differently because it
# is singular
digits_2_to_23
=
[
str
(
digits
)
for
digits
in
range
(
2
,
24
)]
digits_1_to_59
=
[
str
(
digits
)
for
digits
in
range
(
1
,
60
)]
graph_2_to_23
=
graph_1_to_100
@
pynini
.
union
(
*
digits_2_to_23
)
graph_1_to_59
=
graph_1_to_100
@
pynini
.
union
(
*
digits_1_to_59
)
graph_uma
=
pynini
.
cross
(
"uma"
,
"1"
)
# Mapping 'horas'
graph_hour
=
pynutil
.
delete
(
pynini
.
accep
(
"hora"
)
+
pynini
.
accep
(
"s"
).
ques
)
graph_minute
=
pynutil
.
delete
(
pynini
.
accep
(
"minuto"
)
+
pynini
.
accep
(
"s"
).
ques
)
# Mapping 'meio dia' and 'meia noite'
graph_meio_dia
=
pynini
.
cross
(
"meio dia"
,
"12"
)
graph_meia_noite
=
pynini
.
cross
(
"meia noite"
,
"0"
)
# Mapping 'e meia'
graph_e
=
delete_space
+
pynutil
.
delete
(
" e "
)
+
delete_space
graph_e_meia
=
graph_e
+
pynini
.
cross
(
"meia"
,
"30"
)
graph_e_meio
=
graph_e
+
pynini
.
cross
(
"meio"
,
"30"
)
# à uma e meia -> 1:30
# às três e meia -> 3:30
graph_hours_at_prefix_singular
=
(
pynutil
.
insert
(
'morphosyntactic_features: "'
)
+
(
pynini
.
cross
(
"à"
,
"à"
)
|
pynini
.
cross
(
"a"
,
"à"
))
+
pynutil
.
insert
(
'" '
)
+
delete_space
)
graph_hours_at_singular
=
(
graph_hours_at_prefix_singular
+
pynutil
.
insert
(
'hours: "'
)
+
graph_uma
+
pynutil
.
insert
(
'"'
)
+
(
delete_space
+
graph_hour
).
ques
)
graph_hours_at_prefix_plural
=
(
pynutil
.
insert
(
'morphosyntactic_features: "'
)
+
(
pynini
.
cross
(
"às"
,
"às"
)
|
pynini
.
cross
(
"as"
,
"às"
))
+
pynutil
.
insert
(
'" '
)
+
delete_space
)
graph_hours_at_plural
=
(
graph_hours_at_prefix_plural
+
pynutil
.
insert
(
'hours: "'
)
+
graph_2_to_23
+
pynutil
.
insert
(
'"'
)
+
(
delete_space
+
graph_hour
).
ques
)
final_graph_hour_at
=
graph_hours_at_singular
|
graph_hours_at_plural
graph_minutes_component_without_zero
=
(
graph_e
+
graph_1_to_59
+
(
delete_space
+
graph_minute
).
ques
)
graph_minutes_component_without_zero
|=
(
graph_e_meia
+
pynutil
.
delete
(
delete_space
+
pynini
.
accep
(
"hora"
)).
ques
)
final_graph_minute
=
(
pynutil
.
insert
(
' minutes: "'
)
+
graph_minutes_component_without_zero
+
pynutil
.
insert
(
'"'
)
)
graph_hm
=
final_graph_hour_at
+
final_graph_minute
# à uma hora -> 1:00
graph_hours_at_singular_with_hour
=
(
graph_hours_at_prefix_singular
+
pynutil
.
insert
(
'hours: "'
)
+
graph_uma
+
pynutil
.
insert
(
'"'
)
+
delete_space
+
graph_hour
)
graph_hours_at_plural_with_hour
=
(
graph_hours_at_prefix_plural
+
pynutil
.
insert
(
'hours: "'
)
+
graph_2_to_23
+
pynutil
.
insert
(
'"'
)
+
delete_space
+
graph_hour
)
graph_hm
|=
(
graph_hours_at_singular_with_hour
|
graph_hours_at_plural_with_hour
)
+
pynutil
.
insert
(
' minutes: "00"'
,
weight
=
0.2
)
# meio dia e meia -> 12:30
# meia noite e meia -> 0:30
graph_minutes_without_zero
=
(
pynutil
.
insert
(
' minutes: "'
)
+
graph_minutes_component_without_zero
+
pynutil
.
insert
(
'"'
)
)
graph_meio_min
=
(
pynutil
.
insert
(
'hours: "'
)
+
(
graph_meio_dia
|
graph_meia_noite
)
+
pynutil
.
insert
(
'"'
)
+
graph_minutes_without_zero
)
graph_meio_min
|=
(
pynutil
.
insert
(
'hours: "'
)
+
graph_meio_dia
+
pynutil
.
insert
(
'" minutes: "'
)
+
graph_e_meio
+
pynutil
.
insert
(
'"'
)
)
graph_hm
|=
graph_meio_min
# às quinze para as quatro -> às 3:45
# NOTE: case 'para à uma' ('to one') could be either 0:XX or 12:XX
# leading to wrong reading ('meio dia e ...' or 'meia noite e ...')
graph_para_a
=
(
pynutil
.
delete
(
"para"
)
|
pynutil
.
delete
(
"para a"
)
|
pynutil
.
delete
(
"para as"
)
|
pynutil
.
delete
(
"pra"
)
|
pynutil
.
delete
(
"pras"
)
)
graph_para_o
=
pynutil
.
delete
(
"para"
)
|
pynutil
.
delete
(
"para o"
)
|
pynutil
.
delete
(
"pro"
)
graph_pra_min
=
(
pynutil
.
insert
(
'morphosyntactic_features: "'
)
+
(
pynini
.
cross
(
"à"
,
"à"
)
|
pynini
.
cross
(
"às"
,
"às"
)
|
pynini
.
cross
(
"a"
,
"à"
)
|
pynini
.
cross
(
"as"
,
"às"
)
)
+
pynutil
.
insert
(
'" '
)
+
delete_space
)
graph_pra_min
+=
(
pynutil
.
insert
(
'minutes: "'
)
+
(
graph_1_to_59
@
graph_minutes_to
)
+
pynutil
.
insert
(
'" '
)
+
(
delete_space
+
graph_minute
).
ques
)
graph_pra_hour
=
(
pynutil
.
insert
(
'hours: "'
)
+
(
graph_2_to_23
@
graph_hours_to
)
+
pynutil
.
insert
(
'"'
)
+
(
delete_space
+
graph_hour
).
ques
)
graph_pra_hour
|=
(
pynutil
.
insert
(
'hours: "'
)
+
(
graph_meia_noite
@
graph_hours_to
)
+
pynutil
.
insert
(
'"'
)
)
graph_pra
=
graph_pra_min
+
delete_space
+
graph_para_a
+
delete_space
+
graph_pra_hour
# às quinze pro meio dia -> às 11:45
graph_pro
=
graph_pra_min
+
delete_space
+
graph_para_o
+
delete_space
graph_pro
+=
(
pynutil
.
insert
(
' hours: "'
)
+
(
graph_meio_dia
@
graph_hours_to
)
+
pynutil
.
insert
(
'"'
)
)
graph_mh
=
graph_pra
|
graph_pro
# optional suffix
final_suffix
=
(
pynutil
.
insert
(
'suffix: "'
)
+
(
graph_suffix_am
|
graph_suffix_pm
)
+
pynutil
.
insert
(
'"'
)
)
final_suffix_optional
=
pynini
.
closure
(
delete_space
+
insert_space
+
final_suffix
,
0
,
1
)
final_graph
=
pynini
.
union
((
graph_hm
|
graph_mh
)
+
final_suffix_optional
).
optimize
()
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/tokenize_and_classify.py
0 → 100644
View file @
70a8a9e0
import
os
import
pynini
from
fun_text_processing.inverse_text_normalization.pt.taggers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.punctuation
import
PunctuationFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.whitelist
import
WhiteListFst
from
fun_text_processing.inverse_text_normalization.pt.taggers.word
import
WordFst
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
generator_main
,
)
from
pynini.lib
import
pynutil
import
logging
class
ClassifyFst
(
GraphFst
):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def
__init__
(
self
,
cache_dir
:
str
=
None
,
overwrite_cache
:
bool
=
False
):
super
().
__init__
(
name
=
"tokenize_and_classify"
,
kind
=
"classify"
)
far_file
=
None
if
cache_dir
is
not
None
and
cache_dir
!=
"None"
:
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
far_file
=
os
.
path
.
join
(
cache_dir
,
"_pt_itn.far"
)
if
not
overwrite_cache
and
far_file
and
os
.
path
.
exists
(
far_file
):
self
.
fst
=
pynini
.
Far
(
far_file
,
mode
=
"r"
)[
"tokenize_and_classify"
]
logging
.
info
(
f
"ClassifyFst.fst was restored from
{
far_file
}
."
)
else
:
logging
.
info
(
f
"Creating ClassifyFst grammars."
)
cardinal
=
CardinalFst
(
use_strict_e
=
True
)
cardinal_graph
=
cardinal
.
fst
ordinal_graph
=
OrdinalFst
().
fst
decimal
=
DecimalFst
(
cardinal
)
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
cardinal
=
cardinal
,
decimal
=
decimal
).
fst
date_graph
=
DateFst
(
cardinal
=
cardinal
).
fst
word_graph
=
WordFst
().
fst
time_graph
=
TimeFst
().
fst
money_graph
=
MoneyFst
(
cardinal
=
cardinal
,
decimal
=
decimal
).
fst
whitelist_graph
=
WhiteListFst
().
fst
punct_graph
=
PunctuationFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
classify
=
(
pynutil
.
add_weight
(
whitelist_graph
,
1.01
)
|
pynutil
.
add_weight
(
time_graph
,
1.09
)
|
pynutil
.
add_weight
(
date_graph
,
1.09
)
|
pynutil
.
add_weight
(
decimal_graph
,
1.09
)
|
pynutil
.
add_weight
(
measure_graph
,
1.1
)
|
pynutil
.
add_weight
(
cardinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
ordinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
money_graph
,
1.1
)
|
pynutil
.
add_weight
(
telephone_graph
,
1.1
)
|
pynutil
.
add_weight
(
electronic_graph
,
1.1
)
|
pynutil
.
add_weight
(
word_graph
,
100
)
)
punct
=
(
pynutil
.
insert
(
"tokens { "
)
+
pynutil
.
add_weight
(
punct_graph
,
weight
=
1.1
)
+
pynutil
.
insert
(
" }"
)
)
token
=
pynutil
.
insert
(
"tokens { "
)
+
classify
+
pynutil
.
insert
(
" }"
)
token_plus_punct
=
(
pynini
.
closure
(
punct
+
pynutil
.
insert
(
" "
))
+
token
+
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
punct
)
)
graph
=
token_plus_punct
+
pynini
.
closure
(
delete_extra_space
+
token_plus_punct
)
graph
=
delete_space
+
graph
+
delete_space
self
.
fst
=
graph
.
optimize
()
if
far_file
:
generator_main
(
far_file
,
{
"tokenize_and_classify"
:
self
.
fst
})
logging
.
info
(
f
"ClassifyFst grammars are saved to
{
far_file
}
."
)
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/whitelist.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.pt.utils
import
get_abs_path
from
fun_text_processing.text_normalization.en.graph_utils
import
GraphFst
,
convert_space
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for classifying whitelisted tokens
e.g. usted -> tokens { name: "ud." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"classify"
)
whitelist
=
pynini
.
string_file
(
get_abs_path
(
"data/whitelist.tsv"
)).
invert
()
graph
=
pynutil
.
insert
(
'name: "'
)
+
convert_space
(
whitelist
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/taggers/word.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_SPACE
,
GraphFst
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"classify"
)
word
=
pynutil
.
insert
(
'name: "'
)
+
pynini
.
closure
(
DAMO_NOT_SPACE
,
1
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
word
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/utils.py
0 → 100644
View file @
70a8a9e0
import
os
def
get_abs_path
(
rel_path
):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
+
rel_path
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/cardinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { negative: "-" integer: "23" } -> -23
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynutil
.
delete
(
"negative:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
,
0
,
1
,
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
self
.
numbers
=
graph
graph
=
optional_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/date.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_extra_space
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
DateFst
(
GraphFst
):
"""
Finite state transducer for verbalizing date, e.g.
date { day: "1" month: "enero" preserve_order: true } -> 1 de enero
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"date"
,
kind
=
"verbalize"
)
month
=
(
pynutil
.
delete
(
"month:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
day
=
(
pynutil
.
delete
(
"day:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
year
=
(
pynutil
.
delete
(
"year:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
# day month
graph_dmy
=
(
day
+
delete_extra_space
+
pynutil
.
insert
(
"de"
)
+
insert_space
+
month
+
(
delete_extra_space
+
pynutil
.
insert
(
"de"
)
+
insert_space
+
year
).
ques
)
graph_dmy
|=
(
day
+
delete_space
+
pynutil
.
insert
(
"/"
)
+
month
+
pynutil
.
delete
(
' morphosyntactic_features: "/"'
)
+
(
delete_space
+
pynutil
.
insert
(
"/"
)
+
year
).
ques
)
optional_preserve_order
=
pynini
.
closure
(
pynutil
.
delete
(
"preserve_order:"
)
+
delete_space
+
pynutil
.
delete
(
"true"
)
+
delete_space
|
pynutil
.
delete
(
"field_order:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
final_graph
=
graph_dmy
+
delete_space
+
optional_preserve_order
delete_tokens
=
self
.
delete_tokens
(
final_graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/decimal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing decimal,
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" } -> -1,26
e.g. decimal { negative: "true" integer_part: "1" morphosyntactic_features: "." fractional_part: "26" } -> -1.26
e.g. decimal { negative: "false" integer_part: "1" morphosyntactic_features: "," fractional_part: "26" quantity: "millón" } -> 1,26 millón
e.g. decimal { negative: "false" integer_part: "2" quantity: "millones" } -> 2 millones
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"verbalize"
)
optionl_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
)
+
delete_space
,
0
,
1
)
integer
=
(
pynutil
.
delete
(
"integer_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_integer
=
pynini
.
closure
(
integer
+
delete_space
,
0
,
1
)
decimal_point
=
pynini
.
cross
(
'morphosyntactic_features: ","'
,
","
)
decimal_point
|=
pynini
.
cross
(
'morphosyntactic_features: "."'
,
"."
)
fractional
=
(
decimal_point
+
delete_space
+
pynutil
.
delete
(
"fractional_part:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_fractional
=
pynini
.
closure
(
fractional
+
delete_space
,
0
,
1
)
quantity
=
(
pynutil
.
delete
(
"quantity:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_quantity
=
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
quantity
+
delete_space
,
0
,
1
)
graph
=
optional_integer
+
optional_fractional
+
optional_quantity
self
.
numbers
=
graph
graph
=
optionl_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/electronic.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for verbalizing electronic
e.g. tokens { electronic { username: "cdf1" domain: "abc.edu" } } -> cdf1@abc.edu
e.g. tokens { electronic { protocol: "www.abc.edu" } } -> www.abc.edu
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"verbalize"
)
user_name
=
(
pynutil
.
delete
(
"username:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
domain
=
(
pynutil
.
delete
(
"domain:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
protocol
=
(
pynutil
.
delete
(
"protocol:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
user_name
+
delete_space
+
pynutil
.
insert
(
"@"
)
+
domain
graph
|=
protocol
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/measure.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_CHAR
,
GraphFst
,
delete_space
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for verbalizing measure, e.g.
measure { cardinal { negative: "true" integer: "12" } units: "kg" } -> -12 kg
Args:
decimal: DecimalFst
cardinal: CardinalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynini
.
cross
(
'negative: "true"'
,
"-"
),
0
,
1
)
unit
=
(
pynutil
.
delete
(
"units:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
+
delete_space
)
graph_decimal
=
(
pynutil
.
delete
(
"decimal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
decimal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph_cardinal
=
(
pynutil
.
delete
(
"cardinal {"
)
+
delete_space
+
optional_sign
+
delete_space
+
cardinal
.
numbers
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
(
graph_cardinal
|
graph_decimal
)
+
delete_space
+
pynutil
.
insert
(
" "
)
+
unit
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/money.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for verbalizing money, e.g.
money { integer_part: "12" morphosyntactic_features: "," fractional_part: "05" currency: "$" } -> $12,05
Args:
decimal: DecimalFst
"""
def
__init__
(
self
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"verbalize"
)
unit
=
(
pynutil
.
delete
(
"currency:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
unit
+
delete_space
+
insert_space
+
decimal
.
numbers
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/ordinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing ordinal, e.g.
ordinal { integer: "13" morphosyntactic_features: "o" } -> 13º
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
replace_suffix
=
pynini
.
union
(
pynini
.
cross
(
' morphosyntactic_features: "o"'
,
"º"
),
pynini
.
cross
(
' morphosyntactic_features: "a"'
,
"ª"
),
)
graph
=
graph
+
replace_suffix
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/telephone.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for verbalizing telephone, e.g.
telephone { number_part: "123-123-5678" }
-> 123-123-5678
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"verbalize"
)
number_part
=
(
pynutil
.
delete
(
'number_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
delete_tokens
=
self
.
delete_tokens
(
number_part
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/time.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_DIGIT
,
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for verbalizing time,
e.g. time { hours: "à 1" minutes: "10" } -> à 1:10
e.g. time { hours: "às 2" minutes: "45" } -> às 2:45
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"verbalize"
)
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
prefix
=
(
pynutil
.
delete
(
"morphosyntactic_features:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
+
delete_space
+
insert_space
)
optional_prefix
=
pynini
.
closure
(
prefix
,
0
,
1
)
hour
=
(
pynutil
.
delete
(
"hours:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
minute
=
(
pynutil
.
delete
(
"minutes:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_DIGIT
,
1
)
+
pynutil
.
delete
(
'"'
)
)
suffix
=
(
delete_space
+
insert_space
+
pynutil
.
delete
(
"suffix:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
optional_suffix
=
pynini
.
closure
(
suffix
,
0
,
1
)
graph
=
(
optional_prefix
+
hour
+
delete_space
+
pynutil
.
insert
(
":"
)
+
(
minute
@
add_leading_zero_to_double_digit
)
+
optional_suffix
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/verbalize.py
0 → 100644
View file @
70a8a9e0
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.whitelist
import
WhiteListFst
from
fun_text_processing.text_normalization.en.graph_utils
import
GraphFst
class
VerbalizeFst
(
GraphFst
):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize"
,
kind
=
"verbalize"
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
ordinal_graph
=
OrdinalFst
().
fst
decimal
=
DecimalFst
()
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
decimal
=
decimal
,
cardinal
=
cardinal
).
fst
money_graph
=
MoneyFst
(
decimal
=
decimal
).
fst
time_graph
=
TimeFst
().
fst
date_graph
=
DateFst
().
fst
whitelist_graph
=
WhiteListFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
graph
=
(
time_graph
|
date_graph
|
money_graph
|
measure_graph
|
ordinal_graph
|
decimal_graph
|
cardinal_graph
|
whitelist_graph
|
telephone_graph
|
electronic_graph
)
self
.
fst
=
graph
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/verbalize_final.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.pt.verbalizers.word
import
WordFst
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
VerbalizeFinalFst
(
GraphFst
):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize_final"
,
kind
=
"verbalize"
)
verbalize
=
VerbalizeFst
().
fst
word
=
WordFst
().
fst
types
=
verbalize
|
word
graph
=
(
pynutil
.
delete
(
"tokens"
)
+
delete_space
+
pynutil
.
delete
(
"{"
)
+
delete_space
+
types
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
delete_space
+
pynini
.
closure
(
graph
+
delete_extra_space
)
+
graph
+
delete_space
self
.
fst
=
graph
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/whitelist.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for verbalizing whitelist
e.g. tokens { name: "sexta feira" } -> "sexta-feira"
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph
=
graph
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/pt/verbalizers/word.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_CHAR
,
DAMO_SIGMA
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for verbalizing plain tokens
e.g. tokens { name: "sleep" } -> sleep
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"verbalize"
)
chars
=
pynini
.
closure
(
DAMO_CHAR
-
" "
,
1
)
char
=
(
pynutil
.
delete
(
"name:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
chars
+
pynutil
.
delete
(
'"'
)
)
graph
=
char
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/ru/__init__.py
0 → 100644
View file @
70a8a9e0
Prev
1
…
34
35
36
37
38
39
40
41
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment