Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
FunASR
Commits
431278fa
Commit
431278fa
authored
Nov 22, 2024
by
“change”
Browse files
Initial commit
parent
8c252776
Pipeline
#1949
failed with stages
in 0 seconds
Changes
788
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
303 additions
and
0 deletions
+303
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/money.py
...essing/inverse_text_normalization/ru/verbalizers/money.py
+21
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py
...sing/inverse_text_normalization/ru/verbalizers/ordinal.py
+22
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py
...ng/inverse_text_normalization/ru/verbalizers/telephone.py
+21
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/time.py
...cessing/inverse_text_normalization/ru/verbalizers/time.py
+40
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize.py
...ng/inverse_text_normalization/ru/verbalizers/verbalize.py
+50
-0
fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py
...erse_text_normalization/ru/verbalizers/verbalize_final.py
+33
-0
fun_text_processing/inverse_text_normalization/run_evaluate.py
...ext_processing/inverse_text_normalization/run_evaluate.py
+109
-0
fun_text_processing/inverse_text_normalization/tl/__init__.py
...text_processing/inverse_text_normalization/tl/__init__.py
+7
-0
No files found.
Too many changes to show.
To preserve performance only
788 of 788+
files are displayed.
Plain diff
Email patch
fun_text_processing/inverse_text_normalization/ru/verbalizers/money.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for verbalizing electronic
e.g. money { integer_part: "2 руб." } -> "2 руб."
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"money"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
'integer_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/ru/verbalizers/ordinal.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing ordinal numbers
e.g. ordinal { integer: "2" } -> "2"
Args:
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def
__init__
(
self
,
deterministic
:
bool
=
True
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"verbalize"
,
deterministic
=
deterministic
)
value
=
pynini
.
closure
(
DAMO_NOT_QUOTE
)
graph
=
pynutil
.
delete
(
'integer: "'
)
+
value
+
pynutil
.
delete
(
'"'
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/ru/verbalizers/telephone.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
DAMO_NOT_QUOTE
,
GraphFst
from
pynini.lib
import
pynutil
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for verbalizing telephone
e.g. telephone { number_part: "8-913-983-56-01" } -> "8-913-983-56-01"
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"verbalize"
)
graph
=
(
pynutil
.
delete
(
'number_part: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/ru/verbalizers/time.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.text_normalization.en.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for verbalizing time
e.g. time { hours: "02:15" } -> "02:15"
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"verbalize"
)
hour
=
(
pynutil
.
delete
(
"hours: "
)
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
minutes
=
(
pynutil
.
delete
(
"minutes: "
)
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
graph_preserve_order
=
(
pynutil
.
delete
(
'hours: "'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
# for cases that require permutations for the correct verbalization
graph_reverse_order
=
hour
+
delete_space
+
pynutil
.
insert
(
":"
)
+
minutes
+
delete_space
graph
=
graph_preserve_order
|
graph_reverse_order
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize.py
0 → 100644
View file @
431278fa
from
fun_text_processing.inverse_text_normalization.en.verbalizers.whitelist
import
WhiteListFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.time
import
TimeFst
from
fun_text_processing.text_normalization.en.graph_utils
import
GraphFst
class
VerbalizeFst
(
GraphFst
):
"""
Composes other verbalizer grammars.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize"
,
kind
=
"verbalize"
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
ordinal
=
OrdinalFst
()
ordinal_graph
=
ordinal
.
fst
decimal
=
DecimalFst
()
decimal_graph
=
decimal
.
fst
whitelist_graph
=
WhiteListFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
money_graph
=
MoneyFst
().
fst
date_graph
=
DateFst
().
fst
measure_graph
=
MeasureFst
().
fst
telephone_graph
=
TelephoneFst
().
fst
time_graph
=
TimeFst
().
fst
graph
=
(
whitelist_graph
|
cardinal_graph
|
ordinal_graph
|
decimal_graph
|
electronic_graph
|
date_graph
|
money_graph
|
measure_graph
|
telephone_graph
|
time_graph
)
self
.
fst
=
graph
fun_text_processing/inverse_text_normalization/ru/verbalizers/verbalize_final.py
0 → 100644
View file @
431278fa
import
pynini
from
fun_text_processing.inverse_text_normalization.en.verbalizers.word
import
WordFst
from
fun_text_processing.inverse_text_normalization.ru.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.text_normalization.en.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
VerbalizeFinalFst
(
GraphFst
):
"""
Finite state transducer that verbalizes an entire sentence, e.g.
tokens { name: "its" } tokens { time { hours: "12" minutes: "30" } } tokens { name: "now" } -> its 12:30 now
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"verbalize_final"
,
kind
=
"verbalize"
)
verbalize
=
VerbalizeFst
().
fst
word
=
WordFst
().
fst
types
=
verbalize
|
word
graph
=
(
pynutil
.
delete
(
"tokens"
)
+
delete_space
+
pynutil
.
delete
(
"{"
)
+
delete_space
+
types
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
graph
=
delete_space
+
pynini
.
closure
(
graph
+
delete_extra_space
)
+
graph
+
delete_space
self
.
fst
=
graph
fun_text_processing/inverse_text_normalization/run_evaluate.py
0 → 100644
View file @
431278fa
from
argparse
import
ArgumentParser
from
fun_text_processing.inverse_text_normalization.inverse_normalize
import
InverseNormalizer
from
fun_text_processing.text_normalization.data_loader_utils
import
(
evaluate
,
known_types
,
load_files
,
training_data_to_sentences
,
training_data_to_tokens
,
)
"""
Runs Evaluation on data in the format of : <semiotic class>
\t
<unnormalized text>
\t
<`self` if trivial class or normalized text>
like the Google text normalization data https://www.kaggle.com/richardwilliamsproat/text-normalization-for-english-russian-and-polish
"""
def
parse_args
():
parser
=
ArgumentParser
()
parser
.
add_argument
(
"--input"
,
help
=
"input file path"
,
type
=
str
,
required
=
True
)
parser
.
add_argument
(
"--lang"
,
help
=
"language"
,
choices
=
[
"en"
,
"id"
,
"ja"
,
"de"
,
"es"
,
"pt"
,
"ru"
,
"fr"
,
"vi"
,
"ko"
,
"zh"
,
"fil"
],
default
=
"en"
,
type
=
str
,
)
parser
.
add_argument
(
"--cat"
,
dest
=
"category"
,
help
=
"focus on class only ("
+
", "
.
join
(
known_types
)
+
")"
,
type
=
str
,
default
=
None
,
choices
=
known_types
,
)
parser
.
add_argument
(
"--filter"
,
action
=
"store_true"
,
help
=
"clean data for inverse normalization purposes"
)
return
parser
.
parse_args
()
if
__name__
==
"__main__"
:
# Example usage:
# python run_evaluate.py --input=<INPUT> --cat=<CATEGORY> --filter
args
=
parse_args
()
if
args
.
lang
==
"en"
:
from
fun_text_processing.inverse_text_normalization.en.clean_eval_data
import
filter_loaded_data
file_path
=
args
.
input
inverse_normalizer
=
InverseNormalizer
()
print
(
"Loading training data: "
+
file_path
)
training_data
=
load_files
([
file_path
])
if
args
.
filter
:
training_data
=
filter_loaded_data
(
training_data
)
# Evaluate at sentence level if no specific category is provided
if
args
.
category
is
None
:
print
(
"Sentence level evaluation..."
)
sentences_un_normalized
,
sentences_normalized
,
_
=
training_data_to_sentences
(
training_data
)
print
(
"- Data: "
+
str
(
len
(
sentences_normalized
))
+
" sentences"
)
sentences_prediction
=
inverse_normalizer
.
inverse_normalize_list
(
sentences_normalized
)
print
(
"- Denormalized. Evaluating..."
)
sentences_accuracy
=
evaluate
(
preds
=
sentences_prediction
,
labels
=
sentences_un_normalized
,
input
=
sentences_normalized
)
print
(
"- Accuracy: "
+
str
(
sentences_accuracy
))
# Evaluate at token level
print
(
"Token level evaluation..."
)
tokens_per_type
=
training_data_to_tokens
(
training_data
,
category
=
args
.
category
)
token_accuracy
=
{}
for
token_type
,
(
tokens_un_normalized
,
tokens_normalized
)
in
tokens_per_type
.
items
():
print
(
"- Token type: "
+
token_type
)
print
(
" - Data: "
+
str
(
len
(
tokens_normalized
))
+
" tokens"
)
tokens_prediction
=
inverse_normalizer
.
inverse_normalize_list
(
tokens_normalized
)
print
(
" - Denormalized. Evaluating..."
)
token_accuracy
[
token_type
]
=
evaluate
(
tokens_prediction
,
tokens_un_normalized
,
input
=
tokens_normalized
)
print
(
" - Accuracy: "
+
str
(
token_accuracy
[
token_type
]))
# Calculate weighted token accuracy
token_count_per_type
=
{
token_type
:
len
(
tokens
)
for
token_type
,
(
tokens
,
_
)
in
tokens_per_type
.
items
()}
token_weighted_accuracy
=
[
token_count_per_type
[
token_type
]
*
accuracy
for
token_type
,
accuracy
in
token_accuracy
.
items
()
]
print
(
"- Accuracy: "
+
str
(
sum
(
token_weighted_accuracy
)
/
sum
(
token_count_per_type
.
values
())))
print
(
" - Total: "
+
str
(
sum
(
token_count_per_type
.
values
())),
"
\n
"
)
for
token_type
in
token_accuracy
:
if
token_type
not
in
known_types
:
raise
ValueError
(
"Unexpected token type: "
+
token_type
)
# Output table summarizing evaluation results if no specific category is provided
if
args
.
category
is
None
:
c1
=
[
"Class"
,
"sent level"
]
+
known_types
c2
=
[
"Num Tokens"
,
len
(
sentences_normalized
)]
+
[
str
(
token_count_per_type
.
get
(
known_type
,
0
))
for
known_type
in
known_types
]
c3
=
[
"Denormalization"
,
str
(
sentences_accuracy
)]
+
[
str
(
token_accuracy
.
get
(
known_type
,
"0"
))
for
known_type
in
known_types
]
for
i
in
range
(
len
(
c1
)):
print
(
f
"
{
c1
[
i
]:
10
s
}
|
{
c2
[
i
]:
10
s
}
|
{
c3
[
i
]:
5
s
}
"
)
else
:
print
(
f
"numbers
\t
{
token_count_per_type
[
args
.
category
]
}
"
)
print
(
f
"Denormalization
\t
{
token_accuracy
[
args
.
category
]
}
"
)
fun_text_processing/inverse_text_normalization/tl/__init__.py
0 → 100755
View file @
431278fa
from
fun_text_processing.inverse_text_normalization.tl.taggers.tokenize_and_classify
import
(
ClassifyFst
,
)
from
fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize
import
VerbalizeFst
from
fun_text_processing.inverse_text_normalization.tl.verbalizers.verbalize_final
import
(
VerbalizeFinalFst
,
)
Prev
1
…
36
37
38
39
40
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment