Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Pipeline
#1738
failed with stages
in 0 seconds
Changes
827
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
467 additions
and
0 deletions
+467
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/decimal.py
...ocessing/inverse_text_normalization/tl/taggers/decimal.py
+99
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/electronic.py
...ssing/inverse_text_normalization/tl/taggers/electronic.py
+101
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/fraction.py
...cessing/inverse_text_normalization/tl/taggers/fraction.py
+11
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/measure.py
...ocessing/inverse_text_normalization/tl/taggers/measure.py
+97
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/money.py
...processing/inverse_text_normalization/tl/taggers/money.py
+110
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/ordinal.py
...ocessing/inverse_text_normalization/tl/taggers/ordinal.py
+29
-0
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/punctuation.py
...sing/inverse_text_normalization/tl/taggers/punctuation.py
+20
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/decimal.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
(
DAMO_DIGIT
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
def
get_quantity
(
decimal
:
"pynini.FstLike"
,
cardinal_up_to_hundred
:
"pynini.FstLike"
)
->
"pynini.FstLike"
:
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
numbers
=
cardinal_up_to_hundred
@
(
pynutil
.
delete
(
pynini
.
closure
(
"0"
))
+
pynini
.
difference
(
DAMO_DIGIT
,
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
)
)
suffix
=
pynini
.
union
(
"milyon"
,
"bilyon"
,
"trilyon"
,
"quadrilyon"
,
"quintilyon"
,
"sextilyon"
)
res
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
numbers
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
pynutil
.
insert
(
'quantity: "'
)
+
suffix
+
pynutil
.
insert
(
'"'
)
)
res
|=
(
decimal
+
delete_extra_space
+
pynutil
.
insert
(
'quantity: "'
)
+
(
suffix
|
"libo"
)
+
pynutil
.
insert
(
'"'
)
)
return
res
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for classifying decimal
e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" }
e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_decimal
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_decimal
|=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
|
pynini
.
cross
(
"o"
,
"0"
)
graph_decimal
=
pynini
.
closure
(
graph_decimal
+
delete_space
)
+
graph_decimal
self
.
graph
=
graph_decimal
point
=
pynini
.
cross
(
"punto"
,
""
)
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"minus"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
graph_fractional
=
(
pynutil
.
insert
(
'fractional_part: "'
)
+
graph_decimal
+
pynutil
.
insert
(
'"'
)
)
graph_integer
=
pynutil
.
insert
(
'integer_part: "'
)
+
cardinal_graph
+
pynutil
.
insert
(
'"'
)
final_graph_wo_sign
=
(
pynini
.
closure
(
graph_integer
+
delete_extra_space
,
0
,
1
)
+
point
+
delete_extra_space
+
graph_fractional
)
final_graph
=
optional_graph_negative
+
final_graph_wo_sign
self
.
final_graph_wo_negative
=
final_graph_wo_sign
|
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundred_component_at_least_one_none_zero_digit
)
final_graph
|=
optional_graph_negative
+
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundred_component_at_least_one_none_zero_digit
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/electronic.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
(
DAMO_ALPHA
,
GraphFst
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"classify"
)
delete_extra_space
=
pynutil
.
delete
(
" "
)
alpha_num
=
(
DAMO_ALPHA
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
)
symbols
=
pynini
.
string_file
(
get_abs_path
(
"data/electronic/symbols.tsv"
)).
invert
()
accepted_username
=
alpha_num
|
symbols
process_dot
=
pynini
.
cross
(
"dot"
,
"."
)
username
=
(
alpha_num
+
pynini
.
closure
(
delete_extra_space
+
accepted_username
)
)
|
pynutil
.
add_weight
(
pynini
.
closure
(
DAMO_ALPHA
,
1
),
weight
=
0.0001
)
username
=
pynutil
.
insert
(
'username: "'
)
+
username
+
pynutil
.
insert
(
'"'
)
single_alphanum
=
pynini
.
closure
(
alpha_num
+
delete_extra_space
)
+
alpha_num
server
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/server_name.tsv"
)
)
domain
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/domain.tsv"
))
domain_graph
=
(
pynutil
.
insert
(
'domain: "'
)
+
server
+
delete_extra_space
+
process_dot
+
delete_extra_space
+
domain
+
pynutil
.
insert
(
'"'
)
)
graph
=
(
username
+
delete_extra_space
+
pynutil
.
delete
(
"at"
)
+
insert_space
+
delete_extra_space
+
domain_graph
)
############# url ###
protocol_end
=
pynini
.
cross
(
pynini
.
union
(
"w w w"
,
"www"
),
"www"
)
protocol_start
=
(
pynini
.
cross
(
"h t t p"
,
"http"
)
|
pynini
.
cross
(
"h t t p s"
,
"https"
)
)
+
pynini
.
cross
(
" colon slash slash "
,
"://"
)
# .com,
ending
=
(
delete_extra_space
+
symbols
+
delete_extra_space
+
(
domain
|
pynini
.
closure
(
accepted_username
+
delete_extra_space
,
)
+
accepted_username
)
)
protocol_default
=
(
(
(
pynini
.
closure
(
delete_extra_space
+
accepted_username
,
1
)
|
server
)
|
pynutil
.
add_weight
(
pynini
.
closure
(
DAMO_ALPHA
,
1
),
weight
=
0.0001
)
)
+
pynini
.
closure
(
ending
,
1
)
).
optimize
()
protocol
=
(
pynini
.
closure
(
protocol_start
,
0
,
1
)
+
protocol_end
+
delete_extra_space
+
process_dot
+
protocol_default
).
optimize
()
protocol
|=
(
pynini
.
closure
(
protocol_end
+
delete_extra_space
+
process_dot
,
0
,
1
)
+
protocol_default
)
protocol
=
pynutil
.
insert
(
'protocol: "'
)
+
protocol
.
optimize
()
+
pynutil
.
insert
(
'"'
)
graph
|=
protocol
########
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/fraction.py
0 → 100755
View file @
70a8a9e0
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
GraphFst
class
FractionFst
(
GraphFst
):
"""
Finite state transducer for classifying fraction
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"fraction"
,
kind
=
"classify"
)
# integer_part # numerator # denominator
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/measure.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
(
DAMO_SIGMA
,
GraphFst
,
convert_space
,
delete_extra_space
,
delete_space
,
get_singulars
,
)
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for classifying measure
e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_unit
=
pynini
.
string_file
(
get_abs_path
(
"data/measurements.tsv"
))
graph_unit_singular
=
pynini
.
invert
(
graph_unit
)
# singular -> abbr
graph_unit_plural
=
get_singulars
(
graph_unit_singular
)
# plural -> abbr
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"minus"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
unit_singular
=
convert_space
(
graph_unit_singular
)
unit_plural
=
convert_space
(
graph_unit_plural
)
unit_misc
=
(
pynutil
.
insert
(
"/"
)
+
pynutil
.
delete
(
"per"
)
+
delete_space
+
convert_space
(
graph_unit_singular
)
)
unit_singular
=
(
pynutil
.
insert
(
'units: "'
)
+
(
unit_singular
|
unit_misc
|
pynutil
.
add_weight
(
unit_singular
+
delete_space
+
unit_misc
,
0.01
)
)
+
pynutil
.
insert
(
'"'
)
)
unit_plural
=
(
pynutil
.
insert
(
'units: "'
)
+
(
unit_plural
|
unit_misc
|
pynutil
.
add_weight
(
unit_plural
+
delete_space
+
unit_misc
,
0.01
)
)
+
pynutil
.
insert
(
'"'
)
)
subgraph_decimal
=
(
pynutil
.
insert
(
"decimal { "
)
+
optional_graph_negative
+
decimal
.
final_graph_wo_negative
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_plural
)
subgraph_cardinal
=
(
pynutil
.
insert
(
"cardinal { "
)
+
optional_graph_negative
+
pynutil
.
insert
(
'integer: "'
)
+
((
DAMO_SIGMA
-
"one"
)
@
cardinal_graph
)
+
pynutil
.
insert
(
'"'
)
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_plural
)
subgraph_cardinal
|=
(
pynutil
.
insert
(
"cardinal { "
)
+
optional_graph_negative
+
pynutil
.
insert
(
'integer: "'
)
+
pynini
.
cross
(
"one"
,
"1"
)
+
pynutil
.
insert
(
'"'
)
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_singular
)
final_graph
=
subgraph_decimal
|
subgraph_cardinal
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/money.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
(
DAMO_DIGIT
,
DAMO_NOT_SPACE
,
DAMO_SIGMA
,
GraphFst
,
convert_space
,
delete_extra_space
,
delete_space
,
get_singulars
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for classifying money
e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"classify"
)
# quantity, integer_part, fractional_part, currency
cardinal_graph
=
cardinal
.
graph_no_exception
# add support for missing hundred (only for 3 digit numbers)
# "one fifty" -> "one hundred fifty"
with_hundred
=
pynini
.
compose
(
pynini
.
closure
(
DAMO_NOT_SPACE
)
+
pynini
.
accep
(
" "
)
+
pynutil
.
insert
(
"hundred "
)
+
DAMO_SIGMA
,
pynini
.
compose
(
cardinal_graph
,
DAMO_DIGIT
**
3
),
)
cardinal_graph
|=
with_hundred
graph_decimal_final
=
decimal
.
final_graph_wo_negative
unit
=
pynini
.
string_file
(
get_abs_path
(
"data/currency.tsv"
))
unit_singular
=
pynini
.
invert
(
unit
)
unit_plural
=
get_singulars
(
unit_singular
)
graph_unit_singular
=
(
pynutil
.
insert
(
'currency: "'
)
+
convert_space
(
unit_singular
)
+
pynutil
.
insert
(
'"'
)
)
graph_unit_plural
=
(
pynutil
.
insert
(
'currency: "'
)
+
convert_space
(
unit_plural
)
+
pynutil
.
insert
(
'"'
)
)
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
# twelve dollars (and) fifty cents, zero cents
cents_standalone
=
(
pynutil
.
insert
(
'fractional_part: "'
)
+
pynini
.
union
(
pynutil
.
add_weight
(((
DAMO_SIGMA
-
"one"
)
@
cardinal_graph
),
-
0.7
)
@
add_leading_zero_to_double_digit
+
delete_space
+
pynutil
.
delete
(
"cents"
),
pynini
.
cross
(
"one"
,
"01"
)
+
delete_space
+
pynutil
.
delete
(
"cent"
),
)
+
pynutil
.
insert
(
'"'
)
)
optional_cents_standalone
=
pynini
.
closure
(
delete_space
+
pynini
.
closure
(
pynutil
.
delete
(
"and"
)
+
delete_space
,
0
,
1
)
+
insert_space
+
cents_standalone
,
0
,
1
,
)
# twelve dollars fifty, only after integer
optional_cents_suffix
=
pynini
.
closure
(
delete_extra_space
+
pynutil
.
insert
(
'fractional_part: "'
)
+
pynutil
.
add_weight
(
cardinal_graph
@
add_leading_zero_to_double_digit
,
-
0.7
)
+
pynutil
.
insert
(
'"'
),
0
,
1
,
)
graph_integer
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
((
DAMO_SIGMA
-
"one"
)
@
cardinal_graph
)
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
graph_unit_plural
+
(
optional_cents_standalone
|
optional_cents_suffix
)
)
graph_integer
|=
(
pynutil
.
insert
(
'integer_part: "'
)
+
pynini
.
cross
(
"one"
,
"1"
)
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
graph_unit_singular
+
(
optional_cents_standalone
|
optional_cents_suffix
)
)
graph_decimal
=
graph_decimal_final
+
delete_extra_space
+
graph_unit_plural
graph_decimal
|=
pynutil
.
insert
(
'currency: "$" integer_part: "0" '
)
+
cents_standalone
final_graph
=
graph_integer
|
graph_decimal
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/ordinal.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
DAMO_CHAR
,
GraphFst
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for classifying ordinal
e.g. thirteenth -> ordinal { integer: "13" }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/digit.tsv"
))
graph_teens
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/teen.tsv"
))
graph
=
pynini
.
closure
(
DAMO_CHAR
)
+
pynini
.
union
(
graph_digit
,
graph_teens
,
pynini
.
cross
(
"tieth"
,
"ty"
),
pynini
.
cross
(
"th"
,
""
)
)
self
.
graph
=
graph
@
cardinal_graph
final_graph
=
pynutil
.
insert
(
'integer: "'
)
+
self
.
graph
+
pynutil
.
insert
(
'"'
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/tl/taggers/punctuation.py
0 → 100755
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.tl.graph_utils
import
GraphFst
from
pynini.lib
import
pynutil
class
PunctuationFst
(
GraphFst
):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"punctuation"
,
kind
=
"classify"
)
s
=
"!#$%&'()*+,-./:;<=>?@^_`{|}~"
punct
=
pynini
.
union
(
*
s
)
graph
=
pynutil
.
insert
(
'name: "'
)
+
punct
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
Prev
1
…
38
39
40
41
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment