Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
ModelZoo
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Pipeline
#1738
failed with stages
in 0 seconds
Changes
827
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1355 additions
and
0 deletions
+1355
-0
FunASR/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv
...processing/inverse_text_normalization/id/id_unit_test.tsv
+29
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/__init__.py
...cessing/inverse_text_normalization/id/taggers/__init__.py
+0
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
...cessing/inverse_text_normalization/id/taggers/cardinal.py
+161
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/date.py
..._processing/inverse_text_normalization/id/taggers/date.py
+150
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/decimal.py
...ocessing/inverse_text_normalization/id/taggers/decimal.py
+100
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/electronic.py
...ssing/inverse_text_normalization/id/taggers/electronic.py
+100
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/fraction.py
...cessing/inverse_text_normalization/id/taggers/fraction.py
+11
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/measure.py
...ocessing/inverse_text_normalization/id/taggers/measure.py
+97
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/money.py
...processing/inverse_text_normalization/id/taggers/money.py
+110
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/ordinal.py
...ocessing/inverse_text_normalization/id/taggers/ordinal.py
+29
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/punctuation.py
...sing/inverse_text_normalization/id/taggers/punctuation.py
+20
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/telephone.py
...essing/inverse_text_normalization/id/taggers/telephone.py
+149
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/time.py
..._processing/inverse_text_normalization/id/taggers/time.py
+151
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/tokenize_and_classify.py
...se_text_normalization/id/taggers/tokenize_and_classify.py
+102
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/whitelist.py
...essing/inverse_text_normalization/id/taggers/whitelist.py
+19
-0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/word.py
..._processing/inverse_text_normalization/id/taggers/word.py
+15
-0
FunASR/fun_text_processing/inverse_text_normalization/id/text_cases.tsv
...t_processing/inverse_text_normalization/id/text_cases.tsv
+41
-0
FunASR/fun_text_processing/inverse_text_normalization/id/utils.py
...un_text_processing/inverse_text_normalization/id/utils.py
+33
-0
FunASR/fun_text_processing/inverse_text_normalization/id/verbalizers/__init__.py
...ing/inverse_text_normalization/id/verbalizers/__init__.py
+0
-0
FunASR/fun_text_processing/inverse_text_normalization/id/verbalizers/cardinal.py
...ing/inverse_text_normalization/id/verbalizers/cardinal.py
+38
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv
0 → 100644
View file @
70a8a9e0
dua ribu dua puluh dua 2022
tiga ribu 3000
sembilan ribu sembilan ratus sembilan puluh sembilan 9999
seribu satu 1001
ribu 1000
seribu 1000
seribu dua ratus delapan puluh sembilan 1289
ribu dua ratus delapan puluh sembilan 1289
nol satu dua tiga empat lima enam tujuh delapan sembilan 01 2345-6789
empat belas 14
limabelas 15
enambelas 16
tujuh belas 17
delapan belas 18
sembilan belas 19
dua puluh 20
seratus enam 106
enam ratus 600
ratus 100
seratus 100
satu miliar 1 miliar
seratus dua puluh tiga 123
ratus dua puluh tiga 123
dua puluh empat maret 24 maret
ribu tujuh puluh enam 1076
seribu tujuh puluh enam 1076
ribu tujuh puluh enam rupiah 1076 rupiah
tujuh puluh enam 76
ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima +62 21 6539-0605
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
,
num_to_word
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_ALPHA
,
DAMO_DIGIT
,
DAMO_SIGMA
,
DAMO_SPACE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for classifying cardinals
e.g. minus twenty three -> cardinal { integer: "23" negative: "-" } }
Numbers below thirteen are not converted.
"""
def
__init__
(
self
,
deterministic
:
bool
=
True
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"classify"
,
deterministic
=
deterministic
)
graph_zero
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_ties
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties.tsv"
))
graph_teen
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/teen.tsv"
))
graph_hundreds
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/hundreds.tsv"
))
graph_thousand
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/thousand.tsv"
))
graph_hundred
=
pynini
.
cross
(
"ratus"
,
""
)
|
pynini
.
cross
(
"seratus"
,
""
)
graph_hundred_component
=
pynini
.
union
(
graph_digit
+
delete_space
+
graph_hundred
,
pynutil
.
insert
(
"0"
)
)
graph_hundred_component
+=
delete_space
graph_hundred_component
+=
pynini
.
union
(
graph_teen
|
pynutil
.
insert
(
"00"
),
(
graph_ties
|
pynutil
.
insert
(
"0"
))
+
delete_space
+
(
graph_digit
|
pynutil
.
insert
(
"0"
)),
)
graph_one_hundred_component
=
pynini
.
union
(
pynini
.
cross
(
"ratus"
,
"1"
)
|
pynini
.
cross
(
"seratus"
,
"1"
)
)
graph_one_hundred_component
+=
delete_space
graph_one_hundred_component
+=
pynini
.
union
(
graph_teen
|
pynutil
.
insert
(
"00"
),
(
graph_ties
|
pynutil
.
insert
(
"0"
))
+
delete_space
+
(
graph_digit
|
pynutil
.
insert
(
"0"
)),
)
graph_hundred_component
=
graph_hundred_component
|
graph_one_hundred_component
graph_hundred_component_at_least_one_none_zero_digit
=
graph_hundred_component
@
(
pynini
.
closure
(
DAMO_DIGIT
)
+
(
DAMO_DIGIT
-
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
)
)
self
.
graph_hundred_component_at_least_one_none_zero_digit
=
(
graph_hundred_component_at_least_one_none_zero_digit
)
graph_thousand
=
pynini
.
cross
(
"ribu"
,
""
)
|
pynini
.
cross
(
"seribu"
,
""
)
graph_one_thousand_component
=
pynini
.
union
(
pynini
.
cross
(
"ribu"
,
"1"
)
|
pynini
.
cross
(
"seribu"
,
"1"
)
)
graph_thousands
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
(
pynutil
.
delete
(
"ribu"
)
|
pynutil
.
delete
(
"seribu"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_thousands
=
graph_thousands
|
(
pynutil
.
insert
(
"00"
)
+
graph_one_thousand_component
)
graph_million
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
(
pynutil
.
delete
(
"juta"
)
|
pynutil
.
delete
(
"sejuta"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_billion
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
(
pynutil
.
delete
(
"miliar"
)
|
pynutil
.
delete
(
"semiliar"
)
|
pynutil
.
delete
(
"milyar"
)
|
pynutil
.
delete
(
"semilyar"
)
),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_trillion
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
(
pynutil
.
delete
(
"triliun"
)
|
pynutil
.
delete
(
"setriliun"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_quadrillion
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
pynutil
.
delete
(
"milion lipat empat"
),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_quintillion
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
pynutil
.
delete
(
"triliun"
),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_sextillion
=
pynini
.
union
(
graph_hundred_component_at_least_one_none_zero_digit
+
delete_space
+
pynutil
.
delete
(
"sextillion"
),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph
=
pynini
.
union
(
graph_sextillion
+
delete_space
+
graph_quintillion
+
delete_space
+
graph_quadrillion
+
delete_space
+
graph_trillion
+
delete_space
+
graph_billion
+
delete_space
+
graph_million
+
delete_space
+
graph_thousands
+
delete_space
+
graph_hundred_component
,
# graph_digit,
graph_zero
,
)
graph
=
graph
@
pynini
.
union
(
pynutil
.
delete
(
pynini
.
closure
(
"0"
))
+
pynini
.
difference
(
DAMO_DIGIT
,
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
),
"0"
,
)
labels_exception
=
[
"nol"
]
graph_exception
=
pynini
.
union
(
*
labels_exception
)
graph
=
(
pynini
.
cdrewrite
(
pynutil
.
delete
(
"dan"
),
DAMO_SPACE
,
DAMO_SPACE
,
DAMO_SIGMA
)
@
(
DAMO_ALPHA
+
DAMO_SIGMA
)
@
graph
)
self
.
graph_no_exception
=
graph
self
.
graph
=
(
pynini
.
project
(
graph
,
"input"
)
-
graph_exception
.
arcsort
())
@
graph
optional_minus_graph
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"kurang"
,
'"-"'
)
+
DAMO_SPACE
,
0
,
1
)
final_graph
=
(
optional_minus_graph
+
pynutil
.
insert
(
'integer: "'
)
+
self
.
graph
+
pynutil
.
insert
(
'"'
)
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/date.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_ALPHA
,
DAMO_DIGIT
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
graph_teen
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/teen.tsv"
)).
optimize
()
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
)).
optimize
()
ties_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties.tsv"
)).
optimize
()
def
_get_month_graph
():
"""
Transducer for month, e.g. march -> march
"""
month_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/months.tsv"
))
return
month_graph
def
_get_ties_graph
():
"""
Transducer for 20-99 e.g
twenty three -> 23
"""
graph
=
ties_graph
+
(
delete_space
+
graph_digit
|
pynutil
.
insert
(
"0"
))
return
graph
def
_get_range_graph
():
"""
Transducer for decades (1**0s, 2**0s), centuries (2*00s, 1*00s), millennia (2000s)
"""
graph_ties
=
_get_ties_graph
()
graph
=
(
graph_ties
|
graph_teen
)
+
delete_space
+
pynini
.
cross
(
"ratusan"
,
"00s"
)
graph
|=
pynini
.
cross
(
"dua"
,
"2"
)
+
delete_space
+
pynini
.
cross
(
"ribuan"
,
"000s"
)
graph
|=
(
(
graph_ties
|
graph_teen
)
+
delete_space
+
(
pynini
.
closure
(
DAMO_ALPHA
,
1
)
+
(
pynini
.
cross
(
"ies"
,
"y"
)
|
pynutil
.
delete
(
"s"
)))
@
(
graph_ties
|
pynini
.
cross
(
"sepuluh"
,
"10"
))
+
pynutil
.
insert
(
"s"
)
)
graph
@=
pynini
.
union
(
"1"
,
"2"
)
+
DAMO_DIGIT
+
DAMO_DIGIT
+
DAMO_DIGIT
+
"s"
return
graph
def
_get_year_graph
():
"""
Transducer for year, e.g. twenty twenty -> 2020
"""
def
_get_digits_graph
():
zero
=
pynini
.
cross
((
pynini
.
accep
(
"oh"
)
|
pynini
.
accep
(
"o"
)),
"0"
)
graph
=
zero
+
delete_space
+
graph_digit
graph
.
optimize
()
return
graph
def
_get_thousands_graph
():
graph_ties
=
_get_ties_graph
()
graph_hundred_component
=
(
graph_digit
+
delete_space
+
pynutil
.
delete
(
"ratus"
)
)
|
pynutil
.
insert
(
"0"
)
graph
=
(
graph_digit
+
delete_space
+
pynutil
.
delete
(
"ribu"
)
+
delete_space
+
graph_hundred_component
+
delete_space
+
(
graph_teen
|
graph_ties
)
)
return
graph
graph_ties
=
_get_ties_graph
()
graph_digits
=
_get_digits_graph
()
graph_thousands
=
_get_thousands_graph
()
year_graph
=
(
# 20 19, 40 12, 2012 - assuming no limit on the year
(
graph_teen
+
delete_space
+
(
graph_ties
|
graph_digits
|
graph_teen
))
|
(
graph_ties
+
delete_space
+
(
graph_ties
|
graph_digits
|
graph_teen
))
|
graph_thousands
)
year_graph
.
optimize
()
return
year_graph
class
DateFst
(
GraphFst
):
"""
Finite state transducer for classifying date,
e.g. january fifth twenty twelve -> date { month: "january" day: "5" year: "2012" preserve_order: true }
e.g. the fifth of january twenty twelve -> date { day: "5" month: "january" year: "2012" preserve_order: true }
e.g. twenty twenty -> date { year: "2012" preserve_order: true }
Args:
ordinal: OrdinalFst
"""
def
__init__
(
self
,
ordinal
:
GraphFst
):
super
().
__init__
(
name
=
"date"
,
kind
=
"classify"
)
ordinal_graph
=
ordinal
.
graph
year_graph
=
_get_year_graph
()
YEAR_WEIGHT
=
0.001
year_graph
=
pynutil
.
add_weight
(
year_graph
,
YEAR_WEIGHT
)
month_graph
=
_get_month_graph
()
month_graph
=
pynutil
.
insert
(
'month: "'
)
+
month_graph
+
pynutil
.
insert
(
'"'
)
day_graph
=
(
pynutil
.
insert
(
'day: "'
)
+
pynutil
.
add_weight
(
ordinal_graph
,
-
0.7
)
+
pynutil
.
insert
(
'"'
)
)
graph_year
=
(
delete_extra_space
+
pynutil
.
insert
(
'year: "'
)
+
pynutil
.
add_weight
(
year_graph
,
-
YEAR_WEIGHT
)
+
pynutil
.
insert
(
'"'
)
)
optional_graph_year
=
pynini
.
closure
(
graph_year
,
0
,
1
,
)
graph_mdy
=
month_graph
+
(
(
delete_extra_space
+
day_graph
)
|
graph_year
|
(
delete_extra_space
+
day_graph
+
graph_year
)
)
graph_dmy
=
(
pynutil
.
delete
(
"the"
)
+
delete_space
+
day_graph
+
delete_space
+
pynutil
.
delete
(
"of"
)
+
delete_extra_space
+
month_graph
+
optional_graph_year
)
graph_year
=
(
pynutil
.
insert
(
'year: "'
)
+
(
year_graph
|
_get_range_graph
())
+
pynutil
.
insert
(
'"'
)
)
final_graph
=
graph_mdy
|
graph_dmy
|
graph_year
final_graph
+=
pynutil
.
insert
(
" preserve_order: true"
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/decimal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_DIGIT
,
GraphFst
,
delete_extra_space
,
delete_space
,
)
from
pynini.lib
import
pynutil
import
pdb
def
get_quantity
(
decimal
:
"pynini.FstLike"
,
cardinal_up_to_hundred
:
"pynini.FstLike"
)
->
"pynini.FstLike"
:
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Args:
decimal: decimal FST
cardinal_up_to_hundred: cardinal FST
"""
numbers
=
cardinal_up_to_hundred
@
(
pynutil
.
delete
(
pynini
.
closure
(
"0"
))
+
pynini
.
difference
(
DAMO_DIGIT
,
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
)
)
suffix
=
pynini
.
union
(
"juta"
,
"miliar"
,
"triliun"
,
"kuadriliun"
,
"triliun"
,
"sextillion"
)
res
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
numbers
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
pynutil
.
insert
(
'quantity: "'
)
+
suffix
+
pynutil
.
insert
(
'"'
)
)
res
|=
(
decimal
+
delete_extra_space
+
pynutil
.
insert
(
'quantity: "'
)
+
(
suffix
|
"ribu"
)
+
pynutil
.
insert
(
'"'
)
)
return
res
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for classifying decimal
e.g. minus twelve point five o o six billion -> decimal { negative: "true" integer_part: "12" fractional_part: "5006" quantity: "billion" }
e.g. one billion -> decimal { integer_part: "1" quantity: "billion" }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_decimal
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_decimal
|=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
|
pynini
.
cross
(
"o"
,
"0"
)
graph_decimal
=
pynini
.
closure
(
graph_decimal
+
delete_space
)
+
graph_decimal
self
.
graph
=
graph_decimal
point
=
pynutil
.
delete
(
"point"
)
# titik
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"kurang"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
graph_fractional
=
(
pynutil
.
insert
(
'fractional_part: "'
)
+
graph_decimal
+
pynutil
.
insert
(
'"'
)
)
graph_integer
=
pynutil
.
insert
(
'integer_part: "'
)
+
cardinal_graph
+
pynutil
.
insert
(
'"'
)
final_graph_wo_sign
=
(
pynini
.
closure
(
graph_integer
+
delete_extra_space
,
0
,
1
)
+
point
+
delete_extra_space
+
graph_fractional
)
final_graph
=
optional_graph_negative
+
final_graph_wo_sign
self
.
final_graph_wo_negative
=
final_graph_wo_sign
|
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundred_component_at_least_one_none_zero_digit
)
final_graph
|=
optional_graph_negative
+
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundred_component_at_least_one_none_zero_digit
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/electronic.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_ALPHA
,
GraphFst
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for classifying electronic: as URLs, email addresses, etc.
e.g. c d f one at a b c dot e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"classify"
)
delete_extra_space
=
pynutil
.
delete
(
" "
)
alpha_num
=
(
DAMO_ALPHA
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
)
symbols
=
pynini
.
string_file
(
get_abs_path
(
"data/electronic/symbols.tsv"
)).
invert
()
accepted_username
=
alpha_num
|
symbols
process_dot
=
pynini
.
cross
(
"dot"
,
"."
)
username
=
(
alpha_num
+
pynini
.
closure
(
delete_extra_space
+
accepted_username
)
)
|
pynutil
.
add_weight
(
pynini
.
closure
(
DAMO_ALPHA
,
1
),
weight
=
0.0001
)
username
=
pynutil
.
insert
(
'username: "'
)
+
username
+
pynutil
.
insert
(
'"'
)
single_alphanum
=
pynini
.
closure
(
alpha_num
+
delete_extra_space
)
+
alpha_num
server
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/server_name.tsv"
)
)
domain
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/domain.tsv"
))
domain_graph
=
(
pynutil
.
insert
(
'domain: "'
)
+
server
+
delete_extra_space
+
process_dot
+
delete_extra_space
+
domain
+
pynutil
.
insert
(
'"'
)
)
graph
=
(
username
+
delete_extra_space
+
pynutil
.
delete
(
"at"
)
+
insert_space
+
delete_extra_space
+
domain_graph
)
############# url ###
protocol_end
=
pynini
.
cross
(
pynini
.
union
(
"w w w"
,
"www"
),
"www"
)
protocol_start
=
(
pynini
.
cross
(
"h t t p"
,
"http"
)
|
pynini
.
cross
(
"h t t p s"
,
"https"
)
)
+
pynini
.
cross
(
" colon slash slash "
,
"://"
)
# .com,
ending
=
(
delete_extra_space
+
symbols
+
delete_extra_space
+
(
domain
|
pynini
.
closure
(
accepted_username
+
delete_extra_space
,
)
+
accepted_username
)
)
protocol_default
=
(
(
(
pynini
.
closure
(
delete_extra_space
+
accepted_username
,
1
)
|
server
)
|
pynutil
.
add_weight
(
pynini
.
closure
(
DAMO_ALPHA
,
1
),
weight
=
0.0001
)
)
+
pynini
.
closure
(
ending
,
1
)
).
optimize
()
protocol
=
(
pynini
.
closure
(
protocol_start
,
0
,
1
)
+
protocol_end
+
delete_extra_space
+
process_dot
+
protocol_default
).
optimize
()
protocol
|=
(
pynini
.
closure
(
protocol_end
+
delete_extra_space
+
process_dot
,
0
,
1
)
+
protocol_default
)
protocol
=
pynutil
.
insert
(
'protocol: "'
)
+
protocol
.
optimize
()
+
pynutil
.
insert
(
'"'
)
graph
|=
protocol
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/fraction.py
0 → 100644
View file @
70a8a9e0
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
GraphFst
class
FractionFst
(
GraphFst
):
"""
Finite state transducer for classifying fraction
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"fraction"
,
kind
=
"classify"
)
# integer_part # numerator # denominator
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/measure.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_SIGMA
,
GraphFst
,
convert_space
,
delete_extra_space
,
delete_space
,
get_singulars
,
)
from
pynini.lib
import
pynutil
class
MeasureFst
(
GraphFst
):
"""
Finite state transducer for classifying measure
e.g. minus twelve kilograms -> measure { negative: "true" cardinal { integer: "12" } units: "kg" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"measure"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_unit
=
pynini
.
string_file
(
get_abs_path
(
"data/measurements.tsv"
))
graph_unit_singular
=
pynini
.
invert
(
graph_unit
)
# singular -> abbr
graph_unit_plural
=
get_singulars
(
graph_unit_singular
)
# plural -> abbr
optional_graph_negative
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"kurang"
,
'"true"'
)
+
delete_extra_space
,
0
,
1
,
)
unit_singular
=
convert_space
(
graph_unit_singular
)
unit_plural
=
convert_space
(
graph_unit_plural
)
unit_misc
=
(
pynutil
.
insert
(
"/"
)
+
pynutil
.
delete
(
"per"
)
+
delete_space
+
convert_space
(
graph_unit_singular
)
)
unit_singular
=
(
pynutil
.
insert
(
'units: "'
)
+
(
unit_singular
|
unit_misc
|
pynutil
.
add_weight
(
unit_singular
+
delete_space
+
unit_misc
,
0.01
)
)
+
pynutil
.
insert
(
'"'
)
)
unit_plural
=
(
pynutil
.
insert
(
'units: "'
)
+
(
unit_plural
|
unit_misc
|
pynutil
.
add_weight
(
unit_plural
+
delete_space
+
unit_misc
,
0.01
)
)
+
pynutil
.
insert
(
'"'
)
)
subgraph_decimal
=
(
pynutil
.
insert
(
"decimal { "
)
+
optional_graph_negative
+
decimal
.
final_graph_wo_negative
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_plural
)
subgraph_cardinal
=
(
pynutil
.
insert
(
"cardinal { "
)
+
optional_graph_negative
+
pynutil
.
insert
(
'integer: "'
)
+
((
DAMO_SIGMA
-
"satu"
)
@
cardinal_graph
)
+
pynutil
.
insert
(
'"'
)
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_plural
)
subgraph_cardinal
|=
(
pynutil
.
insert
(
"cardinal { "
)
+
optional_graph_negative
+
pynutil
.
insert
(
'integer: "'
)
+
pynini
.
cross
(
"satu"
,
"1"
)
+
pynutil
.
insert
(
'"'
)
+
pynutil
.
insert
(
" }"
)
+
delete_extra_space
+
unit_singular
)
final_graph
=
subgraph_decimal
|
subgraph_cardinal
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/money.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_DIGIT
,
DAMO_NOT_SPACE
,
DAMO_SIGMA
,
GraphFst
,
convert_space
,
delete_extra_space
,
delete_space
,
get_singulars
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
MoneyFst
(
GraphFst
):
"""
Finite state transducer for classifying money
e.g. twelve dollars and five cents -> money { integer_part: "12" fractional_part: 05 currency: "$" }
Args:
cardinal: CardinalFst
decimal: DecimalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
,
decimal
:
GraphFst
):
super
().
__init__
(
name
=
"money"
,
kind
=
"classify"
)
# quantity, integer_part, fractional_part, currency
cardinal_graph
=
cardinal
.
graph_no_exception
# add support for missing hundred (only for 3 digit numbers)
# "one fifty" -> "one hundred fifty"
with_hundred
=
pynini
.
compose
(
pynini
.
closure
(
DAMO_NOT_SPACE
)
+
pynini
.
accep
(
" "
)
+
pynutil
.
insert
(
"ratus "
)
+
DAMO_SIGMA
,
pynini
.
compose
(
cardinal_graph
,
DAMO_DIGIT
**
3
),
)
cardinal_graph
|=
with_hundred
graph_decimal_final
=
decimal
.
final_graph_wo_negative
unit
=
pynini
.
string_file
(
get_abs_path
(
"data/currency.tsv"
))
unit_singular
=
pynini
.
invert
(
unit
)
unit_plural
=
get_singulars
(
unit_singular
)
graph_unit_singular
=
(
pynutil
.
insert
(
'currency: "'
)
+
convert_space
(
unit_singular
)
+
pynutil
.
insert
(
'"'
)
)
graph_unit_plural
=
(
pynutil
.
insert
(
'currency: "'
)
+
convert_space
(
unit_plural
)
+
pynutil
.
insert
(
'"'
)
)
add_leading_zero_to_double_digit
=
(
DAMO_DIGIT
+
DAMO_DIGIT
)
|
(
pynutil
.
insert
(
"0"
)
+
DAMO_DIGIT
)
# twelve dollars (and) fifty cents, zero cents
cents_standalone
=
(
pynutil
.
insert
(
'fractional_part: "'
)
+
pynini
.
union
(
pynutil
.
add_weight
(((
DAMO_SIGMA
-
"satu"
)
@
cardinal_graph
),
-
0.7
)
@
add_leading_zero_to_double_digit
+
delete_space
+
pynutil
.
delete
(
"sen"
),
pynini
.
cross
(
"satu"
,
"01"
)
+
delete_space
+
pynutil
.
delete
(
"sen"
),
)
+
pynutil
.
insert
(
'"'
)
)
optional_cents_standalone
=
pynini
.
closure
(
delete_space
+
pynini
.
closure
(
pynutil
.
delete
(
"dan"
)
+
delete_space
,
0
,
1
)
+
insert_space
+
cents_standalone
,
0
,
1
,
)
# twelve dollars fifty, only after integer
optional_cents_suffix
=
pynini
.
closure
(
delete_extra_space
+
pynutil
.
insert
(
'fractional_part: "'
)
+
pynutil
.
add_weight
(
cardinal_graph
@
add_leading_zero_to_double_digit
,
-
0.7
)
+
pynutil
.
insert
(
'"'
),
0
,
1
,
)
graph_integer
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
((
DAMO_SIGMA
-
"satu"
)
@
cardinal_graph
)
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
graph_unit_plural
+
(
optional_cents_standalone
|
optional_cents_suffix
)
)
graph_integer
|=
(
pynutil
.
insert
(
'integer_part: "'
)
+
pynini
.
cross
(
"satu"
,
"1"
)
+
pynutil
.
insert
(
'"'
)
+
delete_extra_space
+
graph_unit_singular
+
(
optional_cents_standalone
|
optional_cents_suffix
)
)
graph_decimal
=
graph_decimal_final
+
delete_extra_space
+
graph_unit_plural
graph_decimal
|=
pynutil
.
insert
(
'currency: "$" integer_part: "0" '
)
+
cents_standalone
final_graph
=
graph_integer
|
graph_decimal
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/ordinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
DAMO_CHAR
,
GraphFst
from
pynini.lib
import
pynutil
class
OrdinalFst
(
GraphFst
):
"""
Finite state transducer for classifying ordinal
e.g. thirteenth -> ordinal { integer: "13" }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"ordinal"
,
kind
=
"classify"
)
cardinal_graph
=
cardinal
.
graph_no_exception
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/digit.tsv"
))
graph_teens
=
pynini
.
string_file
(
get_abs_path
(
"data/ordinals/teen.tsv"
))
graph
=
pynini
.
closure
(
DAMO_CHAR
)
+
pynini
.
union
(
graph_digit
,
graph_teens
,
pynini
.
cross
(
"tieth"
,
"ty"
),
pynini
.
cross
(
"th"
,
""
)
)
# TODO
self
.
graph
=
graph
@
cardinal_graph
final_graph
=
pynutil
.
insert
(
'integer: "'
)
+
self
.
graph
+
pynutil
.
insert
(
'"'
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/punctuation.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
GraphFst
from
pynini.lib
import
pynutil
class
PunctuationFst
(
GraphFst
):
"""
Finite state transducer for classifying punctuation
e.g. a, -> tokens { name: "a" } tokens { name: "," }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"punctuation"
,
kind
=
"classify"
)
s
=
"!#$%&'()*+,-./:;<=>?@^_`{|}~"
punct
=
pynini
.
union
(
*
s
)
graph
=
pynutil
.
insert
(
'name: "'
)
+
punct
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/telephone.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_ALNUM
,
DAMO_ALPHA
,
DAMO_DIGIT
,
GraphFst
,
insert_space
,
)
from
pynini.lib
import
pynutil
def
get_serial_number
(
cardinal
):
"""
any alphanumerical character sequence with at least one number with length greater equal to 3
"""
digit
=
pynini
.
compose
(
cardinal
.
graph_no_exception
,
DAMO_DIGIT
)
character
=
digit
|
DAMO_ALPHA
sequence
=
character
+
pynini
.
closure
(
pynutil
.
delete
(
" "
)
+
character
,
2
)
sequence
=
sequence
@
(
pynini
.
closure
(
DAMO_ALNUM
)
+
DAMO_DIGIT
+
pynini
.
closure
(
DAMO_ALNUM
))
return
sequence
.
optimize
()
class
TelephoneFst
(
GraphFst
):
"""
Finite state transducer for classifying telephone numbers, e.g.
one two three one two three five six seven eight -> { number_part: "123-123-5678" }
This class also support card number and IP format.
"one two three dot one double three dot o dot four o" -> { number_part: "123.133.0.40"}
"three two double seven three two one four three two one four three double zero five" ->
{ number_part: 3277 3214 3214 3005}
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"telephone"
,
kind
=
"classify"
)
# country code, number_part, extension
digit_to_str
=
(
pynini
.
invert
(
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
)).
optimize
())
|
pynini
.
cross
(
"0"
,
pynini
.
union
(
"o"
,
"oh"
,
"nol"
)).
optimize
()
)
str_to_digit
=
pynini
.
invert
(
digit_to_str
)
double_digit
=
pynini
.
union
(
*
[
pynini
.
cross
(
pynini
.
project
(
str
(
i
)
@
digit_to_str
,
"output"
)
+
pynini
.
accep
(
" "
)
+
pynini
.
project
(
str
(
i
)
@
digit_to_str
,
"output"
),
pynutil
.
insert
(
"dobel "
)
+
pynini
.
project
(
str
(
i
)
@
digit_to_str
,
"output"
),
)
for
i
in
range
(
10
)
]
)
double_digit
.
invert
()
# to handle cases like "one twenty three"
two_digit_cardinal
=
pynini
.
compose
(
cardinal
.
graph_no_exception
,
DAMO_DIGIT
**
2
)
double_digit_to_digit
=
(
pynini
.
compose
(
double_digit
,
str_to_digit
+
pynutil
.
delete
(
" "
)
+
str_to_digit
)
|
two_digit_cardinal
)
single_or_double_digit
=
(
pynutil
.
add_weight
(
double_digit_to_digit
,
-
0.0001
)
|
str_to_digit
).
optimize
()
single_or_double_digit
|=
(
single_or_double_digit
+
pynini
.
closure
(
pynutil
.
add_weight
(
pynutil
.
delete
(
" "
)
+
single_or_double_digit
,
0.0001
)
)
).
optimize
()
number_part
=
pynini
.
compose
(
single_or_double_digit
,
DAMO_DIGIT
**
2
+
pynutil
.
insert
(
" "
)
+
DAMO_DIGIT
**
4
+
pynutil
.
insert
(
"-"
)
+
DAMO_DIGIT
**
4
,
).
optimize
()
number_part
=
(
pynutil
.
insert
(
'number_part: "'
)
+
number_part
.
optimize
()
+
pynutil
.
insert
(
'"'
)
)
cardinal_option
=
pynini
.
compose
(
single_or_double_digit
,
DAMO_DIGIT
**
(
2
,
3
))
country_code
=
(
pynutil
.
insert
(
'country_code: "'
)
+
pynini
.
closure
(
pynini
.
cross
(
"ditambah "
,
"+"
),
0
,
1
)
+
(
(
pynini
.
closure
(
str_to_digit
+
pynutil
.
delete
(
" "
),
0
,
2
)
+
str_to_digit
)
|
cardinal_option
)
+
pynutil
.
insert
(
'"'
)
)
optional_country_code
=
pynini
.
closure
(
country_code
+
pynutil
.
delete
(
" "
)
+
insert_space
,
0
,
1
).
optimize
()
graph
=
optional_country_code
+
number_part
# credit card number
space_four_digits
=
insert_space
+
DAMO_DIGIT
**
4
credit_card_graph
=
pynini
.
compose
(
single_or_double_digit
,
DAMO_DIGIT
**
4
+
space_four_digits
**
3
).
optimize
()
graph
|=
(
pynutil
.
insert
(
'number_part: "'
)
+
credit_card_graph
.
optimize
()
+
pynutil
.
insert
(
'"'
)
)
# SSN
ssn_graph
=
pynini
.
compose
(
single_or_double_digit
,
DAMO_DIGIT
**
3
+
pynutil
.
insert
(
"-"
)
+
DAMO_DIGIT
**
2
+
pynutil
.
insert
(
"-"
)
+
DAMO_DIGIT
**
4
,
).
optimize
()
graph
|=
pynutil
.
insert
(
'number_part: "'
)
+
ssn_graph
.
optimize
()
+
pynutil
.
insert
(
'"'
)
# ip
digit_or_double
=
(
pynini
.
closure
(
str_to_digit
+
pynutil
.
delete
(
" "
),
0
,
1
)
+
double_digit_to_digit
)
digit_or_double
|=
double_digit_to_digit
+
pynini
.
closure
(
pynutil
.
delete
(
" "
)
+
str_to_digit
,
0
,
1
)
digit_or_double
|=
str_to_digit
+
(
pynutil
.
delete
(
" "
)
+
str_to_digit
)
**
(
0
,
2
)
digit_or_double
|=
cardinal_option
digit_or_double
=
digit_or_double
.
optimize
()
ip_graph
=
digit_or_double
+
(
pynini
.
cross
(
" dot "
,
"."
)
+
digit_or_double
)
**
3
graph
|=
pynutil
.
insert
(
'number_part: "'
)
+
ip_graph
.
optimize
()
+
pynutil
.
insert
(
'"'
)
graph
|=
(
pynutil
.
insert
(
'number_part: "'
)
+
pynutil
.
add_weight
(
get_serial_number
(
cardinal
=
cardinal
),
weight
=
0.0001
)
+
pynutil
.
insert
(
'"'
)
)
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/time.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.taggers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
,
num_to_word
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
GraphFst
,
convert_space
,
delete_extra_space
,
delete_space
,
insert_space
,
)
from
pynini.lib
import
pynutil
class
TimeFst
(
GraphFst
):
"""
Finite state transducer for classifying time
e.g. twelve thirty -> time { hours: "12" minutes: "30" }
e.g. twelve past one -> time { minutes: "12" hours: "1" }
e.g. two o clock a m -> time { hours: "2" suffix: "a.m." }
e.g. quarter to two -> time { hours: "1" minutes: "45" }
e.g. quarter past two -> time { hours: "2" minutes: "15" }
e.g. half past two -> time { hours: "2" minutes: "30" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"time"
,
kind
=
"classify"
)
# hours, minutes, seconds, suffix, zone, style, speak_period
suffix_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/time/time_suffix.tsv"
))
time_zone_graph
=
pynini
.
invert
(
pynini
.
string_file
(
get_abs_path
(
"data/time/time_zone.tsv"
)))
to_hour_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/time/to_hour.tsv"
))
minute_to_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/time/minute_to.tsv"
))
cardinal
=
pynutil
.
add_weight
(
CardinalFst
().
graph_no_exception
,
weight
=-
0.7
)
labels_hour
=
[
"nol"
,
"satu"
,
"dua"
,
"tiga"
,
"empat"
,
"lima"
,
"enam"
,
"tujuh"
,
"delapan"
,
"sembilan"
,
"sepuluh"
,
"sebelas"
,
"duabelas"
,
"tigabelas"
,
]
labels_minute_single
=
[
num_to_word
(
x
)
for
x
in
range
(
1
,
10
)]
labels_minute_double
=
[
num_to_word
(
x
)
for
x
in
range
(
10
,
60
)]
graph_hour
=
pynini
.
union
(
*
labels_hour
)
@
cardinal
graph_minute_single
=
pynini
.
union
(
*
labels_minute_single
)
@
cardinal
graph_minute_double
=
pynini
.
union
(
*
labels_minute_double
)
@
cardinal
graph_minute_verbose
=
pynini
.
cross
(
"setengah"
,
"30"
)
|
pynini
.
cross
(
"seperempat"
,
"15"
)
oclock
=
pynini
.
cross
(
pynini
.
union
(
"jam"
,
"pukul"
),
""
)
final_graph_hour
=
pynutil
.
insert
(
'hours: "'
)
+
graph_hour
+
pynutil
.
insert
(
'"'
)
graph_minute
=
(
oclock
+
pynutil
.
insert
(
"00"
)
|
pynutil
.
delete
(
"o"
)
+
delete_space
+
graph_minute_single
|
graph_minute_double
)
final_suffix
=
(
pynutil
.
insert
(
'suffix: "'
)
+
convert_space
(
suffix_graph
)
+
pynutil
.
insert
(
'"'
)
)
final_suffix
=
delete_space
+
insert_space
+
final_suffix
final_suffix_optional
=
pynini
.
closure
(
final_suffix
,
0
,
1
)
final_time_zone_optional
=
pynini
.
closure
(
delete_space
+
insert_space
+
pynutil
.
insert
(
'zone: "'
)
+
convert_space
(
time_zone_graph
)
+
pynutil
.
insert
(
'"'
),
0
,
1
,
)
# five o' clock
# two o eight, two thirty five (am/pm)
# two pm/am
graph_hm
=
(
final_graph_hour
+
delete_extra_space
+
pynutil
.
insert
(
'minutes: "'
)
+
graph_minute
+
pynutil
.
insert
(
'"'
)
)
# 10 past four, quarter past four, half past four
graph_m_past_h
=
(
pynutil
.
insert
(
'minutes: "'
)
+
pynini
.
union
(
graph_minute_single
,
graph_minute_double
,
graph_minute_verbose
)
+
pynutil
.
insert
(
'"'
)
+
delete_space
+
pynutil
.
delete
(
"setengah"
)
+
delete_extra_space
+
final_graph_hour
)
graph_quarter_time
=
(
pynutil
.
insert
(
'minutes: "'
)
+
pynini
.
cross
(
"seperempat"
,
"45"
)
+
pynutil
.
insert
(
'"'
)
+
delete_space
+
pynutil
.
delete
(
pynini
.
union
(
"to"
,
"till"
))
+
delete_extra_space
+
pynutil
.
insert
(
'hours: "'
)
+
to_hour_graph
+
pynutil
.
insert
(
'"'
)
)
graph_m_to_h_suffix_time
=
(
pynutil
.
insert
(
'minutes: "'
)
+
((
graph_minute_single
|
graph_minute_double
).
optimize
()
@
minute_to_graph
)
+
pynutil
.
insert
(
'"'
)
+
pynini
.
closure
(
delete_space
+
pynutil
.
delete
(
pynini
.
union
(
"min"
,
"mins"
,
"menit"
)),
0
,
1
)
+
delete_space
+
pynutil
.
delete
(
pynini
.
union
(
"sampai"
))
+
delete_extra_space
+
pynutil
.
insert
(
'hours: "'
)
+
to_hour_graph
+
pynutil
.
insert
(
'"'
)
+
final_suffix
)
graph_h
=
(
final_graph_hour
+
delete_extra_space
+
pynutil
.
insert
(
'minutes: "'
)
+
(
pynutil
.
insert
(
"00"
)
|
graph_minute
)
+
pynutil
.
insert
(
'"'
)
+
final_suffix
+
final_time_zone_optional
)
final_graph
=
(
(
graph_hm
|
graph_m_past_h
|
graph_quarter_time
)
+
final_suffix_optional
+
final_time_zone_optional
)
final_graph
|=
graph_h
final_graph
|=
graph_m_to_h_suffix_time
final_graph
=
self
.
add_tokens
(
final_graph
.
optimize
())
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/tokenize_and_classify.py
0 → 100644
View file @
70a8a9e0
import
os
import
pynini
from
fun_text_processing.inverse_text_normalization.id.taggers.cardinal
import
CardinalFst
from
fun_text_processing.inverse_text_normalization.id.taggers.date
import
DateFst
from
fun_text_processing.inverse_text_normalization.id.taggers.decimal
import
DecimalFst
from
fun_text_processing.inverse_text_normalization.id.taggers.electronic
import
ElectronicFst
from
fun_text_processing.inverse_text_normalization.id.taggers.measure
import
MeasureFst
from
fun_text_processing.inverse_text_normalization.id.taggers.money
import
MoneyFst
from
fun_text_processing.inverse_text_normalization.id.taggers.ordinal
import
OrdinalFst
from
fun_text_processing.inverse_text_normalization.id.taggers.punctuation
import
PunctuationFst
from
fun_text_processing.inverse_text_normalization.id.taggers.telephone
import
TelephoneFst
from
fun_text_processing.inverse_text_normalization.id.taggers.time
import
TimeFst
from
fun_text_processing.inverse_text_normalization.id.taggers.whitelist
import
WhiteListFst
from
fun_text_processing.inverse_text_normalization.id.taggers.word
import
WordFst
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
GraphFst
,
delete_extra_space
,
delete_space
,
generator_main
,
)
from
pynini.lib
import
pynutil
import
logging
class
ClassifyFst
(
GraphFst
):
"""
Final class that composes all other classification grammars. This class can process an entire sentence, that is lower cased.
For deployment, this grammar will be compiled and exported to OpenFst Finate State Archiv (FAR) File.
More details to deployment at NeMo/tools/text_processing_deployment.
Args:
cache_dir: path to a dir with .far grammar file. Set to None to avoid using cache.
overwrite_cache: set to True to overwrite .far files
"""
def
__init__
(
self
,
cache_dir
:
str
=
None
,
overwrite_cache
:
bool
=
False
):
super
().
__init__
(
name
=
"tokenize_and_classify"
,
kind
=
"classify"
)
far_file
=
None
if
cache_dir
is
not
None
and
cache_dir
!=
"None"
:
os
.
makedirs
(
cache_dir
,
exist_ok
=
True
)
far_file
=
os
.
path
.
join
(
cache_dir
,
"_id_itn.far"
)
if
not
overwrite_cache
and
far_file
and
os
.
path
.
exists
(
far_file
):
self
.
fst
=
pynini
.
Far
(
far_file
,
mode
=
"r"
)[
"tokenize_and_classify"
]
logging
.
info
(
f
"ClassifyFst.fst was restored from
{
far_file
}
."
)
else
:
logging
.
info
(
f
"Creating ClassifyFst grammars."
)
cardinal
=
CardinalFst
()
cardinal_graph
=
cardinal
.
fst
ordinal
=
OrdinalFst
(
cardinal
)
ordinal_graph
=
ordinal
.
fst
decimal
=
DecimalFst
(
cardinal
)
decimal_graph
=
decimal
.
fst
measure_graph
=
MeasureFst
(
cardinal
=
cardinal
,
decimal
=
decimal
).
fst
date_graph
=
DateFst
(
ordinal
=
ordinal
).
fst
word_graph
=
WordFst
().
fst
time_graph
=
TimeFst
().
fst
money_graph
=
MoneyFst
(
cardinal
=
cardinal
,
decimal
=
decimal
).
fst
whitelist_graph
=
WhiteListFst
().
fst
punct_graph
=
PunctuationFst
().
fst
electronic_graph
=
ElectronicFst
().
fst
telephone_graph
=
TelephoneFst
(
cardinal
).
fst
classify
=
(
pynutil
.
add_weight
(
whitelist_graph
,
1.01
)
|
pynutil
.
add_weight
(
time_graph
,
1.1
)
|
pynutil
.
add_weight
(
date_graph
,
1.09
)
|
pynutil
.
add_weight
(
decimal_graph
,
1.1
)
|
pynutil
.
add_weight
(
measure_graph
,
1.1
)
|
pynutil
.
add_weight
(
cardinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
ordinal_graph
,
1.1
)
|
pynutil
.
add_weight
(
money_graph
,
1.1
)
|
pynutil
.
add_weight
(
telephone_graph
,
1.1
)
|
pynutil
.
add_weight
(
electronic_graph
,
1.1
)
|
pynutil
.
add_weight
(
word_graph
,
100
)
)
punct
=
(
pynutil
.
insert
(
"tokens { "
)
+
pynutil
.
add_weight
(
punct_graph
,
weight
=
1.1
)
+
pynutil
.
insert
(
" }"
)
)
token
=
pynutil
.
insert
(
"tokens { "
)
+
classify
+
pynutil
.
insert
(
" }"
)
token_plus_punct
=
(
pynini
.
closure
(
punct
+
pynutil
.
insert
(
" "
))
+
token
+
pynini
.
closure
(
pynutil
.
insert
(
" "
)
+
punct
)
)
graph
=
token_plus_punct
+
pynini
.
closure
(
delete_extra_space
+
token_plus_punct
)
graph
=
delete_space
+
graph
+
delete_space
self
.
fst
=
graph
.
optimize
()
if
far_file
:
generator_main
(
far_file
,
{
"tokenize_and_classify"
:
self
.
fst
})
logging
.
info
(
f
"ClassifyFst grammars are saved to
{
far_file
}
."
)
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/whitelist.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.utils
import
get_abs_path
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
GraphFst
,
convert_space
from
pynini.lib
import
pynutil
class
WhiteListFst
(
GraphFst
):
"""
Finite state transducer for classifying whitelisted tokens
e.g. misses -> tokens { name: "mrs." }
This class has highest priority among all classifier grammars. Whitelisted tokens are defined and loaded from "data/whitelist.tsv".
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"whitelist"
,
kind
=
"classify"
)
whitelist
=
pynini
.
string_file
(
get_abs_path
(
"data/whitelist.tsv"
)).
invert
()
graph
=
pynutil
.
insert
(
'name: "'
)
+
convert_space
(
whitelist
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/taggers/word.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
DAMO_NOT_SPACE
,
GraphFst
from
pynini.lib
import
pynutil
class
WordFst
(
GraphFst
):
"""
Finite state transducer for classifying plain tokens, that do not belong to any special class. This can be considered as the default class.
e.g. sleep -> tokens { name: "sleep" }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"word"
,
kind
=
"classify"
)
word
=
pynutil
.
insert
(
'name: "'
)
+
pynini
.
closure
(
DAMO_NOT_SPACE
,
1
)
+
pynutil
.
insert
(
'"'
)
self
.
fst
=
word
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/id/text_cases.tsv
0 → 100644
View file @
70a8a9e0
dua ribu dua puluh dua 2022
nol satu dua tiga empat lima enam tujuh delapan sembilan 0123456789
empat belas 14
limabelas 15
enambelas 16
tujuh belas 17
delapan belas 18
sembilan belas 19
dua puluh 20
seratus enam 106
enam ratus 600
ratus 100
seratus 100
satu juta 1,000,000
satu miliar 1 miliar
seratus dua puluh tiga 123
ratus dua puluh tiga 123
dua puluh empat maret 24th March
seribu dua ratus delapan puluh sembilan 1289
lima juta tiga ribu tujuh puluh enam rupiah Rp5003076
ribu tujuh puluh enam rupiah Rp1076
tujuh puluh enam rupiah dollar $1076
ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima +62 21 6539-0605
tiga ribu 3000
sembilan ribu sembilan ratus sembilan puluh sembilan 9999
seribu satu 1001
nol 0
satu 1
dua 2
tiga 3
empat 4
lima 5
enam 6
tujuh 7
delapan 8
sembilan 9
sepuluh 10
sebelas 11
dua belas 12
tigabelas 13
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/id/utils.py
0 → 100644
View file @
70a8a9e0
import
os
from
typing
import
Union
import
inflect
_inflect
=
inflect
.
engine
()
def
num_to_word
(
x
:
Union
[
str
,
int
]):
"""
converts integer to spoken representation
Args
x: integer
Returns: spoken representation
"""
if
isinstance
(
x
,
int
):
x
=
str
(
x
)
x
=
_inflect
.
number_to_words
(
str
(
x
)).
replace
(
"-"
,
" "
).
replace
(
","
,
""
)
return
x
def
get_abs_path
(
rel_path
):
"""
Get absolute path
Args:
rel_path: relative path to this file
Returns absolute path
"""
return
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
+
"/"
+
rel_path
FunASR/fun_text_processing/inverse_text_normalization/id/verbalizers/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/id/verbalizers/cardinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.id.graph_utils
import
(
DAMO_NOT_QUOTE
,
GraphFst
,
delete_space
,
)
from
pynini.lib
import
pynutil
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for verbalizing cardinal
e.g. cardinal { integer: "23" negative: "-" } -> -23
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"verbalize"
)
optional_sign
=
pynini
.
closure
(
pynutil
.
delete
(
"negative:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
DAMO_NOT_QUOTE
+
pynutil
.
delete
(
'"'
)
+
delete_space
,
0
,
1
,
)
graph
=
(
pynutil
.
delete
(
"integer:"
)
+
delete_space
+
pynutil
.
delete
(
'"'
)
+
pynini
.
closure
(
DAMO_NOT_QUOTE
,
1
)
+
pynutil
.
delete
(
'"'
)
)
self
.
numbers
=
graph
graph
=
optional_sign
+
graph
delete_tokens
=
self
.
delete_tokens
(
graph
)
self
.
fst
=
delete_tokens
.
optimize
()
Prev
1
…
23
24
25
26
27
28
29
30
31
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment