Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
yangrong
Paraformer_FunASR_pytorch
Commits
70a8a9e0
Commit
70a8a9e0
authored
Oct 03, 2024
by
wangwei990215
Browse files
initial commit
parents
Changes
827
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
992 additions
and
0 deletions
+992
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/__init__.py
...sing/inverse_text_normalization/fr/data/roman/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/digits_large.tsv
...inverse_text_normalization/fr/data/roman/digits_large.tsv
+10
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/hundreds_large.tsv
...verse_text_normalization/fr/data/roman/hundreds_large.tsv
+10
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/ties_large.tsv
...g/inverse_text_normalization/fr/data/roman/ties_large.tsv
+10
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/suppletive.tsv
...cessing/inverse_text_normalization/fr/data/suppletive.tsv
+0
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/__init__.py
...ssing/inverse_text_normalization/fr/data/time/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hour_to_night.tsv
...inverse_text_normalization/fr/data/time/hour_to_night.tsv
+13
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours.tsv
...cessing/inverse_text_normalization/fr/data/time/hours.tsv
+27
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours_to.tsv
...sing/inverse_text_normalization/fr/data/time/hours_to.tsv
+26
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes.tsv
...ssing/inverse_text_normalization/fr/data/time/minutes.tsv
+64
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes_to.tsv
...ng/inverse_text_normalization/fr/data/time/minutes_to.tsv
+60
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_am.tsv
...nverse_text_normalization/fr/data/time/time_suffix_am.tsv
+2
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_pm.tsv
...nverse_text_normalization/fr/data/time/time_suffix_pm.tsv
+3
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/whitelist.tsv
...ocessing/inverse_text_normalization/fr/data/whitelist.tsv
+17
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/graph_utils.py
...t_processing/inverse_text_normalization/fr/graph_utils.py
+180
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/__init__.py
...cessing/inverse_text_normalization/fr/taggers/__init__.py
+1
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/cardinal.py
...cessing/inverse_text_normalization/fr/taggers/cardinal.py
+287
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/date.py
..._processing/inverse_text_normalization/fr/taggers/date.py
+48
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/decimal.py
...ocessing/inverse_text_normalization/fr/taggers/decimal.py
+126
-0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/electronic.py
...ssing/inverse_text_normalization/fr/taggers/electronic.py
+106
-0
No files found.
Too many changes to show.
To preserve performance only
827 of 827+
files are displayed.
Plain diff
Email patch
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/digits_large.tsv
0 → 100644
View file @
70a8a9e0
I 1
II 2
III 3
IV 4
V 5
VI 6
VII 7
VIII 8
IX 9
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/hundreds_large.tsv
0 → 100644
View file @
70a8a9e0
C 1
CC 2
CCC 3
CD 4
D 5
DC 6
DCC 7
DCCC 8
CM 9
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/roman/ties_large.tsv
0 → 100644
View file @
70a8a9e0
X 1
XX 2
XXX 3
XL 4
L 5
LX 6
LXX 7
LXXX 8
XC 9
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/suppletive.tsv
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hour_to_night.tsv
0 → 100644
View file @
70a8a9e0
1 13
2 14
3 15
4 16
5 17
6 18
7 19
8 20
9 21
10 22
11 23
12 0
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours.tsv
0 → 100644
View file @
70a8a9e0
zéro 0
une 1
deux 2
trois 3
quatre 4
cinq 5
six 6
sept 7
huit 8
neuf 9
dix 10
onze 11
douze 12
treize 13
quatorze 14
quinze 15
seize 16
dix-sept 17
dix-huit 18
dix-neuf 19
vingt 20
vingt-et-une 21
vingt et une
vingt-deux 22
vingt-trois 23
vingt-quatre 24
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/hours_to.tsv
0 → 100644
View file @
70a8a9e0
1 0
2 1
3 2
4 3
5 4
6 5
7 6
8 7
9 8
10 9
11 10
12 11
13 12
14 13
15 14
16 15
17 16
18 17
19 18
20 19
21 20
22 21
23 22
24 23
0 23
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes.tsv
0 → 100644
View file @
70a8a9e0
une 01
deux 02
trois 03
quatre 04
cinq 05
six 06
sept 07
huit 08
neuf 09
dix 10
onze 11
douze 12
treize 13
quatorze 14
quinze 15
seize 16
dix-sept 17
dix-huit 18
dix-neuf 19
vingt 20
vingt-et-une 21
vingt et une 21
vingt-deux 22
vingt-trois 23
vingt-quatre 27
vingt-cinq 25
vingt-six 26
vingt-sept 27
vingt-huit 28
vingt-neuf 29
trente 30
trente-et-une 31
trente et une 31
trente-deux 32
trente-trois 33
trente-quatre 34
trente-cinq 35
trente-six 36
trente-sept 37
trente-huit 38
trente-neuf 39
quarante 40
quarante-et-une 41
quarante et une 41
quarante-deux 42
quarante-trois 43
quarante-quatre 44
quarante-cinq 45
quarante-six 46
quarante-sept 47
quarante-huit 48
quarante-neuf 49
cinquante 50
cinquante-et-une 51
cinquante et une 51
cinquante-deux 52
cinquante-trois 53
cinquante-quatre 54
cinquante-cinq 55
cinquante-six 56
cinquante-sept 57
cinquante-huit 58
cinquante-neuf 59
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/minutes_to.tsv
0 → 100644
View file @
70a8a9e0
01 59
02 58
03 57
04 56
05 55
06 54
07 53
08 52
09 51
10 50
11 49
12 48
13 47
14 46
15 45
16 44
17 43
18 42
19 41
20 40
21 39
22 38
23 37
24 36
25 35
26 34
27 33
28 32
29 31
30 30
31 29
32 28
33 27
34 26
35 25
36 24
37 23
38 22
39 21
40 20
41 19
42 18
43 17
44 16
45 15
46 14
47 13
48 12
49 11
50 10
51 09
52 08
53 07
54 06
55 05
56 04
57 03
58 02
59 01
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_am.tsv
0 → 100644
View file @
70a8a9e0
du matin
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/time/time_suffix_pm.tsv
0 → 100644
View file @
70a8a9e0
de l'après-midi
du soir
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/data/whitelist.tsv
0 → 100644
View file @
70a8a9e0
monsieur M.
messieurs MM.
madame Mᵐᵉ
mesdames Mᵐᵉˢ
mademoiselle Mˡˡᵉ
mademoiselles Mˡˡᵉˢ
docteur Dʳ
docteurs Dʳˢ
docteure Dʳᵉ
docteures Dʳᵉˢ
après jésus-christ apr. J.-C.
avant Jésus-Christ av. J.-C.
ca v.
vers v.
l’honorable le hon.
le très hononrable le très hon.
\ No newline at end of file
FunASR/fun_text_processing/inverse_text_normalization/fr/graph_utils.py
0 → 100644
View file @
70a8a9e0
import
os
import
string
from
pathlib
import
Path
from
typing
import
Dict
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini
import
Far
from
pynini.examples
import
plurals
from
pynini.export
import
export
from
pynini.lib
import
byte
,
pynutil
,
utf8
DAMO_CHAR
=
utf8
.
VALID_UTF8_CHAR
DAMO_DIGIT
=
byte
.
DIGIT
DAMO_LOWER
=
pynini
.
union
(
*
string
.
ascii_lowercase
).
optimize
()
DAMO_UPPER
=
pynini
.
union
(
*
string
.
ascii_uppercase
).
optimize
()
DAMO_ALPHA
=
pynini
.
union
(
DAMO_LOWER
,
DAMO_UPPER
).
optimize
()
DAMO_ALNUM
=
pynini
.
union
(
DAMO_DIGIT
,
DAMO_ALPHA
).
optimize
()
DAMO_HEX
=
pynini
.
union
(
*
string
.
hexdigits
).
optimize
()
DAMO_NON_BREAKING_SPACE
=
"
\u00A0
"
DAMO_SPACE
=
" "
DAMO_WHITE_SPACE
=
pynini
.
union
(
" "
,
"
\t
"
,
"
\n
"
,
"
\r
"
,
"
\u00A0
"
).
optimize
()
DAMO_NOT_SPACE
=
pynini
.
difference
(
DAMO_CHAR
,
DAMO_WHITE_SPACE
).
optimize
()
DAMO_NOT_QUOTE
=
pynini
.
difference
(
DAMO_CHAR
,
r
'"'
).
optimize
()
DAMO_PUNCT
=
pynini
.
union
(
*
map
(
pynini
.
escape
,
string
.
punctuation
)).
optimize
()
DAMO_GRAPH
=
pynini
.
union
(
DAMO_ALNUM
,
DAMO_PUNCT
).
optimize
()
DAMO_SIGMA
=
pynini
.
closure
(
DAMO_CHAR
)
delete_space
=
pynutil
.
delete
(
pynini
.
closure
(
DAMO_WHITE_SPACE
))
insert_space
=
pynutil
.
insert
(
" "
)
delete_extra_space
=
pynini
.
cross
(
pynini
.
closure
(
DAMO_WHITE_SPACE
,
1
),
" "
)
# French frequently compounds numbers with hyphen.
delete_hyphen
=
pynutil
.
delete
(
pynini
.
closure
(
"-"
,
0
,
1
))
insert_hyphen
=
pynutil
.
insert
(
"-"
)
suppletive
=
pynini
.
string_file
(
get_abs_path
(
"data/suppletive.tsv"
))
_s
=
DAMO_SIGMA
+
pynutil
.
insert
(
"s"
)
_x
=
DAMO_SIGMA
+
pynini
.
string_map
([(
"eau"
),
(
"eu"
),
(
"ou"
)])
+
pynutil
.
insert
(
"x"
)
_aux
=
DAMO_SIGMA
+
pynini
.
string_map
([(
"al"
,
"aux"
),
(
"ail"
,
"aux"
)])
graph_plural
=
plurals
.
_priority_union
(
suppletive
,
plurals
.
_priority_union
(
_s
,
pynini
.
union
(
_x
,
_aux
),
DAMO_SIGMA
),
DAMO_SIGMA
).
optimize
()
SINGULAR_TO_PLURAL
=
graph_plural
PLURAL_TO_SINGULAR
=
pynini
.
invert
(
graph_plural
)
TO_LOWER
=
pynini
.
union
(
*
[
pynini
.
cross
(
x
,
y
)
for
x
,
y
in
zip
(
string
.
ascii_uppercase
,
string
.
ascii_lowercase
)]
)
TO_UPPER
=
pynini
.
invert
(
TO_LOWER
)
def
generator_main
(
file_name
:
str
,
graphs
:
Dict
[
str
,
pynini
.
FstLike
]):
"""
Exports graph as OpenFst finite state archive (FAR) file with given file name and rule name.
Args:
file_name: exported file name
graphs: Mapping of a rule name and Pynini WFST graph to be exported
"""
exporter
=
export
.
Exporter
(
file_name
)
for
rule
,
graph
in
graphs
.
items
():
exporter
[
rule
]
=
graph
.
optimize
()
exporter
.
close
()
print
(
f
"Created
{
file_name
}
"
)
def
get_plurals
(
fst
):
"""
Given singular returns plurals
Args:
fst: Fst
Returns plurals to given singular forms
"""
return
SINGULAR_TO_PLURAL
@
fst
def
get_singulars
(
fst
):
"""
Given plural returns singulars
Args:
fst: Fst
Returns singulars to given plural forms
"""
return
PLURAL_TO_SINGULAR
@
fst
def
convert_space
(
fst
)
->
"pynini.FstLike"
:
"""
Converts space to nonbreaking space.
Used only in tagger grammars for transducing token values within quotes, e.g. name: "hello kitty"
This is making transducer significantly slower, so only use when there could be potential spaces within quotes, otherwise leave it.
Args:
fst: input fst
Returns output fst where breaking spaces are converted to non breaking spaces
"""
return
fst
@
pynini
.
cdrewrite
(
pynini
.
cross
(
DAMO_SPACE
,
DAMO_NON_BREAKING_SPACE
),
""
,
""
,
DAMO_SIGMA
)
class
GraphFst
:
"""
Base class for all grammar fsts.
Args:
name: name of grammar class
kind: either 'classify' or 'verbalize'
deterministic: if True will provide a single transduction option,
for False multiple transduction are generated (used for audio-based normalization)
"""
def
__init__
(
self
,
name
:
str
,
kind
:
str
,
deterministic
:
bool
=
True
):
self
.
name
=
name
self
.
kind
=
kind
self
.
_fst
=
None
self
.
deterministic
=
deterministic
self
.
far_path
=
Path
(
os
.
path
.
dirname
(
__file__
)
+
"/grammars/"
+
kind
+
"/"
+
name
+
".far"
)
if
self
.
far_exist
():
self
.
_fst
=
Far
(
self
.
far_path
,
mode
=
"r"
,
arc_type
=
"standard"
,
far_type
=
"default"
).
get_fst
()
def
far_exist
(
self
)
->
bool
:
"""
Returns true if FAR can be loaded
"""
return
self
.
far_path
.
exists
()
@
property
def
fst
(
self
)
->
"pynini.FstLike"
:
return
self
.
_fst
@
fst
.
setter
def
fst
(
self
,
fst
):
self
.
_fst
=
fst
def
add_tokens
(
self
,
fst
)
->
"pynini.FstLike"
:
"""
Wraps class name around to given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
return
pynutil
.
insert
(
f
"
{
self
.
name
}
{{ "
)
+
fst
+
pynutil
.
insert
(
" }"
)
def
delete_tokens
(
self
,
fst
)
->
"pynini.FstLike"
:
"""
Deletes class name wrap around output of given fst
Args:
fst: input fst
Returns:
Fst: fst
"""
res
=
(
pynutil
.
delete
(
f
"
{
self
.
name
}
"
)
+
delete_space
+
pynutil
.
delete
(
"{"
)
+
delete_space
+
fst
+
delete_space
+
pynutil
.
delete
(
"}"
)
)
return
res
@
pynini
.
cdrewrite
(
pynini
.
cross
(
"
\u00A0
"
,
" "
),
""
,
""
,
DAMO_SIGMA
)
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/__init__.py
0 → 100644
View file @
70a8a9e0
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/cardinal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_CHAR
,
DAMO_DIGIT
,
DAMO_NOT_SPACE
,
DAMO_SIGMA
,
DAMO_SPACE
,
GraphFst
,
delete_hyphen
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
def
rewrite
(
cardinal
:
"pynini.FstLike"
)
->
"pynini.FstLike"
:
"""
Function to rewrite cardinals written in traditional orthograph (no '-' for numbers >100)
to current orthography ('-' between all words in number string)
e.g. deux mille cent vingt-trois -> deux-mille-cent-vingt-trois.
In cases where original orthography is current, or string is mixture of two orthographies,
will render invalid form that will not pass through CardinalFst
e.g. deux-mille cent-vingt-trois -> "deux##vingt-trois" ('#' is not accepted in cardinal FST and will fail to convert.)
e.g. deux
Args:
cardinal: cardinal FST
"""
# Traditional orthography does not hyphenate numbers > 100, this will insert hyphens in
# those contexts.
targets
=
pynini
.
string_map
(
[
"et"
,
# for 'et un/onze'
"cent"
,
"mille"
,
"million"
,
"milliard"
,
"billion"
,
"billiard"
,
"trillion"
,
"trilliard"
,
]
)
targets
+=
pynini
.
accep
(
"s"
).
ques
no_spaces
=
pynini
.
closure
(
DAMO_NOT_SPACE
)
# Valid numbers in reformed orthography will have no spaces.
new_orthography_sigma
=
no_spaces
# Old orthography will not have these strings. Replacing with character to mark.
targets_for_filtering
=
(
"-"
+
targets
)
|
(
"-"
+
targets
+
"-"
)
|
(
targets
+
"-"
)
filter
=
pynini
.
cdrewrite
(
pynini
.
cross
(
targets_for_filtering
,
"#"
),
""
,
""
,
DAMO_SIGMA
)
# Invalid for cardinal
old_orthography_sigma
=
pynini
.
difference
(
DAMO_CHAR
,
"#"
)
# Marked character removed from sigma_star.
old_orthography_sigma
.
closure
()
# Only accept strings that occur in old orthography. (This avoids tying two non-related numbers together.)
# e.g. mille cent-une -> mille-cent-une
filter
@=
old_orthography_sigma
# Now know replacements will only work around targets
replace_left
=
pynini
.
cdrewrite
(
pynini
.
cross
(
" "
,
"-"
),
""
,
targets
,
DAMO_SIGMA
)
replace_right
=
pynini
.
cdrewrite
(
pynini
.
cross
(
" "
,
"-"
),
targets
,
""
,
DAMO_SIGMA
)
replace
=
replace_left
@
replace_right
graph
=
new_orthography_sigma
|
(
filter
@
replace
)
return
graph
@
cardinal
class
CardinalFst
(
GraphFst
):
"""
Finite state transducer for classifying cardinals
e.g. mois vingt-trois -> cardinal { negative: "-" integer: "23"}
This class converts cardinals up to (but not including) "un-quatrillion",
i.e up to "one septillion" in English (10^{24}).
Cardinals below nine are not converted (in order to avoid
"j'ai un pomme." --> "j'ai 1 pomme" and any other odd conversions.)
This transducer accomodates both traditional hyphenation of numbers ('-' for most numbers <100)
and current hyphenation (all elements of number are hyphenated), prioritizing the latter.
e.g cent cinquante et un -> cardinal { integer: "151"}
cent-cinquante-et-un -> cardinal { integer: "151"}
This is done through a context dependent rewrite that attempts to map old spelling to new.
e.g. cent cinquante et un -> cent-cinquante-et-un
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"cardinal"
,
kind
=
"classify"
)
graph_zero
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
graph_digit
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
graph_teens
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/teen.tsv"
))
graph_ties
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties.tsv"
))
graph_ties_unique
=
pynini
.
string_file
(
get_abs_path
(
"data/numbers/ties_unique.tsv"
))
# Tens components
graph_tens_component
=
graph_ties
+
((
delete_hyphen
+
graph_digit
)
|
pynutil
.
insert
(
"0"
))
graph_tens_component
=
pynini
.
union
(
graph_tens_component
,
graph_teens
,
graph_ties_unique
)
graph_tens_component_with_leading_zeros
=
pynini
.
union
(
graph_tens_component
,
(
pynutil
.
insert
(
"0"
)
+
(
graph_digit
|
pynutil
.
insert
(
"0"
,
weight
=
0.01
))),
)
# Hundreds components
graph_cent_singular
=
pynutil
.
delete
(
"cent"
)
# Used in hundreds place
graph_cent_plural
=
pynini
.
cross
(
"cents"
,
"00"
)
# Only used as terminus of hundred sequence. deux cents -> 200, deux cent un -> 201
graph_digit_no_one
=
pynini
.
project
(
pynini
.
union
(
"un"
,
"une"
),
"input"
)
graph_digit_no_one
=
(
pynini
.
project
(
graph_digit
,
"input"
)
-
graph_digit_no_one
.
arcsort
()
)
@
graph_digit
graph_hundreds_component_singular
=
(
graph_digit_no_one
+
delete_hyphen
+
graph_cent_singular
)
# Regular way: [1-9] * 100
graph_hundreds_component_singular
=
pynini
.
union
(
graph_hundreds_component_singular
,
pynini
.
cross
(
"cent"
,
"1"
)
)
graph_hundreds_component_singular
+=
delete_hyphen
graph_hundreds_component_singular
+=
graph_tens_component_with_leading_zeros
graph_hundreds_component_plural
=
graph_digit_no_one
+
delete_hyphen
+
graph_cent_plural
graph_hundreds_component
=
pynini
.
union
(
graph_hundreds_component_singular
,
graph_hundreds_component_plural
,
pynutil
.
insert
(
"0"
)
+
graph_tens_component_with_leading_zeros
,
)
graph_hundreds_component_at_least_one_none_zero_digit
=
graph_hundreds_component
@
(
pynini
.
closure
(
DAMO_DIGIT
)
+
(
DAMO_DIGIT
-
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
)
)
self
.
graph_hundreds_component_at_least_one_none_zero_digit
=
rewrite
(
graph_hundreds_component_at_least_one_none_zero_digit
).
optimize
()
# Graph thousands (we'll need this for cases of mille millions, mille milliards...)
graph_tens_of_hundreds_component_singular
=
(
graph_tens_component
+
delete_hyphen
+
graph_cent_singular
)
# Tens of hundreds. e.g. 1900 = nineteen hundred/ 'dix neuf cents"
graph_tens_of_hundreds_component_singular
+=
(
delete_hyphen
+
graph_tens_component_with_leading_zeros
)
graph_tens_of_hundreds_component_plural
=
(
graph_tens_component
+
delete_hyphen
+
graph_cent_plural
)
graph_tens_of_hundred_component
=
(
graph_tens_of_hundreds_component_plural
|
graph_tens_of_hundreds_component_singular
)
graph_thousands
=
pynini
.
union
(
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
pynutil
.
delete
(
"mille"
),
pynutil
.
insert
(
"001"
)
+
pynutil
.
delete
(
"mille"
),
# because 'mille', not 'un mille'
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
# All other large amounts
graph_millions
=
pynini
.
union
(
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"million"
)
|
pynutil
.
delete
(
"millions"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_milliards
=
pynini
.
union
(
# French for English 'billion'
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"milliard"
)
|
pynutil
.
delete
(
"milliards"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_billions
=
pynini
.
union
(
# NOTE: this is English 'trillion.'
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"billions"
)
|
pynutil
.
delete
(
"billion"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_mille_billion
=
pynini
.
union
(
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
pynutil
.
delete
(
"mille"
),
pynutil
.
insert
(
"001"
)
+
pynutil
.
delete
(
"mille"
),
# because we say 'mille', not 'un mille'
)
graph_mille_billion
+=
delete_hyphen
+
(
graph_millions
|
pynutil
.
insert
(
"000"
)
+
pynutil
.
delete
(
"billions"
)
)
# allow for 'mil millones'
graph_mille_billion
|=
pynutil
.
insert
(
"000000"
,
weight
=
0.1
)
graph_billiards
=
pynini
.
union
(
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"billiards"
)
|
pynutil
.
delete
(
"billiard"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_trillions
=
pynini
.
union
(
# One thousand English trillions.
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"trillions"
)
|
pynutil
.
delete
(
"trillion"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph_trilliards
=
pynini
.
union
(
graph_hundreds_component_at_least_one_none_zero_digit
+
delete_hyphen
+
(
pynutil
.
delete
(
"trilliards"
)
|
pynutil
.
delete
(
"trilliard"
)),
pynutil
.
insert
(
"000"
,
weight
=
0.1
),
)
graph
=
pynini
.
union
(
graph_trilliards
+
delete_hyphen
+
graph_trillions
+
delete_hyphen
+
graph_billiards
+
delete_hyphen
+
graph_billions
+
delete_hyphen
+
graph_milliards
+
delete_hyphen
+
graph_millions
+
delete_hyphen
+
graph_thousands
+
delete_hyphen
+
graph_hundreds_component
,
graph_tens_of_hundred_component
,
graph_zero
,
)
graph
=
graph
@
pynini
.
union
(
pynutil
.
delete
(
pynini
.
closure
(
"0"
))
+
pynini
.
difference
(
DAMO_DIGIT
,
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
),
"0"
,
)
graph
=
rewrite
(
graph
)
self
.
graph_no_exception
=
graph
.
optimize
()
# save self.numbers_up_to_thousand for use in DecimalFst
digits_up_to_thousand
=
DAMO_DIGIT
|
(
DAMO_DIGIT
**
2
)
|
(
DAMO_DIGIT
**
3
)
numbers_up_to_thousand
=
pynini
.
compose
(
graph
,
digits_up_to_thousand
).
optimize
()
self
.
numbers_up_to_thousand
=
numbers_up_to_thousand
# save self.numbers_up_to_million for use in DecimalFst
digits_up_to_million
=
(
DAMO_DIGIT
|
(
DAMO_DIGIT
**
2
)
|
(
DAMO_DIGIT
**
3
)
|
(
DAMO_DIGIT
**
4
)
|
(
DAMO_DIGIT
**
5
)
|
(
DAMO_DIGIT
**
6
)
)
numbers_up_to_million
=
pynini
.
compose
(
graph
,
digits_up_to_million
).
optimize
()
self
.
numbers_up_to_million
=
numbers_up_to_million
# don't convert cardinals from zero to nine inclusive
graph_exception
=
pynini
.
project
(
pynini
.
union
(
graph_digit
,
graph_zero
),
"input"
)
self
.
graph
=
(
pynini
.
project
(
graph
,
"input"
)
-
graph_exception
.
arcsort
())
@
graph
optional_minus_graph
=
pynini
.
closure
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"moins"
,
'"-"'
)
+
DAMO_SPACE
,
0
,
1
)
final_graph
=
(
optional_minus_graph
+
pynutil
.
insert
(
'integer: "'
)
+
self
.
graph
+
pynutil
.
insert
(
'"'
)
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/date.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
GraphFst
,
delete_extra_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
DateFst
(
GraphFst
):
"""
Finite state transducer for classifying date, in the form of (day) month (year) or year
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
e.g. le vingt-quatre juillet deux-mille-treize -> date { day: "24" month: "juli" year: "2013" preserve_order: true }
e.g. le premier janvier -> date { day: "1" month: "janvier" preserve_order: true }
Also will convert colloquialism of spelling in which tens of hundreds are used to express date. (e.g. nineteen hundred and four)
e.g. le vingt mais dix-neuf-cent-quatre -> date { day: "20" month: "mais" year: "1904" preserve_order: true }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"date"
,
kind
=
"classify"
)
self
.
cardinal
=
cardinal
.
graph_no_exception
year_graph
=
self
.
cardinal
month_graph
=
pynini
.
string_file
(
get_abs_path
(
"data/months.tsv"
))
month_graph
=
pynutil
.
insert
(
'month: "'
)
+
month_graph
+
pynutil
.
insert
(
'"'
)
day_graph
=
self
.
cardinal
|
pynini
.
cross
(
"premier"
,
"1"
)
# Premier is only ordinal used for dates
day_graph
=
pynutil
.
insert
(
'day: "'
)
+
day_graph
+
pynutil
.
insert
(
'"'
)
optional_graph_year
=
pynini
.
closure
(
delete_extra_space
+
pynutil
.
insert
(
'year: "'
)
+
year_graph
+
pynutil
.
insert
(
'"'
),
0
,
1
,
)
graph_dmy
=
day_graph
+
delete_extra_space
+
month_graph
+
optional_graph_year
final_graph
=
graph_dmy
final_graph
+=
pynutil
.
insert
(
" preserve_order: true"
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/decimal.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_DIGIT
,
GraphFst
,
delete_extra_space
,
delete_hyphen
,
delete_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
def
get_quantity
(
decimal
:
"pynini.FstLike"
,
cardinal_up_to_thousand
:
"pynini.FstLike"
)
->
"pynini.FstLike"
:
"""
Returns FST that transforms either a cardinal or decimal followed by a quantity into a numeral,
e.g. one million -> integer_part: "1" quantity: "million"
e.g. one point five million -> integer_part: "1" fractional_part: "5" quantity: "million"
Will tag cases up to denominations of tens of hundreds of thousand. 'douze cent mille millions' -> 1 200 000 millions
Args:
decimal: decimal FST
cardinal_up_to_million: cardinal FST
"""
numbers
=
cardinal_up_to_thousand
@
(
pynutil
.
delete
(
pynini
.
closure
(
"0"
))
+
pynini
.
difference
(
DAMO_DIGIT
,
"0"
)
+
pynini
.
closure
(
DAMO_DIGIT
)
)
suffix
=
pynini
.
union
(
"million"
,
"millions"
,
"milliard"
,
"milliards"
,
"billion"
,
"billions"
,
"billiard"
,
"billiards"
,
"trillion"
,
"trillions"
,
"trilliard"
,
"trilliards"
,
)
res
=
(
pynutil
.
insert
(
'integer_part: "'
)
+
numbers
+
pynutil
.
insert
(
'"'
)
+
(
pynini
.
union
(
delete_hyphen
,
delete_extra_space
)
)
# Can be written either as 'deux-millions' or 'deux millions' depending on whether it registers as a noun or part of cardinal.
+
pynutil
.
insert
(
' quantity: "'
)
+
suffix
+
pynutil
.
insert
(
'"'
)
)
res
|=
(
decimal
+
delete_extra_space
+
pynutil
.
insert
(
' quantity: "'
)
+
suffix
+
pynutil
.
insert
(
'"'
)
)
return
res
class
DecimalFst
(
GraphFst
):
"""
Finite state transducer for classifying decimal
Decimal point is "," (virgule).
e.g. moins un virgule deux six -> decimal { negative: "true" integer_part: "1" fractional_part: "26" }
This decimal rule assumes that decimals can be pronounced as:
(a cardinal) + ('virgule') plus (any sequence of cardinals <1 million, including 'zero')
Also writes large numbers in shortened form, e.g.
e.g. un virgule deux-six-million -> decimal { negative: "false" integer_part: "1" fractional_part: "26" quantity: "million" }
e.g. deux-million -> decimal { negative: "false" integer_part: "2" quantity: "millions" }
e.g. moins cent-vingt-quatre-millions -> decimal { negative: "true" integer_part: "124" quantity: "millions" }
Args:
cardinal: CardinalFst
"""
def
__init__
(
self
,
cardinal
:
GraphFst
):
super
().
__init__
(
name
=
"decimal"
,
kind
=
"classify"
)
# number after decimal point can be any series of cardinals <1 million, including 'zero'
graph_decimal
=
cardinal
.
numbers_up_to_million
graph_decimal
=
pynini
.
closure
(
graph_decimal
+
delete_space
)
+
graph_decimal
self
.
graph
=
graph_decimal
# decimal point is denote by virgule
graph_fractional_separator
=
pynutil
.
delete
(
"virgule"
)
# Possible negatives
optional_graph_negative
=
(
pynutil
.
insert
(
"negative: "
)
+
pynini
.
cross
(
"moins"
,
'"true"'
)
+
delete_extra_space
)
optional_graph_negative
=
optional_graph_negative
.
ques
# Fractional portion
graph_fractional
=
(
pynutil
.
insert
(
'fractional_part: "'
)
+
graph_decimal
+
pynutil
.
insert
(
'"'
)
)
# Integers
cardinal_graph
=
cardinal
.
graph_no_exception
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
)
)
graph_integer
=
pynutil
.
insert
(
'integer_part: "'
)
+
cardinal_graph
+
pynutil
.
insert
(
'"'
)
# Final graphs
final_graph_wo_sign
=
(
pynini
.
closure
(
graph_integer
+
delete_extra_space
,
0
,
1
)
+
graph_fractional_separator
+
delete_extra_space
+
graph_fractional
)
final_graph
=
optional_graph_negative
+
final_graph_wo_sign
self
.
final_graph_wo_negative
=
final_graph_wo_sign
|
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundreds_component_at_least_one_none_zero_digit
)
final_graph
|=
optional_graph_negative
+
get_quantity
(
final_graph_wo_sign
,
cardinal
.
graph_hundreds_component_at_least_one_none_zero_digit
)
final_graph
=
self
.
add_tokens
(
final_graph
)
self
.
fst
=
final_graph
.
optimize
()
FunASR/fun_text_processing/inverse_text_normalization/fr/taggers/electronic.py
0 → 100644
View file @
70a8a9e0
import
pynini
from
fun_text_processing.inverse_text_normalization.fr.graph_utils
import
(
DAMO_ALPHA
,
GraphFst
,
insert_space
,
)
from
fun_text_processing.inverse_text_normalization.fr.utils
import
get_abs_path
from
pynini.lib
import
pynutil
class
ElectronicFst
(
GraphFst
):
"""
Finite state transducer for classifying 'electronic' semiotic classes, i.e.
email address (which get converted to "username" and "domain" fields),
and URLS (which get converted to a "protocol" field).
e.g. c d f une arobase a b c point e d u -> tokens { electronic { username: "cdf1" domain: "abc.edu" } }
e.g. double vé double vé double vé a b c point e d u -> tokens { electronic { protocol: "www.abc.edu" } }
"""
def
__init__
(
self
):
super
().
__init__
(
name
=
"electronic"
,
kind
=
"classify"
)
delete_extra_space
=
pynutil
.
delete
(
" "
)
alpha_num
=
(
DAMO_ALPHA
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/digit.tsv"
))
|
pynini
.
string_file
(
get_abs_path
(
"data/numbers/zero.tsv"
))
)
symbols
=
pynini
.
string_file
(
get_abs_path
(
"data/electronic/symbols.tsv"
))
ampersand
=
pynini
.
string_map
([(
"arobase"
),
(
"chez"
),
(
"at"
),
(
"à"
)])
accepted_username
=
alpha_num
|
symbols
process_dot
=
pynini
.
cross
(
"point"
,
"."
)
username
=
(
pynutil
.
insert
(
'username: "'
)
+
alpha_num
+
delete_extra_space
+
pynini
.
closure
(
accepted_username
+
delete_extra_space
)
+
alpha_num
+
pynutil
.
insert
(
'"'
)
)
single_alphanum
=
pynini
.
closure
(
alpha_num
+
delete_extra_space
)
+
alpha_num
server
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/server_name.tsv"
)
)
domain
=
single_alphanum
|
pynini
.
string_file
(
get_abs_path
(
"data/electronic/domain.tsv"
))
domain_graph
=
(
pynutil
.
insert
(
'domain: "'
)
+
server
+
delete_extra_space
+
process_dot
+
delete_extra_space
+
domain
+
pynutil
.
insert
(
'"'
)
)
graph
=
(
username
+
delete_extra_space
+
pynutil
.
delete
(
ampersand
)
+
insert_space
+
delete_extra_space
+
domain_graph
)
############# url ###
protocol_end
=
pynini
.
cross
(
pynini
.
union
(
"www"
,
"w w w"
,
"double vé double vé double vé"
),
"www"
)
protocol_start
=
pynini
.
cross
(
pynini
.
union
(
"http"
,
"h t t p"
,
"ache té té pé"
),
"http"
)
protocol_start
|=
pynini
.
cross
(
pynini
.
union
(
"https"
,
"h t t p s"
,
"ache té té pé esse"
),
"https"
)
protocol_start
+=
pynini
.
cross
(
pynini
.
union
(
" deux-points barre oblique barre oblique "
,
" deux-points barre barre "
,
" deux-points double barre "
,
" deux-points slash slash "
,
),
"://"
,
)
# e.g. .com, .es
ending
=
(
delete_extra_space
+
symbols
+
delete_extra_space
+
(
domain
|
pynini
.
closure
(
accepted_username
+
delete_extra_space
)
+
accepted_username
)
)
protocol
=
(
pynini
.
closure
(
protocol_start
,
0
,
1
)
+
protocol_end
+
delete_extra_space
+
process_dot
+
delete_extra_space
+
(
pynini
.
closure
(
delete_extra_space
+
accepted_username
,
1
)
|
server
)
+
pynini
.
closure
(
ending
,
1
)
)
protocol
=
pynutil
.
insert
(
'protocol: "'
)
+
protocol
+
pynutil
.
insert
(
'"'
)
graph
|=
protocol
########
final_graph
=
self
.
add_tokens
(
graph
)
self
.
fst
=
final_graph
.
optimize
()
Prev
1
…
19
20
21
22
23
24
25
26
27
…
42
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment