Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
hehl2
Torchaudio
Commits
37dbf29f
"git@developer.sourcefind.cn:OpenDAS/vision.git" did not exist on "082f37ee54e47c373081196b964a77ca7c8726ac"
Unverified
Commit
37dbf29f
authored
Jul 26, 2021
by
yangarbiter
Committed by
GitHub
Jul 26, 2021
Browse files
Add text preprocessing utilities for TTS pipeline (#1639)
parent
c49db739
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
202 additions
and
0 deletions
+202
-0
examples/pipeline_tacotron2/text/__init__.py
examples/pipeline_tacotron2/text/__init__.py
+0
-0
examples/pipeline_tacotron2/text/numbers.py
examples/pipeline_tacotron2/text/numbers.py
+95
-0
examples/pipeline_tacotron2/text/test_text.py
examples/pipeline_tacotron2/text/test_text.py
+22
-0
examples/pipeline_tacotron2/text/text_preprocessing.py
examples/pipeline_tacotron2/text/text_preprocessing.py
+85
-0
No files found.
examples/pipeline_tacotron2/text/__init__.py
0 → 100644
View file @
37dbf29f
examples/pipeline_tacotron2/text/numbers.py
0 → 100644
View file @
37dbf29f
# *****************************************************************************
# Copyright (c) 2017 Keith Ito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# *****************************************************************************
"""
Modified from https://github.com/keithito/tacotron
"""
import
inflect
import
re
_inflect
=
inflect
.
engine
()
_comma_number_re
=
re
.
compile
(
r
'([0-9][0-9\,]+[0-9])'
)
_decimal_number_re
=
re
.
compile
(
r
'([0-9]+\.[0-9]+)'
)
_pounds_re
=
re
.
compile
(
r
'£([0-9\,]*[0-9]+)'
)
_dollars_re
=
re
.
compile
(
r
'\$([0-9\.\,]*[0-9]+)'
)
_ordinal_re
=
re
.
compile
(
r
'[0-9]+(st|nd|rd|th)'
)
_number_re
=
re
.
compile
(
r
'[0-9]+'
)
def
_remove_commas
(
m
:
re
.
Match
)
->
str
:
return
m
.
group
(
1
).
replace
(
','
,
''
)
def
_expand_decimal_point
(
m
:
re
.
Match
)
->
str
:
return
m
.
group
(
1
).
replace
(
'.'
,
' point '
)
def
_expand_dollars
(
m
:
re
.
Match
)
->
str
:
match
=
m
.
group
(
1
)
parts
=
match
.
split
(
'.'
)
if
len
(
parts
)
>
2
:
return
match
+
' dollars'
# Unexpected format
dollars
=
int
(
parts
[
0
])
if
parts
[
0
]
else
0
cents
=
int
(
parts
[
1
])
if
len
(
parts
)
>
1
and
parts
[
1
]
else
0
if
dollars
and
cents
:
dollar_unit
=
'dollar'
if
dollars
==
1
else
'dollars'
cent_unit
=
'cent'
if
cents
==
1
else
'cents'
return
'%s %s, %s %s'
%
(
dollars
,
dollar_unit
,
cents
,
cent_unit
)
elif
dollars
:
dollar_unit
=
'dollar'
if
dollars
==
1
else
'dollars'
return
'%s %s'
%
(
dollars
,
dollar_unit
)
elif
cents
:
cent_unit
=
'cent'
if
cents
==
1
else
'cents'
return
'%s %s'
%
(
cents
,
cent_unit
)
else
:
return
'zero dollars'
def
_expand_ordinal
(
m
:
re
.
Match
)
->
str
:
return
_inflect
.
number_to_words
(
m
.
group
(
0
))
def
_expand_number
(
m
:
re
.
Match
)
->
str
:
num
=
int
(
m
.
group
(
0
))
if
num
>
1000
and
num
<
3000
:
if
num
==
2000
:
return
'two thousand'
elif
num
>
2000
and
num
<
2010
:
return
'two thousand '
+
_inflect
.
number_to_words
(
num
%
100
)
elif
num
%
100
==
0
:
return
_inflect
.
number_to_words
(
num
//
100
)
+
' hundred'
else
:
return
_inflect
.
number_to_words
(
num
,
andword
=
''
,
zero
=
'oh'
,
group
=
2
).
replace
(
', '
,
' '
)
else
:
return
_inflect
.
number_to_words
(
num
,
andword
=
''
)
def
normalize_numbers
(
text
:
str
)
->
str
:
text
=
re
.
sub
(
_comma_number_re
,
_remove_commas
,
text
)
text
=
re
.
sub
(
_pounds_re
,
r
'\1 pounds'
,
text
)
text
=
re
.
sub
(
_dollars_re
,
_expand_dollars
,
text
)
text
=
re
.
sub
(
_decimal_number_re
,
_expand_decimal_point
,
text
)
text
=
re
.
sub
(
_ordinal_re
,
_expand_ordinal
,
text
)
text
=
re
.
sub
(
_number_re
,
_expand_number
,
text
)
return
text
examples/pipeline_tacotron2/text/test_text.py
0 → 100644
View file @
37dbf29f
import
unittest
from
parameterized
import
parameterized
from
.text_preprocessing
import
text_to_sequence
class
TestTextPreprocessor
(
unittest
.
TestCase
):
@
parameterized
.
expand
(
[
[
"dr. Strange?"
,
[
15
,
26
,
14
,
31
,
26
,
29
,
11
,
30
,
31
,
29
,
12
,
25
,
18
,
16
,
10
]],
[
"ML, is fun."
,
[
24
,
23
,
6
,
11
,
20
,
30
,
11
,
17
,
32
,
25
,
7
]],
[
"I love torchaudio!"
,
[
20
,
11
,
23
,
26
,
33
,
16
,
11
,
31
,
26
,
29
,
14
,
19
,
12
,
32
,
15
,
20
,
26
,
2
]],
# 'one thousand dollars, twenty cents'
[
"$1,000.20"
,
[
26
,
25
,
16
,
11
,
31
,
19
,
26
,
32
,
30
,
12
,
25
,
15
,
11
,
15
,
26
,
23
,
23
,
12
,
29
,
30
,
6
,
11
,
31
,
34
,
16
,
25
,
31
,
36
,
11
,
14
,
16
,
25
,
31
,
30
]],
]
)
def
test_text_to_sequence
(
self
,
sent
,
seq
):
assert
(
text_to_sequence
(
sent
)
==
seq
)
examples/pipeline_tacotron2/text/text_preprocessing.py
0 → 100644
View file @
37dbf29f
# *****************************************************************************
# Copyright (c) 2017 Keith Ito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# *****************************************************************************
"""
Modified from https://github.com/keithito/tacotron
"""
from
typing
import
List
import
re
from
unidecode
import
unidecode
from
.numbers
import
normalize_numbers
# Regular expression matching whitespace:
_whitespace_re
=
re
.
compile
(
r
'\s+'
)
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations
=
[(
re
.
compile
(
'
\\
b%s
\\
.'
%
x
[
0
],
re
.
IGNORECASE
),
x
[
1
])
for
x
in
[
(
'mrs'
,
'misess'
),
(
'mr'
,
'mister'
),
(
'dr'
,
'doctor'
),
(
'st'
,
'saint'
),
(
'co'
,
'company'
),
(
'jr'
,
'junior'
),
(
'maj'
,
'major'
),
(
'gen'
,
'general'
),
(
'drs'
,
'doctors'
),
(
'rev'
,
'reverend'
),
(
'lt'
,
'lieutenant'
),
(
'hon'
,
'honorable'
),
(
'sgt'
,
'sergeant'
),
(
'capt'
,
'captain'
),
(
'esq'
,
'esquire'
),
(
'ltd'
,
'limited'
),
(
'col'
,
'colonel'
),
(
'ft'
,
'fort'
),
]]
_pad
=
'_'
_punctuation
=
'!
\'
(),.:;? '
_special
=
'-'
_letters
=
'abcdefghijklmnopqrstuvwxyz'
symbols
=
[
_pad
]
+
list
(
_special
)
+
list
(
_punctuation
)
+
list
(
_letters
)
_symbol_to_id
=
{
s
:
i
for
i
,
s
in
enumerate
(
symbols
)}
def
text_to_sequence
(
sent
:
str
)
->
List
[
int
]:
r
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
sent (str): The input sentence to convert to a sequence.
Returns:
List of integers corresponding to the symbols in the sentence.
'''
sent
=
unidecode
(
sent
)
# convert to ascii
sent
=
sent
.
lower
()
# lower case
sent
=
normalize_numbers
(
sent
)
# expand numbers
for
regex
,
replacement
in
_abbreviations
:
# expand abbreviations
sent
=
re
.
sub
(
regex
,
replacement
,
sent
)
sent
=
re
.
sub
(
_whitespace_re
,
' '
,
sent
)
# collapse whitespace
return
[
_symbol_to_id
[
s
]
for
s
in
sent
if
s
in
_symbol_to_id
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment