Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
37dbf29f
Unverified
Commit
37dbf29f
authored
Jul 26, 2021
by
yangarbiter
Committed by
GitHub
Jul 26, 2021
Browse files
Add text preprocessing utilities for TTS pipeline (#1639)
parent
c49db739
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
202 additions
and
0 deletions
+202
-0
examples/pipeline_tacotron2/text/__init__.py
examples/pipeline_tacotron2/text/__init__.py
+0
-0
examples/pipeline_tacotron2/text/numbers.py
examples/pipeline_tacotron2/text/numbers.py
+95
-0
examples/pipeline_tacotron2/text/test_text.py
examples/pipeline_tacotron2/text/test_text.py
+22
-0
examples/pipeline_tacotron2/text/text_preprocessing.py
examples/pipeline_tacotron2/text/text_preprocessing.py
+85
-0
No files found.
examples/pipeline_tacotron2/text/__init__.py
0 → 100644
View file @
37dbf29f
examples/pipeline_tacotron2/text/numbers.py
0 → 100644
View file @
37dbf29f
# *****************************************************************************
# Copyright (c) 2017 Keith Ito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# *****************************************************************************
"""
Modified from https://github.com/keithito/tacotron
"""
import
inflect
import
re
_inflect
=
inflect
.
engine
()
_comma_number_re
=
re
.
compile
(
r
'([0-9][0-9\,]+[0-9])'
)
_decimal_number_re
=
re
.
compile
(
r
'([0-9]+\.[0-9]+)'
)
_pounds_re
=
re
.
compile
(
r
'£([0-9\,]*[0-9]+)'
)
_dollars_re
=
re
.
compile
(
r
'\$([0-9\.\,]*[0-9]+)'
)
_ordinal_re
=
re
.
compile
(
r
'[0-9]+(st|nd|rd|th)'
)
_number_re
=
re
.
compile
(
r
'[0-9]+'
)
def
_remove_commas
(
m
:
re
.
Match
)
->
str
:
return
m
.
group
(
1
).
replace
(
','
,
''
)
def
_expand_decimal_point
(
m
:
re
.
Match
)
->
str
:
return
m
.
group
(
1
).
replace
(
'.'
,
' point '
)
def
_expand_dollars
(
m
:
re
.
Match
)
->
str
:
match
=
m
.
group
(
1
)
parts
=
match
.
split
(
'.'
)
if
len
(
parts
)
>
2
:
return
match
+
' dollars'
# Unexpected format
dollars
=
int
(
parts
[
0
])
if
parts
[
0
]
else
0
cents
=
int
(
parts
[
1
])
if
len
(
parts
)
>
1
and
parts
[
1
]
else
0
if
dollars
and
cents
:
dollar_unit
=
'dollar'
if
dollars
==
1
else
'dollars'
cent_unit
=
'cent'
if
cents
==
1
else
'cents'
return
'%s %s, %s %s'
%
(
dollars
,
dollar_unit
,
cents
,
cent_unit
)
elif
dollars
:
dollar_unit
=
'dollar'
if
dollars
==
1
else
'dollars'
return
'%s %s'
%
(
dollars
,
dollar_unit
)
elif
cents
:
cent_unit
=
'cent'
if
cents
==
1
else
'cents'
return
'%s %s'
%
(
cents
,
cent_unit
)
else
:
return
'zero dollars'
def
_expand_ordinal
(
m
:
re
.
Match
)
->
str
:
return
_inflect
.
number_to_words
(
m
.
group
(
0
))
def
_expand_number
(
m
:
re
.
Match
)
->
str
:
num
=
int
(
m
.
group
(
0
))
if
num
>
1000
and
num
<
3000
:
if
num
==
2000
:
return
'two thousand'
elif
num
>
2000
and
num
<
2010
:
return
'two thousand '
+
_inflect
.
number_to_words
(
num
%
100
)
elif
num
%
100
==
0
:
return
_inflect
.
number_to_words
(
num
//
100
)
+
' hundred'
else
:
return
_inflect
.
number_to_words
(
num
,
andword
=
''
,
zero
=
'oh'
,
group
=
2
).
replace
(
', '
,
' '
)
else
:
return
_inflect
.
number_to_words
(
num
,
andword
=
''
)
def
normalize_numbers
(
text
:
str
)
->
str
:
text
=
re
.
sub
(
_comma_number_re
,
_remove_commas
,
text
)
text
=
re
.
sub
(
_pounds_re
,
r
'\1 pounds'
,
text
)
text
=
re
.
sub
(
_dollars_re
,
_expand_dollars
,
text
)
text
=
re
.
sub
(
_decimal_number_re
,
_expand_decimal_point
,
text
)
text
=
re
.
sub
(
_ordinal_re
,
_expand_ordinal
,
text
)
text
=
re
.
sub
(
_number_re
,
_expand_number
,
text
)
return
text
examples/pipeline_tacotron2/text/test_text.py
0 → 100644
View file @
37dbf29f
import
unittest
from
parameterized
import
parameterized
from
.text_preprocessing
import
text_to_sequence
class
TestTextPreprocessor
(
unittest
.
TestCase
):
@
parameterized
.
expand
(
[
[
"dr. Strange?"
,
[
15
,
26
,
14
,
31
,
26
,
29
,
11
,
30
,
31
,
29
,
12
,
25
,
18
,
16
,
10
]],
[
"ML, is fun."
,
[
24
,
23
,
6
,
11
,
20
,
30
,
11
,
17
,
32
,
25
,
7
]],
[
"I love torchaudio!"
,
[
20
,
11
,
23
,
26
,
33
,
16
,
11
,
31
,
26
,
29
,
14
,
19
,
12
,
32
,
15
,
20
,
26
,
2
]],
# 'one thousand dollars, twenty cents'
[
"$1,000.20"
,
[
26
,
25
,
16
,
11
,
31
,
19
,
26
,
32
,
30
,
12
,
25
,
15
,
11
,
15
,
26
,
23
,
23
,
12
,
29
,
30
,
6
,
11
,
31
,
34
,
16
,
25
,
31
,
36
,
11
,
14
,
16
,
25
,
31
,
30
]],
]
)
def
test_text_to_sequence
(
self
,
sent
,
seq
):
assert
(
text_to_sequence
(
sent
)
==
seq
)
examples/pipeline_tacotron2/text/text_preprocessing.py
0 → 100644
View file @
37dbf29f
# *****************************************************************************
# Copyright (c) 2017 Keith Ito
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
# *****************************************************************************
"""
Modified from https://github.com/keithito/tacotron
"""
from
typing
import
List
import
re
from
unidecode
import
unidecode
from
.numbers
import
normalize_numbers
# Regular expression matching whitespace:
_whitespace_re
=
re
.
compile
(
r
'\s+'
)
# List of (regular expression, replacement) pairs for abbreviations:
_abbreviations
=
[(
re
.
compile
(
'
\\
b%s
\\
.'
%
x
[
0
],
re
.
IGNORECASE
),
x
[
1
])
for
x
in
[
(
'mrs'
,
'misess'
),
(
'mr'
,
'mister'
),
(
'dr'
,
'doctor'
),
(
'st'
,
'saint'
),
(
'co'
,
'company'
),
(
'jr'
,
'junior'
),
(
'maj'
,
'major'
),
(
'gen'
,
'general'
),
(
'drs'
,
'doctors'
),
(
'rev'
,
'reverend'
),
(
'lt'
,
'lieutenant'
),
(
'hon'
,
'honorable'
),
(
'sgt'
,
'sergeant'
),
(
'capt'
,
'captain'
),
(
'esq'
,
'esquire'
),
(
'ltd'
,
'limited'
),
(
'col'
,
'colonel'
),
(
'ft'
,
'fort'
),
]]
_pad
=
'_'
_punctuation
=
'!
\'
(),.:;? '
_special
=
'-'
_letters
=
'abcdefghijklmnopqrstuvwxyz'
symbols
=
[
_pad
]
+
list
(
_special
)
+
list
(
_punctuation
)
+
list
(
_letters
)
_symbol_to_id
=
{
s
:
i
for
i
,
s
in
enumerate
(
symbols
)}
def
text_to_sequence
(
sent
:
str
)
->
List
[
int
]:
r
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
Args:
sent (str): The input sentence to convert to a sequence.
Returns:
List of integers corresponding to the symbols in the sentence.
'''
sent
=
unidecode
(
sent
)
# convert to ascii
sent
=
sent
.
lower
()
# lower case
sent
=
normalize_numbers
(
sent
)
# expand numbers
for
regex
,
replacement
in
_abbreviations
:
# expand abbreviations
sent
=
re
.
sub
(
regex
,
replacement
,
sent
)
sent
=
re
.
sub
(
_whitespace_re
,
' '
,
sent
)
# collapse whitespace
return
[
_symbol_to_id
[
s
]
for
s
in
sent
if
s
in
_symbol_to_id
]
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment