Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
aa56d30c
Unverified
Commit
aa56d30c
authored
Dec 27, 2020
by
Aziz
Committed by
GitHub
Dec 27, 2020
Browse files
Fix CommonVoice for French (#1126)
Resolves #1125 where dataset metadata does not contain an extension.
parent
9c484027
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
133 additions
and
42 deletions
+133
-42
test/torchaudio_unittest/datasets/commonvoice_test.py
test/torchaudio_unittest/datasets/commonvoice_test.py
+112
-32
test/torchaudio_unittest/datasets/utils_test.py
test/torchaudio_unittest/datasets/utils_test.py
+13
-4
torchaudio/datasets/commonvoice.py
torchaudio/datasets/commonvoice.py
+8
-6
No files found.
test/torchaudio_unittest/datasets/commonvoice_test.py
View file @
aa56d30c
import
os
import
csv
import
csv
import
os
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Tuple
,
Dict
from
torch
audio.datasets
import
COMMONVOICE
from
torch
import
Tensor
from
torchaudio_unittest.common_utils
import
(
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TempDirMixin
,
TorchaudioTestCase
,
TorchaudioTestCase
,
...
@@ -11,47 +12,96 @@ from torchaudio_unittest.common_utils import (
...
@@ -11,47 +12,96 @@ from torchaudio_unittest.common_utils import (
normalize_wav
,
normalize_wav
,
)
)
from
torchaudio.datasets
import
COMMONVOICE
class
TestCommonVoice
(
TempDirMixin
,
TorchaudioTestCase
):
_ORIGINAL_EXT_AUDIO
=
COMMONVOICE
.
_ext_audio
backend
=
'default'
_SAMPLE_RATE
=
48000
_HEADERS
=
[
u
"client_ids"
,
u
"path"
,
u
"sentence"
,
u
"up_votes"
,
u
"down_votes"
,
u
"age"
,
u
"gender"
,
u
"accent"
]
root_dir
=
None
data
=
[]
def
get_mock_dataset_en
(
root_dir
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
_headers
=
[
u
"client_ids"
,
u
"path"
,
u
"sentence"
,
u
"up_votes"
,
u
"down_votes"
,
u
"age"
,
u
"gender"
,
u
"accent"
]
mocked_data
=
[
]
# Note: extension is changed to wav for the sake of test
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
_train_csv_contents
=
[
_en
_train_csv_contents
=
[
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c"
,
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c"
,
"common_voice_en_18885784.wav"
,
"common_voice_en_18885784.wav"
,
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery."
,
"2"
,
"0"
,
""
,
""
,
""
],
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery."
,
"2"
,
"0"
,
""
,
""
,
""
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20"
,
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20"
,
"common_voice_en_556542.wav"
,
"Once more into the breach"
,
"2"
,
"0"
,
"thirties"
,
"male"
,
"us"
],
"common_voice_en_556542.wav"
,
"Once more into the breach"
,
"2"
,
"0"
,
"thirties"
,
"male"
,
"us"
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c"
,
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c"
,
"common_voice_en_18607573.wav"
,
"common_voice_en_18607573.wav"
,
"Caddy, show Miss Clare and Miss Summerson their rooms."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"canada"
],
"Caddy, show Miss Clare and Miss Summerson their rooms."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"canada"
],
]
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename
=
os
.
path
.
join
(
root_dir
,
"train.tsv"
)
audio_base_path
=
os
.
path
.
join
(
root_dir
,
"clips"
)
os
.
makedirs
(
audio_base_path
,
exist_ok
=
True
)
with
open
(
tsv_filename
,
"w"
,
newline
=
''
)
as
tsv
:
writer
=
csv
.
writer
(
tsv
,
delimiter
=
'
\t
'
)
writer
.
writerow
(
_HEADERS
)
for
i
,
content
in
enumerate
(
_en_train_csv_contents
):
writer
.
writerow
(
content
)
# Generate and store audio
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
])
data
=
get_whitenoise
(
sample_rate
=
_SAMPLE_RATE
,
duration
=
1
,
n_channels
=
1
,
seed
=
i
,
dtype
=
'float32'
)
save_wav
(
audio_path
,
data
,
_SAMPLE_RATE
)
# Append data entry
mocked_data
.
append
((
normalize_wav
(
data
),
_SAMPLE_RATE
,
dict
(
zip
(
_HEADERS
,
content
))))
return
mocked_data
def
get_mock_dataset_fr
(
root_dir
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
mocked_data
=
[]
_fr_train_csv_contents
=
[
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
"18343441c601cae0597a4b0d3144"
,
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329"
,
"Or sur ce point nous n’avons aucune réponse de votre part."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144"
,
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd"
,
"Monsieur de La Verpillière, laissez parler le ministre"
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
]
]
sample_rate
=
48000
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename
=
os
.
path
.
join
(
root_dir
,
"train.tsv"
)
audio_base_path
=
os
.
path
.
join
(
root_dir
,
"clips"
)
os
.
makedirs
(
audio_base_path
,
exist_ok
=
True
)
with
open
(
tsv_filename
,
"w"
,
newline
=
''
)
as
tsv
:
writer
=
csv
.
writer
(
tsv
,
delimiter
=
'
\t
'
)
writer
.
writerow
(
_HEADERS
)
for
i
,
content
in
enumerate
(
_fr_train_csv_contents
):
content
[
2
]
=
str
(
content
[
2
].
encode
(
"utf-8"
))
writer
.
writerow
(
content
)
# Generate and store audio
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
]
+
_ORIGINAL_EXT_AUDIO
)
data
=
get_whitenoise
(
sample_rate
=
_SAMPLE_RATE
,
duration
=
1
,
n_channels
=
1
,
seed
=
i
,
dtype
=
'float32'
)
save_wav
(
audio_path
,
data
,
_SAMPLE_RATE
)
# Append data entry
mocked_data
.
append
((
normalize_wav
(
data
),
_SAMPLE_RATE
,
dict
(
zip
(
_HEADERS
,
content
))))
return
mocked_data
class
TestCommonVoiceEN
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
@
classmethod
def
setUpClass
(
cls
):
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
# Tsv file name difference does not mean different subset, testing as a whole dataset here
cls
.
data
=
get_mock_dataset_en
(
cls
.
root_dir
)
tsv_filename
=
os
.
path
.
join
(
cls
.
root_dir
,
"train.tsv"
)
COMMONVOICE
.
_ext_audio
=
".wav"
audio_base_path
=
os
.
path
.
join
(
cls
.
root_dir
,
"clips"
)
os
.
makedirs
(
audio_base_path
,
exist_ok
=
True
)
@
classmethod
with
open
(
tsv_filename
,
"w"
,
newline
=
''
)
as
tsv
:
def
tearDownClass
(
cls
):
writer
=
csv
.
writer
(
tsv
,
delimiter
=
'
\t
'
)
COMMONVOICE
.
_ext_audio
=
_ORIGINAL_EXT_AUDIO
writer
.
writerow
(
cls
.
_headers
)
for
i
,
content
in
enumerate
(
cls
.
_train_csv_contents
):
writer
.
writerow
(
content
)
# Generate and store audio
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
])
data
=
get_whitenoise
(
sample_rate
=
cls
.
sample_rate
,
duration
=
1
,
n_channels
=
1
,
seed
=
i
,
dtype
=
'float32'
)
save_wav
(
audio_path
,
data
,
cls
.
sample_rate
)
# Append data entry
cls
.
data
.
append
((
normalize_wav
(
data
),
cls
.
sample_rate
,
dict
(
zip
(
cls
.
_headers
,
content
))))
def
_test_commonvoice
(
self
,
dataset
):
def
_test_commonvoice
(
self
,
dataset
):
n_ite
=
0
n_ite
=
0
...
@@ -59,7 +109,7 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
...
@@ -59,7 +109,7 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
expected_dictionary
=
self
.
data
[
i
][
2
]
expected_dictionary
=
self
.
data
[
i
][
2
]
expected_data
=
self
.
data
[
i
][
0
]
expected_data
=
self
.
data
[
i
][
0
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
TestCommonVoice
.
sample_rate
assert
sample_rate
==
_SAMPLE_RATE
assert
dictionary
==
expected_dictionary
assert
dictionary
==
expected_dictionary
n_ite
+=
1
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
assert
n_ite
==
len
(
self
.
data
)
...
@@ -71,3 +121,33 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
...
@@ -71,3 +121,33 @@ class TestCommonVoice(TempDirMixin, TorchaudioTestCase):
def
test_commonvoice_path
(
self
):
def
test_commonvoice_path
(
self
):
dataset
=
COMMONVOICE
(
Path
(
self
.
root_dir
))
dataset
=
COMMONVOICE
(
Path
(
self
.
root_dir
))
self
.
_test_commonvoice
(
dataset
)
self
.
_test_commonvoice
(
dataset
)
class
TestCommonVoiceFR
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
=
get_mock_dataset_fr
(
cls
.
root_dir
)
COMMONVOICE
.
_ext_audio
=
".mp3"
@
classmethod
def
tearDownClass
(
cls
):
COMMONVOICE
.
_ext_audio
=
_ORIGINAL_EXT_AUDIO
def
_test_commonvoice
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
dictionary
)
in
enumerate
(
dataset
):
expected_dictionary
=
self
.
data
[
i
][
2
]
expected_data
=
self
.
data
[
i
][
0
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
_SAMPLE_RATE
assert
dictionary
==
expected_dictionary
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
def
test_commonvoice_str
(
self
):
dataset
=
COMMONVOICE
(
self
.
root_dir
)
self
.
_test_commonvoice
(
dataset
)
test/torchaudio_unittest/datasets/utils_test.py
View file @
aa56d30c
from
torchaudio.datasets
import
utils
as
dataset_utils
from
torchaudio.datasets.commonvoice
import
COMMONVOICE
from
torchaudio_unittest.common_utils
import
(
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
TorchaudioTestCase
,
get_asset_path
,
get_asset_path
,
)
)
from
torchaudio.datasets
import
utils
as
dataset_utils
from
torchaudio.datasets.commonvoice
import
COMMONVOICE
original_ext_audio
=
COMMONVOICE
.
_ext_audio
class
TestIterator
(
TorchaudioTestCase
):
class
TestIterator
(
TorchaudioTestCase
):
@
classmethod
def
setUpClass
(
cls
):
COMMONVOICE
.
_ext_audio
=
".wav"
@
classmethod
def
tearDownClass
(
cls
):
COMMONVOICE
.
_ext_audio
=
original_ext_audio
backend
=
'default'
backend
=
'default'
path
=
get_asset_path
(
'CommonVoice'
,
'cv-corpus-4-2019-12-10'
,
'tt'
)
path
=
get_asset_path
(
'CommonVoice'
,
'cv-corpus-4-2019-12-10'
,
'tt'
)
...
...
torchaudio/datasets/commonvoice.py
View file @
aa56d30c
import
os
import
csv
import
csv
import
os
import
warnings
import
warnings
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
List
,
Dict
,
Tuple
,
Union
,
Optional
from
typing
import
List
,
Dict
,
Tuple
,
Union
,
Optional
import
torchaudio
from
torch
import
Tensor
from
torch
import
Tensor
from
torch.utils.data
import
Dataset
from
torch.utils.data
import
Dataset
import
torchaudio
def
load_commonvoice_item
(
line
:
List
[
str
],
def
load_commonvoice_item
(
line
:
List
[
str
],
header
:
List
[
str
],
header
:
List
[
str
],
path
:
str
,
path
:
str
,
folder_audio
:
str
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
folder_audio
:
str
,
ext_audio
:
str
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
# Each line as the following data:
# Each line as the following data:
# client_id, path, sentence, up_votes, down_votes, age, gender, accent
# client_id, path, sentence, up_votes, down_votes, age, gender, accent
assert
header
[
1
]
==
"path"
assert
header
[
1
]
==
"path"
fileid
=
line
[
1
]
fileid
=
line
[
1
]
filename
=
os
.
path
.
join
(
path
,
folder_audio
,
fileid
)
filename
=
os
.
path
.
join
(
path
,
folder_audio
,
fileid
)
if
not
filename
.
endswith
(
ext_audio
):
filename
+=
ext_audio
waveform
,
sample_rate
=
torchaudio
.
load
(
filename
)
waveform
,
sample_rate
=
torchaudio
.
load
(
filename
)
dic
=
dict
(
zip
(
header
,
line
))
dic
=
dict
(
zip
(
header
,
line
))
...
@@ -95,7 +97,7 @@ class COMMONVOICE(Dataset):
...
@@ -95,7 +97,7 @@ class COMMONVOICE(Dataset):
``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
``up_votes``, ``down_votes``, ``age``, ``gender`` and ``accent``.
"""
"""
line
=
self
.
_walker
[
n
]
line
=
self
.
_walker
[
n
]
return
load_commonvoice_item
(
line
,
self
.
_header
,
self
.
_path
,
self
.
_folder_audio
)
return
load_commonvoice_item
(
line
,
self
.
_header
,
self
.
_path
,
self
.
_folder_audio
,
self
.
_ext_audio
)
def
__len__
(
self
)
->
int
:
def
__len__
(
self
)
->
int
:
return
len
(
self
.
_walker
)
return
len
(
self
.
_walker
)
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment