Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
914a846d
Unverified
Commit
914a846d
authored
Sep 15, 2020
by
Jaime Ferrando Huertas
Committed by
GitHub
Sep 15, 2020
Browse files
Add tedlium dataset (#882)
parent
b6a61c3f
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
375 additions
and
5 deletions
+375
-5
docs/source/datasets.rst
docs/source/datasets.rst
+11
-4
test/torchaudio_unittest/datasets/tedlium_test.py
test/torchaudio_unittest/datasets/tedlium_test.py
+153
-0
torchaudio/datasets/__init__.py
torchaudio/datasets/__init__.py
+3
-1
torchaudio/datasets/tedlium.py
torchaudio/datasets/tedlium.py
+208
-0
No files found.
docs/source/datasets.rst
View file @
914a846d
...
@@ -4,11 +4,11 @@ torchaudio.datasets
...
@@ -4,11 +4,11 @@ torchaudio.datasets
All datasets are subclasses of :class:`torch.utils.data.Dataset`
All datasets are subclasses of :class:`torch.utils.data.Dataset`
i.e, they have ``__getitem__`` and ``__len__`` methods implemented.
i.e, they have ``__getitem__`` and ``__len__`` methods implemented.
Hence, they can all be passed to a :class:`torch.utils.data.DataLoader`
Hence, they can all be passed to a :class:`torch.utils.data.DataLoader`
which can load multiple samples parallelly using ``torch.multiprocessing`` workers.
which can load multiple samples parallelly using ``torch.multiprocessing`` workers.
For example: ::
For example: ::
yesno_data = torchaudio.datasets.YESNO('.', download=True)
yesno_data = torchaudio.datasets.YESNO('.', download=True)
data_loader = torch.utils.data.DataLoader(yesno_data,
data_loader = torch.utils.data.DataLoader(yesno_data,
batch_size=1,
batch_size=1,
shuffle=True,
shuffle=True,
num_workers=args.nThreads)
num_workers=args.nThreads)
...
@@ -22,7 +22,7 @@ All the datasets have almost similar API. They all have two common arguments:
...
@@ -22,7 +22,7 @@ All the datasets have almost similar API. They all have two common arguments:
``transform`` and ``target_transform`` to transform the input and target respectively.
``transform`` and ``target_transform`` to transform the input and target respectively.
.. currentmodule:: torchaudio.datasets
.. currentmodule:: torchaudio.datasets
CMUARCTIC
CMUARCTIC
...
@@ -81,6 +81,13 @@ SPEECHCOMMANDS
...
@@ -81,6 +81,13 @@ SPEECHCOMMANDS
:special-members:
:special-members:
TEDLIUM
~~~~~~~~~~~~~~
.. autoclass:: TEDLIUM
:members: __getitem__
:special-members: get_phoneme_dict
VCTK
VCTK
~~~~
~~~~
...
...
test/torchaudio_unittest/datasets/tedlium_test.py
0 → 100644
View file @
914a846d
import
os
from
torchaudio.datasets
import
tedlium
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
# Used to generate a unique utterance for each dummy audio file
UTTERANCES
=
[
"AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5
\n
"
,
]
PHONEME
=
[
"a AH"
,
"a(2) EY"
,
"aachen AA K AH N"
,
"aad AE D"
,
"aaden EY D AH N"
,
"aadmi AE D M IY"
,
"aae EY EY"
,
]
class
TestTedlium
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
samples
=
{}
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
root_dir
=
dataset_dir
=
os
.
path
.
join
(
cls
.
root_dir
,
"tedlium"
)
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz
seed
=
0
for
release
in
[
"release1"
,
"release2"
,
"release3"
]:
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
10.00
,
n_channels
=
1
,
dtype
=
"float32"
,
seed
=
seed
)
if
release
in
[
"release1"
,
"release2"
]:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"subset"
],
)
else
:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"data_path"
],
)
os
.
makedirs
(
release_dir
,
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"stm"
),
exist_ok
=
True
)
# Subfolder for transcripts
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"sph"
),
exist_ok
=
True
)
# Subfolder for audio files
filename
=
f
"
{
release
}
.sph"
path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"sph"
),
filename
)
save_wav
(
path
,
data
,
sample_rate
)
trans_filename
=
f
"
{
release
}
.stm"
trans_path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"stm"
),
trans_filename
)
with
open
(
trans_path
,
"w"
)
as
f
:
f
.
write
(
""
.
join
(
UTTERANCES
))
dict_filename
=
f
"
{
release
}
.dic"
dict_path
=
os
.
path
.
join
(
release_dir
,
dict_filename
)
with
open
(
dict_path
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
PHONEME
))
# Create a samples list to compare with
cls
.
samples
[
release
]
=
[]
for
utterance
in
UTTERANCES
:
talk_id
,
_
,
speaker_id
,
start_time
,
end_time
,
identifier
,
transcript
=
utterance
.
split
(
" "
,
6
)
start_time
=
int
(
float
(
start_time
))
*
sample_rate
end_time
=
int
(
float
(
end_time
))
*
sample_rate
sample
=
(
data
[:,
start_time
:
end_time
],
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
,
)
cls
.
samples
[
release
].
append
(
sample
)
seed
+=
1
def
test_tedlium_release1
(
self
):
release
=
"release1"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
release
][
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
release
][
i
][
1
]
assert
transcript
==
self
.
samples
[
release
][
i
][
2
]
assert
talk_id
==
self
.
samples
[
release
][
i
][
3
]
assert
speaker_id
==
self
.
samples
[
release
][
i
][
4
]
assert
identifier
==
self
.
samples
[
release
][
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
[
release
])
dataset
.
_dict_path
=
os
.
path
.
join
(
dataset
.
_path
,
f
"
{
release
}
.dic"
)
phoneme_dict
=
dataset
.
phoneme_dict
phoenemes
=
[
f
"
{
key
}
{
' '
.
join
(
value
)
}
"
for
key
,
value
in
phoneme_dict
.
items
()]
assert
phoenemes
==
PHONEME
def
test_tedlium_release2
(
self
):
release
=
"release2"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
release
][
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
release
][
i
][
1
]
assert
transcript
==
self
.
samples
[
release
][
i
][
2
]
assert
talk_id
==
self
.
samples
[
release
][
i
][
3
]
assert
speaker_id
==
self
.
samples
[
release
][
i
][
4
]
assert
identifier
==
self
.
samples
[
release
][
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
[
release
])
dataset
.
_dict_path
=
os
.
path
.
join
(
dataset
.
_path
,
f
"
{
release
}
.dic"
)
phoneme_dict
=
dataset
.
phoneme_dict
phoenemes
=
[
f
"
{
key
}
{
' '
.
join
(
value
)
}
"
for
key
,
value
in
phoneme_dict
.
items
()]
assert
phoenemes
==
PHONEME
def
test_tedlium_release3
(
self
):
release
=
"release3"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
release
][
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
release
][
i
][
1
]
assert
transcript
==
self
.
samples
[
release
][
i
][
2
]
assert
talk_id
==
self
.
samples
[
release
][
i
][
3
]
assert
speaker_id
==
self
.
samples
[
release
][
i
][
4
]
assert
identifier
==
self
.
samples
[
release
][
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
[
release
])
dataset
.
_dict_path
=
os
.
path
.
join
(
dataset
.
_path
,
f
"
{
release
}
.dic"
)
phoneme_dict
=
dataset
.
phoneme_dict
phoenemes
=
[
f
"
{
key
}
{
' '
.
join
(
value
)
}
"
for
key
,
value
in
phoneme_dict
.
items
()]
assert
phoenemes
==
PHONEME
torchaudio/datasets/__init__.py
View file @
914a846d
...
@@ -8,6 +8,7 @@ from .yesno import YESNO
...
@@ -8,6 +8,7 @@ from .yesno import YESNO
from
.ljspeech
import
LJSPEECH
from
.ljspeech
import
LJSPEECH
from
.cmuarctic
import
CMUARCTIC
from
.cmuarctic
import
CMUARCTIC
from
.libritts
import
LIBRITTS
from
.libritts
import
LIBRITTS
from
.tedlium
import
TEDLIUM
__all__
=
(
__all__
=
(
"COMMONVOICE"
,
"COMMONVOICE"
,
...
@@ -19,7 +20,8 @@ __all__ = (
...
@@ -19,7 +20,8 @@ __all__ = (
"LJSPEECH"
,
"LJSPEECH"
,
"GTZAN"
,
"GTZAN"
,
"CMUARCTIC"
,
"CMUARCTIC"
,
"LIBRITTS"
"LIBRITTS"
,
"diskcache_iterator"
,
"diskcache_iterator"
,
"bg_iterator"
,
"bg_iterator"
,
"TEDLIUM"
,
)
)
torchaudio/datasets/tedlium.py
0 → 100644
View file @
914a846d
import
os
from
typing
import
Tuple
import
torchaudio
from
torch
import
Tensor
from
torch.utils.data
import
Dataset
from
torchaudio.datasets.utils
import
(
download_url
,
extract_archive
,
)
_RELEASE_CONFIGS
=
{
"release1"
:
{
"folder_in_archive"
:
"TEDLIUM_release1"
,
"url"
:
"http://www.openslr.org/resources/7/TEDLIUM_release1.tar.gz"
,
"checksum"
:
"30301975fd8c5cac4040c261c0852f57cfa8adbbad2ce78e77e4986957445f27"
,
"data_path"
:
""
,
"subset"
:
"train"
,
"supported_subsets"
:
[
"train"
,
"test"
,
"dev"
],
"dict"
:
"TEDLIUM.150K.dic"
,
},
"release2"
:
{
"folder_in_archive"
:
"TEDLIUM_release2"
,
"url"
:
"http://www.openslr.org/resources/19/TEDLIUM_release2.tar.gz"
,
"checksum"
:
"93281b5fcaaae5c88671c9d000b443cb3c7ea3499ad12010b3934ca41a7b9c58"
,
"data_path"
:
""
,
"subset"
:
"train"
,
"supported_subsets"
:
[
"train"
,
"test"
,
"dev"
],
"dict"
:
"TEDLIUM.152k.dic"
,
},
"release3"
:
{
"folder_in_archive"
:
"TEDLIUM_release-3"
,
"url"
:
"http://www.openslr.org/resources/51/TEDLIUM_release-3.tgz"
,
"checksum"
:
"ad1e454d14d1ad550bc2564c462d87c7a7ec83d4dc2b9210f22ab4973b9eccdb"
,
"data_path"
:
"data/"
,
"subset"
:
None
,
"supported_subsets"
:
[
None
],
"dict"
:
"TEDLIUM.152k.dic"
,
},
}
class
TEDLIUM
(
Dataset
):
"""
Create a Dataset for Tedlium. It supports releases 1,2 and 3, each item is a list containings:
[waveform, sample_rate, transcript, talk_id, speaker_id, identifier].
Constructor arguments:
Args:
root (str): Path containing dataset or target path where its downloaded if needed
release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE.
subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None
download (bool, optional): Download dataset in case is not founded in root path. Defaults to False.
audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph".
Special functions:
_load_tedlium_item: Loads a TEDLIUM dataset sample given a file name and corresponding sentence name
_load_audio: Default load function used in TEDLIUM dataset, you can overwrite this function to customize
functionality and load individual sentences from a full ted audio talk file
get_phoneme_dict: Returns the phoneme dictionary of a TEDLIUM release
"""
def
__init__
(
self
,
root
:
str
,
release
:
str
=
"release1"
,
subset
:
str
=
None
,
download
:
bool
=
False
,
audio_ext
=
".sph"
)
->
None
:
"""Constructor for TEDLIUM dataset.
Args:
root (str): Path containing dataset or target path where its downloaded if needed
release (str, optional): TEDLIUM identifier (release1,release2,release3). Defaults to RELEASE.
subset (str, optional): train/dev/test for releases 1&2, None for release3. Defaults to Train/None
download (bool, optional): Download dataset in case is not founded in root path. Defaults to False.
audio_ext (str, optional): Overwrite audio extension when loading items. Defaults to ".sph".
Raises:
RuntimeError: If release identifier does not match any supported release,
"""
self
.
_ext_audio
=
audio_ext
if
release
in
_RELEASE_CONFIGS
.
keys
():
folder_in_archive
=
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
]
url
=
_RELEASE_CONFIGS
[
release
][
"url"
]
subset
=
subset
if
subset
else
_RELEASE_CONFIGS
[
release
][
"subset"
]
else
:
# Raise warning
raise
RuntimeError
(
"The release {} does not match any of the supported tedlium releases{} "
.
format
(
release
,
_RELEASE_CONFIGS
.
keys
(),
)
)
if
subset
not
in
_RELEASE_CONFIGS
[
release
][
"supported_subsets"
]:
# Raise warning
raise
RuntimeError
(
"The subset {} does not match any of the supported tedlium subsets{} "
.
format
(
subset
,
_RELEASE_CONFIGS
[
release
][
"supported_subsets"
],
)
)
basename
=
os
.
path
.
basename
(
url
)
archive
=
os
.
path
.
join
(
root
,
basename
)
basename
=
basename
.
split
(
"."
)[
0
]
self
.
_path
=
os
.
path
.
join
(
root
,
folder_in_archive
,
_RELEASE_CONFIGS
[
release
][
"data_path"
])
if
subset
in
[
"train"
,
"dev"
,
"test"
]:
self
.
_path
=
os
.
path
.
join
(
self
.
_path
,
subset
)
if
download
:
if
not
os
.
path
.
isdir
(
self
.
_path
):
if
not
os
.
path
.
isfile
(
archive
):
checksum
=
_RELEASE_CONFIGS
[
release
][
"checksum"
]
download_url
(
url
,
root
,
hash_value
=
checksum
)
extract_archive
(
archive
)
# Create list for all samples
self
.
_filelist
=
[]
stm_path
=
os
.
path
.
join
(
self
.
_path
,
"stm"
)
for
file
in
sorted
(
os
.
listdir
(
stm_path
)):
if
file
.
endswith
(
".stm"
):
stm_path
=
os
.
path
.
join
(
self
.
_path
,
"stm"
,
file
)
with
open
(
stm_path
)
as
f
:
l
=
len
(
f
.
readlines
())
file
=
file
.
replace
(
".stm"
,
""
)
self
.
_filelist
.
extend
((
file
,
line
)
for
line
in
range
(
l
))
# Create dict path for later read
self
.
_dict_path
=
os
.
path
.
join
(
root
,
folder_in_archive
,
_RELEASE_CONFIGS
[
release
][
"dict"
])
self
.
_phoneme_dict
=
None
def
_load_tedlium_item
(
self
,
fileid
:
str
,
line
:
int
,
path
:
str
)
->
Tuple
[
Tensor
,
int
,
str
,
int
,
int
,
int
]:
"""Loads a TEDLIUM dataset sample given a file name and corresponding sentence name.
Args:
fileid (str): File id to identify both text and audio files corresponding to the sample
line (int): Line identifier for the sample inside the text file
path (str): Dataset root path
Returns:
Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier]
"""
transcript_path
=
os
.
path
.
join
(
path
,
"stm"
,
fileid
)
with
open
(
transcript_path
+
".stm"
)
as
f
:
transcript
=
f
.
readlines
()[
line
]
talk_id
,
_
,
speaker_id
,
start_time
,
end_time
,
identifier
,
transcript
=
transcript
.
split
(
" "
,
6
)
wave_path
=
os
.
path
.
join
(
path
,
"sph"
,
fileid
)
waveform
,
sample_rate
=
self
.
_load_audio
(
wave_path
+
self
.
_ext_audio
,
start_time
=
start_time
,
end_time
=
end_time
)
return
(
waveform
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
def
_load_audio
(
self
,
path
:
str
,
start_time
:
float
,
end_time
:
float
,
sample_rate
:
int
=
16000
)
->
[
Tensor
,
int
]:
"""Default load function used in TEDLIUM dataset, you can overwrite this function to customize functionality
and load individual sentences from a full ted audio talk file.
Args:
path (str): Path to audio file
start_time (int, optional): Time in seconds where the sample sentence stars
end_time (int, optional): Time in seconds where the sample sentence finishes
Returns:
[Tensor, int]: Audio tensor representation and sample rate
"""
start_time
=
int
(
float
(
start_time
)
*
sample_rate
)
end_time
=
int
(
float
(
end_time
)
*
sample_rate
)
if
torchaudio
.
get_audio_backend
()
==
"sox_io"
:
return
torchaudio
.
load
(
path
,
frame_offset
=
start_time
,
num_frames
=
end_time
-
start_time
)
return
torchaudio
.
load
(
path
)[:,
start_time
:
end_time
]
def
__getitem__
(
self
,
n
:
int
)
->
Tuple
[
Tensor
,
int
,
str
,
int
,
int
,
int
]:
"""TEDLIUM dataset custom function overwritting default loadbehaviour
Loads a TEDLIUM sample given a index N.
Args:
n (int): Index of sample to be loaded
Returns:
Tedlium_item: A namedTuple containing [waveform, sample_rate, transcript, talk_id, speaker_id, identifier]
"""
fileid
,
line
=
self
.
_filelist
[
n
]
return
self
.
_load_tedlium_item
(
fileid
,
line
,
self
.
_path
)
def
__len__
(
self
)
->
int
:
"""TEDLIUM dataset custom function overwritting len default behaviour.
Returns:
int: TEDLIUM dataset length
"""
return
len
(
self
.
_filelist
)
@
property
def
phoneme_dict
(
self
):
"""Returns the phoneme dictionary of a TEDLIUM release.
Returns:
dictionary: Phoneme dictionary for the current tedlium release
"""
# Read phoneme dictionary
if
not
self
.
_phoneme_dict
:
self
.
_phoneme_dict
=
{}
with
open
(
self
.
_dict_path
,
"r"
,
encoding
=
"utf-8"
)
as
f
:
for
line
in
f
.
readlines
():
content
=
line
.
strip
().
split
()
self
.
_phoneme_dict
[
content
[
0
]]
=
tuple
(
content
[
1
:])
# content[1:] can be empty list
return
self
.
_phoneme_dict
.
copy
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment