Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
Torchaudio
Commits
9dcc7a15
Commit
9dcc7a15
authored
Apr 25, 2022
by
flyingdown
Browse files
init v0.10.0
parent
db2b0b79
Pipeline
#254
failed with stages
in 0 seconds
Changes
416
Pipelines
2
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
2308 additions
and
0 deletions
+2308
-0
test/torchaudio_unittest/common_utils/rnnt_utils.py
test/torchaudio_unittest/common_utils/rnnt_utils.py
+603
-0
test/torchaudio_unittest/common_utils/sox_utils.py
test/torchaudio_unittest/common_utils/sox_utils.py
+106
-0
test/torchaudio_unittest/common_utils/wav_utils.py
test/torchaudio_unittest/common_utils/wav_utils.py
+92
-0
test/torchaudio_unittest/compliance_kaldi_test.py
test/torchaudio_unittest/compliance_kaldi_test.py
+76
-0
test/torchaudio_unittest/datasets/__init__.py
test/torchaudio_unittest/datasets/__init__.py
+0
-0
test/torchaudio_unittest/datasets/cmuarctic_test.py
test/torchaudio_unittest/datasets/cmuarctic_test.py
+84
-0
test/torchaudio_unittest/datasets/cmudict_test.py
test/torchaudio_unittest/datasets/cmudict_test.py
+218
-0
test/torchaudio_unittest/datasets/commonvoice_test.py
test/torchaudio_unittest/datasets/commonvoice_test.py
+148
-0
test/torchaudio_unittest/datasets/datasets_test.py
test/torchaudio_unittest/datasets/datasets_test.py
+15
-0
test/torchaudio_unittest/datasets/gtzan_test.py
test/torchaudio_unittest/datasets/gtzan_test.py
+127
-0
test/torchaudio_unittest/datasets/librispeech_test.py
test/torchaudio_unittest/datasets/librispeech_test.py
+128
-0
test/torchaudio_unittest/datasets/libritts_test.py
test/torchaudio_unittest/datasets/libritts_test.py
+89
-0
test/torchaudio_unittest/datasets/ljspeech_test.py
test/torchaudio_unittest/datasets/ljspeech_test.py
+92
-0
test/torchaudio_unittest/datasets/speechcommands_test.py
test/torchaudio_unittest/datasets/speechcommands_test.py
+161
-0
test/torchaudio_unittest/datasets/tedlium_test.py
test/torchaudio_unittest/datasets/tedlium_test.py
+150
-0
test/torchaudio_unittest/datasets/utils_test.py
test/torchaudio_unittest/datasets/utils_test.py
+37
-0
test/torchaudio_unittest/datasets/vctk_test.py
test/torchaudio_unittest/datasets/vctk_test.py
+107
-0
test/torchaudio_unittest/datasets/yesno_test.py
test/torchaudio_unittest/datasets/yesno_test.py
+67
-0
test/torchaudio_unittest/example/__init__.py
test/torchaudio_unittest/example/__init__.py
+8
-0
test/torchaudio_unittest/example/souce_sepration/__init__.py
test/torchaudio_unittest/example/souce_sepration/__init__.py
+0
-0
No files found.
Too many changes to show.
To preserve performance only
416 of 416+
files are displayed.
Plain diff
Email patch
test/torchaudio_unittest/common_utils/rnnt_utils.py
0 → 100644
View file @
9dcc7a15
import
unittest
import
random
import
torch
import
numpy
as
np
from
torchaudio.functional
import
rnnt_loss
CPU_DEVICE
=
torch
.
device
(
"cpu"
)
class
_NumpyTransducer
(
torch
.
autograd
.
Function
):
@
staticmethod
def
forward
(
ctx
,
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
blank
=-
1
,
):
device
=
log_probs
.
device
log_probs
=
log_probs
.
cpu
().
data
.
numpy
()
logit_lengths
=
logit_lengths
.
cpu
().
data
.
numpy
()
target_lengths
=
target_lengths
.
cpu
().
data
.
numpy
()
targets
=
targets
.
cpu
().
data
.
numpy
()
gradients
,
costs
,
_
,
_
=
__class__
.
compute
(
log_probs
=
log_probs
,
logit_lengths
=
logit_lengths
,
target_lengths
=
target_lengths
,
targets
=
targets
,
blank
=
blank
,
)
costs
=
torch
.
FloatTensor
(
costs
).
to
(
device
=
device
)
gradients
=
torch
.
FloatTensor
(
gradients
).
to
(
device
=
device
)
ctx
.
grads
=
torch
.
autograd
.
Variable
(
gradients
)
return
costs
@
staticmethod
def
backward
(
ctx
,
grad_output
):
grad_output
=
grad_output
.
view
(
-
1
,
1
,
1
,
1
).
to
(
ctx
.
grads
)
return
ctx
.
grads
.
mul
(
grad_output
),
None
,
None
,
None
,
None
,
None
,
None
,
None
,
None
@
staticmethod
def
compute_alpha_one_sequence
(
log_probs
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
alpha
=
np
.
zeros
((
max_T
,
max_U
),
dtype
=
np
.
float32
)
for
t
in
range
(
1
,
max_T
):
alpha
[
t
,
0
]
=
alpha
[
t
-
1
,
0
]
+
log_probs
[
t
-
1
,
0
,
blank
]
for
u
in
range
(
1
,
max_U
):
alpha
[
0
,
u
]
=
alpha
[
0
,
u
-
1
]
+
log_probs
[
0
,
u
-
1
,
targets
[
u
-
1
]]
for
t
in
range
(
1
,
max_T
):
for
u
in
range
(
1
,
max_U
):
skip
=
alpha
[
t
-
1
,
u
]
+
log_probs
[
t
-
1
,
u
,
blank
]
emit
=
alpha
[
t
,
u
-
1
]
+
log_probs
[
t
,
u
-
1
,
targets
[
u
-
1
]]
alpha
[
t
,
u
]
=
np
.
logaddexp
(
skip
,
emit
)
cost
=
-
(
alpha
[
-
1
,
-
1
]
+
log_probs
[
-
1
,
-
1
,
blank
])
return
alpha
,
cost
@
staticmethod
def
compute_beta_one_sequence
(
log_probs
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
beta
=
np
.
zeros
((
max_T
,
max_U
),
dtype
=
np
.
float32
)
beta
[
-
1
,
-
1
]
=
log_probs
[
-
1
,
-
1
,
blank
]
for
t
in
reversed
(
range
(
max_T
-
1
)):
beta
[
t
,
-
1
]
=
beta
[
t
+
1
,
-
1
]
+
log_probs
[
t
,
-
1
,
blank
]
for
u
in
reversed
(
range
(
max_U
-
1
)):
beta
[
-
1
,
u
]
=
beta
[
-
1
,
u
+
1
]
+
log_probs
[
-
1
,
u
,
targets
[
u
]]
for
t
in
reversed
(
range
(
max_T
-
1
)):
for
u
in
reversed
(
range
(
max_U
-
1
)):
skip
=
beta
[
t
+
1
,
u
]
+
log_probs
[
t
,
u
,
blank
]
emit
=
beta
[
t
,
u
+
1
]
+
log_probs
[
t
,
u
,
targets
[
u
]]
beta
[
t
,
u
]
=
np
.
logaddexp
(
skip
,
emit
)
cost
=
-
beta
[
0
,
0
]
return
beta
,
cost
@
staticmethod
def
compute_gradients_one_sequence
(
log_probs
,
alpha
,
beta
,
targets
,
blank
=-
1
):
max_T
,
max_U
,
D
=
log_probs
.
shape
gradients
=
np
.
full
(
log_probs
.
shape
,
float
(
"-inf"
))
cost
=
-
beta
[
0
,
0
]
gradients
[
-
1
,
-
1
,
blank
]
=
alpha
[
-
1
,
-
1
]
gradients
[:
-
1
,
:,
blank
]
=
alpha
[:
-
1
,
:]
+
beta
[
1
:,
:]
for
u
,
l
in
enumerate
(
targets
):
gradients
[:,
u
,
l
]
=
alpha
[:,
u
]
+
beta
[:,
u
+
1
]
gradients
=
-
(
np
.
exp
(
gradients
+
log_probs
+
cost
))
return
gradients
@
staticmethod
def
compute
(
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
blank
=-
1
,
):
gradients
=
np
.
zeros_like
(
log_probs
)
B_tgt
,
max_T
,
max_U
,
D
=
log_probs
.
shape
B_src
=
logit_lengths
.
shape
[
0
]
H
=
int
(
B_tgt
/
B_src
)
alphas
=
np
.
zeros
((
B_tgt
,
max_T
,
max_U
))
betas
=
np
.
zeros
((
B_tgt
,
max_T
,
max_U
))
betas
.
fill
(
float
(
"-inf"
))
alphas
.
fill
(
float
(
"-inf"
))
costs
=
np
.
zeros
(
B_tgt
)
for
b_tgt
in
range
(
B_tgt
):
b_src
=
int
(
b_tgt
/
H
)
T
=
int
(
logit_lengths
[
b_src
])
# NOTE: see https://arxiv.org/pdf/1211.3711.pdf Section 2.1
U
=
int
(
target_lengths
[
b_tgt
])
+
1
seq_log_probs
=
log_probs
[
b_tgt
,
:
T
,
:
U
,
:]
seq_targets
=
targets
[
b_tgt
,
:
int
(
target_lengths
[
b_tgt
])]
alpha
,
alpha_cost
=
__class__
.
compute_alpha_one_sequence
(
log_probs
=
seq_log_probs
,
targets
=
seq_targets
,
blank
=
blank
)
beta
,
beta_cost
=
__class__
.
compute_beta_one_sequence
(
log_probs
=
seq_log_probs
,
targets
=
seq_targets
,
blank
=
blank
)
seq_gradients
=
__class__
.
compute_gradients_one_sequence
(
log_probs
=
seq_log_probs
,
alpha
=
alpha
,
beta
=
beta
,
targets
=
seq_targets
,
blank
=
blank
,
)
np
.
testing
.
assert_almost_equal
(
alpha_cost
,
beta_cost
,
decimal
=
2
)
gradients
[
b_tgt
,
:
T
,
:
U
,
:]
=
seq_gradients
costs
[
b_tgt
]
=
beta_cost
alphas
[
b_tgt
,
:
T
,
:
U
]
=
alpha
betas
[
b_tgt
,
:
T
,
:
U
]
=
beta
return
gradients
,
costs
,
alphas
,
betas
class
NumpyTransducerLoss
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
blank
=-
1
):
super
().
__init__
()
self
.
blank
=
blank
def
forward
(
self
,
logits
,
logit_lengths
,
target_lengths
,
targets
,
):
log_probs
=
torch
.
nn
.
functional
.
log_softmax
(
logits
,
dim
=-
1
)
return
_NumpyTransducer
.
apply
(
log_probs
,
logit_lengths
,
target_lengths
,
targets
,
self
.
blank
,
)
def
compute_with_numpy_transducer
(
data
):
costs
=
NumpyTransducerLoss
(
blank
=
data
[
"blank"
],
)(
logits
=
data
[
"logits"
],
logit_lengths
=
data
[
"logit_lengths"
],
target_lengths
=
data
[
"target_lengths"
],
targets
=
data
[
"targets"
],
)
loss
=
torch
.
sum
(
costs
)
loss
.
backward
()
costs
=
costs
.
cpu
()
gradients
=
data
[
"logits"
].
saved_grad
.
cpu
()
return
costs
,
gradients
def
compute_with_pytorch_transducer
(
data
):
costs
=
rnnt_loss
(
logits
=
data
[
"logits"
],
logit_lengths
=
data
[
"logit_lengths"
],
target_lengths
=
data
[
"target_lengths"
],
targets
=
data
[
"targets"
],
blank
=
data
[
"blank"
],
reduction
=
"none"
,
)
loss
=
torch
.
sum
(
costs
)
loss
.
backward
()
costs
=
costs
.
cpu
()
gradients
=
data
[
"logits"
].
saved_grad
.
cpu
()
return
costs
,
gradients
def
get_basic_data
(
device
):
# Example provided
# in 6f73a2513dc784c59eec153a45f40bc528355b18
# of https://github.com/HawkAaron/warp-transducer
logits
=
torch
.
tensor
(
[
[
[
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.6
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.2
,
0.8
,
0.1
],
],
[
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
],
[
0.1
,
0.1
,
0.2
,
0.1
,
0.1
],
[
0.7
,
0.1
,
0.2
,
0.1
,
0.1
],
],
]
],
dtype
=
torch
.
float32
,
device
=
device
,
)
targets
=
torch
.
tensor
([[
1
,
2
]],
dtype
=
torch
.
int
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int
,
device
=
device
)
logits
.
requires_grad_
(
True
)
return
logits
,
targets
,
logit_lengths
,
target_lengths
def
get_B1_T10_U3_D4_data
(
random
=
False
,
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
,
):
B
,
T
,
U
,
D
=
2
,
10
,
3
,
4
logits
=
torch
.
rand
(
B
,
T
,
U
,
D
,
dtype
=
dtype
,
device
=
device
)
if
not
random
:
logits
.
fill_
(
0.1
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
data
=
{}
data
[
"logits"
]
=
logits
data
[
"logit_lengths"
]
=
torch
.
tensor
([
10
,
10
],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"target_lengths"
]
=
torch
.
tensor
([
2
,
2
],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"targets"
]
=
torch
.
tensor
([[
1
,
2
],
[
1
,
2
]],
dtype
=
torch
.
int32
,
device
=
device
)
data
[
"blank"
]
=
0
return
data
def
get_B1_T2_U3_D5_data
(
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
):
logits
=
torch
.
tensor
(
[
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.8
,
0.1
,
0.1
,
0.6
,
0.1
,
0.1
,
0.1
,
0.1
,
0.1
,
0.2
,
0.1
,
0.1
,
0.7
,
0.1
,
0.2
,
0.1
,
0.1
,
],
dtype
=
dtype
,
device
=
device
,
).
reshape
(
1
,
2
,
3
,
5
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
targets
=
torch
.
tensor
([[
1
,
2
]],
dtype
=
torch
.
int32
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
],
dtype
=
torch
.
int32
,
device
=
device
)
blank
=
-
1
ref_costs
=
torch
.
tensor
([
5.09566688538
],
dtype
=
dtype
)
ref_gradients
=
torch
.
tensor
(
[
0.17703132
,
-
0.39992708
,
0.17703132
,
0.17703132
,
-
0.13116692
,
0.12247062
,
0.12247062
,
-
0.181684
,
0.12247062
,
-
0.1857276
,
0.06269141
,
0.06269141
,
0.06928471
,
0.12624498
,
-
0.32091248
,
0.05456069
,
-
0.2182428
,
0.05456069
,
0.05456069
,
0.05456069
,
0.12073967
,
0.12073967
,
-
0.48295838
,
0.12073967
,
0.12073967
,
0.30741188
,
0.16871123
,
0.18645471
,
0.16871123
,
-
0.83128875
,
],
dtype
=
dtype
,
).
reshape
(
1
,
2
,
3
,
5
)
data
=
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
return
data
,
ref_costs
,
ref_gradients
def
get_B2_T4_U3_D3_data
(
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
):
# Test from D21322854
logits
=
torch
.
tensor
(
[
0.065357
,
0.787530
,
0.081592
,
0.529716
,
0.750675
,
0.754135
,
0.609764
,
0.868140
,
0.622532
,
0.668522
,
0.858039
,
0.164539
,
0.989780
,
0.944298
,
0.603168
,
0.946783
,
0.666203
,
0.286882
,
0.094184
,
0.366674
,
0.736168
,
0.166680
,
0.714154
,
0.399400
,
0.535982
,
0.291821
,
0.612642
,
0.324241
,
0.800764
,
0.524106
,
0.779195
,
0.183314
,
0.113745
,
0.240222
,
0.339470
,
0.134160
,
0.505562
,
0.051597
,
0.640290
,
0.430733
,
0.829473
,
0.177467
,
0.320700
,
0.042883
,
0.302803
,
0.675178
,
0.569537
,
0.558474
,
0.083132
,
0.060165
,
0.107958
,
0.748615
,
0.943918
,
0.486356
,
0.418199
,
0.652408
,
0.024243
,
0.134582
,
0.366342
,
0.295830
,
0.923670
,
0.689929
,
0.741898
,
0.250005
,
0.603430
,
0.987289
,
0.592606
,
0.884672
,
0.543450
,
0.660770
,
0.377128
,
0.358021
,
],
dtype
=
dtype
,
device
=
device
,
).
reshape
(
2
,
4
,
3
,
3
)
logits
.
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
targets
=
torch
.
tensor
([[
1
,
2
],
[
1
,
1
]],
dtype
=
torch
.
int32
,
device
=
device
)
logit_lengths
=
torch
.
tensor
([
4
,
4
],
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
tensor
([
2
,
2
],
dtype
=
torch
.
int32
,
device
=
device
)
blank
=
0
ref_costs
=
torch
.
tensor
([
4.2806528590890736
,
3.9384369822503591
],
dtype
=
dtype
)
ref_gradients
=
torch
.
tensor
(
[
-
0.186844
,
-
0.062555
,
0.249399
,
-
0.203377
,
0.202399
,
0.000977
,
-
0.141016
,
0.079123
,
0.061893
,
-
0.011552
,
-
0.081280
,
0.092832
,
-
0.154257
,
0.229433
,
-
0.075176
,
-
0.246593
,
0.146405
,
0.100188
,
-
0.012918
,
-
0.061593
,
0.074512
,
-
0.055986
,
0.219831
,
-
0.163845
,
-
0.497627
,
0.209240
,
0.288387
,
0.013605
,
-
0.030220
,
0.016615
,
0.113925
,
0.062781
,
-
0.176706
,
-
0.667078
,
0.367659
,
0.299419
,
-
0.356344
,
-
0.055347
,
0.411691
,
-
0.096922
,
0.029459
,
0.067463
,
-
0.063518
,
0.027654
,
0.035863
,
-
0.154499
,
-
0.073942
,
0.228441
,
-
0.166790
,
-
0.000088
,
0.166878
,
-
0.172370
,
0.105565
,
0.066804
,
0.023875
,
-
0.118256
,
0.094381
,
-
0.104707
,
-
0.108934
,
0.213642
,
-
0.369844
,
0.180118
,
0.189726
,
0.025714
,
-
0.079462
,
0.053748
,
0.122328
,
-
0.238789
,
0.116460
,
-
0.598687
,
0.302203
,
0.296484
,
],
dtype
=
dtype
,
).
reshape
(
2
,
4
,
3
,
3
)
data
=
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
return
data
,
ref_costs
,
ref_gradients
def
get_random_data
(
max_B
=
8
,
max_T
=
128
,
max_U
=
32
,
max_D
=
40
,
blank
=-
1
,
dtype
=
torch
.
float32
,
device
=
CPU_DEVICE
,
seed
=
None
,
):
if
seed
is
not
None
:
torch
.
manual_seed
(
seed
=
seed
)
if
blank
!=
-
1
:
raise
ValueError
(
"blank != -1 is not supported yet."
)
random
.
seed
(
0
)
B
=
random
.
randint
(
1
,
max_B
-
1
)
T
=
random
.
randint
(
5
,
max_T
-
1
)
U
=
random
.
randint
(
5
,
max_U
-
1
)
D
=
random
.
randint
(
2
,
max_D
-
1
)
logit_lengths
=
torch
.
randint
(
low
=
5
,
high
=
T
+
1
,
size
=
(
B
,),
dtype
=
torch
.
int32
,
device
=
device
)
target_lengths
=
torch
.
randint
(
low
=
5
,
high
=
U
+
1
,
size
=
(
B
,),
dtype
=
torch
.
int32
,
device
=
device
)
max_src_length
=
torch
.
max
(
logit_lengths
)
max_tgt_length
=
torch
.
max
(
target_lengths
)
targets
=
torch
.
randint
(
low
=
0
,
high
=
D
-
1
,
size
=
(
B
,
max_tgt_length
),
dtype
=
torch
.
int32
,
device
=
device
)
logits
=
torch
.
rand
(
size
=
(
B
,
max_src_length
,
max_tgt_length
+
1
,
D
),
dtype
=
dtype
,
device
=
device
,
).
requires_grad_
(
True
)
def
grad_hook
(
grad
):
logits
.
saved_grad
=
grad
.
clone
()
logits
.
register_hook
(
grad_hook
)
return
{
"logits"
:
logits
,
"targets"
:
targets
,
"logit_lengths"
:
logit_lengths
,
"target_lengths"
:
target_lengths
,
"blank"
:
blank
,
}
def
skipIfNoRNNT
(
test_item
):
try
:
torch
.
ops
.
torchaudio
.
rnnt_loss
return
test_item
except
RuntimeError
:
return
unittest
.
skip
(
"torchaudio C++ extension is not compiled with RNN transducer loss"
)
test/torchaudio_unittest/common_utils/sox_utils.py
0 → 100644
View file @
9dcc7a15
import
sys
import
subprocess
import
warnings
def
get_encoding
(
dtype
):
encodings
=
{
'float32'
:
'floating-point'
,
'int32'
:
'signed-integer'
,
'int16'
:
'signed-integer'
,
'uint8'
:
'unsigned-integer'
,
}
return
encodings
[
dtype
]
def
get_bit_depth
(
dtype
):
bit_depths
=
{
'float32'
:
32
,
'int32'
:
32
,
'int16'
:
16
,
'uint8'
:
8
,
}
return
bit_depths
[
dtype
]
def
gen_audio_file
(
path
,
sample_rate
,
num_channels
,
*
,
encoding
=
None
,
bit_depth
=
None
,
compression
=
None
,
attenuation
=
None
,
duration
=
1
,
comment_file
=
None
,
):
"""Generate synthetic audio file with `sox` command."""
if
path
.
endswith
(
'.wav'
):
warnings
.
warn
(
'Use get_wav_data and save_wav to generate wav file for accurate result.'
)
command
=
[
'sox'
,
'-V3'
,
# verbose
'--no-dither'
,
# disable automatic dithering
'-R'
,
# -R is supposed to be repeatable, though the implementation looks suspicious
# and not setting the seed to a fixed value.
# https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
# search "sox_globals.repeatable"
]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
command
+=
[
'--rate'
,
str
(
sample_rate
),
'--null'
,
# no input
'--channels'
,
str
(
num_channels
),
]
if
compression
is
not
None
:
command
+=
[
'--compression'
,
str
(
compression
)]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
if
encoding
is
not
None
:
command
+=
[
'--encoding'
,
str
(
encoding
)]
if
comment_file
is
not
None
:
command
+=
[
'--comment-file'
,
str
(
comment_file
)]
command
+=
[
str
(
path
),
'synth'
,
str
(
duration
),
# synthesizes for the given duration [sec]
'sawtooth'
,
'1'
,
# saw tooth covers the both ends of value range, which is a good property for test.
# similar to linspace(-1., 1.)
# this introduces bigger boundary effect than sine when converted to mp3
]
if
attenuation
is
not
None
:
command
+=
[
'vol'
,
f
'-
{
attenuation
}
dB'
]
print
(
' '
.
join
(
command
),
file
=
sys
.
stderr
)
subprocess
.
run
(
command
,
check
=
True
)
def
convert_audio_file
(
src_path
,
dst_path
,
*
,
encoding
=
None
,
bit_depth
=
None
,
compression
=
None
):
"""Convert audio file with `sox` command."""
command
=
[
'sox'
,
'-V3'
,
'--no-dither'
,
'-R'
,
str
(
src_path
)]
if
encoding
is
not
None
:
command
+=
[
'--encoding'
,
str
(
encoding
)]
if
bit_depth
is
not
None
:
command
+=
[
'--bits'
,
str
(
bit_depth
)]
if
compression
is
not
None
:
command
+=
[
'--compression'
,
str
(
compression
)]
command
+=
[
dst_path
]
print
(
' '
.
join
(
command
),
file
=
sys
.
stderr
)
subprocess
.
run
(
command
,
check
=
True
)
def
_flattern
(
effects
):
if
not
effects
:
return
effects
if
isinstance
(
effects
[
0
],
str
):
return
effects
return
[
item
for
sublist
in
effects
for
item
in
sublist
]
def
run_sox_effect
(
input_file
,
output_file
,
effect
,
*
,
output_sample_rate
=
None
,
output_bitdepth
=
None
):
"""Run sox effects"""
effect
=
_flattern
(
effect
)
command
=
[
'sox'
,
'-V'
,
'--no-dither'
,
input_file
]
if
output_bitdepth
:
command
+=
[
'--bits'
,
str
(
output_bitdepth
)]
command
+=
[
output_file
]
+
effect
if
output_sample_rate
:
command
+=
[
'rate'
,
str
(
output_sample_rate
)]
print
(
' '
.
join
(
command
))
subprocess
.
run
(
command
,
check
=
True
)
test/torchaudio_unittest/common_utils/wav_utils.py
0 → 100644
View file @
9dcc7a15
from
typing
import
Optional
import
torch
import
scipy.io.wavfile
def
normalize_wav
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
tensor
.
dtype
==
torch
.
float32
:
pass
elif
tensor
.
dtype
==
torch
.
int32
:
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
[
tensor
>
0
]
/=
2147483647.
tensor
[
tensor
<
0
]
/=
2147483648.
elif
tensor
.
dtype
==
torch
.
int16
:
tensor
=
tensor
.
to
(
torch
.
float32
)
tensor
[
tensor
>
0
]
/=
32767.
tensor
[
tensor
<
0
]
/=
32768.
elif
tensor
.
dtype
==
torch
.
uint8
:
tensor
=
tensor
.
to
(
torch
.
float32
)
-
128
tensor
[
tensor
>
0
]
/=
127.
tensor
[
tensor
<
0
]
/=
128.
return
tensor
def
get_wav_data
(
dtype
:
str
,
num_channels
:
int
,
*
,
num_frames
:
Optional
[
int
]
=
None
,
normalize
:
bool
=
True
,
channels_first
:
bool
=
True
,
):
"""Generate linear signal of the given dtype and num_channels
Data range is
[-1.0, 1.0] for float32,
[-2147483648, 2147483647] for int32
[-32768, 32767] for int16
[0, 255] for uint8
num_frames allow to change the linear interpolation parameter.
Default values are 256 for uint8, else 1 << 16.
1 << 16 as default is so that int16 value range is completely covered.
"""
dtype_
=
getattr
(
torch
,
dtype
)
if
num_frames
is
None
:
if
dtype
==
'uint8'
:
num_frames
=
256
else
:
num_frames
=
1
<<
16
if
dtype
==
'uint8'
:
base
=
torch
.
linspace
(
0
,
255
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int8'
:
base
=
torch
.
linspace
(
-
128
,
127
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'float32'
:
base
=
torch
.
linspace
(
-
1.
,
1.
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'float64'
:
base
=
torch
.
linspace
(
-
1.
,
1.
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int32'
:
base
=
torch
.
linspace
(
-
2147483648
,
2147483647
,
num_frames
,
dtype
=
dtype_
)
elif
dtype
==
'int16'
:
base
=
torch
.
linspace
(
-
32768
,
32767
,
num_frames
,
dtype
=
dtype_
)
else
:
raise
NotImplementedError
(
f
'Unsupported dtype
{
dtype
}
'
)
data
=
base
.
repeat
([
num_channels
,
1
])
if
not
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
if
normalize
:
data
=
normalize_wav
(
data
)
return
data
def
load_wav
(
path
:
str
,
normalize
=
True
,
channels_first
=
True
)
->
torch
.
Tensor
:
"""Load wav file without torchaudio"""
sample_rate
,
data
=
scipy
.
io
.
wavfile
.
read
(
path
)
data
=
torch
.
from_numpy
(
data
.
copy
())
if
data
.
ndim
==
1
:
data
=
data
.
unsqueeze
(
1
)
if
normalize
:
data
=
normalize_wav
(
data
)
if
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
return
data
,
sample_rate
def
save_wav
(
path
,
data
,
sample_rate
,
channels_first
=
True
):
"""Save wav file without torchaudio"""
if
channels_first
:
data
=
data
.
transpose
(
1
,
0
)
scipy
.
io
.
wavfile
.
write
(
path
,
sample_rate
,
data
.
numpy
())
test/torchaudio_unittest/compliance_kaldi_test.py
0 → 100644
View file @
9dcc7a15
import
torch
import
torchaudio.compliance.kaldi
as
kaldi
from
torchaudio_unittest
import
common_utils
def
extract_window
(
window
,
wave
,
f
,
frame_length
,
frame_shift
,
snip_edges
):
# just a copy of ExtractWindow from feature-window.cc in python
def
first_sample_of_frame
(
frame
,
window_size
,
window_shift
,
snip_edges
):
if
snip_edges
:
return
frame
*
window_shift
else
:
midpoint_of_frame
=
frame
*
window_shift
+
window_shift
//
2
beginning_of_frame
=
midpoint_of_frame
-
window_size
//
2
return
beginning_of_frame
sample_offset
=
0
num_samples
=
sample_offset
+
wave
.
size
(
0
)
start_sample
=
first_sample_of_frame
(
f
,
frame_length
,
frame_shift
,
snip_edges
)
end_sample
=
start_sample
+
frame_length
if
snip_edges
:
assert
(
start_sample
>=
sample_offset
and
end_sample
<=
num_samples
)
else
:
assert
(
sample_offset
==
0
or
start_sample
>=
sample_offset
)
wave_start
=
start_sample
-
sample_offset
wave_end
=
wave_start
+
frame_length
if
wave_start
>=
0
and
wave_end
<=
wave
.
size
(
0
):
window
[
f
,
:]
=
wave
[
wave_start
:(
wave_start
+
frame_length
)]
else
:
wave_dim
=
wave
.
size
(
0
)
for
s
in
range
(
frame_length
):
s_in_wave
=
s
+
wave_start
while
s_in_wave
<
0
or
s_in_wave
>=
wave_dim
:
if
s_in_wave
<
0
:
s_in_wave
=
-
s_in_wave
-
1
else
:
s_in_wave
=
2
*
wave_dim
-
1
-
s_in_wave
window
[
f
,
s
]
=
wave
[
s_in_wave
]
class
Test_Kaldi
(
common_utils
.
TempDirMixin
,
common_utils
.
TorchaudioTestCase
):
def
_test_get_strided_helper
(
self
,
num_samples
,
window_size
,
window_shift
,
snip_edges
):
waveform
=
torch
.
arange
(
num_samples
).
float
()
output
=
kaldi
.
_get_strided
(
waveform
,
window_size
,
window_shift
,
snip_edges
)
# from NumFrames in feature-window.cc
n
=
window_size
if
snip_edges
:
m
=
0
if
num_samples
<
window_size
else
1
+
(
num_samples
-
window_size
)
//
window_shift
else
:
m
=
(
num_samples
+
(
window_shift
//
2
))
//
window_shift
self
.
assertTrue
(
output
.
dim
()
==
2
)
self
.
assertTrue
(
output
.
shape
[
0
]
==
m
and
output
.
shape
[
1
]
==
n
)
window
=
torch
.
empty
((
m
,
window_size
))
for
r
in
range
(
m
):
extract_window
(
window
,
waveform
,
r
,
window_size
,
window_shift
,
snip_edges
)
self
.
assertEqual
(
window
,
output
)
def
test_get_strided
(
self
):
# generate any combination where 0 < window_size <= num_samples and
# 0 < window_shift.
for
num_samples
in
range
(
1
,
20
):
for
window_size
in
range
(
1
,
num_samples
+
1
):
for
window_shift
in
range
(
1
,
2
*
num_samples
+
1
):
for
snip_edges
in
range
(
0
,
2
):
self
.
_test_get_strided_helper
(
num_samples
,
window_size
,
window_shift
,
snip_edges
)
def
test_mfcc_empty
(
self
):
# Passing in an empty tensor should result in an error
self
.
assertRaises
(
AssertionError
,
kaldi
.
mfcc
,
torch
.
empty
(
0
))
test/torchaudio_unittest/datasets/__init__.py
0 → 100644
View file @
9dcc7a15
test/torchaudio_unittest/datasets/cmuarctic_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
cmuarctic
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
sample_rate
=
16000
transcript
=
"This is a test transcript."
base_dir
=
os
.
path
.
join
(
root_dir
,
"ARCTIC"
,
"cmu_us_aew_arctic"
)
txt_dir
=
os
.
path
.
join
(
base_dir
,
"etc"
)
os
.
makedirs
(
txt_dir
,
exist_ok
=
True
)
txt_file
=
os
.
path
.
join
(
txt_dir
,
"txt.done.data"
)
audio_dir
=
os
.
path
.
join
(
base_dir
,
"wav"
)
os
.
makedirs
(
audio_dir
,
exist_ok
=
True
)
seed
=
42
with
open
(
txt_file
,
"w"
)
as
txt
:
for
c
in
[
"a"
,
"b"
]:
for
i
in
range
(
5
):
utterance_id
=
f
"arctic_
{
c
}{
i
:
04
d
}
"
path
=
os
.
path
.
join
(
audio_dir
,
f
"
{
utterance_id
}
.wav"
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
3
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
seed
,
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
utterance_id
.
split
(
"_"
)[
1
],
)
mocked_data
.
append
(
sample
)
txt
.
write
(
f
'(
{
utterance_id
}
"
{
transcript
}
" )
\n
'
)
seed
+=
1
return
mocked_data
class
TestCMUARCTIC
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_cmuarctic
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
transcript
,
utterance_id
)
in
enumerate
(
dataset
):
expected_sample
=
self
.
samples
[
i
]
assert
sample_rate
==
expected_sample
[
1
]
assert
transcript
==
expected_sample
[
2
]
assert
utterance_id
==
expected_sample
[
3
]
self
.
assertEqual
(
expected_sample
[
0
],
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
n_ite
+=
1
assert
n_ite
==
len
(
self
.
samples
)
def
test_cmuarctic_str
(
self
):
dataset
=
cmuarctic
.
CMUARCTIC
(
self
.
root_dir
)
self
.
_test_cmuarctic
(
dataset
)
def
test_cmuarctic_path
(
self
):
dataset
=
cmuarctic
.
CMUARCTIC
(
Path
(
self
.
root_dir
))
self
.
_test_cmuarctic
(
dataset
)
test/torchaudio_unittest/datasets/cmudict_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
CMUDict
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
)
def
get_mock_dataset
(
root_dir
,
return_punc
=
False
):
"""
root_dir: directory to the mocked dataset
"""
header
=
[
";;; # CMUdict -- Major Version: 0.07"
,
";;; "
,
";;; # $HeadURL$"
,
]
puncs
=
[
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T"
,
"
\"
CLOSE-QUOTE K L OW1 Z K W OW1 T"
,
"#HASH-MARK HH AE1 M AA2 R K"
,
"%PERCENT P ER0 S EH1 N T"
,
"&ERSAND AE1 M P ER0 S AE2 N D"
,
"'END-INNER-QUOTE EH1 N D IH1 N ER0 K W OW1 T"
,
"(BEGIN-PARENS B IH0 G IH1 N P ER0 EH1 N Z"
,
")CLOSE-PAREN K L OW1 Z P ER0 EH1 N"
,
"+PLUS P L UH1 S"
,
",COMMA K AA1 M AH0"
,
"--DASH D AE1 SH"
,
"!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T"
,
"/SLASH S L AE1 SH"
,
":COLON K OW1 L AH0 N"
,
";SEMI-COLON S EH1 M IY0 K OW1 L AH0 N"
,
"?QUESTION-MARK K W EH1 S CH AH0 N M AA1 R K"
,
"{BRACE B R EY1 S"
,
"}CLOSE-BRACE K L OW1 Z B R EY1 S"
,
"...ELLIPSIS IH2 L IH1 P S IH0 S"
,
]
punc_outputs
=
[
"!"
,
"
\"
"
,
"#"
,
"%"
,
"&"
,
"'"
,
"("
,
")"
,
"+"
,
","
,
"--"
,
"!"
,
"/"
,
":"
,
";"
,
"?"
,
"{"
,
"}"
,
"..."
,
]
words
=
[
"3-D TH R IY1 D IY2"
,
"'BOUT B AW1 T"
,
"'CAUSE K AH0 Z"
,
"'TWAS T W AH1 Z"
,
"A AH0"
,
"B B IY1"
,
"C S IY1"
,
"D D IY1"
,
"E IY1"
,
"F EH1 F"
,
"G JH IY1"
,
"H EY1 CH"
,
"I AY1"
,
"J JH EY1"
,
"K K EY1"
,
"L EH1 L"
,
"M EH1 M"
,
"N EH1 N"
,
"O OW1"
,
"P P IY1"
,
"Q K Y UW1"
,
"R AA1 R"
,
"S EH1 S"
,
"T T IY1"
,
"U Y UW1"
,
"V V IY1"
,
"X EH1 K S"
,
"Y W AY1"
,
"Z Z IY1"
,
]
mocked_symbols
=
[
"AA1"
,
"AA2"
,
"AE1"
,
"AE2"
,
"AH0"
,
"AH1"
,
"AY1"
,
"B"
,
"CH"
,
"D"
,
"EH1"
,
"EH2"
,
"ER0"
,
"EY1"
,
"F"
,
"G"
,
"HH"
,
"IH0"
,
"IH1"
,
"IY0"
,
"IY1"
,
"IY2"
,
"JH"
,
"K"
,
"L"
,
"M"
,
"N"
,
"OW1"
,
"OY2"
,
"P"
,
"R"
,
"S"
,
"SH"
,
"T"
,
"TH"
,
"UH1"
,
"UW0"
,
"UW1"
,
"V"
,
"W"
,
"Y"
,
"Z"
,
]
dict_file
=
os
.
path
.
join
(
root_dir
,
"cmudict-0.7b"
)
symbol_file
=
os
.
path
.
join
(
root_dir
,
"cmudict-0.7b.symbols"
)
with
open
(
dict_file
,
"w"
)
as
fileobj
:
for
section
in
[
header
,
puncs
,
words
]:
for
line
in
section
:
fileobj
.
write
(
line
)
fileobj
.
write
(
"
\n
"
)
with
open
(
symbol_file
,
"w"
)
as
txt
:
txt
.
write
(
"
\n
"
.
join
(
mocked_symbols
))
mocked_data
=
[]
if
return_punc
:
for
i
,
ent
in
enumerate
(
puncs
):
_
,
phones
=
ent
.
split
(
" "
)
mocked_data
.
append
((
punc_outputs
[
i
],
phones
.
split
(
" "
)))
for
ent
in
words
:
word
,
phones
=
ent
.
split
(
" "
)
mocked_data
.
append
((
word
,
phones
.
split
(
" "
)))
return
mocked_data
class
TestCMUDict
(
TempDirMixin
,
TorchaudioTestCase
):
root_dir
=
None
root_punc_dir
=
None
samples
=
[]
punc_samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
os
.
path
.
join
(
cls
.
get_base_temp_dir
(),
"normal"
)
os
.
mkdir
(
cls
.
root_dir
)
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
cls
.
root_punc_dir
=
os
.
path
.
join
(
cls
.
get_base_temp_dir
(),
"punc"
)
os
.
mkdir
(
cls
.
root_punc_dir
)
cls
.
punc_samples
=
get_mock_dataset
(
cls
.
root_punc_dir
,
return_punc
=
True
)
def
_test_cmudict
(
self
,
dataset
):
"""Test if the dataset is reading the mocked data correctly."""
n_item
=
0
for
i
,
(
word
,
phones
)
in
enumerate
(
dataset
):
expected_word
,
expected_phones
=
self
.
samples
[
i
]
assert
word
==
expected_word
assert
phones
==
expected_phones
n_item
+=
1
assert
n_item
==
len
(
self
.
samples
)
def
_test_punc_cmudict
(
self
,
dataset
):
"""Test if the dataset is reading the mocked data with punctuations correctly."""
n_item
=
0
for
i
,
(
word
,
phones
)
in
enumerate
(
dataset
):
expected_word
,
expected_phones
=
self
.
punc_samples
[
i
]
assert
word
==
expected_word
assert
phones
==
expected_phones
n_item
+=
1
assert
n_item
==
len
(
self
.
punc_samples
)
def
test_cmuarctic_path_with_punctuation
(
self
):
dataset
=
CMUDict
(
Path
(
self
.
root_punc_dir
),
exclude_punctuations
=
False
)
self
.
_test_punc_cmudict
(
dataset
)
def
test_cmuarctic_str_with_punctuation
(
self
):
dataset
=
CMUDict
(
self
.
root_punc_dir
,
exclude_punctuations
=
False
)
self
.
_test_punc_cmudict
(
dataset
)
def
test_cmuarctic_path
(
self
):
dataset
=
CMUDict
(
Path
(
self
.
root_punc_dir
),
exclude_punctuations
=
True
)
self
.
_test_cmudict
(
dataset
)
def
test_cmuarctic_str
(
self
):
dataset
=
CMUDict
(
self
.
root_punc_dir
,
exclude_punctuations
=
True
)
self
.
_test_cmudict
(
dataset
)
test/torchaudio_unittest/datasets/commonvoice_test.py
0 → 100644
View file @
9dcc7a15
import
csv
import
os
from
pathlib
import
Path
from
typing
import
Tuple
,
Dict
from
torch
import
Tensor
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets
import
COMMONVOICE
_ORIGINAL_EXT_AUDIO
=
COMMONVOICE
.
_ext_audio
_SAMPLE_RATE
=
48000
_HEADERS
=
[
u
"client_ids"
,
u
"path"
,
u
"sentence"
,
u
"up_votes"
,
u
"down_votes"
,
u
"age"
,
u
"gender"
,
u
"accent"
]
_EN_TRAIN_CSV_CONTENTS
=
[
[
"9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c"
,
"common_voice_en_18885784.wav"
,
"He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery."
,
"2"
,
"0"
,
""
,
""
,
""
],
[
"c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20"
,
"common_voice_en_556542.wav"
,
"Once more into the breach"
,
"2"
,
"0"
,
"thirties"
,
"male"
,
"us"
],
[
"f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c"
,
"common_voice_en_18607573.wav"
,
"Caddy, show Miss Clare and Miss Summerson their rooms."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"canada"
],
]
_FR_TRAIN_CSV_CONTENTS
=
[
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
"18343441c601cae0597a4b0d3144"
,
"89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
"16cebac98ee5349e3e8262cb9329"
,
"Or sur ce point nous n’avons aucune réponse de votre part."
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
[
"a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
"343441c601cae0597a4b0d3144"
,
"87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
"cbd395acbdfcfa9d76a6e199bbd"
,
"Monsieur de La Verpillière, laissez parler le ministre"
,
"2"
,
"0"
,
"twenties"
,
"male"
,
"france"
],
]
def
get_mock_dataset
(
root_dir
,
train_csv_contents
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares mocked dataset
"""
mocked_data
=
[]
# Note: extension is changed to wav for the sake of test
# Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
# Tsv file name difference does not mean different subset, testing as a whole dataset here
tsv_filename
=
os
.
path
.
join
(
root_dir
,
"train.tsv"
)
audio_base_path
=
os
.
path
.
join
(
root_dir
,
"clips"
)
os
.
makedirs
(
audio_base_path
,
exist_ok
=
True
)
with
open
(
tsv_filename
,
"w"
,
newline
=
''
)
as
tsv
:
writer
=
csv
.
writer
(
tsv
,
delimiter
=
'
\t
'
)
writer
.
writerow
(
_HEADERS
)
for
i
,
content
in
enumerate
(
train_csv_contents
):
content
[
2
]
=
str
(
content
[
2
].
encode
(
"utf-8"
))
writer
.
writerow
(
content
)
if
not
content
[
1
].
endswith
(
ext_audio
):
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
]
+
ext_audio
)
else
:
audio_path
=
os
.
path
.
join
(
audio_base_path
,
content
[
1
])
data
=
get_whitenoise
(
sample_rate
=
_SAMPLE_RATE
,
duration
=
1
,
n_channels
=
1
,
seed
=
i
,
dtype
=
'float32'
)
save_wav
(
audio_path
,
data
,
_SAMPLE_RATE
)
# Append data entry
mocked_data
.
append
((
normalize_wav
(
data
),
_SAMPLE_RATE
,
dict
(
zip
(
_HEADERS
,
content
))))
return
mocked_data
def
get_mock_dataset_en
(
root_dir
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares english mocked dataset
"""
return
get_mock_dataset
(
root_dir
,
_EN_TRAIN_CSV_CONTENTS
,
ext_audio
)
def
get_mock_dataset_fr
(
root_dir
,
ext_audio
)
->
Tuple
[
Tensor
,
int
,
Dict
[
str
,
str
]]:
"""
prepares french mocked dataset
"""
return
get_mock_dataset
(
root_dir
,
_FR_TRAIN_CSV_CONTENTS
,
ext_audio
)
class
BaseTestCommonVoice
(
TempDirMixin
):
root_dir
=
None
data
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
COMMONVOICE
.
_ext_audio
=
".wav"
@
classmethod
def
tearDownClass
(
cls
):
super
().
tearDownClass
()
COMMONVOICE
.
_ext_audio
=
_ORIGINAL_EXT_AUDIO
def
_test_commonvoice
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
dictionary
)
in
enumerate
(
dataset
):
expected_dictionary
=
self
.
data
[
i
][
2
]
expected_data
=
self
.
data
[
i
][
0
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
_SAMPLE_RATE
assert
dictionary
==
expected_dictionary
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
class
TestCommonVoiceEN
(
BaseTestCommonVoice
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
data
=
get_mock_dataset_en
(
cls
.
root_dir
,
COMMONVOICE
.
_ext_audio
)
def
test_commonvoice_str
(
self
):
dataset
=
COMMONVOICE
(
self
.
root_dir
)
self
.
_test_commonvoice
(
dataset
)
def
test_commonvoice_path
(
self
):
dataset
=
COMMONVOICE
(
Path
(
self
.
root_dir
))
self
.
_test_commonvoice
(
dataset
)
class
TestCommonVoiceFR
(
BaseTestCommonVoice
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
@
classmethod
def
setUpClass
(
cls
):
super
().
setUpClass
()
cls
.
data
=
get_mock_dataset_fr
(
cls
.
root_dir
,
COMMONVOICE
.
_ext_audio
)
def
test_commonvoice_str
(
self
):
dataset
=
COMMONVOICE
(
self
.
root_dir
)
self
.
_test_commonvoice
(
dataset
)
test/torchaudio_unittest/datasets/datasets_test.py
0 → 100644
View file @
9dcc7a15
from
torchaudio.datasets.vctk
import
VCTK
from
torchaudio_unittest.common_utils
import
(
TorchaudioTestCase
,
get_asset_path
,
)
class
TestDatasets
(
TorchaudioTestCase
):
backend
=
'default'
path
=
get_asset_path
()
def
test_vctk
(
self
):
data
=
VCTK
(
self
.
path
)
data
[
0
]
test/torchaudio_unittest/datasets/gtzan_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
gtzan
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_samples
=
[]
mocked_training
=
[]
mocked_validation
=
[]
mocked_testing
=
[]
sample_rate
=
22050
seed
=
0
for
genre
in
gtzan
.
gtzan_genres
:
base_dir
=
os
.
path
.
join
(
root_dir
,
'genres'
,
genre
)
os
.
makedirs
(
base_dir
,
exist_ok
=
True
)
for
i
in
range
(
100
):
filename
=
f
'
{
genre
}
.
{
i
:
05
d
}
'
path
=
os
.
path
.
join
(
base_dir
,
f
'
{
filename
}
.wav'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
seed
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
genre
)
mocked_samples
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_test
:
mocked_testing
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_train
:
mocked_training
.
append
(
sample
)
if
filename
in
gtzan
.
filtered_valid
:
mocked_validation
.
append
(
sample
)
seed
+=
1
return
(
mocked_samples
,
mocked_training
,
mocked_validation
,
mocked_testing
)
class
TestGTZAN
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
training
=
[]
validation
=
[]
testing
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
mocked_data
=
get_mock_dataset
(
cls
.
root_dir
)
cls
.
samples
=
mocked_data
[
0
]
cls
.
training
=
mocked_data
[
1
]
cls
.
validation
=
mocked_data
[
2
]
cls
.
testing
=
mocked_data
[
3
]
def
test_no_subset
(
self
):
dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
)
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
label
==
self
.
samples
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
samples
)
def
_test_training
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
training
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
training
[
i
][
1
]
assert
label
==
self
.
training
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
training
)
def
_test_validation
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
validation
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
validation
[
i
][
1
]
assert
label
==
self
.
validation
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
validation
)
def
_test_testing
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
waveform
,
self
.
testing
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
testing
[
i
][
1
]
assert
label
==
self
.
testing
[
i
][
2
]
n_ite
+=
1
assert
n_ite
==
len
(
self
.
testing
)
def
test_training_str
(
self
):
train_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'training'
)
self
.
_test_training
(
train_dataset
)
def
test_validation_str
(
self
):
val_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'validation'
)
self
.
_test_validation
(
val_dataset
)
def
test_testing_str
(
self
):
test_dataset
=
gtzan
.
GTZAN
(
self
.
root_dir
,
subset
=
'testing'
)
self
.
_test_testing
(
test_dataset
)
def
test_training_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
train_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'training'
)
self
.
_test_training
(
train_dataset
)
def
test_validation_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
val_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'validation'
)
self
.
_test_validation
(
val_dataset
)
def
test_testing_path
(
self
):
root_dir
=
Path
(
self
.
root_dir
)
test_dataset
=
gtzan
.
GTZAN
(
root_dir
,
subset
=
'testing'
)
self
.
_test_testing
(
test_dataset
)
test/torchaudio_unittest/datasets/librispeech_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets
import
librispeech
# Used to generate a unique transcript for each dummy audio file
_NUMBERS
=
[
'ZERO'
,
'ONE'
,
'TWO'
,
'THREE'
,
'FOUR'
,
'FIVE'
,
'SIX'
,
'SEVEN'
,
'EIGHT'
,
'NINE'
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
dataset_dir
=
os
.
path
.
join
(
root_dir
,
librispeech
.
FOLDER_IN_ARCHIVE
,
librispeech
.
URL
)
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz
seed
=
0
for
speaker_id
in
range
(
5
):
speaker_path
=
os
.
path
.
join
(
dataset_dir
,
str
(
speaker_id
))
os
.
makedirs
(
speaker_path
,
exist_ok
=
True
)
for
chapter_id
in
range
(
3
):
chapter_path
=
os
.
path
.
join
(
speaker_path
,
str
(
chapter_id
))
os
.
makedirs
(
chapter_path
,
exist_ok
=
True
)
trans_content
=
[]
for
utterance_id
in
range
(
10
):
filename
=
f
'
{
speaker_id
}
-
{
chapter_id
}
-
{
utterance_id
:
04
d
}
.wav'
path
=
os
.
path
.
join
(
chapter_path
,
filename
)
transcript
=
' '
.
join
(
[
_NUMBERS
[
x
]
for
x
in
[
speaker_id
,
chapter_id
,
utterance_id
]]
)
trans_content
.
append
(
f
'
{
speaker_id
}
-
{
chapter_id
}
-
{
utterance_id
:
04
d
}
{
transcript
}
'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'float32'
,
seed
=
seed
)
save_wav
(
path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
speaker_id
,
chapter_id
,
utterance_id
)
mocked_data
.
append
(
sample
)
seed
+=
1
trans_filename
=
f
'
{
speaker_id
}
-
{
chapter_id
}
.trans.txt'
trans_path
=
os
.
path
.
join
(
chapter_path
,
trans_filename
)
with
open
(
trans_path
,
'w'
)
as
f
:
f
.
write
(
'
\n
'
.
join
(
trans_content
))
return
mocked_data
class
TestLibriSpeech
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
@
classmethod
def
tearDownClass
(
cls
):
# In case of test failure
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.flac'
def
_test_librispeech
(
self
,
dataset
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
speaker_id
,
chapter_id
,
utterance_id
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
transcript
==
self
.
samples
[
i
][
2
]
assert
speaker_id
==
self
.
samples
[
i
][
3
]
assert
chapter_id
==
self
.
samples
[
i
][
4
]
assert
utterance_id
==
self
.
samples
[
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
)
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.flac'
def
test_librispeech_str
(
self
):
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.wav'
dataset
=
librispeech
.
LIBRISPEECH
(
self
.
root_dir
)
self
.
_test_librispeech
(
dataset
)
def
test_librispeech_path
(
self
):
librispeech
.
LIBRISPEECH
.
_ext_audio
=
'.wav'
dataset
=
librispeech
.
LIBRISPEECH
(
Path
(
self
.
root_dir
))
self
.
_test_librispeech
(
dataset
)
test/torchaudio_unittest/datasets/libritts_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
from
torchaudio.datasets.libritts
import
LIBRITTS
_UTTERANCE_IDS
=
[
[
19
,
198
,
'000000'
,
'000000'
],
[
26
,
495
,
'000004'
,
'000000'
],
]
_ORIGINAL_TEXT
=
'this is the original text.'
_NORMALIZED_TEXT
=
'this is the normalized text.'
def
get_mock_dataset
(
root_dir
):
"""
root_dir: directory to the mocked dataset
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
'LibriTTS'
,
'train-clean-100'
)
for
i
,
utterance_id
in
enumerate
(
_UTTERANCE_IDS
):
filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.wav'
file_dir
=
os
.
path
.
join
(
base_dir
,
str
(
utterance_id
[
0
]),
str
(
utterance_id
[
1
]))
os
.
makedirs
(
file_dir
,
exist_ok
=
True
)
path
=
os
.
path
.
join
(
file_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
24000
,
duration
=
2
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
i
)
save_wav
(
path
,
data
,
24000
)
mocked_data
.
append
(
normalize_wav
(
data
))
original_text_filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.original.txt'
path_original
=
os
.
path
.
join
(
file_dir
,
original_text_filename
)
with
open
(
path_original
,
'w'
)
as
file_
:
file_
.
write
(
_ORIGINAL_TEXT
)
normalized_text_filename
=
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
utterance_id
)
}
.normalized.txt'
path_normalized
=
os
.
path
.
join
(
file_dir
,
normalized_text_filename
)
with
open
(
path_normalized
,
'w'
)
as
file_
:
file_
.
write
(
_NORMALIZED_TEXT
)
return
mocked_data
,
_UTTERANCE_IDS
,
_ORIGINAL_TEXT
,
_NORMALIZED_TEXT
class
TestLibriTTS
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
data
=
[]
_utterance_ids
,
_original_text
,
_normalized_text
=
[],
[],
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
,
cls
.
_utterance_ids
,
cls
.
_original_text
,
cls
.
_normalized_text
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_libritts
(
self
,
dataset
):
n_ites
=
0
for
i
,
(
waveform
,
sample_rate
,
original_text
,
normalized_text
,
speaker_id
,
chapter_id
,
utterance_id
)
in
enumerate
(
dataset
):
expected_ids
=
self
.
_utterance_ids
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
24000
assert
speaker_id
==
expected_ids
[
0
]
assert
chapter_id
==
expected_ids
[
1
]
assert
original_text
==
self
.
_original_text
assert
normalized_text
==
self
.
_normalized_text
assert
utterance_id
==
f
'
{
"_"
.
join
(
str
(
u
)
for
u
in
expected_ids
[
-
4
:])
}
'
n_ites
+=
1
assert
n_ites
==
len
(
self
.
_utterance_ids
)
def
test_libritts_str
(
self
):
dataset
=
LIBRITTS
(
self
.
root_dir
)
self
.
_test_libritts
(
dataset
)
def
test_libritts_path
(
self
):
dataset
=
LIBRITTS
(
Path
(
self
.
root_dir
))
self
.
_test_libritts
(
dataset
)
test/torchaudio_unittest/datasets/ljspeech_test.py
0 → 100644
View file @
9dcc7a15
import
csv
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
normalize_wav
,
save_wav
,
)
from
torchaudio.datasets
import
ljspeech
_TRANSCRIPTS
=
[
"Test transcript 1"
,
"Test transcript 2"
,
"Test transcript 3"
,
"In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
_NORMALIZED_TRANSCRIPT
=
[
"Test transcript one"
,
"Test transcript two"
,
"Test transcript three"
,
"In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: path to the mocked dataset
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
"LJSpeech-1.1"
)
archive_dir
=
os
.
path
.
join
(
base_dir
,
"wavs"
)
os
.
makedirs
(
archive_dir
,
exist_ok
=
True
)
metadata_path
=
os
.
path
.
join
(
base_dir
,
"metadata.csv"
)
sample_rate
=
22050
with
open
(
metadata_path
,
mode
=
"w"
,
newline
=
''
)
as
metadata_file
:
metadata_writer
=
csv
.
writer
(
metadata_file
,
delimiter
=
"|"
,
quoting
=
csv
.
QUOTE_NONE
)
for
i
,
(
transcript
,
normalized_transcript
)
in
enumerate
(
zip
(
_TRANSCRIPTS
,
_NORMALIZED_TRANSCRIPT
)
):
fileid
=
f
'LJ001-
{
i
:
04
d
}
'
metadata_writer
.
writerow
([
fileid
,
transcript
,
normalized_transcript
])
filename
=
fileid
+
".wav"
path
=
os
.
path
.
join
(
archive_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
1
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
i
)
save_wav
(
path
,
data
,
sample_rate
)
mocked_data
.
append
(
normalize_wav
(
data
))
return
mocked_data
,
_TRANSCRIPTS
,
_NORMALIZED_TRANSCRIPT
class
TestLJSpeech
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
data
,
_transcripts
,
_normalized_transcript
=
[],
[],
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
,
cls
.
_transcripts
,
cls
.
_normalized_transcript
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_ljspeech
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
transcript
,
normalized_transcript
)
in
enumerate
(
dataset
):
expected_transcript
=
self
.
_transcripts
[
i
]
expected_normalized_transcript
=
self
.
_normalized_transcript
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
sample_rate
assert
transcript
==
expected_transcript
assert
normalized_transcript
==
expected_normalized_transcript
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
def
test_ljspeech_str
(
self
):
dataset
=
ljspeech
.
LJSPEECH
(
self
.
root_dir
)
self
.
_test_ljspeech
(
dataset
)
def
test_ljspeech_path
(
self
):
dataset
=
ljspeech
.
LJSPEECH
(
Path
(
self
.
root_dir
))
self
.
_test_ljspeech
(
dataset
)
test/torchaudio_unittest/datasets/speechcommands_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
normalize_wav
,
save_wav
,
)
from
torchaudio.datasets
import
speechcommands
_LABELS
=
[
"bed"
,
"bird"
,
"cat"
,
"dog"
,
"down"
,
"eight"
,
"five"
,
"follow"
,
"forward"
,
"four"
,
"go"
,
"happy"
,
"house"
,
"learn"
,
"left"
,
"marvin"
,
"nine"
,
"no"
,
"off"
,
"on"
,
"one"
,
"right"
,
"seven"
,
"sheila"
,
"six"
,
"stop"
,
"three"
,
"tree"
,
"two"
,
"up"
,
"visual"
,
"wow"
,
"yes"
,
"zero"
,
]
def
get_mock_dataset
(
dataset_dir
):
"""
dataset_dir: directory to the mocked dataset
"""
mocked_samples
=
[]
mocked_train_samples
=
[]
mocked_valid_samples
=
[]
mocked_test_samples
=
[]
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz sample rate
seed
=
0
valid_file
=
os
.
path
.
join
(
dataset_dir
,
"validation_list.txt"
)
test_file
=
os
.
path
.
join
(
dataset_dir
,
"testing_list.txt"
)
with
open
(
valid_file
,
"w"
)
as
valid
,
open
(
test_file
,
"w"
)
as
test
:
for
label
in
_LABELS
:
path
=
os
.
path
.
join
(
dataset_dir
,
label
)
os
.
makedirs
(
path
,
exist_ok
=
True
)
for
j
in
range
(
6
):
# generate hash ID for speaker
speaker
=
"{:08x}"
.
format
(
j
)
for
utterance
in
range
(
3
):
filename
=
f
"
{
speaker
}{
speechcommands
.
HASH_DIVIDER
}{
utterance
}
.wav"
file_path
=
os
.
path
.
join
(
path
,
filename
)
seed
+=
1
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
"int16"
,
seed
=
seed
,
)
save_wav
(
file_path
,
data
,
sample_rate
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
label
,
speaker
,
utterance
,
)
mocked_samples
.
append
(
sample
)
if
j
<
2
:
mocked_train_samples
.
append
(
sample
)
elif
j
<
4
:
valid
.
write
(
f
'
{
label
}
/
{
filename
}
\n
'
)
mocked_valid_samples
.
append
(
sample
)
elif
j
<
6
:
test
.
write
(
f
'
{
label
}
/
{
filename
}
\n
'
)
mocked_test_samples
.
append
(
sample
)
return
mocked_samples
,
mocked_train_samples
,
mocked_valid_samples
,
mocked_test_samples
class
TestSpeechCommands
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
"default"
root_dir
=
None
samples
=
[]
train_samples
=
[]
valid_samples
=
[]
test_samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
dataset_dir
=
os
.
path
.
join
(
cls
.
root_dir
,
speechcommands
.
FOLDER_IN_ARCHIVE
,
speechcommands
.
URL
)
cls
.
samples
,
cls
.
train_samples
,
cls
.
valid_samples
,
cls
.
test_samples
=
get_mock_dataset
(
dataset_dir
)
def
_testSpeechCommands
(
self
,
dataset
,
data_samples
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
label
,
speaker_id
,
utterance_number
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
data_samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
data_samples
[
i
][
1
]
assert
label
==
data_samples
[
i
][
2
]
assert
speaker_id
==
data_samples
[
i
][
3
]
assert
utterance_number
==
data_samples
[
i
][
4
]
num_samples
+=
1
assert
num_samples
==
len
(
data_samples
)
def
testSpeechCommands_str
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
)
self
.
_testSpeechCommands
(
dataset
,
self
.
samples
)
def
testSpeechCommands_path
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
Path
(
self
.
root_dir
))
self
.
_testSpeechCommands
(
dataset
,
self
.
samples
)
def
testSpeechCommandsSubsetTrain
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"training"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
train_samples
)
def
testSpeechCommandsSubsetValid
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"validation"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
valid_samples
)
def
testSpeechCommandsSubsetTest
(
self
):
dataset
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"testing"
)
self
.
_testSpeechCommands
(
dataset
,
self
.
test_samples
)
def
testSpeechCommandsSum
(
self
):
dataset_all
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
)
dataset_train
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"training"
)
dataset_valid
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"validation"
)
dataset_test
=
speechcommands
.
SPEECHCOMMANDS
(
self
.
root_dir
,
subset
=
"testing"
)
assert
len
(
dataset_train
)
+
len
(
dataset_valid
)
+
len
(
dataset_test
)
==
len
(
dataset_all
)
test/torchaudio_unittest/datasets/tedlium_test.py
0 → 100644
View file @
9dcc7a15
import
os
import
platform
from
pathlib
import
Path
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
skipIfNoSox
)
from
torchaudio.datasets
import
tedlium
# Used to generate a unique utterance for each dummy audio file
_UTTERANCES
=
[
"AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4
\n
"
,
"AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5
\n
"
,
]
_PHONEME
=
[
"a AH"
,
"a(2) EY"
,
"aachen AA K AH N"
,
"aad AE D"
,
"aaden EY D AH N"
,
"aadmi AE D M IY"
,
"aae EY EY"
,
]
def
get_mock_dataset
(
dataset_dir
):
"""
dataset_dir: directory of the mocked dataset
"""
mocked_samples
=
{}
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
16000
# 16kHz
seed
=
0
for
release
in
[
"release1"
,
"release2"
,
"release3"
]:
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
10.00
,
n_channels
=
1
,
dtype
=
"float32"
,
seed
=
seed
)
if
release
in
[
"release1"
,
"release2"
]:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"subset"
],
)
else
:
release_dir
=
os
.
path
.
join
(
dataset_dir
,
tedlium
.
_RELEASE_CONFIGS
[
release
][
"folder_in_archive"
],
tedlium
.
_RELEASE_CONFIGS
[
release
][
"data_path"
],
)
os
.
makedirs
(
release_dir
,
exist_ok
=
True
)
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"stm"
),
exist_ok
=
True
)
# Subfolder for transcripts
os
.
makedirs
(
os
.
path
.
join
(
release_dir
,
"sph"
),
exist_ok
=
True
)
# Subfolder for audio files
filename
=
f
"
{
release
}
.sph"
path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"sph"
),
filename
)
save_wav
(
path
,
data
,
sample_rate
)
trans_filename
=
f
"
{
release
}
.stm"
trans_path
=
os
.
path
.
join
(
os
.
path
.
join
(
release_dir
,
"stm"
),
trans_filename
)
with
open
(
trans_path
,
"w"
)
as
f
:
f
.
write
(
""
.
join
(
_UTTERANCES
))
dict_filename
=
f
"
{
release
}
.dic"
dict_path
=
os
.
path
.
join
(
release_dir
,
dict_filename
)
with
open
(
dict_path
,
"w"
)
as
f
:
f
.
write
(
"
\n
"
.
join
(
_PHONEME
))
# Create a samples list to compare with
mocked_samples
[
release
]
=
[]
for
utterance
in
_UTTERANCES
:
talk_id
,
_
,
speaker_id
,
start_time
,
end_time
,
identifier
,
transcript
=
utterance
.
split
(
" "
,
6
)
start_time
=
int
(
float
(
start_time
))
*
sample_rate
end_time
=
int
(
float
(
end_time
))
*
sample_rate
sample
=
(
data
[:,
start_time
:
end_time
],
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
,
)
mocked_samples
[
release
].
append
(
sample
)
seed
+=
1
return
mocked_samples
class
Tedlium
(
TempDirMixin
):
root_dir
=
None
samples
=
{}
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
root_dir
=
dataset_dir
=
os
.
path
.
join
(
cls
.
root_dir
,
"tedlium"
)
cls
.
samples
=
get_mock_dataset
(
dataset_dir
)
def
_test_tedlium
(
self
,
dataset
,
release
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
talk_id
,
speaker_id
,
identifier
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
release
][
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
release
][
i
][
1
]
assert
transcript
==
self
.
samples
[
release
][
i
][
2
]
assert
talk_id
==
self
.
samples
[
release
][
i
][
3
]
assert
speaker_id
==
self
.
samples
[
release
][
i
][
4
]
assert
identifier
==
self
.
samples
[
release
][
i
][
5
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
[
release
])
dataset
.
_dict_path
=
os
.
path
.
join
(
dataset
.
_path
,
f
"
{
release
}
.dic"
)
phoneme_dict
=
dataset
.
phoneme_dict
phoenemes
=
[
f
"
{
key
}
{
' '
.
join
(
value
)
}
"
for
key
,
value
in
phoneme_dict
.
items
()]
assert
phoenemes
==
_PHONEME
def
test_tedlium_release1_str
(
self
):
release
=
"release1"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release1_path
(
self
):
release
=
"release1"
dataset
=
tedlium
.
TEDLIUM
(
Path
(
self
.
root_dir
),
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release2
(
self
):
release
=
"release2"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
def
test_tedlium_release3
(
self
):
release
=
"release3"
dataset
=
tedlium
.
TEDLIUM
(
self
.
root_dir
,
release
=
release
)
self
.
_test_tedlium
(
dataset
,
release
)
class
TestTedliumSoundfile
(
Tedlium
,
TorchaudioTestCase
):
backend
=
"soundfile"
if
platform
.
system
()
!=
"Windows"
:
@
skipIfNoSox
class
TestTedliumSoxIO
(
Tedlium
,
TorchaudioTestCase
):
backend
=
"sox_io"
test/torchaudio_unittest/datasets/utils_test.py
0 → 100644
View file @
9dcc7a15
import
torch
from
torchaudio_unittest.common_utils
import
(
TorchaudioTestCase
,
TempDirMixin
)
from
torchaudio.datasets
import
utils
as
dataset_utils
class
Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__getitem__
(
self
,
n
):
sample_rate
=
8000
waveform
=
n
*
torch
.
ones
(
2
,
256
)
return
waveform
,
sample_rate
def
__len__
(
self
)
->
int
:
return
2
def
__iter__
(
self
):
for
i
in
range
(
len
(
self
)):
yield
self
[
i
]
class
TestIterator
(
TorchaudioTestCase
,
TempDirMixin
):
backend
=
'default'
def
test_disckcache_iterator
(
self
):
data
=
dataset_utils
.
diskcache_iterator
(
Dataset
(),
self
.
get_base_temp_dir
())
# Save
data
[
0
]
# Load
data
[
0
]
def
test_bg_iterator
(
self
):
data
=
dataset_utils
.
bg_iterator
(
Dataset
(),
5
)
for
_
in
data
:
pass
test/torchaudio_unittest/datasets/vctk_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
vctk
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
# Used to generate a unique transcript for each dummy audio file
_TRANSCRIPT
=
[
'Please call Stella'
,
'Ask her to bring these things'
,
'with her from the store'
,
'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob'
,
'We also need a small plastic snake and a big toy frog for the kids'
,
'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station'
,
'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow'
,
'The rainbow is a division of white light into many beautiful colors'
,
'These take the shape of a long round arch, with its path high above, and its two ends
\
apparently beyond the horizon'
,
'There is, according to legend, a boiling pot of gold at one end'
]
def
get_mock_dataset
(
root_dir
):
"""
root_dir: root directory of the mocked data
"""
mocked_samples
=
[]
dataset_dir
=
os
.
path
.
join
(
root_dir
,
'VCTK-Corpus-0.92'
)
os
.
makedirs
(
dataset_dir
,
exist_ok
=
True
)
sample_rate
=
48000
seed
=
0
for
speaker
in
range
(
225
,
230
):
speaker_id
=
'p'
+
str
(
speaker
)
audio_dir
=
os
.
path
.
join
(
dataset_dir
,
'wav48_silence_trimmed'
,
speaker_id
)
os
.
makedirs
(
audio_dir
,
exist_ok
=
True
)
file_dir
=
os
.
path
.
join
(
dataset_dir
,
'txt'
,
speaker_id
)
os
.
makedirs
(
file_dir
,
exist_ok
=
True
)
for
utterance_id
in
range
(
1
,
11
):
filename
=
f
'
{
speaker_id
}
_
{
utterance_id
:
03
d
}
_mic2'
audio_file_path
=
os
.
path
.
join
(
audio_dir
,
filename
+
'.wav'
)
data
=
get_whitenoise
(
sample_rate
=
sample_rate
,
duration
=
0.01
,
n_channels
=
1
,
dtype
=
'float32'
,
seed
=
seed
)
save_wav
(
audio_file_path
,
data
,
sample_rate
)
txt_file_path
=
os
.
path
.
join
(
file_dir
,
filename
[:
-
5
]
+
'.txt'
)
transcript
=
_TRANSCRIPT
[
utterance_id
-
1
]
with
open
(
txt_file_path
,
'w'
)
as
f
:
f
.
write
(
transcript
)
sample
=
(
normalize_wav
(
data
),
sample_rate
,
transcript
,
speaker_id
,
utterance_id
)
mocked_samples
.
append
(
sample
)
seed
+=
1
return
mocked_samples
class
TestVCTK
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
samples
=
[]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
samples
=
get_mock_dataset
(
cls
.
root_dir
)
def
_test_vctk
(
self
,
dataset
):
num_samples
=
0
for
i
,
(
data
,
sample_rate
,
transcript
,
speaker_id
,
utterance_id
)
in
enumerate
(
dataset
):
self
.
assertEqual
(
data
,
self
.
samples
[
i
][
0
],
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
self
.
samples
[
i
][
1
]
assert
transcript
==
self
.
samples
[
i
][
2
]
assert
speaker_id
==
self
.
samples
[
i
][
3
]
assert
int
(
utterance_id
)
==
self
.
samples
[
i
][
4
]
num_samples
+=
1
assert
num_samples
==
len
(
self
.
samples
)
def
test_vctk_str
(
self
):
dataset
=
vctk
.
VCTK_092
(
self
.
root_dir
,
audio_ext
=
".wav"
)
self
.
_test_vctk
(
dataset
)
def
test_vctk_path
(
self
):
dataset
=
vctk
.
VCTK_092
(
Path
(
self
.
root_dir
),
audio_ext
=
".wav"
)
self
.
_test_vctk
(
dataset
)
test/torchaudio_unittest/datasets/yesno_test.py
0 → 100644
View file @
9dcc7a15
import
os
from
pathlib
import
Path
from
torchaudio.datasets
import
yesno
from
torchaudio_unittest.common_utils
import
(
TempDirMixin
,
TorchaudioTestCase
,
get_whitenoise
,
save_wav
,
normalize_wav
,
)
def
get_mock_data
(
root_dir
,
labels
):
"""
root_dir: path
labels: list of labels
"""
mocked_data
=
[]
base_dir
=
os
.
path
.
join
(
root_dir
,
'waves_yesno'
)
os
.
makedirs
(
base_dir
,
exist_ok
=
True
)
for
i
,
label
in
enumerate
(
labels
):
filename
=
f
'
{
"_"
.
join
(
str
(
l
)
for
l
in
label
)
}
.wav'
path
=
os
.
path
.
join
(
base_dir
,
filename
)
data
=
get_whitenoise
(
sample_rate
=
8000
,
duration
=
6
,
n_channels
=
1
,
dtype
=
'int16'
,
seed
=
i
)
save_wav
(
path
,
data
,
8000
)
mocked_data
.
append
(
normalize_wav
(
data
))
return
mocked_data
class
TestYesNo
(
TempDirMixin
,
TorchaudioTestCase
):
backend
=
'default'
root_dir
=
None
data
=
[]
labels
=
[
[
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
],
[
0
,
0
,
0
,
0
,
1
,
1
,
1
,
1
],
[
0
,
1
,
0
,
1
,
0
,
1
,
1
,
0
],
[
1
,
1
,
1
,
1
,
0
,
0
,
0
,
0
],
[
1
,
1
,
1
,
1
,
1
,
1
,
1
,
1
],
]
@
classmethod
def
setUpClass
(
cls
):
cls
.
root_dir
=
cls
.
get_base_temp_dir
()
cls
.
data
=
get_mock_data
(
cls
.
root_dir
,
cls
.
labels
)
def
_test_yesno
(
self
,
dataset
):
n_ite
=
0
for
i
,
(
waveform
,
sample_rate
,
label
)
in
enumerate
(
dataset
):
expected_label
=
self
.
labels
[
i
]
expected_data
=
self
.
data
[
i
]
self
.
assertEqual
(
expected_data
,
waveform
,
atol
=
5e-5
,
rtol
=
1e-8
)
assert
sample_rate
==
8000
assert
label
==
expected_label
n_ite
+=
1
assert
n_ite
==
len
(
self
.
data
)
def
test_yesno_str
(
self
):
dataset
=
yesno
.
YESNO
(
self
.
root_dir
)
self
.
_test_yesno
(
dataset
)
def
test_yesno_path
(
self
):
dataset
=
yesno
.
YESNO
(
Path
(
self
.
root_dir
))
self
.
_test_yesno
(
dataset
)
test/torchaudio_unittest/example/__init__.py
0 → 100644
View file @
9dcc7a15
import
os
import
sys
sys
.
path
.
append
(
os
.
path
.
join
(
os
.
path
.
dirname
(
__file__
),
'..'
,
'..'
,
'..'
,
'examples'
))
test/torchaudio_unittest/example/souce_sepration/__init__.py
0 → 100644
View file @
9dcc7a15
Prev
1
…
9
10
11
12
13
14
15
16
17
…
21
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment