init v0.10.0

9dcc7a15 · flyingdown · db2b0b79 · 9dcc7a15 · 9dcc7a15 · 9dcc7a15
Commit 9dcc7a15 authored Apr 25, 2022 by flyingdown
20 changed files
--- a/test/torchaudio_unittest/common_utils/rnnt_utils.py
+++ b/test/torchaudio_unittest/common_utils/rnnt_utils.py
+import unittest
+import random
+import torch
+import numpy as np
+from torchaudio.functional import rnnt_loss
+CPU_DEVICE = torch.device("cpu")
+class _NumpyTransducer(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        log_probs,
+        logit_lengths,
+        target_lengths,
+        targets,
+        blank=-1,
+    ):
+        device = log_probs.device
+        log_probs = log_probs.cpu().data.numpy()
+        logit_lengths = logit_lengths.cpu().data.numpy()
+        target_lengths = target_lengths.cpu().data.numpy()
+        targets = targets.cpu().data.numpy()
+        gradients, costs, _, _ = __class__.compute(
+            log_probs=log_probs,
+            logit_lengths=logit_lengths,
+            target_lengths=target_lengths,
+            targets=targets,
+            blank=blank,
+        )
+        costs = torch.FloatTensor(costs).to(device=device)
+        gradients = torch.FloatTensor(gradients).to(device=device)
+        ctx.grads = torch.autograd.Variable(gradients)
+        return costs
+    @staticmethod
+    def backward(ctx, grad_output):
+        grad_output = grad_output.view(-1, 1, 1, 1).to(ctx.grads)
+        return ctx.grads.mul(grad_output), None, None, None, None, None, None, None, None
+    @staticmethod
+    def compute_alpha_one_sequence(log_probs, targets, blank=-1):
+        max_T, max_U, D = log_probs.shape
+        alpha = np.zeros((max_T, max_U), dtype=np.float32)
+        for t in range(1, max_T):
+            alpha[t, 0] = alpha[t - 1, 0] + log_probs[t - 1, 0, blank]
+        for u in range(1, max_U):
+            alpha[0, u] = alpha[0, u - 1] + log_probs[0, u - 1, targets[u - 1]]
+        for t in range(1, max_T):
+            for u in range(1, max_U):
+                skip = alpha[t - 1, u] + log_probs[t - 1, u, blank]
+                emit = alpha[t, u - 1] + log_probs[t, u - 1, targets[u - 1]]
+                alpha[t, u] = np.logaddexp(skip, emit)
+        cost = -(alpha[-1, -1] + log_probs[-1, -1, blank])
+        return alpha, cost
+    @staticmethod
+    def compute_beta_one_sequence(log_probs, targets, blank=-1):
+        max_T, max_U, D = log_probs.shape
+        beta = np.zeros((max_T, max_U), dtype=np.float32)
+        beta[-1, -1] = log_probs[-1, -1, blank]
+        for t in reversed(range(max_T - 1)):
+            beta[t, -1] = beta[t + 1, -1] + log_probs[t, -1, blank]
+        for u in reversed(range(max_U - 1)):
+            beta[-1, u] = beta[-1, u + 1] + log_probs[-1, u, targets[u]]
+        for t in reversed(range(max_T - 1)):
+            for u in reversed(range(max_U - 1)):
+                skip = beta[t + 1, u] + log_probs[t, u, blank]
+                emit = beta[t, u + 1] + log_probs[t, u, targets[u]]
+                beta[t, u] = np.logaddexp(skip, emit)
+        cost = -beta[0, 0]
+        return beta, cost
+    @staticmethod
+    def compute_gradients_one_sequence(
+        log_probs, alpha, beta, targets, blank=-1
+    ):
+        max_T, max_U, D = log_probs.shape
+        gradients = np.full(log_probs.shape, float("-inf"))
+        cost = -beta[0, 0]
+        gradients[-1, -1, blank] = alpha[-1, -1]
+        gradients[:-1, :, blank] = alpha[:-1, :] + beta[1:, :]
+        for u, l in enumerate(targets):
+            gradients[:, u, l] = alpha[:, u] + beta[:, u + 1]
+        gradients = -(np.exp(gradients + log_probs + cost))
+        return gradients
+    @staticmethod
+    def compute(
+        log_probs,
+        logit_lengths,
+        target_lengths,
+        targets,
+        blank=-1,
+    ):
+        gradients = np.zeros_like(log_probs)
+        B_tgt, max_T, max_U, D = log_probs.shape
+        B_src = logit_lengths.shape[0]
+        H = int(B_tgt / B_src)
+        alphas = np.zeros((B_tgt, max_T, max_U))
+        betas = np.zeros((B_tgt, max_T, max_U))
+        betas.fill(float("-inf"))
+        alphas.fill(float("-inf"))
+        costs = np.zeros(B_tgt)
+        for b_tgt in range(B_tgt):
+            b_src = int(b_tgt / H)
+            T = int(logit_lengths[b_src])
+            # NOTE: see https://arxiv.org/pdf/1211.3711.pdf Section 2.1
+            U = int(target_lengths[b_tgt]) + 1
+            seq_log_probs = log_probs[b_tgt, :T, :U, :]
+            seq_targets = targets[b_tgt, : int(target_lengths[b_tgt])]
+            alpha, alpha_cost = __class__.compute_alpha_one_sequence(
+                log_probs=seq_log_probs, targets=seq_targets, blank=blank
+            )
+            beta, beta_cost = __class__.compute_beta_one_sequence(
+                log_probs=seq_log_probs, targets=seq_targets, blank=blank
+            )
+            seq_gradients = __class__.compute_gradients_one_sequence(
+                log_probs=seq_log_probs,
+                alpha=alpha,
+                beta=beta,
+                targets=seq_targets,
+                blank=blank,
+            )
+            np.testing.assert_almost_equal(alpha_cost, beta_cost, decimal=2)
+            gradients[b_tgt, :T, :U, :] = seq_gradients
+            costs[b_tgt] = beta_cost
+            alphas[b_tgt, :T, :U] = alpha
+            betas[b_tgt, :T, :U] = beta
+        return gradients, costs, alphas, betas
+class NumpyTransducerLoss(torch.nn.Module):
+    def __init__(self, blank=-1):
+        super().__init__()
+        self.blank = blank
+    def forward(
+        self,
+        logits,
+        logit_lengths,
+        target_lengths,
+        targets,
+    ):
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
+        return _NumpyTransducer.apply(
+            log_probs,
+            logit_lengths,
+            target_lengths,
+            targets,
+            self.blank,
+        )
+def compute_with_numpy_transducer(data):
+    costs = NumpyTransducerLoss(
+        blank=data["blank"],
+    )(
+        logits=data["logits"],
+        logit_lengths=data["logit_lengths"],
+        target_lengths=data["target_lengths"],
+        targets=data["targets"],
+    )
+    loss = torch.sum(costs)
+    loss.backward()
+    costs = costs.cpu()
+    gradients = data["logits"].saved_grad.cpu()
+    return costs, gradients
+def compute_with_pytorch_transducer(data):
+    costs = rnnt_loss(
+        logits=data["logits"],
+        logit_lengths=data["logit_lengths"],
+        target_lengths=data["target_lengths"],
+        targets=data["targets"],
+        blank=data["blank"],
+        reduction="none",
+    )
+    loss = torch.sum(costs)
+    loss.backward()
+    costs = costs.cpu()
+    gradients = data["logits"].saved_grad.cpu()
+    return costs, gradients
+def get_basic_data(device):
+    # Example provided
+    # in 6f73a2513dc784c59eec153a45f40bc528355b18
+    # of https://github.com/HawkAaron/warp-transducer
+    logits = torch.tensor(
+        [
+            [
+                [
+                    [0.1, 0.6, 0.1, 0.1, 0.1],
+                    [0.1, 0.1, 0.6, 0.1, 0.1],
+                    [0.1, 0.1, 0.2, 0.8, 0.1],
+                ],
+                [
+                    [0.1, 0.6, 0.1, 0.1, 0.1],
+                    [0.1, 0.1, 0.2, 0.1, 0.1],
+                    [0.7, 0.1, 0.2, 0.1, 0.1],
+                ],
+            ]
+        ],
+        dtype=torch.float32,
+        device=device,
+    )
+    targets = torch.tensor([[1, 2]], dtype=torch.int, device=device)
+    logit_lengths = torch.tensor([2], dtype=torch.int, device=device)
+    target_lengths = torch.tensor([2], dtype=torch.int, device=device)
+    logits.requires_grad_(True)
+    return logits, targets, logit_lengths, target_lengths
+def get_B1_T10_U3_D4_data(
+    random=False,
+    dtype=torch.float32,
+    device=CPU_DEVICE,
+):
+    B, T, U, D = 2, 10, 3, 4
+    logits = torch.rand(B, T, U, D, dtype=dtype, device=device)
+    if not random:
+        logits.fill_(0.1)
+    logits.requires_grad_(True)
+    def grad_hook(grad):
+        logits.saved_grad = grad.clone()
+    logits.register_hook(grad_hook)
+    data = {}
+    data["logits"] = logits
+    data["logit_lengths"] = torch.tensor([10, 10], dtype=torch.int32, device=device)
+    data["target_lengths"] = torch.tensor([2, 2], dtype=torch.int32, device=device)
+    data["targets"] = torch.tensor([[1, 2], [1, 2]], dtype=torch.int32, device=device)
+    data["blank"] = 0
+    return data
+def get_B1_T2_U3_D5_data(dtype=torch.float32, device=CPU_DEVICE):
+    logits = torch.tensor(
+        [
+            0.1,
+            0.6,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.6,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.2,
+            0.8,
+            0.1,
+            0.1,
+            0.6,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.1,
+            0.2,
+            0.1,
+            0.1,
+            0.7,
+            0.1,
+            0.2,
+            0.1,
+            0.1,
+        ],
+        dtype=dtype,
+        device=device,
+    ).reshape(1, 2, 3, 5)
+    logits.requires_grad_(True)
+    def grad_hook(grad):
+        logits.saved_grad = grad.clone()
+    logits.register_hook(grad_hook)
+    targets = torch.tensor([[1, 2]], dtype=torch.int32, device=device)
+    logit_lengths = torch.tensor([2], dtype=torch.int32, device=device)
+    target_lengths = torch.tensor([2], dtype=torch.int32, device=device)
+    blank = -1
+    ref_costs = torch.tensor([5.09566688538], dtype=dtype)
+    ref_gradients = torch.tensor(
+        [
+            0.17703132,
+            -0.39992708,
+            0.17703132,
+            0.17703132,
+            -0.13116692,
+            0.12247062,
+            0.12247062,
+            -0.181684,
+            0.12247062,
+            -0.1857276,
+            0.06269141,
+            0.06269141,
+            0.06928471,
+            0.12624498,
+            -0.32091248,
+            0.05456069,
+            -0.2182428,
+            0.05456069,
+            0.05456069,
+            0.05456069,
+            0.12073967,
+            0.12073967,
+            -0.48295838,
+            0.12073967,
+            0.12073967,
+            0.30741188,
+            0.16871123,
+            0.18645471,
+            0.16871123,
+            -0.83128875,
+        ],
+        dtype=dtype,
+    ).reshape(1, 2, 3, 5)
+    data = {
+        "logits": logits,
+        "targets": targets,
+        "logit_lengths": logit_lengths,
+        "target_lengths": target_lengths,
+        "blank": blank,
+    }
+    return data, ref_costs, ref_gradients
+def get_B2_T4_U3_D3_data(dtype=torch.float32, device=CPU_DEVICE):
+    # Test from D21322854
+    logits = torch.tensor(
+        [
+            0.065357,
+            0.787530,
+            0.081592,
+            0.529716,
+            0.750675,
+            0.754135,
+            0.609764,
+            0.868140,
+            0.622532,
+            0.668522,
+            0.858039,
+            0.164539,
+            0.989780,
+            0.944298,
+            0.603168,
+            0.946783,
+            0.666203,
+            0.286882,
+            0.094184,
+            0.366674,
+            0.736168,
+            0.166680,
+            0.714154,
+            0.399400,
+            0.535982,
+            0.291821,
+            0.612642,
+            0.324241,
+            0.800764,
+            0.524106,
+            0.779195,
+            0.183314,
+            0.113745,
+            0.240222,
+            0.339470,
+            0.134160,
+            0.505562,
+            0.051597,
+            0.640290,
+            0.430733,
+            0.829473,
+            0.177467,
+            0.320700,
+            0.042883,
+            0.302803,
+            0.675178,
+            0.569537,
+            0.558474,
+            0.083132,
+            0.060165,
+            0.107958,
+            0.748615,
+            0.943918,
+            0.486356,
+            0.418199,
+            0.652408,
+            0.024243,
+            0.134582,
+            0.366342,
+            0.295830,
+            0.923670,
+            0.689929,
+            0.741898,
+            0.250005,
+            0.603430,
+            0.987289,
+            0.592606,
+            0.884672,
+            0.543450,
+            0.660770,
+            0.377128,
+            0.358021,
+        ],
+        dtype=dtype,
+        device=device,
+    ).reshape(2, 4, 3, 3)
+    logits.requires_grad_(True)
+    def grad_hook(grad):
+        logits.saved_grad = grad.clone()
+    logits.register_hook(grad_hook)
+    targets = torch.tensor([[1, 2], [1, 1]], dtype=torch.int32, device=device)
+    logit_lengths = torch.tensor([4, 4], dtype=torch.int32, device=device)
+    target_lengths = torch.tensor([2, 2], dtype=torch.int32, device=device)
+    blank = 0
+    ref_costs = torch.tensor([4.2806528590890736, 3.9384369822503591], dtype=dtype)
+    ref_gradients = torch.tensor(
+        [
+            -0.186844,
+            -0.062555,
+            0.249399,
+            -0.203377,
+            0.202399,
+            0.000977,
+            -0.141016,
+            0.079123,
+            0.061893,
+            -0.011552,
+            -0.081280,
+            0.092832,
+            -0.154257,
+            0.229433,
+            -0.075176,
+            -0.246593,
+            0.146405,
+            0.100188,
+            -0.012918,
+            -0.061593,
+            0.074512,
+            -0.055986,
+            0.219831,
+            -0.163845,
+            -0.497627,
+            0.209240,
+            0.288387,
+            0.013605,
+            -0.030220,
+            0.016615,
+            0.113925,
+            0.062781,
+            -0.176706,
+            -0.667078,
+            0.367659,
+            0.299419,
+            -0.356344,
+            -0.055347,
+            0.411691,
+            -0.096922,
+            0.029459,
+            0.067463,
+            -0.063518,
+            0.027654,
+            0.035863,
+            -0.154499,
+            -0.073942,
+            0.228441,
+            -0.166790,
+            -0.000088,
+            0.166878,
+            -0.172370,
+            0.105565,
+            0.066804,
+            0.023875,
+            -0.118256,
+            0.094381,
+            -0.104707,
+            -0.108934,
+            0.213642,
+            -0.369844,
+            0.180118,
+            0.189726,
+            0.025714,
+            -0.079462,
+            0.053748,
+            0.122328,
+            -0.238789,
+            0.116460,
+            -0.598687,
+            0.302203,
+            0.296484,
+        ],
+        dtype=dtype,
+    ).reshape(2, 4, 3, 3)
+    data = {
+        "logits": logits,
+        "targets": targets,
+        "logit_lengths": logit_lengths,
+        "target_lengths": target_lengths,
+        "blank": blank,
+    }
+    return data, ref_costs, ref_gradients
+def get_random_data(
+    max_B=8,
+    max_T=128,
+    max_U=32,
+    max_D=40,
+    blank=-1,
+    dtype=torch.float32,
+    device=CPU_DEVICE,
+    seed=None,
+):
+    if seed is not None:
+        torch.manual_seed(seed=seed)
+    if blank != -1:
+        raise ValueError("blank != -1 is not supported yet.")
+    random.seed(0)
+    B = random.randint(1, max_B - 1)
+    T = random.randint(5, max_T - 1)
+    U = random.randint(5, max_U - 1)
+    D = random.randint(2, max_D - 1)
+    logit_lengths = torch.randint(low=5, high=T + 1, size=(B,), dtype=torch.int32, device=device)
+    target_lengths = torch.randint(low=5, high=U + 1, size=(B,), dtype=torch.int32, device=device)
+    max_src_length = torch.max(logit_lengths)
+    max_tgt_length = torch.max(target_lengths)
+    targets = torch.randint(
+        low=0, high=D - 1, size=(B, max_tgt_length), dtype=torch.int32, device=device
+    )
+    logits = torch.rand(
+        size=(B, max_src_length, max_tgt_length + 1, D),
+        dtype=dtype,
+        device=device,
+    ).requires_grad_(True)
+    def grad_hook(grad):
+        logits.saved_grad = grad.clone()
+    logits.register_hook(grad_hook)
+    return {
+        "logits": logits,
+        "targets": targets,
+        "logit_lengths": logit_lengths,
+        "target_lengths": target_lengths,
+        "blank": blank,
+    }
+def skipIfNoRNNT(test_item):
+    try:
+        torch.ops.torchaudio.rnnt_loss
+        return test_item
+    except RuntimeError:
+        return unittest.skip("torchaudio C++ extension is not compiled with RNN transducer loss")
--- a/test/torchaudio_unittest/common_utils/sox_utils.py
+++ b/test/torchaudio_unittest/common_utils/sox_utils.py
+import sys
+import subprocess
+import warnings
+def get_encoding(dtype):
+    encodings = {
+        'float32': 'floating-point',
+        'int32': 'signed-integer',
+        'int16': 'signed-integer',
+        'uint8': 'unsigned-integer',
+    }
+    return encodings[dtype]
+def get_bit_depth(dtype):
+    bit_depths = {
+        'float32': 32,
+        'int32': 32,
+        'int16': 16,
+        'uint8': 8,
+    }
+    return bit_depths[dtype]
+def gen_audio_file(
+        path, sample_rate, num_channels,
+        *, encoding=None, bit_depth=None, compression=None, attenuation=None, duration=1, comment_file=None,
+):
+    """Generate synthetic audio file with `sox` command."""
+    if path.endswith('.wav'):
+        warnings.warn('Use get_wav_data and save_wav to generate wav file for accurate result.')
+    command = [
+        'sox',
+        '-V3',  # verbose
+        '--no-dither',  # disable automatic dithering
+        '-R',
+        # -R is supposed to be repeatable, though the implementation looks suspicious
+        # and not setting the seed to a fixed value.
+        # https://fossies.org/dox/sox-14.4.2/sox_8c_source.html
+        # search "sox_globals.repeatable"
+    ]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    command += [
+        '--rate', str(sample_rate),
+        '--null',  # no input
+        '--channels', str(num_channels),
+    ]
+    if compression is not None:
+        command += ['--compression', str(compression)]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    if encoding is not None:
+        command += ['--encoding', str(encoding)]
+    if comment_file is not None:
+        command += ['--comment-file', str(comment_file)]
+    command += [
+        str(path),
+        'synth', str(duration),  # synthesizes for the given duration [sec]
+        'sawtooth', '1',
+        # saw tooth covers the both ends of value range, which is a good property for test.
+        # similar to linspace(-1., 1.)
+        # this introduces bigger boundary effect than sine when converted to mp3
+    ]
+    if attenuation is not None:
+        command += ['vol', f'-{attenuation}dB']
+    print(' '.join(command), file=sys.stderr)
+    subprocess.run(command, check=True)
+def convert_audio_file(
+        src_path, dst_path,
+        *, encoding=None, bit_depth=None, compression=None):
+    """Convert audio file with `sox` command."""
+    command = ['sox', '-V3', '--no-dither', '-R', str(src_path)]
+    if encoding is not None:
+        command += ['--encoding', str(encoding)]
+    if bit_depth is not None:
+        command += ['--bits', str(bit_depth)]
+    if compression is not None:
+        command += ['--compression', str(compression)]
+    command += [dst_path]
+    print(' '.join(command), file=sys.stderr)
+    subprocess.run(command, check=True)
+def _flattern(effects):
+    if not effects:
+        return effects
+    if isinstance(effects[0], str):
+        return effects
+    return [item for sublist in effects for item in sublist]
+def run_sox_effect(input_file, output_file, effect, *, output_sample_rate=None, output_bitdepth=None):
+    """Run sox effects"""
+    effect = _flattern(effect)
+    command = ['sox', '-V', '--no-dither', input_file]
+    if output_bitdepth:
+        command += ['--bits', str(output_bitdepth)]
+    command += [output_file] + effect
+    if output_sample_rate:
+        command += ['rate', str(output_sample_rate)]
+    print(' '.join(command))
+    subprocess.run(command, check=True)
--- a/test/torchaudio_unittest/common_utils/wav_utils.py
+++ b/test/torchaudio_unittest/common_utils/wav_utils.py
+from typing import Optional
+import torch
+import scipy.io.wavfile
+def normalize_wav(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.dtype == torch.float32:
+        pass
+    elif tensor.dtype == torch.int32:
+        tensor = tensor.to(torch.float32)
+        tensor[tensor > 0] /= 2147483647.
+        tensor[tensor < 0] /= 2147483648.
+    elif tensor.dtype == torch.int16:
+        tensor = tensor.to(torch.float32)
+        tensor[tensor > 0] /= 32767.
+        tensor[tensor < 0] /= 32768.
+    elif tensor.dtype == torch.uint8:
+        tensor = tensor.to(torch.float32) - 128
+        tensor[tensor > 0] /= 127.
+        tensor[tensor < 0] /= 128.
+    return tensor
+def get_wav_data(
+        dtype: str,
+        num_channels: int,
+        *,
+        num_frames: Optional[int] = None,
+        normalize: bool = True,
+        channels_first: bool = True,
+):
+    """Generate linear signal of the given dtype and num_channels
+    Data range is
+        [-1.0, 1.0] for float32,
+        [-2147483648, 2147483647] for int32
+        [-32768, 32767] for int16
+        [0, 255] for uint8
+    num_frames allow to change the linear interpolation parameter.
+    Default values are 256 for uint8, else 1 << 16.
+    1 << 16 as default is so that int16 value range is completely covered.
+    """
+    dtype_ = getattr(torch, dtype)
+    if num_frames is None:
+        if dtype == 'uint8':
+            num_frames = 256
+        else:
+            num_frames = 1 << 16
+    if dtype == 'uint8':
+        base = torch.linspace(0, 255, num_frames, dtype=dtype_)
+    elif dtype == 'int8':
+        base = torch.linspace(-128, 127, num_frames, dtype=dtype_)
+    elif dtype == 'float32':
+        base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
+    elif dtype == 'float64':
+        base = torch.linspace(-1., 1., num_frames, dtype=dtype_)
+    elif dtype == 'int32':
+        base = torch.linspace(-2147483648, 2147483647, num_frames, dtype=dtype_)
+    elif dtype == 'int16':
+        base = torch.linspace(-32768, 32767, num_frames, dtype=dtype_)
+    else:
+        raise NotImplementedError(f'Unsupported dtype {dtype}')
+    data = base.repeat([num_channels, 1])
+    if not channels_first:
+        data = data.transpose(1, 0)
+    if normalize:
+        data = normalize_wav(data)
+    return data
+def load_wav(path: str, normalize=True, channels_first=True) -> torch.Tensor:
+    """Load wav file without torchaudio"""
+    sample_rate, data = scipy.io.wavfile.read(path)
+    data = torch.from_numpy(data.copy())
+    if data.ndim == 1:
+        data = data.unsqueeze(1)
+    if normalize:
+        data = normalize_wav(data)
+    if channels_first:
+        data = data.transpose(1, 0)
+    return data, sample_rate
+def save_wav(path, data, sample_rate, channels_first=True):
+    """Save wav file without torchaudio"""
+    if channels_first:
+        data = data.transpose(1, 0)
+    scipy.io.wavfile.write(path, sample_rate, data.numpy())
--- a/test/torchaudio_unittest/compliance_kaldi_test.py
+++ b/test/torchaudio_unittest/compliance_kaldi_test.py
+import torch
+import torchaudio.compliance.kaldi as kaldi
+from torchaudio_unittest import common_utils
+def extract_window(window, wave, f, frame_length, frame_shift, snip_edges):
+    # just a copy of ExtractWindow from feature-window.cc in python
+    def first_sample_of_frame(frame, window_size, window_shift, snip_edges):
+        if snip_edges:
+            return frame * window_shift
+        else:
+            midpoint_of_frame = frame * window_shift + window_shift // 2
+            beginning_of_frame = midpoint_of_frame - window_size // 2
+            return beginning_of_frame
+    sample_offset = 0
+    num_samples = sample_offset + wave.size(0)
+    start_sample = first_sample_of_frame(f, frame_length, frame_shift, snip_edges)
+    end_sample = start_sample + frame_length
+    if snip_edges:
+        assert(start_sample >= sample_offset and end_sample <= num_samples)
+    else:
+        assert(sample_offset == 0 or start_sample >= sample_offset)
+    wave_start = start_sample - sample_offset
+    wave_end = wave_start + frame_length
+    if wave_start >= 0 and wave_end <= wave.size(0):
+        window[f, :] = wave[wave_start:(wave_start + frame_length)]
+    else:
+        wave_dim = wave.size(0)
+        for s in range(frame_length):
+            s_in_wave = s + wave_start
+            while s_in_wave < 0 or s_in_wave >= wave_dim:
+                if s_in_wave < 0:
+                    s_in_wave = - s_in_wave - 1
+                else:
+                    s_in_wave = 2 * wave_dim - 1 - s_in_wave
+            window[f, s] = wave[s_in_wave]
+class Test_Kaldi(common_utils.TempDirMixin, common_utils.TorchaudioTestCase):
+    def _test_get_strided_helper(self, num_samples, window_size, window_shift, snip_edges):
+        waveform = torch.arange(num_samples).float()
+        output = kaldi._get_strided(waveform, window_size, window_shift, snip_edges)
+        # from NumFrames in feature-window.cc
+        n = window_size
+        if snip_edges:
+            m = 0 if num_samples < window_size else 1 + (num_samples - window_size) // window_shift
+        else:
+            m = (num_samples + (window_shift // 2)) // window_shift
+        self.assertTrue(output.dim() == 2)
+        self.assertTrue(output.shape[0] == m and output.shape[1] == n)
+        window = torch.empty((m, window_size))
+        for r in range(m):
+            extract_window(window, waveform, r, window_size, window_shift, snip_edges)
+        self.assertEqual(window, output)
+    def test_get_strided(self):
+        # generate any combination where 0 < window_size <= num_samples and
+        # 0 < window_shift.
+        for num_samples in range(1, 20):
+            for window_size in range(1, num_samples + 1):
+                for window_shift in range(1, 2 * num_samples + 1):
+                    for snip_edges in range(0, 2):
+                        self._test_get_strided_helper(num_samples, window_size, window_shift, snip_edges)
+    def test_mfcc_empty(self):
+        # Passing in an empty tensor should result in an error
+        self.assertRaises(AssertionError, kaldi.mfcc, torch.empty(0))
--- a/test/torchaudio_unittest/datasets/__init__.py
+++ b/test/torchaudio_unittest/datasets/__init__.py
--- a/test/torchaudio_unittest/datasets/cmuarctic_test.py
+++ b/test/torchaudio_unittest/datasets/cmuarctic_test.py
+import os
+from pathlib import Path
+from torchaudio.datasets import cmuarctic
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+def get_mock_dataset(root_dir):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    mocked_data = []
+    sample_rate = 16000
+    transcript = "This is a test transcript."
+    base_dir = os.path.join(root_dir, "ARCTIC", "cmu_us_aew_arctic")
+    txt_dir = os.path.join(base_dir, "etc")
+    os.makedirs(txt_dir, exist_ok=True)
+    txt_file = os.path.join(txt_dir, "txt.done.data")
+    audio_dir = os.path.join(base_dir, "wav")
+    os.makedirs(audio_dir, exist_ok=True)
+    seed = 42
+    with open(txt_file, "w") as txt:
+        for c in ["a", "b"]:
+            for i in range(5):
+                utterance_id = f"arctic_{c}{i:04d}"
+                path = os.path.join(audio_dir, f"{utterance_id}.wav")
+                data = get_whitenoise(
+                    sample_rate=sample_rate,
+                    duration=3,
+                    n_channels=1,
+                    dtype="int16",
+                    seed=seed,
+                )
+                save_wav(path, data, sample_rate)
+                sample = (
+                    normalize_wav(data),
+                    sample_rate,
+                    transcript,
+                    utterance_id.split("_")[1],
+                )
+                mocked_data.append(sample)
+                txt.write(f'( {utterance_id} "{transcript}" )\n')
+                seed += 1
+    return mocked_data
+class TestCMUARCTIC(TempDirMixin, TorchaudioTestCase):
+    backend = "default"
+    root_dir = None
+    samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.samples = get_mock_dataset(cls.root_dir)
+    def _test_cmuarctic(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, transcript, utterance_id) in enumerate(dataset):
+            expected_sample = self.samples[i]
+            assert sample_rate == expected_sample[1]
+            assert transcript == expected_sample[2]
+            assert utterance_id == expected_sample[3]
+            self.assertEqual(expected_sample[0], waveform, atol=5e-5, rtol=1e-8)
+            n_ite += 1
+        assert n_ite == len(self.samples)
+    def test_cmuarctic_str(self):
+        dataset = cmuarctic.CMUARCTIC(self.root_dir)
+        self._test_cmuarctic(dataset)
+    def test_cmuarctic_path(self):
+        dataset = cmuarctic.CMUARCTIC(Path(self.root_dir))
+        self._test_cmuarctic(dataset)
--- a/test/torchaudio_unittest/datasets/cmudict_test.py
+++ b/test/torchaudio_unittest/datasets/cmudict_test.py
+import os
+from pathlib import Path
+from torchaudio.datasets import CMUDict
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+)
+def get_mock_dataset(root_dir, return_punc=False):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    header = [
+        ";;; # CMUdict  --  Major Version: 0.07",
+        ";;; ",
+        ";;; # $HeadURL$",
+    ]
+    puncs = [
+        "!EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
+        "\"CLOSE-QUOTE  K L OW1 Z K W OW1 T",
+        "#HASH-MARK  HH AE1 M AA2 R K",
+        "%PERCENT  P ER0 S EH1 N T",
+        "&AMPERSAND  AE1 M P ER0 S AE2 N D",
+        "'END-INNER-QUOTE  EH1 N D IH1 N ER0 K W OW1 T",
+        "(BEGIN-PARENS  B IH0 G IH1 N P ER0 EH1 N Z",
+        ")CLOSE-PAREN  K L OW1 Z P ER0 EH1 N",
+        "+PLUS  P L UH1 S",
+        ",COMMA  K AA1 M AH0",
+        "--DASH  D AE1 SH",
+        "!EXCLAMATION-POINT  EH2 K S K L AH0 M EY1 SH AH0 N P OY2 N T",
+        "/SLASH  S L AE1 SH",
+        ":COLON  K OW1 L AH0 N",
+        ";SEMI-COLON  S EH1 M IY0 K OW1 L AH0 N",
+        "?QUESTION-MARK  K W EH1 S CH AH0 N M AA1 R K",
+        "{BRACE  B R EY1 S",
+        "}CLOSE-BRACE  K L OW1 Z B R EY1 S",
+        "...ELLIPSIS  IH2 L IH1 P S IH0 S",
+    ]
+    punc_outputs = [
+        "!",
+        "\"",
+        "#",
+        "%",
+        "&",
+        "'",
+        "(",
+        ")",
+        "+",
+        ",",
+        "--",
+        "!",
+        "/",
+        ":",
+        ";",
+        "?",
+        "{",
+        "}",
+        "...",
+    ]
+    words = [
+        "3-D  TH R IY1 D IY2",
+        "'BOUT  B AW1 T",
+        "'CAUSE  K AH0 Z",
+        "'TWAS  T W AH1 Z",
+        "A  AH0",
+        "B  B IY1",
+        "C  S IY1",
+        "D  D IY1",
+        "E  IY1",
+        "F  EH1 F",
+        "G  JH IY1",
+        "H  EY1 CH",
+        "I  AY1",
+        "J  JH EY1",
+        "K  K EY1",
+        "L  EH1 L",
+        "M  EH1 M",
+        "N  EH1 N",
+        "O  OW1",
+        "P  P IY1",
+        "Q  K Y UW1",
+        "R  AA1 R",
+        "S  EH1 S",
+        "T  T IY1",
+        "U  Y UW1",
+        "V  V IY1",
+        "X  EH1 K S",
+        "Y  W AY1",
+        "Z  Z IY1",
+    ]
+    mocked_symbols = [
+        "AA1",
+        "AA2",
+        "AE1",
+        "AE2",
+        "AH0",
+        "AH1",
+        "AY1",
+        "B",
+        "CH",
+        "D",
+        "EH1",
+        "EH2",
+        "ER0",
+        "EY1",
+        "F",
+        "G",
+        "HH",
+        "IH0",
+        "IH1",
+        "IY0",
+        "IY1",
+        "IY2",
+        "JH",
+        "K",
+        "L",
+        "M",
+        "N",
+        "OW1",
+        "OY2",
+        "P",
+        "R",
+        "S",
+        "SH",
+        "T",
+        "TH",
+        "UH1",
+        "UW0",
+        "UW1",
+        "V",
+        "W",
+        "Y",
+        "Z",
+    ]
+    dict_file = os.path.join(root_dir, "cmudict-0.7b")
+    symbol_file = os.path.join(root_dir, "cmudict-0.7b.symbols")
+    with open(dict_file, "w") as fileobj:
+        for section in [header, puncs, words]:
+            for line in section:
+                fileobj.write(line)
+                fileobj.write("\n")
+    with open(symbol_file, "w") as txt:
+        txt.write("\n".join(mocked_symbols))
+    mocked_data = []
+    if return_punc:
+        for i, ent in enumerate(puncs):
+            _, phones = ent.split("  ")
+            mocked_data.append((punc_outputs[i], phones.split(" ")))
+    for ent in words:
+        word, phones = ent.split("  ")
+        mocked_data.append((word, phones.split(" ")))
+    return mocked_data
+class TestCMUDict(TempDirMixin, TorchaudioTestCase):
+    root_dir = None
+    root_punc_dir = None
+    samples = []
+    punc_samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = os.path.join(cls.get_base_temp_dir(), "normal")
+        os.mkdir(cls.root_dir)
+        cls.samples = get_mock_dataset(cls.root_dir)
+        cls.root_punc_dir = os.path.join(cls.get_base_temp_dir(), "punc")
+        os.mkdir(cls.root_punc_dir)
+        cls.punc_samples = get_mock_dataset(cls.root_punc_dir, return_punc=True)
+    def _test_cmudict(self, dataset):
+        """Test if the dataset is reading the mocked data correctly."""
+        n_item = 0
+        for i, (word, phones) in enumerate(dataset):
+            expected_word, expected_phones = self.samples[i]
+            assert word == expected_word
+            assert phones == expected_phones
+            n_item += 1
+        assert n_item == len(self.samples)
+    def _test_punc_cmudict(self, dataset):
+        """Test if the dataset is reading the mocked data with punctuations correctly."""
+        n_item = 0
+        for i, (word, phones) in enumerate(dataset):
+            expected_word, expected_phones = self.punc_samples[i]
+            assert word == expected_word
+            assert phones == expected_phones
+            n_item += 1
+        assert n_item == len(self.punc_samples)
+    def test_cmuarctic_path_with_punctuation(self):
+        dataset = CMUDict(Path(self.root_punc_dir), exclude_punctuations=False)
+        self._test_punc_cmudict(dataset)
+    def test_cmuarctic_str_with_punctuation(self):
+        dataset = CMUDict(self.root_punc_dir, exclude_punctuations=False)
+        self._test_punc_cmudict(dataset)
+    def test_cmuarctic_path(self):
+        dataset = CMUDict(Path(self.root_punc_dir), exclude_punctuations=True)
+        self._test_cmudict(dataset)
+    def test_cmuarctic_str(self):
+        dataset = CMUDict(self.root_punc_dir, exclude_punctuations=True)
+        self._test_cmudict(dataset)
--- a/test/torchaudio_unittest/datasets/commonvoice_test.py
+++ b/test/torchaudio_unittest/datasets/commonvoice_test.py
+import csv
+import os
+from pathlib import Path
+from typing import Tuple, Dict
+from torch import Tensor
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+from torchaudio.datasets import COMMONVOICE
+_ORIGINAL_EXT_AUDIO = COMMONVOICE._ext_audio
+_SAMPLE_RATE = 48000
+_HEADERS = [u"client_ids", u"path", u"sentence", u"up_votes", u"down_votes", u"age", u"gender", u"accent"]
+_EN_TRAIN_CSV_CONTENTS = [
+    ["9d16c5d980247861130e0480e2719f448be73d86a496c36d01a477cbdecd8cfd1399403d7a77bf458d211a70711b2da0845c",
+     "common_voice_en_18885784.wav",
+     "He was accorded a State funeral, and was buried in Drayton and Toowoomba Cemetery.", "2", "0", "", "",
+     ""],
+    ["c82eb9291328620f06025a1f8112b909099e447e485e99236cb87df008650250e79fea5ca772061fb6a370830847b9c44d20",
+     "common_voice_en_556542.wav", "Once more into the breach", "2", "0", "thirties", "male", "us"],
+    ["f74d880c5ad4c5917f314a604d3fc4805159d255796fb9f8defca35333ecc002bdf53dc463503c12674ea840b21b4a507b7c",
+     "common_voice_en_18607573.wav",
+     "Caddy, show Miss Clare and Miss Summerson their rooms.", "2", "0", "twenties", "male", "canada"],
+]
+_FR_TRAIN_CSV_CONTENTS = [
+    [
+        "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef"
+        "18343441c601cae0597a4b0d3144",
+        "89e67e7682b36786a0b4b4022c4d42090c86edd96c78c12d30088e62522b8fe466ea4912e6a1055dfb91b296a0743e0a2bbe"
+        "16cebac98ee5349e3e8262cb9329",
+        "Or sur ce point nous n’avons aucune réponse de votre part.", "2", "0", "twenties", "male", "france"],
+    [
+        "a2e8e1e1cc74d08c92a53d7b9ff84e077eb90410edd85b8882f16fd037cecfcb6a19413c6c63ce6458cfea9579878fa91cef18"
+        "343441c601cae0597a4b0d3144",
+        "87d71819a26179e93acfee149d0b21b7bf5e926e367d80b2b3792d45f46e04853a514945783ff764c1fc237b4eb0ee2b0a7a7"
+        "cbd395acbdfcfa9d76a6e199bbd",
+        "Monsieur de La Verpillière, laissez parler le ministre", "2", "0", "twenties", "male", "france"],
+]
+def get_mock_dataset(root_dir, train_csv_contents, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
+    """
+    prepares mocked dataset
+    """
+    mocked_data = []
+    # Note: extension is changed to wav for the sake of test
+    # Note: the first content is missing values for `age`, `gender` and `accent` as in the original data.
+    # Tsv file name difference does not mean different subset, testing as a whole dataset here
+    tsv_filename = os.path.join(root_dir, "train.tsv")
+    audio_base_path = os.path.join(root_dir, "clips")
+    os.makedirs(audio_base_path, exist_ok=True)
+    with open(tsv_filename, "w", newline='') as tsv:
+        writer = csv.writer(tsv, delimiter='\t')
+        writer.writerow(_HEADERS)
+        for i, content in enumerate(train_csv_contents):
+            content[2] = str(content[2].encode("utf-8"))
+            writer.writerow(content)
+            if not content[1].endswith(ext_audio):
+                audio_path = os.path.join(audio_base_path, content[1] + ext_audio)
+            else:
+                audio_path = os.path.join(audio_base_path, content[1])
+            data = get_whitenoise(sample_rate=_SAMPLE_RATE, duration=1, n_channels=1, seed=i, dtype='float32')
+            save_wav(audio_path, data, _SAMPLE_RATE)
+            # Append data entry
+            mocked_data.append((normalize_wav(data), _SAMPLE_RATE, dict(zip(_HEADERS, content))))
+    return mocked_data
+def get_mock_dataset_en(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
+    """
+    prepares english mocked dataset
+    """
+    return get_mock_dataset(root_dir, _EN_TRAIN_CSV_CONTENTS, ext_audio)
+def get_mock_dataset_fr(root_dir, ext_audio) -> Tuple[Tensor, int, Dict[str, str]]:
+    """
+    prepares french mocked dataset
+    """
+    return get_mock_dataset(root_dir, _FR_TRAIN_CSV_CONTENTS, ext_audio)
+class BaseTestCommonVoice(TempDirMixin):
+    root_dir = None
+    data = None
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.root_dir = cls.get_base_temp_dir()
+        COMMONVOICE._ext_audio = ".wav"
+    @classmethod
+    def tearDownClass(cls):
+        super().tearDownClass()
+        COMMONVOICE._ext_audio = _ORIGINAL_EXT_AUDIO
+    def _test_commonvoice(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, dictionary) in enumerate(dataset):
+            expected_dictionary = self.data[i][2]
+            expected_data = self.data[i][0]
+            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
+            assert sample_rate == _SAMPLE_RATE
+            assert dictionary == expected_dictionary
+            n_ite += 1
+        assert n_ite == len(self.data)
+class TestCommonVoiceEN(BaseTestCommonVoice, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.data = get_mock_dataset_en(cls.root_dir, COMMONVOICE._ext_audio)
+    def test_commonvoice_str(self):
+        dataset = COMMONVOICE(self.root_dir)
+        self._test_commonvoice(dataset)
+    def test_commonvoice_path(self):
+        dataset = COMMONVOICE(Path(self.root_dir))
+        self._test_commonvoice(dataset)
+class TestCommonVoiceFR(BaseTestCommonVoice, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    @classmethod
+    def setUpClass(cls):
+        super().setUpClass()
+        cls.data = get_mock_dataset_fr(cls.root_dir, COMMONVOICE._ext_audio)
+    def test_commonvoice_str(self):
+        dataset = COMMONVOICE(self.root_dir)
+        self._test_commonvoice(dataset)
--- a/test/torchaudio_unittest/datasets/datasets_test.py
+++ b/test/torchaudio_unittest/datasets/datasets_test.py
+from torchaudio.datasets.vctk import VCTK
+from torchaudio_unittest.common_utils import (
+    TorchaudioTestCase,
+    get_asset_path,
+)
+class TestDatasets(TorchaudioTestCase):
+    backend = 'default'
+    path = get_asset_path()
+    def test_vctk(self):
+        data = VCTK(self.path)
+        data[0]
--- a/test/torchaudio_unittest/datasets/gtzan_test.py
+++ b/test/torchaudio_unittest/datasets/gtzan_test.py
+import os
+from pathlib import Path
+from torchaudio.datasets import gtzan
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+def get_mock_dataset(root_dir):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    mocked_samples = []
+    mocked_training = []
+    mocked_validation = []
+    mocked_testing = []
+    sample_rate = 22050
+    seed = 0
+    for genre in gtzan.gtzan_genres:
+        base_dir = os.path.join(root_dir, 'genres', genre)
+        os.makedirs(base_dir, exist_ok=True)
+        for i in range(100):
+            filename = f'{genre}.{i:05d}'
+            path = os.path.join(base_dir, f'{filename}.wav')
+            data = get_whitenoise(sample_rate=sample_rate, duration=0.01, n_channels=1, dtype='int16', seed=seed)
+            save_wav(path, data, sample_rate)
+            sample = (normalize_wav(data), sample_rate, genre)
+            mocked_samples.append(sample)
+            if filename in gtzan.filtered_test:
+                mocked_testing.append(sample)
+            if filename in gtzan.filtered_train:
+                mocked_training.append(sample)
+            if filename in gtzan.filtered_valid:
+                mocked_validation.append(sample)
+            seed += 1
+    return (mocked_samples, mocked_training, mocked_validation, mocked_testing)
+class TestGTZAN(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    samples = []
+    training = []
+    validation = []
+    testing = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        mocked_data = get_mock_dataset(cls.root_dir)
+        cls.samples = mocked_data[0]
+        cls.training = mocked_data[1]
+        cls.validation = mocked_data[2]
+        cls.testing = mocked_data[3]
+    def test_no_subset(self):
+        dataset = gtzan.GTZAN(self.root_dir)
+        n_ite = 0
+        for i, (waveform, sample_rate, label) in enumerate(dataset):
+            self.assertEqual(waveform, self.samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.samples[i][1]
+            assert label == self.samples[i][2]
+            n_ite += 1
+        assert n_ite == len(self.samples)
+    def _test_training(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, label) in enumerate(dataset):
+            self.assertEqual(waveform, self.training[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.training[i][1]
+            assert label == self.training[i][2]
+            n_ite += 1
+        assert n_ite == len(self.training)
+    def _test_validation(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, label) in enumerate(dataset):
+            self.assertEqual(waveform, self.validation[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.validation[i][1]
+            assert label == self.validation[i][2]
+            n_ite += 1
+        assert n_ite == len(self.validation)
+    def _test_testing(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, label) in enumerate(dataset):
+            self.assertEqual(waveform, self.testing[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.testing[i][1]
+            assert label == self.testing[i][2]
+            n_ite += 1
+        assert n_ite == len(self.testing)
+    def test_training_str(self):
+        train_dataset = gtzan.GTZAN(self.root_dir, subset='training')
+        self._test_training(train_dataset)
+    def test_validation_str(self):
+        val_dataset = gtzan.GTZAN(self.root_dir, subset='validation')
+        self._test_validation(val_dataset)
+    def test_testing_str(self):
+        test_dataset = gtzan.GTZAN(self.root_dir, subset='testing')
+        self._test_testing(test_dataset)
+    def test_training_path(self):
+        root_dir = Path(self.root_dir)
+        train_dataset = gtzan.GTZAN(root_dir, subset='training')
+        self._test_training(train_dataset)
+    def test_validation_path(self):
+        root_dir = Path(self.root_dir)
+        val_dataset = gtzan.GTZAN(root_dir, subset='validation')
+        self._test_validation(val_dataset)
+    def test_testing_path(self):
+        root_dir = Path(self.root_dir)
+        test_dataset = gtzan.GTZAN(root_dir, subset='testing')
+        self._test_testing(test_dataset)
--- a/test/torchaudio_unittest/datasets/librispeech_test.py
+++ b/test/torchaudio_unittest/datasets/librispeech_test.py
+import os
+from pathlib import Path
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+from torchaudio.datasets import librispeech
+# Used to generate a unique transcript for each dummy audio file
+_NUMBERS = [
+    'ZERO',
+    'ONE',
+    'TWO',
+    'THREE',
+    'FOUR',
+    'FIVE',
+    'SIX',
+    'SEVEN',
+    'EIGHT',
+    'NINE'
+]
+def get_mock_dataset(root_dir):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    mocked_data = []
+    dataset_dir = os.path.join(
+        root_dir, librispeech.FOLDER_IN_ARCHIVE, librispeech.URL
+    )
+    os.makedirs(dataset_dir, exist_ok=True)
+    sample_rate = 16000  # 16kHz
+    seed = 0
+    for speaker_id in range(5):
+        speaker_path = os.path.join(dataset_dir, str(speaker_id))
+        os.makedirs(speaker_path, exist_ok=True)
+        for chapter_id in range(3):
+            chapter_path = os.path.join(speaker_path, str(chapter_id))
+            os.makedirs(chapter_path, exist_ok=True)
+            trans_content = []
+            for utterance_id in range(10):
+                filename = f'{speaker_id}-{chapter_id}-{utterance_id:04d}.wav'
+                path = os.path.join(chapter_path, filename)
+                transcript = ' '.join(
+                    [_NUMBERS[x] for x in [speaker_id, chapter_id, utterance_id]]
+                )
+                trans_content.append(
+                    f'{speaker_id}-{chapter_id}-{utterance_id:04d} {transcript}'
+                )
+                data = get_whitenoise(
+                    sample_rate=sample_rate,
+                    duration=0.01,
+                    n_channels=1,
+                    dtype='float32',
+                    seed=seed
+                )
+                save_wav(path, data, sample_rate)
+                sample = (
+                    normalize_wav(data),
+                    sample_rate,
+                    transcript,
+                    speaker_id,
+                    chapter_id,
+                    utterance_id
+                )
+                mocked_data.append(sample)
+                seed += 1
+            trans_filename = f'{speaker_id}-{chapter_id}.trans.txt'
+            trans_path = os.path.join(chapter_path, trans_filename)
+            with open(trans_path, 'w') as f:
+                f.write('\n'.join(trans_content))
+    return mocked_data
+class TestLibriSpeech(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.samples = get_mock_dataset(cls.root_dir)
+    @classmethod
+    def tearDownClass(cls):
+        # In case of test failure
+        librispeech.LIBRISPEECH._ext_audio = '.flac'
+    def _test_librispeech(self, dataset):
+        num_samples = 0
+        for i, (
+                data, sample_rate, transcript, speaker_id, chapter_id, utterance_id
+        ) in enumerate(dataset):
+            self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.samples[i][1]
+            assert transcript == self.samples[i][2]
+            assert speaker_id == self.samples[i][3]
+            assert chapter_id == self.samples[i][4]
+            assert utterance_id == self.samples[i][5]
+            num_samples += 1
+        assert num_samples == len(self.samples)
+        librispeech.LIBRISPEECH._ext_audio = '.flac'
+    def test_librispeech_str(self):
+        librispeech.LIBRISPEECH._ext_audio = '.wav'
+        dataset = librispeech.LIBRISPEECH(self.root_dir)
+        self._test_librispeech(dataset)
+    def test_librispeech_path(self):
+        librispeech.LIBRISPEECH._ext_audio = '.wav'
+        dataset = librispeech.LIBRISPEECH(Path(self.root_dir))
+        self._test_librispeech(dataset)
--- a/test/torchaudio_unittest/datasets/libritts_test.py
+++ b/test/torchaudio_unittest/datasets/libritts_test.py
+import os
+from pathlib import Path
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+from torchaudio.datasets.libritts import LIBRITTS
+_UTTERANCE_IDS = [
+    [19, 198, '000000', '000000'],
+    [26, 495, '000004', '000000'],
+]
+_ORIGINAL_TEXT = 'this is the original text.'
+_NORMALIZED_TEXT = 'this is the normalized text.'
+def get_mock_dataset(root_dir):
+    """
+    root_dir: directory to the mocked dataset
+    """
+    mocked_data = []
+    base_dir = os.path.join(root_dir, 'LibriTTS', 'train-clean-100')
+    for i, utterance_id in enumerate(_UTTERANCE_IDS):
+        filename = f'{"_".join(str(u) for u in utterance_id)}.wav'
+        file_dir = os.path.join(base_dir, str(utterance_id[0]), str(utterance_id[1]))
+        os.makedirs(file_dir, exist_ok=True)
+        path = os.path.join(file_dir, filename)
+        data = get_whitenoise(sample_rate=24000, duration=2, n_channels=1, dtype='int16', seed=i)
+        save_wav(path, data, 24000)
+        mocked_data.append(normalize_wav(data))
+        original_text_filename = f'{"_".join(str(u) for u in utterance_id)}.original.txt'
+        path_original = os.path.join(file_dir, original_text_filename)
+        with open(path_original, 'w') as file_:
+            file_.write(_ORIGINAL_TEXT)
+        normalized_text_filename = f'{"_".join(str(u) for u in utterance_id)}.normalized.txt'
+        path_normalized = os.path.join(file_dir, normalized_text_filename)
+        with open(path_normalized, 'w') as file_:
+            file_.write(_NORMALIZED_TEXT)
+    return mocked_data, _UTTERANCE_IDS, _ORIGINAL_TEXT, _NORMALIZED_TEXT
+class TestLibriTTS(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    data = []
+    _utterance_ids, _original_text, _normalized_text = [], [], []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.data, cls._utterance_ids, cls._original_text, cls._normalized_text = get_mock_dataset(cls.root_dir)
+    def _test_libritts(self, dataset):
+        n_ites = 0
+        for i, (waveform,
+                sample_rate,
+                original_text,
+                normalized_text,
+                speaker_id,
+                chapter_id,
+                utterance_id) in enumerate(dataset):
+            expected_ids = self._utterance_ids[i]
+            expected_data = self.data[i]
+            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
+            assert sample_rate == 24000
+            assert speaker_id == expected_ids[0]
+            assert chapter_id == expected_ids[1]
+            assert original_text == self._original_text
+            assert normalized_text == self._normalized_text
+            assert utterance_id == f'{"_".join(str(u) for u in expected_ids[-4:])}'
+            n_ites += 1
+        assert n_ites == len(self._utterance_ids)
+    def test_libritts_str(self):
+        dataset = LIBRITTS(self.root_dir)
+        self._test_libritts(dataset)
+    def test_libritts_path(self):
+        dataset = LIBRITTS(Path(self.root_dir))
+        self._test_libritts(dataset)
--- a/test/torchaudio_unittest/datasets/ljspeech_test.py
+++ b/test/torchaudio_unittest/datasets/ljspeech_test.py
+import csv
+import os
+from pathlib import Path
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    normalize_wav,
+    save_wav,
+)
+from torchaudio.datasets import ljspeech
+_TRANSCRIPTS = [
+    "Test transcript 1",
+    "Test transcript 2",
+    "Test transcript 3",
+    "In 1465 Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
+]
+_NORMALIZED_TRANSCRIPT = [
+    "Test transcript one",
+    "Test transcript two",
+    "Test transcript three",
+    "In fourteen sixty-five Sweynheim and Pannartz began printing in the monastery of Subiaco near Rome,"
+]
+def get_mock_dataset(root_dir):
+    """
+    root_dir: path to the mocked dataset
+    """
+    mocked_data = []
+    base_dir = os.path.join(root_dir, "LJSpeech-1.1")
+    archive_dir = os.path.join(base_dir, "wavs")
+    os.makedirs(archive_dir, exist_ok=True)
+    metadata_path = os.path.join(base_dir, "metadata.csv")
+    sample_rate = 22050
+    with open(metadata_path, mode="w", newline='') as metadata_file:
+        metadata_writer = csv.writer(
+            metadata_file, delimiter="|", quoting=csv.QUOTE_NONE
+        )
+        for i, (transcript, normalized_transcript) in enumerate(
+                zip(_TRANSCRIPTS, _NORMALIZED_TRANSCRIPT)
+        ):
+            fileid = f'LJ001-{i:04d}'
+            metadata_writer.writerow([fileid, transcript, normalized_transcript])
+            filename = fileid + ".wav"
+            path = os.path.join(archive_dir, filename)
+            data = get_whitenoise(
+                sample_rate=sample_rate, duration=1, n_channels=1, dtype="int16", seed=i
+            )
+            save_wav(path, data, sample_rate)
+            mocked_data.append(normalize_wav(data))
+    return mocked_data, _TRANSCRIPTS, _NORMALIZED_TRANSCRIPT
+class TestLJSpeech(TempDirMixin, TorchaudioTestCase):
+    backend = "default"
+    root_dir = None
+    data, _transcripts, _normalized_transcript = [], [], []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.data, cls._transcripts, cls._normalized_transcript = get_mock_dataset(cls.root_dir)
+    def _test_ljspeech(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, transcript, normalized_transcript) in enumerate(
+                dataset
+        ):
+            expected_transcript = self._transcripts[i]
+            expected_normalized_transcript = self._normalized_transcript[i]
+            expected_data = self.data[i]
+            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
+            assert sample_rate == sample_rate
+            assert transcript == expected_transcript
+            assert normalized_transcript == expected_normalized_transcript
+            n_ite += 1
+        assert n_ite == len(self.data)
+    def test_ljspeech_str(self):
+        dataset = ljspeech.LJSPEECH(self.root_dir)
+        self._test_ljspeech(dataset)
+    def test_ljspeech_path(self):
+        dataset = ljspeech.LJSPEECH(Path(self.root_dir))
+        self._test_ljspeech(dataset)
--- a/test/torchaudio_unittest/datasets/speechcommands_test.py
+++ b/test/torchaudio_unittest/datasets/speechcommands_test.py
+import os
+from pathlib import Path
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    normalize_wav,
+    save_wav,
+)
+from torchaudio.datasets import speechcommands
+_LABELS = [
+    "bed",
+    "bird",
+    "cat",
+    "dog",
+    "down",
+    "eight",
+    "five",
+    "follow",
+    "forward",
+    "four",
+    "go",
+    "happy",
+    "house",
+    "learn",
+    "left",
+    "marvin",
+    "nine",
+    "no",
+    "off",
+    "on",
+    "one",
+    "right",
+    "seven",
+    "sheila",
+    "six",
+    "stop",
+    "three",
+    "tree",
+    "two",
+    "up",
+    "visual",
+    "wow",
+    "yes",
+    "zero",
+]
+def get_mock_dataset(dataset_dir):
+    """
+    dataset_dir: directory to the mocked dataset
+    """
+    mocked_samples = []
+    mocked_train_samples = []
+    mocked_valid_samples = []
+    mocked_test_samples = []
+    os.makedirs(dataset_dir, exist_ok=True)
+    sample_rate = 16000  # 16kHz sample rate
+    seed = 0
+    valid_file = os.path.join(dataset_dir, "validation_list.txt")
+    test_file = os.path.join(dataset_dir, "testing_list.txt")
+    with open(valid_file, "w") as valid, open(test_file, "w") as test:
+        for label in _LABELS:
+            path = os.path.join(dataset_dir, label)
+            os.makedirs(path, exist_ok=True)
+            for j in range(6):
+                # generate hash ID for speaker
+                speaker = "{:08x}".format(j)
+                for utterance in range(3):
+                    filename = f"{speaker}{speechcommands.HASH_DIVIDER}{utterance}.wav"
+                    file_path = os.path.join(path, filename)
+                    seed += 1
+                    data = get_whitenoise(
+                        sample_rate=sample_rate,
+                        duration=0.01,
+                        n_channels=1,
+                        dtype="int16",
+                        seed=seed,
+                    )
+                    save_wav(file_path, data, sample_rate)
+                    sample = (
+                        normalize_wav(data),
+                        sample_rate,
+                        label,
+                        speaker,
+                        utterance,
+                    )
+                    mocked_samples.append(sample)
+                    if j < 2:
+                        mocked_train_samples.append(sample)
+                    elif j < 4:
+                        valid.write(f'{label}/{filename}\n')
+                        mocked_valid_samples.append(sample)
+                    elif j < 6:
+                        test.write(f'{label}/{filename}\n')
+                        mocked_test_samples.append(sample)
+    return mocked_samples, mocked_train_samples, mocked_valid_samples, mocked_test_samples
+class TestSpeechCommands(TempDirMixin, TorchaudioTestCase):
+    backend = "default"
+    root_dir = None
+    samples = []
+    train_samples = []
+    valid_samples = []
+    test_samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        dataset_dir = os.path.join(
+            cls.root_dir, speechcommands.FOLDER_IN_ARCHIVE, speechcommands.URL
+        )
+        cls.samples, cls.train_samples, cls.valid_samples, cls.test_samples = get_mock_dataset(dataset_dir)
+    def _testSpeechCommands(self, dataset, data_samples):
+        num_samples = 0
+        for i, (data, sample_rate, label, speaker_id, utterance_number) in enumerate(
+                dataset
+        ):
+            self.assertEqual(data, data_samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == data_samples[i][1]
+            assert label == data_samples[i][2]
+            assert speaker_id == data_samples[i][3]
+            assert utterance_number == data_samples[i][4]
+            num_samples += 1
+        assert num_samples == len(data_samples)
+    def testSpeechCommands_str(self):
+        dataset = speechcommands.SPEECHCOMMANDS(self.root_dir)
+        self._testSpeechCommands(dataset, self.samples)
+    def testSpeechCommands_path(self):
+        dataset = speechcommands.SPEECHCOMMANDS(Path(self.root_dir))
+        self._testSpeechCommands(dataset, self.samples)
+    def testSpeechCommandsSubsetTrain(self):
+        dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="training")
+        self._testSpeechCommands(dataset, self.train_samples)
+    def testSpeechCommandsSubsetValid(self):
+        dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="validation")
+        self._testSpeechCommands(dataset, self.valid_samples)
+    def testSpeechCommandsSubsetTest(self):
+        dataset = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="testing")
+        self._testSpeechCommands(dataset, self.test_samples)
+    def testSpeechCommandsSum(self):
+        dataset_all = speechcommands.SPEECHCOMMANDS(self.root_dir)
+        dataset_train = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="training")
+        dataset_valid = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="validation")
+        dataset_test = speechcommands.SPEECHCOMMANDS(self.root_dir, subset="testing")
+        assert len(dataset_train) + len(dataset_valid) + len(dataset_test) == len(dataset_all)
--- a/test/torchaudio_unittest/datasets/tedlium_test.py
+++ b/test/torchaudio_unittest/datasets/tedlium_test.py
+import os
+import platform
+from pathlib import Path
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    skipIfNoSox
+)
+from torchaudio.datasets import tedlium
+# Used to generate a unique utterance for each dummy audio file
+_UTTERANCES = [
+    "AaronHuey_2010X 1 AaronHuey_2010X 0.0 2.0 <o,f0,female> script1\n",
+    "AaronHuey_2010X 1 AaronHuey_2010X 2.0 4.0 <o,f0,female> script2\n",
+    "AaronHuey_2010X 1 AaronHuey_2010X 4.0 6.0 <o,f0,female> script3\n",
+    "AaronHuey_2010X 1 AaronHuey_2010X 6.0 8.0 <o,f0,female> script4\n",
+    "AaronHuey_2010X 1 AaronHuey_2010X 8.0 10.0 <o,f0,female> script5\n",
+]
+_PHONEME = [
+    "a AH",
+    "a(2) EY",
+    "aachen AA K AH N",
+    "aad AE D",
+    "aaden EY D AH N",
+    "aadmi AE D M IY",
+    "aae EY EY",
+]
+def get_mock_dataset(dataset_dir):
+    """
+    dataset_dir: directory of the mocked dataset
+    """
+    mocked_samples = {}
+    os.makedirs(dataset_dir, exist_ok=True)
+    sample_rate = 16000  # 16kHz
+    seed = 0
+    for release in ["release1", "release2", "release3"]:
+        data = get_whitenoise(sample_rate=sample_rate, duration=10.00, n_channels=1, dtype="float32", seed=seed)
+        if release in ["release1", "release2"]:
+            release_dir = os.path.join(
+                dataset_dir,
+                tedlium._RELEASE_CONFIGS[release]["folder_in_archive"],
+                tedlium._RELEASE_CONFIGS[release]["subset"],
+            )
+        else:
+            release_dir = os.path.join(
+                dataset_dir,
+                tedlium._RELEASE_CONFIGS[release]["folder_in_archive"],
+                tedlium._RELEASE_CONFIGS[release]["data_path"],
+            )
+        os.makedirs(release_dir, exist_ok=True)
+        os.makedirs(os.path.join(release_dir, "stm"), exist_ok=True)  # Subfolder for transcripts
+        os.makedirs(os.path.join(release_dir, "sph"), exist_ok=True)  # Subfolder for audio files
+        filename = f"{release}.sph"
+        path = os.path.join(os.path.join(release_dir, "sph"), filename)
+        save_wav(path, data, sample_rate)
+        trans_filename = f"{release}.stm"
+        trans_path = os.path.join(os.path.join(release_dir, "stm"), trans_filename)
+        with open(trans_path, "w") as f:
+            f.write("".join(_UTTERANCES))
+        dict_filename = f"{release}.dic"
+        dict_path = os.path.join(release_dir, dict_filename)
+        with open(dict_path, "w") as f:
+            f.write("\n".join(_PHONEME))
+        # Create a samples list to compare with
+        mocked_samples[release] = []
+        for utterance in _UTTERANCES:
+            talk_id, _, speaker_id, start_time, end_time, identifier, transcript = utterance.split(" ", 6)
+            start_time = int(float(start_time)) * sample_rate
+            end_time = int(float(end_time)) * sample_rate
+            sample = (
+                data[:, start_time:end_time],
+                sample_rate,
+                transcript,
+                talk_id,
+                speaker_id,
+                identifier,
+            )
+            mocked_samples[release].append(sample)
+        seed += 1
+    return mocked_samples
+class Tedlium(TempDirMixin):
+    root_dir = None
+    samples = {}
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.root_dir = dataset_dir = os.path.join(cls.root_dir, "tedlium")
+        cls.samples = get_mock_dataset(dataset_dir)
+    def _test_tedlium(self, dataset, release):
+        num_samples = 0
+        for i, (data, sample_rate, transcript, talk_id, speaker_id, identifier) in enumerate(dataset):
+            self.assertEqual(data, self.samples[release][i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.samples[release][i][1]
+            assert transcript == self.samples[release][i][2]
+            assert talk_id == self.samples[release][i][3]
+            assert speaker_id == self.samples[release][i][4]
+            assert identifier == self.samples[release][i][5]
+            num_samples += 1
+        assert num_samples == len(self.samples[release])
+        dataset._dict_path = os.path.join(dataset._path, f"{release}.dic")
+        phoneme_dict = dataset.phoneme_dict
+        phoenemes = [f"{key} {' '.join(value)}" for key, value in phoneme_dict.items()]
+        assert phoenemes == _PHONEME
+    def test_tedlium_release1_str(self):
+        release = "release1"
+        dataset = tedlium.TEDLIUM(self.root_dir, release=release)
+        self._test_tedlium(dataset, release)
+    def test_tedlium_release1_path(self):
+        release = "release1"
+        dataset = tedlium.TEDLIUM(Path(self.root_dir), release=release)
+        self._test_tedlium(dataset, release)
+    def test_tedlium_release2(self):
+        release = "release2"
+        dataset = tedlium.TEDLIUM(self.root_dir, release=release)
+        self._test_tedlium(dataset, release)
+    def test_tedlium_release3(self):
+        release = "release3"
+        dataset = tedlium.TEDLIUM(self.root_dir, release=release)
+        self._test_tedlium(dataset, release)
+class TestTedliumSoundfile(Tedlium, TorchaudioTestCase):
+    backend = "soundfile"
+if platform.system() != "Windows":
+    @skipIfNoSox
+    class TestTedliumSoxIO(Tedlium, TorchaudioTestCase):
+        backend = "sox_io"
--- a/test/torchaudio_unittest/datasets/utils_test.py
+++ b/test/torchaudio_unittest/datasets/utils_test.py
+import torch
+from torchaudio_unittest.common_utils import (
+    TorchaudioTestCase,
+    TempDirMixin
+)
+from torchaudio.datasets import utils as dataset_utils
+class Dataset(torch.utils.data.Dataset):
+    def __getitem__(self, n):
+        sample_rate = 8000
+        waveform = n * torch.ones(2, 256)
+        return waveform, sample_rate
+    def __len__(self) -> int:
+        return 2
+    def __iter__(self):
+        for i in range(len(self)):
+            yield self[i]
+class TestIterator(TorchaudioTestCase, TempDirMixin):
+    backend = 'default'
+    def test_disckcache_iterator(self):
+        data = dataset_utils.diskcache_iterator(Dataset(), self.get_base_temp_dir())
+        # Save
+        data[0]
+        # Load
+        data[0]
+    def test_bg_iterator(self):
+        data = dataset_utils.bg_iterator(Dataset(), 5)
+        for _ in data:
+            pass
--- a/test/torchaudio_unittest/datasets/vctk_test.py
+++ b/test/torchaudio_unittest/datasets/vctk_test.py
+import os
+from pathlib import Path
+from torchaudio.datasets import vctk
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+# Used to generate a unique transcript for each dummy audio file
+_TRANSCRIPT = [
+    'Please call Stella',
+    'Ask her to bring these things',
+    'with her from the store',
+    'Six spoons of fresh snow peas, five thick slabs of blue cheese, and maybe a snack for her brother Bob',
+    'We also need a small plastic snake and a big toy frog for the kids',
+    'She can scoop these things into three red bags, and we will go meet her Wednesday at the train station',
+    'When the sunlight strikes raindrops in the air, they act as a prism and form a rainbow',
+    'The rainbow is a division of white light into many beautiful colors',
+    'These take the shape of a long round arch, with its path high above, and its two ends \
+        apparently beyond the horizon',
+    'There is, according to legend, a boiling pot of gold at one end'
+]
+def get_mock_dataset(root_dir):
+    """
+    root_dir: root directory of the mocked data
+    """
+    mocked_samples = []
+    dataset_dir = os.path.join(root_dir, 'VCTK-Corpus-0.92')
+    os.makedirs(dataset_dir, exist_ok=True)
+    sample_rate = 48000
+    seed = 0
+    for speaker in range(225, 230):
+        speaker_id = 'p' + str(speaker)
+        audio_dir = os.path.join(dataset_dir, 'wav48_silence_trimmed', speaker_id)
+        os.makedirs(audio_dir, exist_ok=True)
+        file_dir = os.path.join(dataset_dir, 'txt', speaker_id)
+        os.makedirs(file_dir, exist_ok=True)
+        for utterance_id in range(1, 11):
+            filename = f'{speaker_id}_{utterance_id:03d}_mic2'
+            audio_file_path = os.path.join(audio_dir, filename + '.wav')
+            data = get_whitenoise(
+                sample_rate=sample_rate,
+                duration=0.01,
+                n_channels=1,
+                dtype='float32',
+                seed=seed
+            )
+            save_wav(audio_file_path, data, sample_rate)
+            txt_file_path = os.path.join(file_dir, filename[:-5] + '.txt')
+            transcript = _TRANSCRIPT[utterance_id - 1]
+            with open(txt_file_path, 'w') as f:
+                f.write(transcript)
+            sample = (
+                normalize_wav(data),
+                sample_rate,
+                transcript,
+                speaker_id,
+                utterance_id
+            )
+            mocked_samples.append(sample)
+            seed += 1
+    return mocked_samples
+class TestVCTK(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    samples = []
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.samples = get_mock_dataset(cls.root_dir)
+    def _test_vctk(self, dataset):
+        num_samples = 0
+        for i, (data, sample_rate, transcript, speaker_id, utterance_id) in enumerate(dataset):
+            self.assertEqual(data, self.samples[i][0], atol=5e-5, rtol=1e-8)
+            assert sample_rate == self.samples[i][1]
+            assert transcript == self.samples[i][2]
+            assert speaker_id == self.samples[i][3]
+            assert int(utterance_id) == self.samples[i][4]
+            num_samples += 1
+        assert num_samples == len(self.samples)
+    def test_vctk_str(self):
+        dataset = vctk.VCTK_092(self.root_dir, audio_ext=".wav")
+        self._test_vctk(dataset)
+    def test_vctk_path(self):
+        dataset = vctk.VCTK_092(Path(self.root_dir), audio_ext=".wav")
+        self._test_vctk(dataset)
--- a/test/torchaudio_unittest/datasets/yesno_test.py
+++ b/test/torchaudio_unittest/datasets/yesno_test.py
+import os
+from pathlib import Path
+from torchaudio.datasets import yesno
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_whitenoise,
+    save_wav,
+    normalize_wav,
+)
+def get_mock_data(root_dir, labels):
+    """
+    root_dir: path
+    labels: list of labels
+    """
+    mocked_data = []
+    base_dir = os.path.join(root_dir, 'waves_yesno')
+    os.makedirs(base_dir, exist_ok=True)
+    for i, label in enumerate(labels):
+        filename = f'{"_".join(str(l) for l in label)}.wav'
+        path = os.path.join(base_dir, filename)
+        data = get_whitenoise(sample_rate=8000, duration=6, n_channels=1, dtype='int16', seed=i)
+        save_wav(path, data, 8000)
+        mocked_data.append(normalize_wav(data))
+    return mocked_data
+class TestYesNo(TempDirMixin, TorchaudioTestCase):
+    backend = 'default'
+    root_dir = None
+    data = []
+    labels = [
+        [0, 0, 0, 0, 0, 0, 0, 0],
+        [0, 0, 0, 0, 1, 1, 1, 1],
+        [0, 1, 0, 1, 0, 1, 1, 0],
+        [1, 1, 1, 1, 0, 0, 0, 0],
+        [1, 1, 1, 1, 1, 1, 1, 1],
+    ]
+    @classmethod
+    def setUpClass(cls):
+        cls.root_dir = cls.get_base_temp_dir()
+        cls.data = get_mock_data(cls.root_dir, cls.labels)
+    def _test_yesno(self, dataset):
+        n_ite = 0
+        for i, (waveform, sample_rate, label) in enumerate(dataset):
+            expected_label = self.labels[i]
+            expected_data = self.data[i]
+            self.assertEqual(expected_data, waveform, atol=5e-5, rtol=1e-8)
+            assert sample_rate == 8000
+            assert label == expected_label
+            n_ite += 1
+        assert n_ite == len(self.data)
+    def test_yesno_str(self):
+        dataset = yesno.YESNO(self.root_dir)
+        self._test_yesno(dataset)
+    def test_yesno_path(self):
+        dataset = yesno.YESNO(Path(self.root_dir))
+        self._test_yesno(dataset)
--- a/test/torchaudio_unittest/example/__init__.py
+++ b/test/torchaudio_unittest/example/__init__.py
+import os
+import sys
+sys.path.append(
+    os.path.join(
+        os.path.dirname(__file__),
+        '..', '..', '..', 'examples'))
--- a/test/torchaudio_unittest/example/souce_sepration/__init__.py
+++ b/test/torchaudio_unittest/example/souce_sepration/__init__.py