init v0.10.0

9dcc7a15 · flyingdown · db2b0b79 · 9dcc7a15 · 9dcc7a15 · 9dcc7a15
Commit 9dcc7a15 authored Apr 25, 2022 by flyingdown
20 changed files
--- a/test/torchaudio_unittest/sox_effect/smoke_test.py
+++ b/test/torchaudio_unittest/sox_effect/smoke_test.py
+from torchaudio import sox_effects
+from parameterized import parameterized
+
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    skipIfNoSox,
+    get_wav_data,
+    get_sinusoid,
+    save_wav,
+)
+from .common import (
+    load_params,
+)
+
+
+@skipIfNoSox
+class SmokeTest(TempDirMixin, TorchaudioTestCase):
+    """Run smoke test on various effects
+
+    The purpose of this test suite is to verify that sox_effect functionalities do not exhibit
+    abnormal behaviors.
+
+    This test suite should be able to run without any additional tools (such as sox command),
+    however without such tools, the correctness of each function cannot be verified.
+    """
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_tensor(self, args):
+        """`apply_effects_tensor` should not crash"""
+        effects = args['effects']
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+        original = get_sinusoid(
+            frequency=800, sample_rate=input_sr,
+            n_channels=num_channels, dtype='float32')
+        _found, _sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
+
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_file(self, args):
+        """`apply_effects_file` should return identical data as sox command"""
+        dtype = 'int32'
+        channels_first = True
+        effects = args['effects']
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+
+        input_path = self.get_temp_path('input.wav')
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+
+        _found, _sr = sox_effects.apply_effects_file(
+            input_path, effects, normalize=False, channels_first=channels_first)
+
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_fileobj(self, args):
+        """`apply_effects_file` should return identical data as sox command"""
+        dtype = 'int32'
+        channels_first = True
+        effects = args['effects']
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+
+        input_path = self.get_temp_path('input.wav')
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+
+        with open(input_path, 'rb') as fileobj:
+            _found, _sr = sox_effects.apply_effects_file(
+                fileobj, effects, normalize=False, channels_first=channels_first)
--- a/test/torchaudio_unittest/sox_effect/sox_effect_test.py
+++ b/test/torchaudio_unittest/sox_effect/sox_effect_test.py
+import io
+import itertools
+from pathlib import Path
+import tarfile
+
+from parameterized import parameterized
+from torchaudio import sox_effects
+from torchaudio._internal import module_utils as _mod_utils
+
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    HttpServerMixin,
+    PytorchTestCase,
+    skipIfNoSox,
+    skipIfNoModule,
+    skipIfNoExec,
+    get_asset_path,
+    get_sinusoid,
+    get_wav_data,
+    save_wav,
+    load_wav,
+    sox_utils,
+)
+from .common import (
+    load_params,
+    name_func,
+)
+
+
+if _mod_utils.is_module_available("requests"):
+    import requests
+
+
+@skipIfNoSox
+class TestSoxEffects(PytorchTestCase):
+    def test_init(self):
+        """Calling init_sox_effects multiple times does not crush"""
+        for _ in range(3):
+            sox_effects.init_sox_effects()
+
+
+@skipIfNoSox
+class TestSoxEffectsTensor(TempDirMixin, PytorchTestCase):
+    """Test suite for `apply_effects_tensor` function"""
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2, 4, 8],
+        [True, False]
+    )), name_func=name_func)
+    def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
+        """`apply_effects_tensor` without effects should return identical data as input"""
+        original = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        expected = original.clone()
+        found, output_sample_rate = sox_effects.apply_effects_tensor(
+            expected, sample_rate, [], channels_first)
+
+        assert output_sample_rate == sample_rate
+        # SoxEffect should not alter the input Tensor object
+        self.assertEqual(original, expected)
+        # SoxEffect should not return the same Tensor object
+        assert expected is not found
+        # Returned Tensor should equal to the input Tensor
+        self.assertEqual(expected, found)
+
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects(self, args):
+        """`apply_effects_tensor` should return identical data as sox command"""
+        effects = args['effects']
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+        output_sr = args.get("output_sample_rate")
+
+        input_path = self.get_temp_path('input.wav')
+        reference_path = self.get_temp_path('reference.wav')
+
+        original = get_sinusoid(
+            frequency=800, sample_rate=input_sr,
+            n_channels=num_channels, dtype='float32')
+        save_wav(input_path, original, input_sr)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_tensor(original, input_sr, effects)
+
+        assert sr == expected_sr
+        self.assertEqual(expected, found)
+
+
+@skipIfNoSox
+class TestSoxEffectsFile(TempDirMixin, PytorchTestCase):
+    """Test suite for `apply_effects_file` function"""
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2, 4, 8],
+        [False, True],
+    )), name_func=name_func)
+    def test_apply_no_effect(self, dtype, sample_rate, num_channels, channels_first):
+        """`apply_effects_file` without effects should return identical data as input"""
+        path = self.get_temp_path('input.wav')
+        expected = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(path, expected, sample_rate, channels_first=channels_first)
+
+        found, output_sample_rate = sox_effects.apply_effects_file(
+            path, [], normalize=False, channels_first=channels_first)
+
+        assert output_sample_rate == sample_rate
+        self.assertEqual(expected, found)
+
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_str(self, args):
+        """`apply_effects_file` should return identical data as sox command"""
+        dtype = 'int32'
+        channels_first = True
+        effects = args['effects']
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+        output_sr = args.get("output_sample_rate")
+
+        input_path = self.get_temp_path('input.wav')
+        reference_path = self.get_temp_path('reference.wav')
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, normalize=False, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+    def test_apply_effects_path(self):
+        """`apply_effects_file` should return identical data as sox command when file path is given as a Path Object"""
+        dtype = 'int32'
+        channels_first = True
+        effects = [["hilbert"]]
+        num_channels = 2
+        input_sr = 8000
+        output_sr = 8000
+
+        input_path = self.get_temp_path('input.wav')
+        reference_path = self.get_temp_path('reference.wav')
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, input_sr, channels_first=channels_first)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_sample_rate=output_sr)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            Path(input_path), effects, normalize=False, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+
+@skipIfNoSox
+class TestFileFormats(TempDirMixin, PytorchTestCase):
+    """`apply_effects_file` gives the same result as sox on various file formats"""
+    @parameterized.expand(list(itertools.product(
+        ['float32', 'int32', 'int16', 'uint8'],
+        [8000, 16000],
+        [1, 2],
+    )), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
+    def test_wav(self, dtype, sample_rate, num_channels):
+        """`apply_effects_file` works on various wav format"""
+        channels_first = True
+        effects = [['band', '300', '10']]
+
+        input_path = self.get_temp_path('input.wav')
+        reference_path = self.get_temp_path('reference.wav')
+        data = get_wav_data(dtype, num_channels, channels_first=channels_first)
+        save_wav(input_path, data, sample_rate, channels_first=channels_first)
+        sox_utils.run_sox_effect(input_path, reference_path, effects)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, normalize=False, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+    )), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
+    def test_mp3(self, sample_rate, num_channels):
+        """`apply_effects_file` works on various mp3 format"""
+        channels_first = True
+        effects = [['band', '300', '10']]
+
+        input_path = self.get_temp_path('input.mp3')
+        reference_path = self.get_temp_path('reference.wav')
+        sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
+        sox_utils.run_sox_effect(input_path, reference_path, effects)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, channels_first=channels_first)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected, atol=1e-4, rtol=1e-8)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+    )), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
+    def test_flac(self, sample_rate, num_channels):
+        """`apply_effects_file` works on various flac format"""
+        channels_first = True
+        effects = [['band', '300', '10']]
+
+        input_path = self.get_temp_path('input.flac')
+        reference_path = self.get_temp_path('reference.wav')
+        sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
+        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, channels_first=channels_first)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+    @parameterized.expand(list(itertools.product(
+        [8000, 16000],
+        [1, 2],
+    )), name_func=lambda f, _, p: f'{f.__name__}_{"_".join(str(arg) for arg in p.args)}')
+    def test_vorbis(self, sample_rate, num_channels):
+        """`apply_effects_file` works on various vorbis format"""
+        channels_first = True
+        effects = [['band', '300', '10']]
+
+        input_path = self.get_temp_path('input.vorbis')
+        reference_path = self.get_temp_path('reference.wav')
+        sox_utils.gen_audio_file(input_path, sample_rate, num_channels)
+        sox_utils.run_sox_effect(input_path, reference_path, effects, output_bitdepth=32)
+
+        expected, expected_sr = load_wav(reference_path)
+        found, sr = sox_effects.apply_effects_file(
+            input_path, effects, channels_first=channels_first)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+
+@skipIfNoSox
+class TestApplyEffectFileWithoutExtension(PytorchTestCase):
+    def test_mp3(self):
+        """Providing format allows to read mp3 without extension
+
+        libsox does not check header for mp3
+
+        https://github.com/pytorch/audio/issues/1040
+
+        The file was generated with the following command
+            ffmpeg -f lavfi -i "sine=frequency=1000:duration=5" -ar 16000 -f mp3 test_noext
+        """
+        effects = [['band', '300', '10']]
+        path = get_asset_path("mp3_without_ext")
+        _, sr = sox_effects.apply_effects_file(path, effects, format="mp3")
+        assert sr == 16000
+
+
+@skipIfNoExec('sox')
+@skipIfNoSox
+class TestFileObject(TempDirMixin, PytorchTestCase):
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_fileobj(self, ext, compression):
+        """Applying effects via file object works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [['band', '300', '10']]
+        format_ = ext if ext in ['mp3'] else None
+        input_path = self.get_temp_path(f'input.{ext}')
+        reference_path = self.get_temp_path('reference.wav')
+
+        sox_utils.gen_audio_file(
+            input_path, sample_rate, num_channels=2, compression=compression)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        with open(input_path, 'rb') as fileobj:
+            found, sr = sox_effects.apply_effects_file(
+                fileobj, effects, channels_first=channels_first, format=format_)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_bytesio(self, ext, compression):
+        """Applying effects via BytesIO object works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [['band', '300', '10']]
+        format_ = ext if ext in ['mp3'] else None
+        input_path = self.get_temp_path(f'input.{ext}')
+        reference_path = self.get_temp_path('reference.wav')
+
+        sox_utils.gen_audio_file(
+            input_path, sample_rate, num_channels=2, compression=compression)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        with open(input_path, 'rb') as file_:
+            fileobj = io.BytesIO(file_.read())
+        found, sr = sox_effects.apply_effects_file(
+            fileobj, effects, channels_first=channels_first, format=format_)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_tarfile(self, ext, compression):
+        """Applying effects to compressed audio via file-like file works"""
+        sample_rate = 16000
+        channels_first = True
+        effects = [['band', '300', '10']]
+        format_ = ext if ext in ['mp3'] else None
+        audio_file = f'input.{ext}'
+
+        input_path = self.get_temp_path(audio_file)
+        reference_path = self.get_temp_path('reference.wav')
+        archive_path = self.get_temp_path('archive.tar.gz')
+
+        sox_utils.gen_audio_file(
+            input_path, sample_rate, num_channels=2, compression=compression)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        with tarfile.TarFile(archive_path, 'w') as tarobj:
+            tarobj.add(input_path, arcname=audio_file)
+        with tarfile.TarFile(archive_path, 'r') as tarobj:
+            fileobj = tarobj.extractfile(audio_file)
+            found, sr = sox_effects.apply_effects_file(
+                fileobj, effects, channels_first=channels_first, format=format_)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
+
+
+@skipIfNoSox
+@skipIfNoExec('sox')
+@skipIfNoModule("requests")
+class TestFileObjectHttp(HttpServerMixin, PytorchTestCase):
+    @parameterized.expand([
+        ('wav', None),
+        ('mp3', 128),
+        ('mp3', 320),
+        ('flac', 0),
+        ('flac', 5),
+        ('flac', 8),
+        ('vorbis', -1),
+        ('vorbis', 10),
+        ('amb', None),
+    ])
+    def test_requests(self, ext, compression):
+        sample_rate = 16000
+        channels_first = True
+        effects = [['band', '300', '10']]
+        format_ = ext if ext in ['mp3'] else None
+        audio_file = f'input.{ext}'
+        input_path = self.get_temp_path(audio_file)
+        reference_path = self.get_temp_path('reference.wav')
+
+        sox_utils.gen_audio_file(
+            input_path, sample_rate, num_channels=2, compression=compression)
+        sox_utils.run_sox_effect(
+            input_path, reference_path, effects, output_bitdepth=32)
+        expected, expected_sr = load_wav(reference_path)
+
+        url = self.get_url(audio_file)
+        with requests.get(url, stream=True) as resp:
+            found, sr = sox_effects.apply_effects_file(
+                resp.raw, effects, channels_first=channels_first, format=format_)
+        save_wav(self.get_temp_path('result.wav'), found, sr, channels_first=channels_first)
+        assert sr == expected_sr
+        self.assertEqual(found, expected)
--- a/test/torchaudio_unittest/sox_effect/torchscript_test.py
+++ b/test/torchaudio_unittest/sox_effect/torchscript_test.py
+from typing import List
+
+import torch
+from torchaudio import sox_effects
+from parameterized import parameterized
+
+from torchaudio_unittest.common_utils import (
+    TempDirMixin,
+    TorchaudioTestCase,
+    skipIfNoSox,
+    get_sinusoid,
+    save_wav,
+    torch_script,
+)
+from .common import (
+    load_params,
+)
+
+
+class SoxEffectTensorTransform(torch.nn.Module):
+    effects: List[List[str]]
+
+    def __init__(self, effects: List[List[str]], sample_rate: int, channels_first: bool):
+        super().__init__()
+        self.effects = effects
+        self.sample_rate = sample_rate
+        self.channels_first = channels_first
+
+    def forward(self, tensor: torch.Tensor):
+        return sox_effects.apply_effects_tensor(
+            tensor, self.sample_rate, self.effects, self.channels_first)
+
+
+class SoxEffectFileTransform(torch.nn.Module):
+    effects: List[List[str]]
+    channels_first: bool
+
+    def __init__(self, effects: List[List[str]], channels_first: bool):
+        super().__init__()
+        self.effects = effects
+        self.channels_first = channels_first
+
+    def forward(self, path: str):
+        return sox_effects.apply_effects_file(path, self.effects, self.channels_first)
+
+
+@skipIfNoSox
+class TestTorchScript(TempDirMixin, TorchaudioTestCase):
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_tensor(self, args):
+        effects = args['effects']
+        channels_first = True
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+
+        trans = SoxEffectTensorTransform(effects, input_sr, channels_first)
+
+        trans = torch_script(trans)
+
+        wav = get_sinusoid(
+            frequency=800, sample_rate=input_sr,
+            n_channels=num_channels, dtype='float32', channels_first=channels_first)
+        found, sr_found = trans(wav)
+        expected, sr_expected = sox_effects.apply_effects_tensor(
+            wav, input_sr, effects, channels_first)
+
+        assert sr_found == sr_expected
+        self.assertEqual(expected, found)
+
+    @parameterized.expand(
+        load_params("sox_effect_test_args.jsonl"),
+        name_func=lambda f, i, p: f'{f.__name__}_{i}_{p.args[0]["effects"][0][0]}',
+    )
+    def test_apply_effects_file(self, args):
+        effects = args['effects']
+        channels_first = True
+        num_channels = args.get("num_channels", 2)
+        input_sr = args.get("input_sample_rate", 8000)
+
+        trans = SoxEffectFileTransform(effects, channels_first)
+        trans = torch_script(trans)
+
+        path = self.get_temp_path('input.wav')
+        wav = get_sinusoid(
+            frequency=800, sample_rate=input_sr,
+            n_channels=num_channels, dtype='float32', channels_first=channels_first)
+        save_wav(path, wav, sample_rate=input_sr, channels_first=channels_first)
+
+        found, sr_found = trans(path)
+        expected, sr_expected = sox_effects.apply_effects_file(path, effects, channels_first)
+
+        assert sr_found == sr_expected
+        self.assertEqual(expected, found)
--- a/test/torchaudio_unittest/transforms/__init__.py
+++ b/test/torchaudio_unittest/transforms/__init__.py
--- a/test/torchaudio_unittest/transforms/autograd_cpu_test.py
+++ b/test/torchaudio_unittest/transforms/autograd_cpu_test.py
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .autograd_test_impl import AutogradTestMixin, AutogradTestFloat32
+
+
+class AutogradCPUTest(AutogradTestMixin, PytorchTestCase):
+    device = 'cpu'
+
+
+class AutogradRNNTCPUTest(AutogradTestFloat32, PytorchTestCase):
+    device = 'cpu'
--- a/test/torchaudio_unittest/transforms/autograd_cuda_test.py
+++ b/test/torchaudio_unittest/transforms/autograd_cuda_test.py
+from torchaudio_unittest.common_utils import (
+    PytorchTestCase,
+    skipIfNoCuda,
+)
+from .autograd_test_impl import AutogradTestMixin, AutogradTestFloat32
+
+
+@skipIfNoCuda
+class AutogradCUDATest(AutogradTestMixin, PytorchTestCase):
+    device = 'cuda'
+
+
+@skipIfNoCuda
+class AutogradRNNTCUDATest(AutogradTestFloat32, PytorchTestCase):
+    device = 'cuda'
--- a/test/torchaudio_unittest/transforms/autograd_test_impl.py
+++ b/test/torchaudio_unittest/transforms/autograd_test_impl.py
+from typing import List
+import unittest
+
+from parameterized import parameterized
+import torch
+from torch.autograd import gradcheck, gradgradcheck
+import torchaudio.transforms as T
+
+from torchaudio_unittest.common_utils import (
+    TestBaseMixin,
+    get_whitenoise,
+    get_spectrogram,
+    nested_params,
+    rnnt_utils,
+)
+
+
+class _DeterministicWrapper(torch.nn.Module):
+    """Helper transform wrapper to make the given transform deterministic"""
+    def __init__(self, transform, seed=0):
+        super().__init__()
+        self.seed = seed
+        self.transform = transform
+
+    def forward(self, input: torch.Tensor):
+        torch.random.manual_seed(self.seed)
+        return self.transform(input)
+
+
+class AutogradTestMixin(TestBaseMixin):
+    def assert_grad(
+            self,
+            transform: torch.nn.Module,
+            inputs: List[torch.Tensor],
+            *,
+            nondet_tol: float = 0.0,
+    ):
+        transform = transform.to(dtype=torch.float64, device=self.device)
+
+        # gradcheck and gradgradcheck only pass if the input tensors are of dtype `torch.double` or
+        # `torch.cdouble`, when the default eps and tolerance values are used.
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(
+                    dtype=torch.cdouble if i.is_complex() else torch.double,
+                    device=self.device)
+                i.requires_grad = True
+            inputs_.append(i)
+        assert gradcheck(transform, inputs_)
+        assert gradgradcheck(transform, inputs_, nondet_tol=nondet_tol)
+
+    @parameterized.expand([
+        ({'pad': 0, 'normalized': False, 'power': None, 'return_complex': True}, ),
+        ({'pad': 3, 'normalized': False, 'power': None, 'return_complex': True}, ),
+        ({'pad': 0, 'normalized': True, 'power': None, 'return_complex': True}, ),
+        ({'pad': 3, 'normalized': True, 'power': None, 'return_complex': True}, ),
+        ({'pad': 0, 'normalized': False, 'power': None}, ),
+        ({'pad': 3, 'normalized': False, 'power': None}, ),
+        ({'pad': 0, 'normalized': True, 'power': None}, ),
+        ({'pad': 3, 'normalized': True, 'power': None}, ),
+        ({'pad': 0, 'normalized': False, 'power': 1.0}, ),
+        ({'pad': 3, 'normalized': False, 'power': 1.0}, ),
+        ({'pad': 0, 'normalized': True, 'power': 1.0}, ),
+        ({'pad': 3, 'normalized': True, 'power': 1.0}, ),
+        ({'pad': 0, 'normalized': False, 'power': 2.0}, ),
+        ({'pad': 3, 'normalized': False, 'power': 2.0}, ),
+        ({'pad': 0, 'normalized': True, 'power': 2.0}, ),
+        ({'pad': 3, 'normalized': True, 'power': 2.0}, ),
+    ])
+    def test_spectrogram(self, kwargs):
+        # replication_pad1d_backward_cuda is not deteministic and
+        # gives very small (~2.7756e-17) difference.
+        #
+        # See https://github.com/pytorch/pytorch/issues/54093
+        transform = T.Spectrogram(**kwargs)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    @parameterized.expand([(False, ), (True, )])
+    def test_inverse_spectrogram(self, return_complex):
+        # create a realistic input:
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        length = waveform.shape[-1]
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        if not return_complex:
+            spectrogram = torch.view_as_real(spectrogram)
+
+        # test
+        inv_transform = T.InverseSpectrogram(n_fft=400)
+        self.assert_grad(inv_transform, [spectrogram, length])
+
+    def test_melspectrogram(self):
+        # replication_pad1d_backward_cuda is not deteministic and
+        # gives very small (~2.7756e-17) difference.
+        #
+        # See https://github.com/pytorch/pytorch/issues/54093
+        sample_rate = 8000
+        transform = T.MelSpectrogram(sample_rate=sample_rate)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    @nested_params(
+        [0, 0.99],
+        [False, True],
+    )
+    def test_griffinlim(self, momentum, rand_init):
+        n_fft = 400
+        power = 1
+        n_iter = 3
+        spec = get_spectrogram(
+            get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2),
+            n_fft=n_fft, power=power)
+        transform = _DeterministicWrapper(
+            T.GriffinLim(n_fft=n_fft, n_iter=n_iter, momentum=momentum, rand_init=rand_init, power=power))
+        self.assert_grad(transform, [spec])
+
+    @parameterized.expand([(False, ), (True, )])
+    def test_mfcc(self, log_mels):
+        sample_rate = 8000
+        transform = T.MFCC(sample_rate=sample_rate, log_mels=log_mels)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform])
+
+    @parameterized.expand([(False, ), (True, )])
+    def test_lfcc(self, log_lf):
+        sample_rate = 8000
+        transform = T.LFCC(sample_rate=sample_rate, log_lf=log_lf)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform])
+
+    def test_compute_deltas(self):
+        transform = T.ComputeDeltas()
+        spec = torch.rand(10, 20)
+        self.assert_grad(transform, [spec])
+
+    @parameterized.expand([(8000, 8000), (8000, 4000), (4000, 8000)])
+    def test_resample(self, orig_freq, new_freq):
+        transform = T.Resample(orig_freq=orig_freq, new_freq=new_freq)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform])
+
+    @parameterized.expand([("linear", ), ("exponential", ), ("logarithmic", ), ("quarter_sine", ), ("half_sine", )])
+    def test_fade(self, fade_shape):
+        transform = T.Fade(fade_shape=fade_shape)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    @parameterized.expand([(T.TimeMasking,), (T.FrequencyMasking,)])
+    def test_masking(self, masking_transform):
+        sample_rate = 8000
+        n_fft = 400
+        spectrogram = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2),
+            n_fft=n_fft, power=1)
+        deterministic_transform = _DeterministicWrapper(masking_transform(400))
+        self.assert_grad(deterministic_transform, [spectrogram])
+
+    @parameterized.expand([(T.TimeMasking,), (T.FrequencyMasking,)])
+    def test_masking_iid(self, masking_transform):
+        sample_rate = 8000
+        n_fft = 400
+        specs = [get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2, seed=i),
+            n_fft=n_fft, power=1)
+            for i in range(3)
+        ]
+
+        batch = torch.stack(specs)
+        assert batch.ndim == 4
+        deterministic_transform = _DeterministicWrapper(masking_transform(400, True))
+        self.assert_grad(deterministic_transform, [batch])
+
+    def test_spectral_centroid(self):
+        sample_rate = 8000
+        transform = T.SpectralCentroid(sample_rate=sample_rate)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform], nondet_tol=1e-10)
+
+    def test_amplitude_to_db(self):
+        sample_rate = 8000
+        transform = T.AmplitudeToDB()
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform])
+
+    def test_melscale(self):
+        sample_rate = 8000
+        n_fft = 400
+        n_mels = n_fft // 2 + 1
+        transform = T.MelScale(sample_rate=sample_rate, n_mels=n_mels)
+        spec = get_spectrogram(
+            get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2),
+            n_fft=n_fft, power=1)
+        self.assert_grad(transform, [spec])
+
+    @parameterized.expand([(1.5, "amplitude"), (2, "power"), (10, "db")])
+    def test_vol(self, gain, gain_type):
+        sample_rate = 8000
+        transform = T.Vol(gain=gain, gain_type=gain_type)
+        waveform = get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=2)
+        self.assert_grad(transform, [waveform])
+
+    @parameterized.expand([
+        ({'cmn_window': 100, 'min_cmn_window': 50, 'center': False, 'norm_vars': False}, ),
+        ({'cmn_window': 100, 'min_cmn_window': 50, 'center': True, 'norm_vars': False}, ),
+        ({'cmn_window': 100, 'min_cmn_window': 50, 'center': False, 'norm_vars': True}, ),
+        ({'cmn_window': 100, 'min_cmn_window': 50, 'center': True, 'norm_vars': True}, ),
+    ])
+    def test_sliding_window_cmn(self, kwargs):
+        n_fft = 10
+        power = 1
+        spec = get_spectrogram(
+            get_whitenoise(sample_rate=200, duration=0.05, n_channels=2),
+            n_fft=n_fft, power=power)
+        spec_reshaped = spec.transpose(-1, -2)
+
+        transform = T.SlidingWindowCmn(**kwargs)
+        self.assert_grad(transform, [spec_reshaped])
+
+    @unittest.expectedFailure
+    def test_timestretch_zeros_fail(self):
+        """Test that ``T.TimeStretch`` fails gradcheck at 0
+
+        This is because ``F.phase_vocoder`` converts data from cartesian to polar coordinate,
+        which performs ``atan2(img, real)``, and gradient is not defined at 0.
+        """
+        n_fft = 16
+        transform = T.TimeStretch(n_freq=n_fft // 2 + 1, fixed_rate=0.99)
+        waveform = torch.zeros(2, 40)
+        spectrogram = get_spectrogram(waveform, n_fft=n_fft, power=None)
+        self.assert_grad(transform, [spectrogram])
+
+    @nested_params(
+        [0.7, 0.8, 0.9, 1.0, 1.3],
+        [False, True],
+    )
+    def test_timestretch_non_zero(self, rate, test_pseudo_complex):
+        """Verify that ``T.TimeStretch`` does not fail if it's not close to 0
+
+        ``T.TimeStrech`` is not differentiable around 0, so this test checks the differentiability
+        for cases where input is not zero.
+
+        As tested above, when spectrogram contains values close to zero, the gradients are unstable
+        and gradcheck fails.
+
+        In this test, we generate spectrogram from random signal, then we push the points around
+        zero away from the origin.
+
+        This process does not reflect the real use-case, and it is not practical for users, but
+        this helps us understand to what degree the function is differentiable and when not.
+        """
+        n_fft = 16
+        transform = T.TimeStretch(n_freq=n_fft // 2 + 1, fixed_rate=rate)
+        waveform = get_whitenoise(sample_rate=40, duration=1, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=n_fft, power=None)
+
+        # 1e-3 is too small (on CPU)
+        epsilon = 1e-2
+        too_close = spectrogram.abs() < epsilon
+        spectrogram[too_close] = epsilon * spectrogram[too_close] / spectrogram[too_close].abs()
+        if test_pseudo_complex:
+            spectrogram = torch.view_as_real(spectrogram)
+        self.assert_grad(transform, [spectrogram])
+
+    def test_psd(self):
+        transform = T.PSD()
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        self.assert_grad(transform, [spectrogram])
+
+    @parameterized.expand([
+        [True],
+        [False],
+    ])
+    def test_psd_with_mask(self, multi_mask):
+        transform = T.PSD(multi_mask=multi_mask)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        if multi_mask:
+            mask = torch.rand(spectrogram.shape[-3:])
+        else:
+            mask = torch.rand(spectrogram.shape[-2:])
+
+        self.assert_grad(transform, [spectrogram, mask])
+
+    @parameterized.expand([
+        "ref_channel",
+        # stv_power test time too long, comment for now
+        # "stv_power",
+        # stv_evd will fail since the eigenvalues are not distinct
+        # "stv_evd",
+    ])
+    def test_mvdr(self, solution):
+        transform = T.MVDR(solution=solution)
+        waveform = get_whitenoise(sample_rate=8000, duration=0.05, n_channels=2)
+        spectrogram = get_spectrogram(waveform, n_fft=400)
+        mask_s = torch.rand(spectrogram.shape[-2:])
+        mask_n = torch.rand(spectrogram.shape[-2:])
+        self.assert_grad(transform, [spectrogram, mask_s, mask_n])
+
+
+class AutogradTestFloat32(TestBaseMixin):
+    def assert_grad(
+            self,
+            transform: torch.nn.Module,
+            inputs: List[torch.Tensor],
+    ):
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(dtype=torch.float32, device=self.device)
+            inputs_.append(i)
+        # gradcheck with float32 requires higher atol and epsilon
+        assert gradcheck(transform, inputs, eps=1e-3, atol=1e-3, nondet_tol=0.)
+
+    @parameterized.expand([
+        (rnnt_utils.get_B1_T10_U3_D4_data, ),
+        (rnnt_utils.get_B2_T4_U3_D3_data, ),
+        (rnnt_utils.get_B1_T2_U3_D5_data, ),
+    ])
+    def test_rnnt_loss(self, data_func):
+        def get_data(data_func, device):
+            data = data_func()
+            if type(data) == tuple:
+                data = data[0]
+            return data
+
+        data = get_data(data_func, self.device)
+        inputs = (
+            data["logits"].to(torch.float32),
+            data["targets"],
+            data["logit_lengths"],
+            data["target_lengths"],
+        )
+        loss = T.RNNTLoss(blank=data["blank"])
+
+        self.assert_grad(loss, inputs)
--- a/test/torchaudio_unittest/transforms/batch_consistency_test.py
+++ b/test/torchaudio_unittest/transforms/batch_consistency_test.py
+"""Test numerical consistency among single input and batched input."""
+import torch
+from parameterized import parameterized
+from torchaudio import transforms as T
+
+from torchaudio_unittest import common_utils
+
+
+class TestTransforms(common_utils.TorchaudioTestCase):
+    """Test suite for classes defined in `transforms` module"""
+    backend = 'default'
+
+    def assert_batch_consistency(
+            self, transform, batch, *args, atol=1e-8, rtol=1e-5, seed=42,
+            **kwargs):
+        n = batch.size(0)
+
+        # Compute items separately, then batch the result
+        torch.random.manual_seed(seed)
+        items_input = batch.clone()
+        items_result = torch.stack([
+            transform(items_input[i], *args, **kwargs) for i in range(n)
+        ])
+
+        # Batch the input and run
+        torch.random.manual_seed(seed)
+        batch_input = batch.clone()
+        batch_result = transform(batch_input, *args, **kwargs)
+
+        self.assertEqual(items_input, batch_input, rtol=rtol, atol=atol)
+        self.assertEqual(items_result, batch_result, rtol=rtol, atol=atol)
+
+    def test_batch_AmplitudeToDB(self):
+        spec = torch.rand((3, 2, 6, 201))
+        transform = T.AmplitudeToDB()
+
+        self.assert_batch_consistency(transform, spec)
+
+    def test_batch_Resample(self):
+        waveform = torch.randn(3, 2, 2786)
+        transform = T.Resample()
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_MelScale(self):
+        specgram = torch.randn(3, 2, 201, 256)
+        transform = T.MelScale()
+
+        self.assert_batch_consistency(transform, specgram)
+
+    def test_batch_InverseMelScale(self):
+        n_mels = 32
+        n_stft = 5
+        mel_spec = torch.randn(3, 2, n_mels, 32) ** 2
+        transform = T.InverseMelScale(n_stft, n_mels)
+
+        # Because InverseMelScale runs SGD on randomly initialized values so they do not yield
+        # exactly same result. For this reason, tolerance is very relaxed here.
+        self.assert_batch_consistency(transform, mel_spec, atol=1.0, rtol=1e-5)
+
+    def test_batch_compute_deltas(self):
+        specgram = torch.randn(3, 2, 31, 2786)
+        transform = T.ComputeDeltas()
+
+        self.assert_batch_consistency(transform, specgram)
+
+    def test_batch_mulaw(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+
+        # Single then transform then batch
+        expected = [T.MuLawEncoding()(waveform[i]) for i in range(3)]
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = T.MuLawEncoding()(waveform)
+
+        # shape = (3, 2, 201, 1394)
+        self.assertEqual(computed, expected)
+
+        # Single then transform then batch
+        expected_decoded = [T.MuLawDecoding()(expected[i]) for i in range(3)]
+        expected_decoded = torch.stack(expected_decoded)
+
+        # Batch then transform
+        computed_decoded = T.MuLawDecoding()(computed)
+
+        # shape = (3, 2, 201, 1394)
+        self.assertEqual(computed_decoded, expected_decoded)
+
+    def test_batch_spectrogram(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.Spectrogram()
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_inverse_spectrogram(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        specgram = common_utils.get_spectrogram(waveform, n_fft=400)
+        specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
+        transform = T.InverseSpectrogram(n_fft=400)
+
+        self.assert_batch_consistency(transform, specgram)
+
+    def test_batch_melspectrogram(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.MelSpectrogram()
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_mfcc(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.MFCC()
+
+        self.assert_batch_consistency(transform, waveform, atol=1e-4, rtol=1e-5)
+
+    def test_batch_lfcc(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.LFCC()
+
+        self.assert_batch_consistency(transform, waveform, atol=1e-4, rtol=1e-5)
+
+    @parameterized.expand([(True, ), (False, )])
+    def test_batch_TimeStretch(self, test_pseudo_complex):
+        rate = 2
+        num_freq = 1025
+        num_frames = 400
+        batch = 3
+
+        spec = torch.randn(batch, num_freq, num_frames, dtype=torch.complex64)
+        if test_pseudo_complex:
+            spec = torch.view_as_real(spec)
+
+        transform = T.TimeStretch(
+            fixed_rate=rate,
+            n_freq=num_freq,
+            hop_length=512
+        )
+
+        self.assert_batch_consistency(transform, spec, atol=1e-5, rtol=1e-5)
+
+    def test_batch_Fade(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        fade_in_len = 3000
+        fade_out_len = 3000
+        transform = T.Fade(fade_in_len, fade_out_len)
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_Vol(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.Vol(gain=1.1)
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_spectral_centroid(self):
+        sample_rate = 44100
+        waveform = common_utils.get_whitenoise(sample_rate=sample_rate, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.SpectralCentroid(sample_rate)
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_pitch_shift(self):
+        sample_rate = 8000
+        n_steps = -2
+        waveform = common_utils.get_whitenoise(sample_rate=sample_rate, duration=0.05, n_channels=6)
+        waveform = waveform.reshape(3, 2, -1)
+        transform = T.PitchShift(sample_rate, n_steps, n_fft=400)
+
+        self.assert_batch_consistency(transform, waveform)
+
+    def test_batch_PSD(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        specgram = common_utils.get_spectrogram(waveform, n_fft=400)
+        specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
+        transform = T.PSD()
+
+        self.assert_batch_consistency(transform, specgram)
+
+    def test_batch_PSD_with_mask(self):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.to(torch.double)
+        specgram = common_utils.get_spectrogram(waveform, n_fft=400)
+        specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
+        mask = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
+        transform = T.PSD()
+
+        # Single then transform then batch
+        expected = [transform(specgram[i], mask[i]) for i in range(3)]
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = transform(specgram, mask)
+
+        self.assertEqual(computed, expected)
+
+    @parameterized.expand([
+        [True],
+        [False],
+    ])
+    def test_MVDR(self, multi_mask):
+        waveform = common_utils.get_whitenoise(sample_rate=8000, duration=1, n_channels=6)
+        waveform = waveform.to(torch.double)
+        specgram = common_utils.get_spectrogram(waveform, n_fft=400)
+        specgram = specgram.reshape(3, 2, specgram.shape[-2], specgram.shape[-1])
+        if multi_mask:
+            mask_s = torch.rand((3, 2, specgram.shape[-2], specgram.shape[-1]))
+            mask_n = torch.rand((3, 2, specgram.shape[-2], specgram.shape[-1]))
+        else:
+            mask_s = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
+            mask_n = torch.rand((3, specgram.shape[-2], specgram.shape[-1]))
+        transform = T.MVDR(multi_mask=multi_mask)
+
+        # Single then transform then batch
+        expected = [transform(specgram[i], mask_s[i], mask_n[i]) for i in range(3)]
+        expected = torch.stack(expected)
+
+        # Batch then transform
+        computed = transform(specgram, mask_s, mask_n)
+
+        self.assertEqual(computed, expected)
--- a/test/torchaudio_unittest/transforms/kaldi_compatibility_cpu_test.py
+++ b/test/torchaudio_unittest/transforms/kaldi_compatibility_cpu_test.py
+import torch
+
+from torchaudio_unittest import common_utils
+from .kaldi_compatibility_impl import Kaldi
+
+
+class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cpu')
+
+
+class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cpu')
--- a/test/torchaudio_unittest/transforms/kaldi_compatibility_cuda_test.py
+++ b/test/torchaudio_unittest/transforms/kaldi_compatibility_cuda_test.py
+import torch
+
+from torchaudio_unittest import common_utils
+from .kaldi_compatibility_impl import Kaldi
+
+
+@common_utils.skipIfNoCuda
+class TestKaldiFloat32(Kaldi, common_utils.PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cuda')
+
+
+@common_utils.skipIfNoCuda
+class TestKaldiFloat64(Kaldi, common_utils.PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cuda')
--- a/test/torchaudio_unittest/transforms/kaldi_compatibility_impl.py
+++ b/test/torchaudio_unittest/transforms/kaldi_compatibility_impl.py
+"""Test suites for checking numerical compatibility against Kaldi"""
+import torchaudio.compliance.kaldi
+from parameterized import parameterized
+
+from torchaudio_unittest.common_utils import (
+    TestBaseMixin,
+    TempDirMixin,
+    load_params,
+    skipIfNoExec,
+    get_asset_path,
+    load_wav,
+)
+from torchaudio_unittest.common_utils.kaldi_utils import (
+    convert_args,
+    run_kaldi,
+)
+
+
+class Kaldi(TempDirMixin, TestBaseMixin):
+    def assert_equal(self, output, *, expected, rtol=None, atol=None):
+        expected = expected.to(dtype=self.dtype, device=self.device)
+        self.assertEqual(output, expected, rtol=rtol, atol=atol)
+
+    @parameterized.expand(load_params('kaldi_test_fbank_args.jsonl'))
+    @skipIfNoExec('compute-fbank-feats')
+    def test_fbank(self, kwargs):
+        """fbank should be numerically compatible with compute-fbank-feats"""
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
+        result = torchaudio.compliance.kaldi.fbank(waveform, **kwargs)
+        command = ['compute-fbank-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
+        self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
+
+    @parameterized.expand(load_params('kaldi_test_spectrogram_args.jsonl'))
+    @skipIfNoExec('compute-spectrogram-feats')
+    def test_spectrogram(self, kwargs):
+        """spectrogram should be numerically compatible with compute-spectrogram-feats"""
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
+        result = torchaudio.compliance.kaldi.spectrogram(waveform, **kwargs)
+        command = ['compute-spectrogram-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
+        self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
+
+    @parameterized.expand(load_params('kaldi_test_mfcc_args.jsonl'))
+    @skipIfNoExec('compute-mfcc-feats')
+    def test_mfcc(self, kwargs):
+        """mfcc should be numerically compatible with compute-mfcc-feats"""
+        wave_file = get_asset_path('kaldi_file.wav')
+        waveform = load_wav(wave_file, normalize=False)[0].to(dtype=self.dtype, device=self.device)
+        result = torchaudio.compliance.kaldi.mfcc(waveform, **kwargs)
+        command = ['compute-mfcc-feats'] + convert_args(**kwargs) + ['scp:-', 'ark:-']
+        kaldi_result = run_kaldi(command, 'scp', wave_file)
+        self.assert_equal(result, expected=kaldi_result, rtol=1e-4, atol=1e-8)
--- a/test/torchaudio_unittest/transforms/librosa_compatibility_cpu_test.py
+++ b/test/torchaudio_unittest/transforms/librosa_compatibility_cpu_test.py
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .librosa_compatibility_test_impl import TransformsTestBase
+
+
+class TestTransforms(TransformsTestBase, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cpu')
--- a/test/torchaudio_unittest/transforms/librosa_compatibility_cuda_test.py
+++ b/test/torchaudio_unittest/transforms/librosa_compatibility_cuda_test.py
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase, skipIfNoCuda
+from .librosa_compatibility_test_impl import TransformsTestBase
+
+
+@skipIfNoCuda
+class TestTransforms(TransformsTestBase, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cuda')
--- a/test/torchaudio_unittest/transforms/librosa_compatibility_test_impl.py
+++ b/test/torchaudio_unittest/transforms/librosa_compatibility_test_impl.py
+import unittest
+
+import torch
+import torchaudio.transforms as T
+from torchaudio._internal.module_utils import is_module_available
+from parameterized import param, parameterized
+
+from torchaudio_unittest.common_utils import (
+    TestBaseMixin,
+    get_whitenoise,
+    get_sinusoid,
+    get_spectrogram,
+    nested_params,
+)
+
+LIBROSA_AVAILABLE = is_module_available('librosa')
+
+if LIBROSA_AVAILABLE:
+    import librosa
+
+
+@unittest.skipIf(not LIBROSA_AVAILABLE, "Librosa not available")
+class TransformsTestBase(TestBaseMixin):
+    @parameterized.expand([
+        param(n_fft=400, hop_length=200, power=2.0),
+        param(n_fft=600, hop_length=100, power=2.0),
+        param(n_fft=400, hop_length=200, power=3.0),
+        param(n_fft=200, hop_length=50, power=2.0),
+    ])
+    def test_Spectrogram(self, n_fft, hop_length, power):
+        sample_rate = 16000
+        waveform = get_whitenoise(
+            sample_rate=sample_rate, n_channels=1,
+        ).to(self.device, self.dtype)
+
+        expected = librosa.core.spectrum._spectrogram(
+            y=waveform[0].cpu().numpy(),
+            n_fft=n_fft, hop_length=hop_length, power=power)[0]
+
+        result = T.Spectrogram(
+            n_fft=n_fft, hop_length=hop_length, power=power,
+        ).to(self.device, self.dtype)(waveform)[0]
+        self.assertEqual(result, torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
+
+    def test_Spectrogram_complex(self):
+        n_fft = 400
+        hop_length = 200
+        sample_rate = 16000
+        waveform = get_whitenoise(
+            sample_rate=sample_rate, n_channels=1,
+        ).to(self.device, self.dtype)
+
+        expected = librosa.core.spectrum._spectrogram(
+            y=waveform[0].cpu().numpy(),
+            n_fft=n_fft, hop_length=hop_length, power=1)[0]
+
+        result = T.Spectrogram(
+            n_fft=n_fft, hop_length=hop_length, power=None, return_complex=True,
+        ).to(self.device, self.dtype)(waveform)[0]
+        self.assertEqual(result.abs(), torch.from_numpy(expected), atol=1e-5, rtol=1e-5)
+
+    @nested_params(
+        [
+            param(n_fft=400, hop_length=200, n_mels=64),
+            param(n_fft=600, hop_length=100, n_mels=128),
+            param(n_fft=200, hop_length=50, n_mels=32),
+        ],
+        [param(norm=norm) for norm in [None, 'slaney']],
+        [param(mel_scale=mel_scale) for mel_scale in ['htk', 'slaney']],
+    )
+    def test_MelSpectrogram(self, n_fft, hop_length, n_mels, norm, mel_scale):
+        sample_rate = 16000
+        waveform = get_sinusoid(
+            sample_rate=sample_rate, n_channels=1,
+        ).to(self.device, self.dtype)
+
+        expected = librosa.feature.melspectrogram(
+            y=waveform[0].cpu().numpy(),
+            sr=sample_rate, n_fft=n_fft,
+            hop_length=hop_length, n_mels=n_mels, norm=norm,
+            htk=mel_scale == "htk")
+        result = T.MelSpectrogram(
+            sample_rate=sample_rate, window_fn=torch.hann_window,
+            hop_length=hop_length, n_mels=n_mels,
+            n_fft=n_fft, norm=norm, mel_scale=mel_scale,
+        ).to(self.device, self.dtype)(waveform)[0]
+        self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
+
+    def test_magnitude_to_db(self):
+        spectrogram = get_spectrogram(
+            get_whitenoise(), n_fft=400, power=2).to(self.device, self.dtype)
+        result = T.AmplitudeToDB('magnitude', 80.).to(self.device, self.dtype)(spectrogram)[0]
+        expected = librosa.core.spectrum.amplitude_to_db(spectrogram[0].cpu().numpy())
+        self.assertEqual(result, torch.from_numpy(expected))
+
+    def test_power_to_db(self):
+        spectrogram = get_spectrogram(
+            get_whitenoise(), n_fft=400, power=2).to(self.device, self.dtype)
+        result = T.AmplitudeToDB('power', 80.).to(self.device, self.dtype)(spectrogram)[0]
+        expected = librosa.core.spectrum.power_to_db(spectrogram[0].cpu().numpy())
+        self.assertEqual(result, torch.from_numpy(expected))
+
+    @nested_params([
+        param(n_fft=400, hop_length=200, n_mels=64, n_mfcc=40),
+        param(n_fft=600, hop_length=100, n_mels=128, n_mfcc=20),
+        param(n_fft=200, hop_length=50, n_mels=32, n_mfcc=25),
+    ])
+    def test_mfcc(self, n_fft, hop_length, n_mels, n_mfcc):
+        sample_rate = 16000
+        waveform = get_whitenoise(
+            sample_rate=sample_rate, n_channels=1).to(self.device, self.dtype)
+        result = T.MFCC(
+            sample_rate=sample_rate, n_mfcc=n_mfcc, norm='ortho',
+            melkwargs={'hop_length': hop_length, 'n_fft': n_fft, 'n_mels': n_mels},
+        ).to(self.device, self.dtype)(waveform)[0]
+
+        melspec = librosa.feature.melspectrogram(
+            y=waveform[0].cpu().numpy(), sr=sample_rate, n_fft=n_fft,
+            win_length=n_fft, hop_length=hop_length,
+            n_mels=n_mels, htk=True, norm=None)
+        expected = librosa.feature.mfcc(
+            S=librosa.core.spectrum.power_to_db(melspec),
+            n_mfcc=n_mfcc, dct_type=2, norm='ortho')
+        self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
+
+    @parameterized.expand([
+        param(n_fft=400, hop_length=200),
+        param(n_fft=600, hop_length=100),
+        param(n_fft=200, hop_length=50),
+    ])
+    def test_spectral_centroid(self, n_fft, hop_length):
+        sample_rate = 16000
+        waveform = get_whitenoise(
+            sample_rate=sample_rate, n_channels=1).to(self.device, self.dtype)
+
+        result = T.SpectralCentroid(
+            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length,
+        ).to(self.device, self.dtype)(waveform)
+        expected = librosa.feature.spectral_centroid(
+            y=waveform[0].cpu().numpy(), sr=sample_rate, n_fft=n_fft, hop_length=hop_length)
+        self.assertEqual(result, torch.from_numpy(expected), atol=5e-4, rtol=1e-5)
--- a/test/torchaudio_unittest/transforms/sox_compatibility_test.py
+++ b/test/torchaudio_unittest/transforms/sox_compatibility_test.py
+import warnings
+
+import torch
+import torchaudio.transforms as T
+from parameterized import parameterized
+
+from torchaudio_unittest.common_utils import (
+    skipIfNoSox,
+    skipIfNoExec,
+    TempDirMixin,
+    TorchaudioTestCase,
+    get_asset_path,
+    sox_utils,
+    load_wav,
+    save_wav,
+    get_whitenoise,
+)
+
+
+@skipIfNoSox
+@skipIfNoExec('sox')
+class TestFunctionalFiltering(TempDirMixin, TorchaudioTestCase):
+    def run_sox_effect(self, input_file, effect):
+        output_file = self.get_temp_path('expected.wav')
+        sox_utils.run_sox_effect(input_file, output_file, [str(e) for e in effect])
+        return load_wav(output_file)
+
+    def assert_sox_effect(self, result, input_path, effects, atol=1e-04, rtol=1e-5):
+        expected, _ = self.run_sox_effect(input_path, effects)
+        self.assertEqual(result, expected, atol=atol, rtol=rtol)
+
+    def get_whitenoise(self, sample_rate=8000):
+        noise = get_whitenoise(
+            sample_rate=sample_rate, duration=3, scale_factor=0.9,
+        )
+        path = self.get_temp_path("whitenoise.wav")
+        save_wav(path, noise, sample_rate)
+        return noise, path
+
+    @parameterized.expand([
+        ('q', 'quarter_sine'),
+        ('h', 'half_sine'),
+        ('t', 'linear'),
+    ])
+    def test_fade(self, fade_shape_sox, fade_shape):
+        fade_in_len, fade_out_len = 44100, 44100
+        data, path = self.get_whitenoise(sample_rate=44100)
+        result = T.Fade(fade_in_len, fade_out_len, fade_shape)(data)
+        self.assert_sox_effect(result, path, ['fade', fade_shape_sox, '1', '0', '1'])
+
+    @parameterized.expand([
+        ('amplitude', 1.1),
+        ('db', 2),
+        ('power', 2),
+    ])
+    def test_vol(self, gain_type, gain):
+        data, path = self.get_whitenoise()
+        result = T.Vol(gain, gain_type)(data)
+        self.assert_sox_effect(result, path, ['vol', f'{gain}', gain_type])
+
+    @parameterized.expand(['vad-go-stereo-44100.wav', 'vad-go-mono-32000.wav'])
+    def test_vad(self, filename):
+        path = get_asset_path(filename)
+        data, sample_rate = load_wav(path)
+        result = T.Vad(sample_rate)(data)
+        self.assert_sox_effect(result, path, ['vad'])
+
+    def test_vad_warning(self):
+        """vad should throw a warning if input dimension is greater than 2"""
+        sample_rate = 41100
+
+        data = torch.rand(5, 5, sample_rate)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            T.Vad(sample_rate)(data)
+        assert len(w) == 1
+
+        data = torch.rand(5, sample_rate)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            T.Vad(sample_rate)(data)
+        assert len(w) == 0
+
+        data = torch.rand(sample_rate)
+        with warnings.catch_warnings(record=True) as w:
+            warnings.simplefilter("always")
+            T.Vad(sample_rate)(data)
+        assert len(w) == 0
--- a/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py
+++ b/test/torchaudio_unittest/transforms/torchscript_consistency_cpu_test.py
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from .torchscript_consistency_impl import Transforms, TransformsFloat32Only, TransformsFloat64Only
+
+
+class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cpu')
+
+
+class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cpu')
--- a/test/torchaudio_unittest/transforms/torchscript_consistency_cuda_test.py
+++ b/test/torchaudio_unittest/transforms/torchscript_consistency_cuda_test.py
+import torch
+
+from torchaudio_unittest.common_utils import skipIfNoCuda, PytorchTestCase
+from .torchscript_consistency_impl import Transforms, TransformsFloat32Only, TransformsFloat64Only
+
+
+@skipIfNoCuda
+class TestTransformsFloat32(Transforms, TransformsFloat32Only, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device('cuda')
+
+
+@skipIfNoCuda
+class TestTransformsFloat64(Transforms, TransformsFloat64Only, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device('cuda')
--- a/test/torchaudio_unittest/transforms/torchscript_consistency_impl.py
+++ b/test/torchaudio_unittest/transforms/torchscript_consistency_impl.py
+"""Test suites for jit-ability and its numerical compatibility"""
+
+import torch
+import torchaudio.transforms as T
+from parameterized import parameterized
+
+from torchaudio_unittest import common_utils
+from torchaudio_unittest.common_utils import (
+    skipIfRocm,
+    TestBaseMixin,
+    torch_script,
+)
+
+
+class Transforms(TestBaseMixin):
+    """Implements test for Transforms that are performed for different devices"""
+    def _assert_consistency(self, transform, tensor, *args):
+        tensor = tensor.to(device=self.device, dtype=self.dtype)
+        transform = transform.to(device=self.device, dtype=self.dtype)
+
+        ts_transform = torch_script(transform)
+
+        output = transform(tensor, *args)
+        ts_output = ts_transform(tensor, *args)
+        self.assertEqual(ts_output, output)
+
+    def _assert_consistency_complex(self, transform, tensor, test_pseudo_complex=False, *args):
+        assert tensor.is_complex()
+        tensor = tensor.to(device=self.device, dtype=self.complex_dtype)
+        transform = transform.to(device=self.device, dtype=self.dtype)
+
+        ts_transform = torch_script(transform)
+
+        if test_pseudo_complex:
+            tensor = torch.view_as_real(tensor)
+        output = transform(tensor, *args)
+        ts_output = ts_transform(tensor, *args)
+        self.assertEqual(ts_output, output)
+
+    def test_Spectrogram(self):
+        tensor = torch.rand((1, 1000))
+        self._assert_consistency(T.Spectrogram(), tensor)
+
+    def test_Spectrogram_return_complex(self):
+        tensor = torch.rand((1, 1000))
+        self._assert_consistency(T.Spectrogram(power=None, return_complex=True), tensor)
+
+    def test_InverseSpectrogram(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        self._assert_consistency_complex(T.InverseSpectrogram(n_fft=400, hop_length=100), spectrogram)
+
+    def test_InverseSpectrogram_pseudocomplex(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        spectrogram = torch.view_as_real(spectrogram)
+        self._assert_consistency(T.InverseSpectrogram(n_fft=400, hop_length=100), spectrogram)
+
+    @skipIfRocm
+    def test_GriffinLim(self):
+        tensor = torch.rand((1, 201, 6))
+        self._assert_consistency(T.GriffinLim(length=1000, rand_init=False), tensor)
+
+    def test_AmplitudeToDB(self):
+        spec = torch.rand((6, 201))
+        self._assert_consistency(T.AmplitudeToDB(), spec)
+
+    def test_MelScale(self):
+        spec_f = torch.rand((1, 201, 6))
+        self._assert_consistency(T.MelScale(n_stft=201), spec_f)
+
+    def test_MelSpectrogram(self):
+        tensor = torch.rand((1, 1000))
+        self._assert_consistency(T.MelSpectrogram(), tensor)
+
+    def test_MFCC(self):
+        tensor = torch.rand((1, 1000))
+        self._assert_consistency(T.MFCC(), tensor)
+
+    def test_LFCC(self):
+        tensor = torch.rand((1, 1000))
+        self._assert_consistency(T.LFCC(), tensor)
+
+    def test_Resample(self):
+        sr1, sr2 = 16000, 8000
+        tensor = common_utils.get_whitenoise(sample_rate=sr1)
+        self._assert_consistency(T.Resample(sr1, sr2), tensor)
+
+    def test_ComplexNorm(self):
+        tensor = torch.rand((1, 2, 201, 2))
+        self._assert_consistency(T.ComplexNorm(), tensor)
+
+    def test_MuLawEncoding(self):
+        tensor = common_utils.get_whitenoise()
+        self._assert_consistency(T.MuLawEncoding(), tensor)
+
+    def test_MuLawDecoding(self):
+        tensor = torch.rand((1, 10))
+        self._assert_consistency(T.MuLawDecoding(), tensor)
+
+    def test_Fade(self):
+        waveform = common_utils.get_whitenoise()
+        fade_in_len = 3000
+        fade_out_len = 3000
+        self._assert_consistency(T.Fade(fade_in_len, fade_out_len), waveform)
+
+    def test_FrequencyMasking(self):
+        tensor = torch.rand((10, 2, 50, 10, 2))
+        self._assert_consistency(T.FrequencyMasking(freq_mask_param=60, iid_masks=False), tensor)
+
+    def test_TimeMasking(self):
+        tensor = torch.rand((10, 2, 50, 10, 2))
+        self._assert_consistency(T.TimeMasking(time_mask_param=30, iid_masks=False), tensor)
+
+    def test_Vol(self):
+        waveform = common_utils.get_whitenoise()
+        self._assert_consistency(T.Vol(1.1), waveform)
+
+    def test_SlidingWindowCmn(self):
+        tensor = torch.rand((1000, 10))
+        self._assert_consistency(T.SlidingWindowCmn(), tensor)
+
+    def test_Vad(self):
+        filepath = common_utils.get_asset_path("vad-go-mono-32000.wav")
+        waveform, sample_rate = common_utils.load_wav(filepath)
+        self._assert_consistency(T.Vad(sample_rate=sample_rate), waveform)
+
+    def test_SpectralCentroid(self):
+        sample_rate = 44100
+        waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
+        self._assert_consistency(T.SpectralCentroid(sample_rate=sample_rate), waveform)
+
+    @parameterized.expand([(True, ), (False, )])
+    def test_TimeStretch(self, test_pseudo_complex):
+        n_freq = 400
+        hop_length = 512
+        fixed_rate = 1.3
+        tensor = torch.view_as_complex(torch.rand((10, 2, n_freq, 10, 2)))
+        self._assert_consistency_complex(
+            T.TimeStretch(n_freq=n_freq, hop_length=hop_length, fixed_rate=fixed_rate),
+            tensor,
+            test_pseudo_complex
+        )
+
+    def test_PitchShift(self):
+        sample_rate = 8000
+        n_steps = 4
+        waveform = common_utils.get_whitenoise(sample_rate=sample_rate)
+        self._assert_consistency(
+            T.PitchShift(sample_rate=sample_rate, n_steps=n_steps),
+            waveform
+        )
+
+    def test_PSD(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        spectrogram = spectrogram.to(self.device)
+        self._assert_consistency_complex(T.PSD(), spectrogram)
+
+    def test_PSD_with_mask(self):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        spectrogram = spectrogram.to(self.device)
+        mask = torch.rand(spectrogram.shape[-2:], device=self.device)
+        self._assert_consistency_complex(T.PSD(), spectrogram, False, mask)
+
+
+class TransformsFloat32Only(TestBaseMixin):
+    def test_rnnt_loss(self):
+        logits = torch.tensor([[[[0.1, 0.6, 0.1, 0.1, 0.1],
+                                 [0.1, 0.1, 0.6, 0.1, 0.1],
+                                 [0.1, 0.1, 0.2, 0.8, 0.1]],
+                                [[0.1, 0.6, 0.1, 0.1, 0.1],
+                                 [0.1, 0.1, 0.2, 0.1, 0.1],
+                                 [0.7, 0.1, 0.2, 0.1, 0.1]]]])
+        tensor = logits.to(device=self.device, dtype=torch.float32)
+        targets = torch.tensor([[1, 2]], device=tensor.device, dtype=torch.int32)
+        logit_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
+        target_lengths = torch.tensor([2], device=tensor.device, dtype=torch.int32)
+
+        self._assert_consistency(T.RNNTLoss(), logits, targets, logit_lengths, target_lengths)
+
+
+class TransformsFloat64Only(TestBaseMixin):
+    @parameterized.expand([
+        ["ref_channel", True],
+        ["stv_evd", True],
+        ["stv_power", True],
+        ["ref_channel", False],
+        ["stv_evd", False],
+        ["stv_power", False],
+    ])
+    def test_MVDR(self, solution, online):
+        tensor = common_utils.get_whitenoise(sample_rate=8000, n_channels=4)
+        spectrogram = common_utils.get_spectrogram(tensor, n_fft=400, hop_length=100)
+        spectrogram = spectrogram.to(device=self.device, dtype=torch.cdouble)
+        mask_s = torch.rand(spectrogram.shape[-2:], device=self.device)
+        mask_n = torch.rand(spectrogram.shape[-2:], device=self.device)
+        self._assert_consistency_complex(
+            T.MVDR(solution=solution, online=online),
+            spectrogram, False, mask_s, mask_n
+        )
--- a/test/torchaudio_unittest/transforms/transforms_cpu_test.py
+++ b/test/torchaudio_unittest/transforms/transforms_cpu_test.py
+import torch
+
+from torchaudio_unittest.common_utils import PytorchTestCase
+from . transforms_test_impl import TransformsTestBase
+
+
+class TransformsCPUFloat32Test(TransformsTestBase, PytorchTestCase):
+    device = 'cpu'
+    dtype = torch.float32
+
+
+class TransformsCPUFloat64Test(TransformsTestBase, PytorchTestCase):
+    device = 'cpu'
+    dtype = torch.float64
--- a/test/torchaudio_unittest/transforms/transforms_cuda_test.py
+++ b/test/torchaudio_unittest/transforms/transforms_cuda_test.py
+import torch
+
+from torchaudio_unittest.common_utils import (
+    PytorchTestCase,
+    skipIfNoCuda,
+)
+from . transforms_test_impl import TransformsTestBase
+
+
+@skipIfNoCuda
+class TransformsCUDAFloat32Test(TransformsTestBase, PytorchTestCase):
+    device = 'cuda'
+    dtype = torch.float32
+
+
+@skipIfNoCuda
+class TransformsCUDAFloat64Test(TransformsTestBase, PytorchTestCase):
+    device = 'cuda'
+    dtype = torch.float64