Add simulate_rir_ism method for room impulse response simulation (#2880)

Summary: replicate of https://github.com/pytorch/audio/issues/2644 Pull Request resolved: https://github.com/pytorch/audio/pull/2880 Reviewed By: mthrok Differential Revision: D41633911 Pulled By: nateanl fbshipit-source-id: 73cf145d75c389e996aafe96571ab86dc21f86e5

Add simulate_rir_ism method for room impulse response simulation (#2880)
Summary: replicate of https://github.com/pytorch/audio/issues/2644 Pull Request resolved: https://github.com/pytorch/audio/pull/2880 Reviewed By: mthrok Differential Revision: D41633911 Pulled By: nateanl fbshipit-source-id: 73cf145d75c389e996aafe96571ab86dc21f86e5
8c5c9a9b · Zhaoheng Ni · Facebook GitHub Bot · 3f02b898 · 8c5c9a9b · 8c5c9a9b
Commit 8c5c9a9b authored Feb 14, 2023 by Zhaoheng Ni Committed by Facebook GitHub Bot Feb 14, 2023
20 changed files
--- a/.circleci/unittest/linux/scripts/install.sh
+++ b/.circleci/unittest/linux/scripts/install.sh
@@ -72,7 +72,7 @@ fi
 (
    set -x
    conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
-    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag
+    pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

--- a/.circleci/unittest/windows/scripts/install.sh
+++ b/.circleci/unittest/windows/scripts/install.sh
@@ -90,7 +90,8 @@ esac
        unidecode \
        'protobuf<4.21.0' \
        demucs \
-        tinytag
+        tinytag \
+        pyroomacoustics
 )
 # Install fairseq
 git clone https://github.com/pytorch/fairseq

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -54,6 +54,7 @@ endif()
 # Options
 option(BUILD_SOX "Build libsox statically" ON)
 option(BUILD_KALDI "Build kaldi statically" ON)
+option(BUILD_RIR "Enable RIR simulation" ON)
 option(BUILD_RNNT "Enable RNN transducer" ON)
 option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
 option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)

--- a/docs/source/prototype.functional.rst
+++ b/docs/source/prototype.functional.rst
@@ -22,3 +22,12 @@ DSP
   oscillator_bank
   sinc_impulse_response
   frequency_impulse_response
+Room Impulse Response Simulation
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+.. autosummary::
+   :toctree: generated
+   :nosignatures:
+   simulate_rir_ism
--- a/docs/source/refs.bib
+++ b/docs/source/refs.bib
@@ -504,3 +504,27 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop
  year={2021},
  organization={IEEE}
 }
+@inproceedings{scheibler2018pyroomacoustics,
+  title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
+  author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
+  booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
+  pages={351--355},
+  year={2018},
+  organization={IEEE}
+}
+@article{allen1979image,
+  title={Image method for efficiently simulating small-room acoustics},
+  author={Allen, Jont B and Berkley, David A},
+  journal={The Journal of the Acoustical Society of America},
+  volume={65},
+  number={4},
+  pages={943--950},
+  year={1979},
+  publisher={Acoustical Society of America}
+}
+@misc{wiki:Absorption_(acoustics),
+   author = "{Wikipedia contributors}",
+   title = "Absorption (acoustics) --- {W}ikipedia{,} The Free Encyclopedia",
+   url = "https://en.wikipedia.org/wiki/Absorption_(acoustics)",
+   note = "[Online]"
+ }
--- a/test/torchaudio_unittest/common_utils/__init__.py
+++ b/test/torchaudio_unittest/common_utils/__init__.py
@@ -13,6 +13,7 @@ from .case_utils import (
    skipIfNoMacOS,
    skipIfNoModule,
    skipIfNoQengine,
+    skipIfNoRIR,
    skipIfNoSox,
    skipIfPy310,
    skipIfRocm,
@@ -47,6 +48,7 @@ __all__ = [
    "skipIfNoMacOS",
    "skipIfNoModule",
    "skipIfNoKaldi",
+    "skipIfNoRIR",
    "skipIfNoSox",
    "skipIfNoSoxBackend",
    "skipIfRocm",

--- a/test/torchaudio_unittest/common_utils/case_utils.py
+++ b/test/torchaudio_unittest/common_utils/case_utils.py
@@ -225,6 +225,11 @@ skipIfNoKaldi = _skipIf(
    reason="Kaldi features are not available.",
    key="NO_KALDI",
 )
+skipIfNoRIR = _skipIf(
+    not torchaudio._extension._IS_RIR_AVAILABLE,
+    reason="RIR features are not available.",
+    key="NO_RIR",
+)
 skipIfNoCtcDecoder = _skipIf(
    not is_ctc_decoder_available(),
    reason="CTC decoder not available.",

--- a/test/torchaudio_unittest/prototype/functional/functional_cpu_test.py
+++ b/test/torchaudio_unittest/prototype/functional/functional_cpu_test.py
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
-from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl
+from .functional_test_impl import Functional64OnlyTestImpl, FunctionalCPUOnlyTestImpl, FunctionalTestImpl
 class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
@@ -17,3 +17,13 @@ class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
 class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase):
    dtype = torch.float64
    device = torch.device("cpu")
+class FunctionalCPUOnlyFloat32Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+class FunctionalCPUOnlyFloat64Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
--- a/test/torchaudio_unittest/prototype/functional/functional_test_impl.py
+++ b/test/torchaudio_unittest/prototype/functional/functional_test_impl.py
+from torchaudio._internal import module_utils as _mod_utils
+if _mod_utils.is_module_available("pyroomacoustics"):
+    import pyroomacoustics as pra
 import torch
 import torchaudio.prototype.functional as F
 from parameterized import param, parameterized
-from torchaudio_unittest.common_utils import nested_params, TestBaseMixin
+from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, skipIfNoRIR, TestBaseMixin
 from .dsp_utils import freq_ir as freq_ir_np, oscillator_bank as oscillator_bank_np, sinc_ir as sinc_ir_np
@@ -424,3 +429,83 @@ class Functional64OnlyTestImpl(TestBaseMixin):
        except AssertionError:
            _debug_plot()
            raise
+@skipIfNoModule("pyroomacoustics")
+@skipIfNoRIR
+class FunctionalCPUOnlyTestImpl(TestBaseMixin):
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_single_band(self, channel):
+        """Test simulate_rir_ism function in the case where absorption coefficients are identical for all walls."""
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        # absorption is set as a float value indicating absorption coefficients are the same for every wall.
+        absorption = 0.5
+        # compute rir signal by torchaudio implementation
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
+        # compute rir signal by pyroomacoustics
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials=pra.Material(absorption),
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        # mic_locs is a numpy array of dimension `(3, channel)`.
+        mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+        self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_multi_band(self, channel):
+        """Test simulate_rir_ism in the case where absorption coefficients are different for all walls."""
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        # absorption is set as a Tensor with dimensions `(7, 6)` indicating there are
+        # 6 walls and each wall has 7 absorption coefficients corresponds to 7 octave bands, respectively.
+        absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+        walls = ["west", "east", "south", "north", "floor", "ceiling"]
+        room = pra.ShoeBox(
+            room_dim.detach().numpy(),
+            fs=16000,
+            materials={
+                walls[i]: pra.Material(
+                    {
+                        "coeffs": absorption[:, i]
+                        .reshape(
+                            -1,
+                        )
+                        .detach()
+                        .numpy(),
+                        "center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
+                    }
+                )
+                for i in range(len(walls))
+            },
+            max_order=max_order,
+            ray_tracing=False,
+            air_absorption=False,
+        )
+        # mic_locs is a numpy array of dimension `(D, channel)`.
+        mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
+        room.add_microphone_array(mic_locs)
+        room.add_source(source.tolist())
+        room.compute_rir()
+        max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
+        expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
+        for i in range(channel):
+            expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
+        actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
+        self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
--- a/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py
+++ b/test/torchaudio_unittest/prototype/functional/torchscript_consistency_cpu_test.py
 import torch
 from torchaudio_unittest.common_utils import PytorchTestCase
-from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl
+from .torchscript_consistency_test_impl import TorchScriptConsistencyCPUOnlyTestImpl, TorchScriptConsistencyTestImpl
 class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
@@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor
 class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
    dtype = torch.float64
    device = torch.device("cpu")
+class TorchScriptConsistencyCPUOnlyFloat32Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float32
+    device = torch.device("cpu")
+class TorchScriptConsistencyCPUOnlyFloat64Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
+    dtype = torch.float64
+    device = torch.device("cpu")
--- a/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py
+++ b/test/torchaudio_unittest/prototype/functional/torchscript_consistency_test_impl.py
@@ -2,7 +2,8 @@ import unittest
 import torch
 import torchaudio.prototype.functional as F
-from torchaudio_unittest.common_utils import TestBaseMixin, torch_script
+from parameterized import parameterized
+from torchaudio_unittest.common_utils import skipIfNoRIR, TestBaseMixin, torch_script
 class TorchScriptConsistencyTestImpl(TestBaseMixin):
@@ -62,3 +63,52 @@ class TorchScriptConsistencyTestImpl(TestBaseMixin):
    def test_freq_ir(self):
        mags = torch.tensor([0, 0.5, 1.0], device=self.device, dtype=self.dtype)
        self._assert_consistency(F.frequency_impulse_response, (mags,))
+class TorchScriptConsistencyCPUOnlyTestImpl(TestBaseMixin):
+    def _assert_consistency(self, func, inputs, shape_only=False):
+        inputs_ = []
+        for i in inputs:
+            if torch.is_tensor(i):
+                i = i.to(device=self.device, dtype=self.dtype)
+            inputs_.append(i)
+        ts_func = torch_script(func)
+        torch.random.manual_seed(40)
+        output = func(*inputs_)
+        torch.random.manual_seed(40)
+        ts_output = ts_func(*inputs_)
+        if shape_only:
+            ts_output = ts_output.shape
+            output = output.shape
+        self.assertEqual(ts_output, output)
+    @skipIfNoRIR
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_single_band(self, channel):
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        absorption = 0.5
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
+    @skipIfNoRIR
+    @parameterized.expand([(1,), (4,)])
+    def test_simulate_rir_ism_multi_band(self, channel):
+        room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
+        mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
+        source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
+        max_order = 3
+        absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
+        center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
+        self._assert_consistency(
+            F.simulate_rir_ism,
+            (room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
+        )
--- a/tools/setup_helpers/extension.py
+++ b/tools/setup_helpers/extension.py
@@ -35,6 +35,7 @@ def _get_build(var, default=False):
 _BUILD_SOX = False if platform.system() == "Windows" else _get_build("BUILD_SOX", True)
 _BUILD_KALDI = False if platform.system() == "Windows" else _get_build("BUILD_KALDI", True)
+_BUILD_RIR = _get_build("BUILD_RIR", True)
 _BUILD_RNNT = _get_build("BUILD_RNNT", True)
 _BUILD_CTC_DECODER = _get_build("BUILD_CTC_DECODER", True)
 _USE_FFMPEG = _get_build("USE_FFMPEG", False)
@@ -116,6 +117,7 @@ class CMakeBuild(build_ext):
            f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
            f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
            f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",
+            f"-DBUILD_RIR:BOOL={'ON' if _BUILD_RIR else 'OFF'}",
            f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}",
            f"-DBUILD_CTC_DECODER:BOOL={'ON' if _BUILD_CTC_DECODER else 'OFF'}",
            "-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",

--- a/torchaudio/_extension/__init__.py
+++ b/torchaudio/_extension/__init__.py
@@ -20,6 +20,7 @@ __all__ = [
    "_check_cuda_version",
    "_IS_TORCHAUDIO_EXT_AVAILABLE",
    "_IS_KALDI_AVAILABLE",
+    "_IS_RIR_AVAILABLE",
    "_SOX_INITIALIZED",
    "_FFMPEG_INITIALIZED",
 ]
@@ -33,9 +34,10 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
 # In case of an error, we do not catch the failure as it suggests there is something
 # wrong with the installation.
 _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
-# Kaldi features are implemented in _torchaudio extension, but it can be individually
+# Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
 # turned on/off at build time. Available means that _torchaudio is loaded properly, and
-# Kaldi features are found there.
+# Kaldi or RIR features are found there.
+_IS_RIR_AVAILABLE = False
 _IS_KALDI_AVAILABLE = False
 if _IS_TORCHAUDIO_EXT_AVAILABLE:
    _load_lib("libtorchaudio")
@@ -43,6 +45,7 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
    import torchaudio.lib._torchaudio  # noqa
    _check_cuda_version()
+    _IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
    _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
@@ -88,3 +91,11 @@ fail_if_no_sox = (
 )
 fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
+fail_if_no_rir = (
+    no_op
+    if _IS_RIR_AVAILABLE
+    else fail_with_message(
+        "requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
+    )
+)
--- a/torchaudio/csrc/CMakeLists.txt
+++ b/torchaudio/csrc/CMakeLists.txt
@@ -41,6 +41,11 @@ if(BUILD_RNNT)
  endif()
 endif()
+if(BUILD_RIR)
+  list(APPEND sources rir.cpp)
+  list(APPEND compile_definitions INCLUDE_RIR)
+endif()
 if(USE_CUDA)
  list(
    APPEND

--- a/torchaudio/csrc/pybind/pybind.cpp
+++ b/torchaudio/csrc/pybind/pybind.cpp
@@ -6,6 +6,7 @@ namespace {
 PYBIND11_MODULE(_torchaudio, m) {
  m.def("is_kaldi_available", &is_kaldi_available, "");
+  m.def("is_rir_available", &is_rir_available, "");
  m.def("cuda_version", &cuda_version, "");
 }

--- a/torchaudio/csrc/rir.cpp
+++ b/torchaudio/csrc/rir.cpp
+/*
+Copyright (c) 2014-2017 EPFL-LCAV
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
+*/
+/**
+ * Image source method implementation based on PyRoomAcoustics:
+ * https://github.com/LCAV/pyroomacoustics
+ */
+#include <torch/script.h>
+#include <torch/torch.h>
+#include <cmath>
+using namespace torch::indexing;
+namespace torchaudio {
+namespace rir {
+namespace {
+/**
+ * @brief Sum up impulse response signal of all image sources into one Tensor
+ * based on delays of arrival of the image sources. The implementation is based
+ * on the one in pyroomacoustics:
+ * https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/build_rir.pyx
+ *
+ * @tparam scalar_t The type of irs and rirs Tensor
+ * @param irs The impulse responses for all image sources. Tensor with
+ * dimensions `(num_band, num_image, num_mic, ir_length)`.
+ * @param delay The delays for the impulse response of each image source. Tensor
+ * with dimensions `(num_inage, num_mic)`.
+ * @param rirs The output room impulse response signal. Tensor with dimensions
+ * `(num_band, num_mic, rir_length)`.
+ * @param num_band The number of frequency bands for the wall materials.
+ * @param num_image The number of image sources in irs.
+ * @param num_mic The number of microphones in the array.
+ * @param ir_length The length of impulse response signal.
+ */
+template <typename scalar_t>
+void simulate_rir_impl(
+    const torch::Tensor& irs,
+    const torch::Tensor& delay,
+    const int64_t rir_length,
+    const int64_t num_band,
+    const int64_t num_image,
+    const int64_t num_mic,
+    const int64_t ir_length,
+    torch::Tensor& rirs) {
+  const scalar_t* input_data = irs.data_ptr<scalar_t>();
+  const int* delay_data = delay.data_ptr<int>();
+  scalar_t* output_data = rirs.data_ptr<scalar_t>();
+  for (auto i = 0; i < num_band * num_image * num_mic; i++) {
+    int64_t offset_input = i * ir_length;
+    int64_t mic = i % num_mic;
+    int64_t image = ((i - mic) / num_mic) % num_image;
+    int64_t band = (i - mic - image * num_mic) / (num_image * num_mic);
+    int64_t offset_output = (band * num_mic + mic) * rir_length;
+    int64_t offset_delay = image * num_mic + mic;
+    for (auto j = 0; j < ir_length; j++) {
+      output_data[offset_output + j + delay_data[offset_delay]] +=
+          input_data[offset_input + j];
+    }
+  }
+}
+/**
+ * @brief Sum up impulse response signal of all image sources into one Tensor
+ * based on delays of arrival of the image sources.
+ *
+ * @param irs The impulse responses for all image sources. Tensor with
+ * dimensions `(num_band, num_image, num_mic, ir_length)`.
+ * @param delay The delays for the impulse response of each image source. Tensor
+ * with dimensions `(num_inage, num_mic)`.
+ * @param rir_length The length of the output room impulse response signal.
+ * @return torch::Tensor The output room impulse response signal. Tensor with
+ * dimensions `(num_band, num_mic, rir_length)`.
+ */
+torch::Tensor simulate_rir(
+    const torch::Tensor& irs,
+    const torch::Tensor& delay,
+    const int64_t rir_length) {
+  const int64_t num_band = irs.size(0);
+  const int64_t num_image = irs.size(1);
+  const int64_t num_mic = irs.size(2);
+  const int64_t ir_length = irs.size(3);
+  torch::Tensor rirs =
+      torch::zeros({num_band, num_mic, rir_length}, irs.dtype());
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(irs.scalar_type(), "build_rir", [&] {
+    simulate_rir_impl<scalar_t>(
+        irs, delay, rir_length, num_band, num_image, num_mic, ir_length, rirs);
+  });
+  return rirs;
+}
+/**
+ * @brief Create the band-pass filters for the octave bands.
+ * The implementation is based on the one in pyroomacoustics:
+ * https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/acoustics.py#L261
+ *
+ * @tparam scalar_t The type of center frequencies and output filter Tensors.
+ * @param centers The Tensor that stores the center frequencies of octave bands.
+ * Tensor with dimension `(num_band,)`.
+ * @param sample_rate The sample_rate of simulated room impulse response signal.
+ * @param n_fft The number of fft points.
+ * @param filters The output band-pass filter. Tensor with dimensions
+ * `(num_band, n_fft - 1)`.
+ */
+template <typename scalar_t>
+void make_rir_filter_impl(
+    torch::Tensor& centers,
+    double sample_rate,
+    int64_t n_fft,
+    torch::Tensor& filters) {
+  int64_t n = centers.size(0);
+  torch::Tensor new_bands = torch::zeros({n, 2}, centers.dtype());
+  scalar_t* newband_data = new_bands.data_ptr<scalar_t>();
+  const scalar_t* centers_data = centers.data_ptr<scalar_t>();
+  for (int64_t i = 0; i < n; i++) {
+    if (i == 0) {
+      newband_data[i * 2] = centers_data[0] / 2;
+      newband_data[i * 2 + 1] = centers_data[1];
+    } else if (i == n - 1) {
+      newband_data[i * 2] = centers_data[n - 2];
+      newband_data[i * 2 + 1] = sample_rate / 2;
+    } else {
+      newband_data[i * 2] = centers_data[i - 1];
+      newband_data[i * 2 + 1] = centers_data[i + 1];
+    }
+  }
+  const auto half = 0.5;
+  auto n_freq = n_fft / 2 + 1;
+  torch::Tensor freq_resp = torch::zeros({n_freq, n}, centers.dtype());
+  torch::Tensor freq =
+      torch::arange(n_freq, centers.dtype()) / n_fft * sample_rate;
+  const scalar_t* freq_data = freq.data_ptr<scalar_t>();
+  scalar_t* freqreq_data = freq_resp.data_ptr<scalar_t>();
+  for (auto i = 0; i < n; i++) {
+    for (auto j = 0; j < n_freq; j++) {
+      if (freq_data[j] >= newband_data[i * 2] &&
+          freq_data[j] < centers_data[i]) {
+        freqreq_data[j * n + i] =
+            half * (1 + cos(2 * M_PI * freq_data[j] / centers_data[i]));
+      }
+      if (i != n - 1 && freq_data[j] >= centers_data[i] &&
+          freq_data[j] < newband_data[i * 2 + 1]) {
+        freqreq_data[j * n + i] =
+            half * (1 - cos(2 * M_PI * freq_data[j] / newband_data[i * 2 + 1]));
+      }
+      if (i == n - 1 && centers_data[i] <= freq_data[j]) {
+        freqreq_data[j * n + i] = 1.0;
+      }
+    }
+  }
+  filters = torch::fft::fftshift(torch::fft::irfft(freq_resp, n_fft, 0), 0);
+  filters = filters.index({Slice(1)}).transpose(0, 1);
+}
+/**
+ * @brief Create the band-pass filters for the octave bands.
+ *
+ * @param centers The Tensor that stores the center frequencies of octave bands.
+ * Tensor with dimension `(num_band,)`.
+ * @param sample_rate The sample_rate of simulated room impulse response signal.
+ * @param n_fft The number of fft points.
+ * @return torch::Tensor The output band-pass filter. Tensor with dimensions
+ * `(num_band, n_fft - 1)`.
+ */
+torch::Tensor make_rir_filter(
+    torch::Tensor centers,
+    double sample_rate,
+    int64_t n_fft) {
+  torch::Tensor filters;
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(
+      centers.scalar_type(), "make_filter", [&] {
+        make_rir_filter_impl<scalar_t>(centers, sample_rate, n_fft, filters);
+      });
+  return filters;
+}
+TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
+  m.impl("torchaudio::_simulate_rir", torchaudio::rir::simulate_rir);
+  m.impl("torchaudio::_make_rir_filter", torchaudio::rir::make_rir_filter);
+}
+TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
+  m.def(
+      "torchaudio::_simulate_rir(Tensor irs, Tensor delay_i, int rir_length) -> Tensor");
+  m.def(
+      "torchaudio::_make_rir_filter(Tensor centers, float sample_rate, int n_fft) -> Tensor");
+}
+} // Anonymous namespace
+} // namespace rir
+} // namespace torchaudio
--- a/torchaudio/csrc/utils.cpp
+++ b/torchaudio/csrc/utils.cpp
@@ -15,6 +15,14 @@ bool is_kaldi_available() {
 #endif
 }
+bool is_rir_available() {
+#ifdef INCLUDE_RIR
+  return true;
+#else
+  return false;
+#endif
+}
 c10::optional<int64_t> cuda_version() {
 #ifdef USE_CUDA
  return CUDA_VERSION;

--- a/torchaudio/csrc/utils.h
+++ b/torchaudio/csrc/utils.h
@@ -3,5 +3,6 @@
 namespace torchaudio {
 bool is_kaldi_available();
+bool is_rir_available();
 c10::optional<int64_t> cuda_version();
 } // namespace torchaudio
--- a/torchaudio/prototype/functional/__init__.py
+++ b/torchaudio/prototype/functional/__init__.py
@@ -6,6 +6,7 @@ from ._dsp import (
    oscillator_bank,
    sinc_impulse_response,
 )
+from ._rir import simulate_rir_ism
 from .functional import barkscale_fbanks
@@ -17,4 +18,5 @@ __all__ = [
    "frequency_impulse_response",
    "oscillator_bank",
    "sinc_impulse_response",
+    "simulate_rir_ism",
 ]
--- a/torchaudio/prototype/functional/_rir.py
+++ b/torchaudio/prototype/functional/_rir.py
+import math
+from typing import Optional, Tuple, Union
+import torch
+import torchaudio
+from torch import Tensor
+def _compute_image_sources(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    max_order: int,
+    absorption: torch.Tensor,
+    scatter: Optional[torch.Tensor] = None,
+) -> Tuple[Tensor, Tensor]:
+    """Compute image sources in a shoebox-like room.
+    Args:
+        room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
+            `(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
+        source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
+            `(D)`.
+        max_order (int): The maximum number of reflections of the source.
+        absorption (torch.Tensor): The absorption coefficients of wall materials.
+            ``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
+            The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
+            ``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
+            if the coefficients are different to different frequencies. `7` refers to the default number
+            of octave bands. (See note in `simulate_rir_ism` method).
+            ``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
+            of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
+            Or it is `6` if the room is a 3D room, representing absorption coefficients
+            of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
+        scatter (torch.Tensor): The scattering coefficients of wall materials.
+            The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
+            used in image source computation. (Default: ``None``)
+    Returns:
+        (torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
+            Tensor with dimensions `(num_image_source, D)`.
+        (torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
+            `(num_band, num_image_source)`.
+    """
+    if scatter is None:
+        tr = torch.sqrt(1 - absorption)
+    else:
+        tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
+    ind = torch.arange(-max_order, max_order + 1, device=source.device)
+    if room.shape[0] == 2:
+        XYZ = torch.meshgrid(ind, ind, indexing="ij")
+    else:
+        XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
+    XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
+    XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
+    # compute locations of image sources
+    d = room[None, :]
+    s = source[None, :]
+    img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
+    # attenuation
+    exp_lo = abs(torch.floor((XYZ / 2)))
+    exp_hi = abs(torch.floor((XYZ + 1) / 2))
+    t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, left walls)
+    t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1)  # (num_band, right walls)
+    att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1)  # (num_band, num_image_source)
+    return img_loc, att
+def _hann(x: torch.Tensor, T: int):
+    """Compute the Hann window where the values are truncated based on window length.
+    torch.hann_window can only sample window function at integer points, the method is to sample
+    continuous window function at non-integer points.
+    Args:
+        x (torch.Tensor): The fractional component of time delay Tensor.
+        T (torch.Tensor): The window length of sinc function.
+    Returns:
+        (torch.Tensor): The hann window Tensor where values outside
+            the sinc window (`T`) is set to zero.
+    """
+    y = torch.where(
+        torch.abs(x) <= T / 2,
+        0.5 * (1 + torch.cos(2 * math.pi * x / T)),
+        x.new_zeros(1),
+    )
+    return y
+def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
+    """Compute fractional delay of impulse response signal.
+    Args:
+        delay (torch.Tensor): The time delay Tensor in samples.
+        delay_i (torch.Tensor): The integer part of delay.
+        delay_filter_length (int): The window length for sinc function.
+    Returns:
+        (torch.Tensor): The impulse response Tensor for all image sources.
+    """
+    if delay_filter_length % 2 != 1:
+        raise ValueError("The filter length must be odd")
+    pad = delay_filter_length // 2
+    n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
+    delay = delay[..., None]
+    return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
+def _validate_inputs(
+    room: torch.Tensor, source: torch.Tensor, mic_array: torch.Tensor, absorption: Union[float, torch.Tensor]
+) -> torch.Tensor:
+    """Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
+    Args:
+        room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
+            three dimensions of the room.
+        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
+        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
+        absorption (float or torch.Tensor): The absorption coefficients of wall materials.
+            If the dtype is ``float``, the absorption coefficient is identical for all walls and
+            all frequencies.
+            If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
+            absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
+            and ``"ceiling"``, respectively.
+            If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
+    Returns:
+        (torch.Tensor): The absorption Tensor. The shape is `(1, 6)` for single octave band case,
+            or `(7, 6)` for multi octave band case.
+    """
+    if room.ndim != 1:
+        raise ValueError(f"room must be a 1D Tensor. Found {room.shape}.")
+    D = room.shape[0]
+    if D != 3:
+        raise ValueError(f"room must be a 3D room. Found {room.shape}.")
+    num_wall = 6
+    if source.shape[0] != D:
+        raise ValueError(f"The shape of source must be `(3,)`. Found {source.shape}")
+    if mic_array.ndim != 2:
+        raise ValueError(f"mic_array must be a 2D Tensor. Found {mic_array.shape}.")
+    if mic_array.shape[1] != D:
+        raise ValueError(f"The second dimension of mic_array must be 3. Found {mic_array.shape}.")
+    if isinstance(absorption, float):
+        absorption = torch.ones(1, num_wall) * absorption
+    elif isinstance(absorption, Tensor) and absorption.ndim == 1:
+        if absorption.shape[0] != num_wall:
+            raise ValueError(
+                "The shape of absorption must be `(6,)` if it is a 1D Tensor." f"Found the shape {absorption.shape}."
+            )
+        absorption = absorption.unsqueeze(0)
+    elif isinstance(absorption, Tensor) and absorption.ndim == 2:
+        if absorption.shape != (7, num_wall):
+            raise ValueError(
+                "The shape of absorption must be `(7, 6)` if it is a 2D Tensor."
+                f"Found the shape of room is {D} and shape of absorption is {absorption.shape}."
+            )
+        absorption = absorption
+    else:
+        absorption = absorption
+    return absorption
+def simulate_rir_ism(
+    room: torch.Tensor,
+    source: torch.Tensor,
+    mic_array: torch.Tensor,
+    max_order: int,
+    absorption: Union[float, torch.Tensor],
+    output_length: Optional[int] = None,
+    delay_filter_length: int = 81,
+    center_frequency: Optional[torch.Tensor] = None,
+    sound_speed: float = 343.0,
+    sample_rate: float = 16000.0,
+) -> Tensor:
+    r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
+    The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
+    .. devices:: CPU
+    .. properties:: TorchScript
+    Args:
+        room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
+            three dimensions of the room.
+        source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
+        mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
+        max_order (int): The maximum number of reflections of the source.
+        absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
+            coefficients of wall materials for sound energy.
+            If the dtype is ``float``, the absorption coefficient is identical for all walls and
+            all frequencies.
+            If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
+            absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
+            and ``"ceiling"``, respectively.
+            If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
+        output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
+            the length is defined as
+            .. math::
+                \frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
+            where ``max_d`` is the maximum distance between image sources and microphones.
+        delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
+        center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
+            Only used when ``absorption`` is a 2D Tensor.
+        sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
+        sample_rate (float, optional): The sample rate of the generated room impulse response signal.
+            (Default: ``16000.0``)
+    Returns:
+        (torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
+        `(channel, rir_length)`.
+    Note:
+        If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
+        of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
+        Users need to tune the values of ``absorption`` to the corresponding frequencies.
+    """
+    absorption = _validate_inputs(room, source, mic_array, absorption)
+    img_location, att = _compute_image_sources(room, source, max_order, absorption)
+    # compute distances between image sources and microphones
+    vec = img_location[:, None, :] - mic_array[None, :, :]
+    dist = torch.linalg.norm(vec, dim=-1)  # (image_source, channel)
+    img_src_att = att[..., None] / dist[None, ...]  # (band, image_source, channel)
+    # separate delays in integer / frac part
+    delay = dist * sample_rate / sound_speed  # distance to delay in samples
+    delay_i = torch.ceil(delay)  # integer part
+    # compute the shorts IRs corresponding to each image source
+    irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
+    rir_length = int(delay_i.max() + irs.shape[-1])
+    rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
+    # multi-band processing
+    if absorption.shape[0] > 1:
+        if center_frequency is None:
+            center = torch.tensor(
+                [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
+            )
+        else:
+            center = center_frequency
+        # n_fft is set to 512 by default.
+        filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
+        rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
+    # sum up rir signals of all image sources into one waveform.
+    rir = rir.sum(0)
+    if output_length is not None:
+        if output_length > rir.shape[-1]:
+            rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
+        else:
+            rir = rir[..., :output_length]
+    return rir