"docs/git@developer.sourcefind.cn:OpenDAS/torchaudio.git" did not exist on "3267c7ed38088e67dd1bdb4095689d82747b0d75"
Commit 8c5c9a9b authored by Zhaoheng Ni's avatar Zhaoheng Ni Committed by Facebook GitHub Bot
Browse files

Add simulate_rir_ism method for room impulse response simulation (#2880)

Summary:
replicate of https://github.com/pytorch/audio/issues/2644

Pull Request resolved: https://github.com/pytorch/audio/pull/2880

Reviewed By: mthrok

Differential Revision: D41633911

Pulled By: nateanl

fbshipit-source-id: 73cf145d75c389e996aafe96571ab86dc21f86e5
parent 3f02b898
...@@ -72,7 +72,7 @@ fi ...@@ -72,7 +72,7 @@ fi
( (
set -x set -x
conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20' conda install -y -c conda-forge ${NUMBA_DEV_CHANNEL} 'librosa>=0.8.0' parameterized 'requests>=2.20'
pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pip install kaldi-io SoundFile coverage pytest pytest-cov 'scipy==1.7.3' transformers expecttest unidecode inflect Pillow sentencepiece pytorch-lightning 'protobuf<4.21.0' demucs tinytag pyroomacoustics
) )
# Install fairseq # Install fairseq
git clone https://github.com/pytorch/fairseq git clone https://github.com/pytorch/fairseq
......
...@@ -90,7 +90,8 @@ esac ...@@ -90,7 +90,8 @@ esac
unidecode \ unidecode \
'protobuf<4.21.0' \ 'protobuf<4.21.0' \
demucs \ demucs \
tinytag tinytag \
pyroomacoustics
) )
# Install fairseq # Install fairseq
git clone https://github.com/pytorch/fairseq git clone https://github.com/pytorch/fairseq
......
...@@ -54,6 +54,7 @@ endif() ...@@ -54,6 +54,7 @@ endif()
# Options # Options
option(BUILD_SOX "Build libsox statically" ON) option(BUILD_SOX "Build libsox statically" ON)
option(BUILD_KALDI "Build kaldi statically" ON) option(BUILD_KALDI "Build kaldi statically" ON)
option(BUILD_RIR "Enable RIR simulation" ON)
option(BUILD_RNNT "Enable RNN transducer" ON) option(BUILD_RNNT "Enable RNN transducer" ON)
option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON) option(BUILD_CTC_DECODER "Build Flashlight CTC decoder" ON)
option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF) option(BUILD_TORCHAUDIO_PYTHON_EXTENSION "Build Python extension" OFF)
......
...@@ -22,3 +22,12 @@ DSP ...@@ -22,3 +22,12 @@ DSP
oscillator_bank oscillator_bank
sinc_impulse_response sinc_impulse_response
frequency_impulse_response frequency_impulse_response
Room Impulse Response Simulation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
.. autosummary::
:toctree: generated
:nosignatures:
simulate_rir_ism
...@@ -504,3 +504,27 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop ...@@ -504,3 +504,27 @@ abstract = {End-to-end spoken language translation (SLT) has recently gained pop
year={2021}, year={2021},
organization={IEEE} organization={IEEE}
} }
@inproceedings{scheibler2018pyroomacoustics,
title={Pyroomacoustics: A python package for audio room simulation and array processing algorithms},
author={Scheibler, Robin and Bezzam, Eric and Dokmani{\'c}, Ivan},
booktitle={2018 IEEE international conference on acoustics, speech and signal processing (ICASSP)},
pages={351--355},
year={2018},
organization={IEEE}
}
@article{allen1979image,
title={Image method for efficiently simulating small-room acoustics},
author={Allen, Jont B and Berkley, David A},
journal={The Journal of the Acoustical Society of America},
volume={65},
number={4},
pages={943--950},
year={1979},
publisher={Acoustical Society of America}
}
@misc{wiki:Absorption_(acoustics),
author = "{Wikipedia contributors}",
title = "Absorption (acoustics) --- {W}ikipedia{,} The Free Encyclopedia",
url = "https://en.wikipedia.org/wiki/Absorption_(acoustics)",
note = "[Online]"
}
...@@ -13,6 +13,7 @@ from .case_utils import ( ...@@ -13,6 +13,7 @@ from .case_utils import (
skipIfNoMacOS, skipIfNoMacOS,
skipIfNoModule, skipIfNoModule,
skipIfNoQengine, skipIfNoQengine,
skipIfNoRIR,
skipIfNoSox, skipIfNoSox,
skipIfPy310, skipIfPy310,
skipIfRocm, skipIfRocm,
...@@ -47,6 +48,7 @@ __all__ = [ ...@@ -47,6 +48,7 @@ __all__ = [
"skipIfNoMacOS", "skipIfNoMacOS",
"skipIfNoModule", "skipIfNoModule",
"skipIfNoKaldi", "skipIfNoKaldi",
"skipIfNoRIR",
"skipIfNoSox", "skipIfNoSox",
"skipIfNoSoxBackend", "skipIfNoSoxBackend",
"skipIfRocm", "skipIfRocm",
......
...@@ -225,6 +225,11 @@ skipIfNoKaldi = _skipIf( ...@@ -225,6 +225,11 @@ skipIfNoKaldi = _skipIf(
reason="Kaldi features are not available.", reason="Kaldi features are not available.",
key="NO_KALDI", key="NO_KALDI",
) )
skipIfNoRIR = _skipIf(
not torchaudio._extension._IS_RIR_AVAILABLE,
reason="RIR features are not available.",
key="NO_RIR",
)
skipIfNoCtcDecoder = _skipIf( skipIfNoCtcDecoder = _skipIf(
not is_ctc_decoder_available(), not is_ctc_decoder_available(),
reason="CTC decoder not available.", reason="CTC decoder not available.",
......
import torch import torch
from torchaudio_unittest.common_utils import PytorchTestCase from torchaudio_unittest.common_utils import PytorchTestCase
from .functional_test_impl import Functional64OnlyTestImpl, FunctionalTestImpl from .functional_test_impl import Functional64OnlyTestImpl, FunctionalCPUOnlyTestImpl, FunctionalTestImpl
class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase): class FunctionalFloat32CPUTest(FunctionalTestImpl, PytorchTestCase):
...@@ -17,3 +17,13 @@ class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase): ...@@ -17,3 +17,13 @@ class FunctionalFloat64CPUTest(FunctionalTestImpl, PytorchTestCase):
class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase): class FunctionalFloat64OnlyCPUTest(Functional64OnlyTestImpl, PytorchTestCase):
dtype = torch.float64 dtype = torch.float64
device = torch.device("cpu") device = torch.device("cpu")
class FunctionalCPUOnlyFloat32Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
class FunctionalCPUOnlyFloat64Test(FunctionalCPUOnlyTestImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")
from torchaudio._internal import module_utils as _mod_utils
if _mod_utils.is_module_available("pyroomacoustics"):
import pyroomacoustics as pra
import torch import torch
import torchaudio.prototype.functional as F import torchaudio.prototype.functional as F
from parameterized import param, parameterized from parameterized import param, parameterized
from torchaudio_unittest.common_utils import nested_params, TestBaseMixin from torchaudio_unittest.common_utils import nested_params, skipIfNoModule, skipIfNoRIR, TestBaseMixin
from .dsp_utils import freq_ir as freq_ir_np, oscillator_bank as oscillator_bank_np, sinc_ir as sinc_ir_np from .dsp_utils import freq_ir as freq_ir_np, oscillator_bank as oscillator_bank_np, sinc_ir as sinc_ir_np
...@@ -424,3 +429,83 @@ class Functional64OnlyTestImpl(TestBaseMixin): ...@@ -424,3 +429,83 @@ class Functional64OnlyTestImpl(TestBaseMixin):
except AssertionError: except AssertionError:
_debug_plot() _debug_plot()
raise raise
@skipIfNoModule("pyroomacoustics")
@skipIfNoRIR
class FunctionalCPUOnlyTestImpl(TestBaseMixin):
@parameterized.expand([(1,), (4,)])
def test_simulate_rir_ism_single_band(self, channel):
"""Test simulate_rir_ism function in the case where absorption coefficients are identical for all walls."""
room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
max_order = 3
# absorption is set as a float value indicating absorption coefficients are the same for every wall.
absorption = 0.5
# compute rir signal by torchaudio implementation
actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
# compute rir signal by pyroomacoustics
room = pra.ShoeBox(
room_dim.detach().numpy(),
fs=16000,
materials=pra.Material(absorption),
max_order=max_order,
ray_tracing=False,
air_absorption=False,
)
# mic_locs is a numpy array of dimension `(3, channel)`.
mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
room.add_microphone_array(mic_locs)
room.add_source(source.tolist())
room.compute_rir()
max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
for i in range(channel):
expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
@parameterized.expand([(1,), (4,)])
def test_simulate_rir_ism_multi_band(self, channel):
"""Test simulate_rir_ism in the case where absorption coefficients are different for all walls."""
room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
max_order = 3
# absorption is set as a Tensor with dimensions `(7, 6)` indicating there are
# 6 walls and each wall has 7 absorption coefficients corresponds to 7 octave bands, respectively.
absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
walls = ["west", "east", "south", "north", "floor", "ceiling"]
room = pra.ShoeBox(
room_dim.detach().numpy(),
fs=16000,
materials={
walls[i]: pra.Material(
{
"coeffs": absorption[:, i]
.reshape(
-1,
)
.detach()
.numpy(),
"center_freqs": [125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0],
}
)
for i in range(len(walls))
},
max_order=max_order,
ray_tracing=False,
air_absorption=False,
)
# mic_locs is a numpy array of dimension `(D, channel)`.
mic_locs = mic_array.transpose(0, 1).double().detach().numpy()
room.add_microphone_array(mic_locs)
room.add_source(source.tolist())
room.compute_rir()
max_len = max([room.rir[i][0].shape[0] for i in range(channel)])
expected = torch.zeros(channel, max_len, dtype=self.dtype, device=self.device)
for i in range(channel):
expected[i, 0 : room.rir[i][0].shape[0]] = torch.from_numpy(room.rir[i][0])
actual = F.simulate_rir_ism(room_dim, source, mic_array, max_order, absorption)
self.assertEqual(expected, actual, atol=1e-3, rtol=1e-3)
import torch import torch
from torchaudio_unittest.common_utils import PytorchTestCase from torchaudio_unittest.common_utils import PytorchTestCase
from .torchscript_consistency_test_impl import TorchScriptConsistencyTestImpl from .torchscript_consistency_test_impl import TorchScriptConsistencyCPUOnlyTestImpl, TorchScriptConsistencyTestImpl
class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase): class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
...@@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor ...@@ -12,3 +12,13 @@ class TorchScriptConsistencyCPUFloat32Test(TorchScriptConsistencyTestImpl, Pytor
class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase): class TorchScriptConsistencyCPUFloat64Test(TorchScriptConsistencyTestImpl, PytorchTestCase):
dtype = torch.float64 dtype = torch.float64
device = torch.device("cpu") device = torch.device("cpu")
class TorchScriptConsistencyCPUOnlyFloat32Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
dtype = torch.float32
device = torch.device("cpu")
class TorchScriptConsistencyCPUOnlyFloat64Test(TorchScriptConsistencyCPUOnlyTestImpl, PytorchTestCase):
dtype = torch.float64
device = torch.device("cpu")
...@@ -2,7 +2,8 @@ import unittest ...@@ -2,7 +2,8 @@ import unittest
import torch import torch
import torchaudio.prototype.functional as F import torchaudio.prototype.functional as F
from torchaudio_unittest.common_utils import TestBaseMixin, torch_script from parameterized import parameterized
from torchaudio_unittest.common_utils import skipIfNoRIR, TestBaseMixin, torch_script
class TorchScriptConsistencyTestImpl(TestBaseMixin): class TorchScriptConsistencyTestImpl(TestBaseMixin):
...@@ -62,3 +63,52 @@ class TorchScriptConsistencyTestImpl(TestBaseMixin): ...@@ -62,3 +63,52 @@ class TorchScriptConsistencyTestImpl(TestBaseMixin):
def test_freq_ir(self): def test_freq_ir(self):
mags = torch.tensor([0, 0.5, 1.0], device=self.device, dtype=self.dtype) mags = torch.tensor([0, 0.5, 1.0], device=self.device, dtype=self.dtype)
self._assert_consistency(F.frequency_impulse_response, (mags,)) self._assert_consistency(F.frequency_impulse_response, (mags,))
class TorchScriptConsistencyCPUOnlyTestImpl(TestBaseMixin):
def _assert_consistency(self, func, inputs, shape_only=False):
inputs_ = []
for i in inputs:
if torch.is_tensor(i):
i = i.to(device=self.device, dtype=self.dtype)
inputs_.append(i)
ts_func = torch_script(func)
torch.random.manual_seed(40)
output = func(*inputs_)
torch.random.manual_seed(40)
ts_output = ts_func(*inputs_)
if shape_only:
ts_output = ts_output.shape
output = output.shape
self.assertEqual(ts_output, output)
@skipIfNoRIR
@parameterized.expand([(1,), (4,)])
def test_simulate_rir_ism_single_band(self, channel):
room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
max_order = 3
absorption = 0.5
center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
self._assert_consistency(
F.simulate_rir_ism,
(room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
)
@skipIfNoRIR
@parameterized.expand([(1,), (4,)])
def test_simulate_rir_ism_multi_band(self, channel):
room_dim = torch.rand(3, dtype=self.dtype, device=self.device) + 5
mic_array = torch.rand(channel, 3, dtype=self.dtype, device=self.device) + 1
source = torch.rand(3, dtype=self.dtype, device=self.device) + 4
max_order = 3
absorption = torch.rand(7, 6, dtype=self.dtype, device=self.device)
center_frequency = torch.tensor([125, 250, 500, 1000, 2000, 4000, 8000], dtype=self.dtype, device=self.device)
self._assert_consistency(
F.simulate_rir_ism,
(room_dim, source, mic_array, max_order, absorption, None, 81, center_frequency, 343.0, 16000.0),
)
...@@ -35,6 +35,7 @@ def _get_build(var, default=False): ...@@ -35,6 +35,7 @@ def _get_build(var, default=False):
_BUILD_SOX = False if platform.system() == "Windows" else _get_build("BUILD_SOX", True) _BUILD_SOX = False if platform.system() == "Windows" else _get_build("BUILD_SOX", True)
_BUILD_KALDI = False if platform.system() == "Windows" else _get_build("BUILD_KALDI", True) _BUILD_KALDI = False if platform.system() == "Windows" else _get_build("BUILD_KALDI", True)
_BUILD_RIR = _get_build("BUILD_RIR", True)
_BUILD_RNNT = _get_build("BUILD_RNNT", True) _BUILD_RNNT = _get_build("BUILD_RNNT", True)
_BUILD_CTC_DECODER = _get_build("BUILD_CTC_DECODER", True) _BUILD_CTC_DECODER = _get_build("BUILD_CTC_DECODER", True)
_USE_FFMPEG = _get_build("USE_FFMPEG", False) _USE_FFMPEG = _get_build("USE_FFMPEG", False)
...@@ -116,6 +117,7 @@ class CMakeBuild(build_ext): ...@@ -116,6 +117,7 @@ class CMakeBuild(build_ext):
f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}", f"-DPython_INCLUDE_DIR={distutils.sysconfig.get_python_inc()}",
f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}", f"-DBUILD_SOX:BOOL={'ON' if _BUILD_SOX else 'OFF'}",
f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}", f"-DBUILD_KALDI:BOOL={'ON' if _BUILD_KALDI else 'OFF'}",
f"-DBUILD_RIR:BOOL={'ON' if _BUILD_RIR else 'OFF'}",
f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}", f"-DBUILD_RNNT:BOOL={'ON' if _BUILD_RNNT else 'OFF'}",
f"-DBUILD_CTC_DECODER:BOOL={'ON' if _BUILD_CTC_DECODER else 'OFF'}", f"-DBUILD_CTC_DECODER:BOOL={'ON' if _BUILD_CTC_DECODER else 'OFF'}",
"-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON", "-DBUILD_TORCHAUDIO_PYTHON_EXTENSION:BOOL=ON",
......
...@@ -20,6 +20,7 @@ __all__ = [ ...@@ -20,6 +20,7 @@ __all__ = [
"_check_cuda_version", "_check_cuda_version",
"_IS_TORCHAUDIO_EXT_AVAILABLE", "_IS_TORCHAUDIO_EXT_AVAILABLE",
"_IS_KALDI_AVAILABLE", "_IS_KALDI_AVAILABLE",
"_IS_RIR_AVAILABLE",
"_SOX_INITIALIZED", "_SOX_INITIALIZED",
"_FFMPEG_INITIALIZED", "_FFMPEG_INITIALIZED",
] ]
...@@ -33,9 +34,10 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9): ...@@ -33,9 +34,10 @@ if os.name == "nt" and (3, 8) <= sys.version_info < (3, 9):
# In case of an error, we do not catch the failure as it suggests there is something # In case of an error, we do not catch the failure as it suggests there is something
# wrong with the installation. # wrong with the installation.
_IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio") _IS_TORCHAUDIO_EXT_AVAILABLE = is_module_available("torchaudio.lib._torchaudio")
# Kaldi features are implemented in _torchaudio extension, but it can be individually # Kaldi and RIR features are implemented in _torchaudio extension, but they can be individually
# turned on/off at build time. Available means that _torchaudio is loaded properly, and # turned on/off at build time. Available means that _torchaudio is loaded properly, and
# Kaldi features are found there. # Kaldi or RIR features are found there.
_IS_RIR_AVAILABLE = False
_IS_KALDI_AVAILABLE = False _IS_KALDI_AVAILABLE = False
if _IS_TORCHAUDIO_EXT_AVAILABLE: if _IS_TORCHAUDIO_EXT_AVAILABLE:
_load_lib("libtorchaudio") _load_lib("libtorchaudio")
...@@ -43,6 +45,7 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE: ...@@ -43,6 +45,7 @@ if _IS_TORCHAUDIO_EXT_AVAILABLE:
import torchaudio.lib._torchaudio # noqa import torchaudio.lib._torchaudio # noqa
_check_cuda_version() _check_cuda_version()
_IS_RIR_AVAILABLE = torchaudio.lib._torchaudio.is_rir_available()
_IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available() _IS_KALDI_AVAILABLE = torchaudio.lib._torchaudio.is_kaldi_available()
...@@ -88,3 +91,11 @@ fail_if_no_sox = ( ...@@ -88,3 +91,11 @@ fail_if_no_sox = (
) )
fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg fail_if_no_ffmpeg = no_op if _FFMPEG_INITIALIZED else _fail_since_no_ffmpeg
fail_if_no_rir = (
no_op
if _IS_RIR_AVAILABLE
else fail_with_message(
"requires RIR extension, but TorchAudio is not compiled with it. Please build TorchAudio with RIR support."
)
)
...@@ -41,6 +41,11 @@ if(BUILD_RNNT) ...@@ -41,6 +41,11 @@ if(BUILD_RNNT)
endif() endif()
endif() endif()
if(BUILD_RIR)
list(APPEND sources rir.cpp)
list(APPEND compile_definitions INCLUDE_RIR)
endif()
if(USE_CUDA) if(USE_CUDA)
list( list(
APPEND APPEND
......
...@@ -6,6 +6,7 @@ namespace { ...@@ -6,6 +6,7 @@ namespace {
PYBIND11_MODULE(_torchaudio, m) { PYBIND11_MODULE(_torchaudio, m) {
m.def("is_kaldi_available", &is_kaldi_available, ""); m.def("is_kaldi_available", &is_kaldi_available, "");
m.def("is_rir_available", &is_rir_available, "");
m.def("cuda_version", &cuda_version, ""); m.def("cuda_version", &cuda_version, "");
} }
......
/*
Copyright (c) 2014-2017 EPFL-LCAV
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
*/
/**
* Image source method implementation based on PyRoomAcoustics:
* https://github.com/LCAV/pyroomacoustics
*/
#include <torch/script.h>
#include <torch/torch.h>
#include <cmath>
using namespace torch::indexing;
namespace torchaudio {
namespace rir {
namespace {
/**
* @brief Sum up impulse response signal of all image sources into one Tensor
* based on delays of arrival of the image sources. The implementation is based
* on the one in pyroomacoustics:
* https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/build_rir.pyx
*
* @tparam scalar_t The type of irs and rirs Tensor
* @param irs The impulse responses for all image sources. Tensor with
* dimensions `(num_band, num_image, num_mic, ir_length)`.
* @param delay The delays for the impulse response of each image source. Tensor
* with dimensions `(num_inage, num_mic)`.
* @param rirs The output room impulse response signal. Tensor with dimensions
* `(num_band, num_mic, rir_length)`.
* @param num_band The number of frequency bands for the wall materials.
* @param num_image The number of image sources in irs.
* @param num_mic The number of microphones in the array.
* @param ir_length The length of impulse response signal.
*/
template <typename scalar_t>
void simulate_rir_impl(
const torch::Tensor& irs,
const torch::Tensor& delay,
const int64_t rir_length,
const int64_t num_band,
const int64_t num_image,
const int64_t num_mic,
const int64_t ir_length,
torch::Tensor& rirs) {
const scalar_t* input_data = irs.data_ptr<scalar_t>();
const int* delay_data = delay.data_ptr<int>();
scalar_t* output_data = rirs.data_ptr<scalar_t>();
for (auto i = 0; i < num_band * num_image * num_mic; i++) {
int64_t offset_input = i * ir_length;
int64_t mic = i % num_mic;
int64_t image = ((i - mic) / num_mic) % num_image;
int64_t band = (i - mic - image * num_mic) / (num_image * num_mic);
int64_t offset_output = (band * num_mic + mic) * rir_length;
int64_t offset_delay = image * num_mic + mic;
for (auto j = 0; j < ir_length; j++) {
output_data[offset_output + j + delay_data[offset_delay]] +=
input_data[offset_input + j];
}
}
}
/**
* @brief Sum up impulse response signal of all image sources into one Tensor
* based on delays of arrival of the image sources.
*
* @param irs The impulse responses for all image sources. Tensor with
* dimensions `(num_band, num_image, num_mic, ir_length)`.
* @param delay The delays for the impulse response of each image source. Tensor
* with dimensions `(num_inage, num_mic)`.
* @param rir_length The length of the output room impulse response signal.
* @return torch::Tensor The output room impulse response signal. Tensor with
* dimensions `(num_band, num_mic, rir_length)`.
*/
torch::Tensor simulate_rir(
const torch::Tensor& irs,
const torch::Tensor& delay,
const int64_t rir_length) {
const int64_t num_band = irs.size(0);
const int64_t num_image = irs.size(1);
const int64_t num_mic = irs.size(2);
const int64_t ir_length = irs.size(3);
torch::Tensor rirs =
torch::zeros({num_band, num_mic, rir_length}, irs.dtype());
AT_DISPATCH_FLOATING_TYPES_AND_HALF(irs.scalar_type(), "build_rir", [&] {
simulate_rir_impl<scalar_t>(
irs, delay, rir_length, num_band, num_image, num_mic, ir_length, rirs);
});
return rirs;
}
/**
* @brief Create the band-pass filters for the octave bands.
* The implementation is based on the one in pyroomacoustics:
* https://github.com/LCAV/pyroomacoustics/blob/master/pyroomacoustics/acoustics.py#L261
*
* @tparam scalar_t The type of center frequencies and output filter Tensors.
* @param centers The Tensor that stores the center frequencies of octave bands.
* Tensor with dimension `(num_band,)`.
* @param sample_rate The sample_rate of simulated room impulse response signal.
* @param n_fft The number of fft points.
* @param filters The output band-pass filter. Tensor with dimensions
* `(num_band, n_fft - 1)`.
*/
template <typename scalar_t>
void make_rir_filter_impl(
torch::Tensor& centers,
double sample_rate,
int64_t n_fft,
torch::Tensor& filters) {
int64_t n = centers.size(0);
torch::Tensor new_bands = torch::zeros({n, 2}, centers.dtype());
scalar_t* newband_data = new_bands.data_ptr<scalar_t>();
const scalar_t* centers_data = centers.data_ptr<scalar_t>();
for (int64_t i = 0; i < n; i++) {
if (i == 0) {
newband_data[i * 2] = centers_data[0] / 2;
newband_data[i * 2 + 1] = centers_data[1];
} else if (i == n - 1) {
newband_data[i * 2] = centers_data[n - 2];
newband_data[i * 2 + 1] = sample_rate / 2;
} else {
newband_data[i * 2] = centers_data[i - 1];
newband_data[i * 2 + 1] = centers_data[i + 1];
}
}
const auto half = 0.5;
auto n_freq = n_fft / 2 + 1;
torch::Tensor freq_resp = torch::zeros({n_freq, n}, centers.dtype());
torch::Tensor freq =
torch::arange(n_freq, centers.dtype()) / n_fft * sample_rate;
const scalar_t* freq_data = freq.data_ptr<scalar_t>();
scalar_t* freqreq_data = freq_resp.data_ptr<scalar_t>();
for (auto i = 0; i < n; i++) {
for (auto j = 0; j < n_freq; j++) {
if (freq_data[j] >= newband_data[i * 2] &&
freq_data[j] < centers_data[i]) {
freqreq_data[j * n + i] =
half * (1 + cos(2 * M_PI * freq_data[j] / centers_data[i]));
}
if (i != n - 1 && freq_data[j] >= centers_data[i] &&
freq_data[j] < newband_data[i * 2 + 1]) {
freqreq_data[j * n + i] =
half * (1 - cos(2 * M_PI * freq_data[j] / newband_data[i * 2 + 1]));
}
if (i == n - 1 && centers_data[i] <= freq_data[j]) {
freqreq_data[j * n + i] = 1.0;
}
}
}
filters = torch::fft::fftshift(torch::fft::irfft(freq_resp, n_fft, 0), 0);
filters = filters.index({Slice(1)}).transpose(0, 1);
}
/**
* @brief Create the band-pass filters for the octave bands.
*
* @param centers The Tensor that stores the center frequencies of octave bands.
* Tensor with dimension `(num_band,)`.
* @param sample_rate The sample_rate of simulated room impulse response signal.
* @param n_fft The number of fft points.
* @return torch::Tensor The output band-pass filter. Tensor with dimensions
* `(num_band, n_fft - 1)`.
*/
torch::Tensor make_rir_filter(
torch::Tensor centers,
double sample_rate,
int64_t n_fft) {
torch::Tensor filters;
AT_DISPATCH_FLOATING_TYPES_AND_HALF(
centers.scalar_type(), "make_filter", [&] {
make_rir_filter_impl<scalar_t>(centers, sample_rate, n_fft, filters);
});
return filters;
}
TORCH_LIBRARY_IMPL(torchaudio, CPU, m) {
m.impl("torchaudio::_simulate_rir", torchaudio::rir::simulate_rir);
m.impl("torchaudio::_make_rir_filter", torchaudio::rir::make_rir_filter);
}
TORCH_LIBRARY_FRAGMENT(torchaudio, m) {
m.def(
"torchaudio::_simulate_rir(Tensor irs, Tensor delay_i, int rir_length) -> Tensor");
m.def(
"torchaudio::_make_rir_filter(Tensor centers, float sample_rate, int n_fft) -> Tensor");
}
} // Anonymous namespace
} // namespace rir
} // namespace torchaudio
...@@ -15,6 +15,14 @@ bool is_kaldi_available() { ...@@ -15,6 +15,14 @@ bool is_kaldi_available() {
#endif #endif
} }
bool is_rir_available() {
#ifdef INCLUDE_RIR
return true;
#else
return false;
#endif
}
c10::optional<int64_t> cuda_version() { c10::optional<int64_t> cuda_version() {
#ifdef USE_CUDA #ifdef USE_CUDA
return CUDA_VERSION; return CUDA_VERSION;
......
...@@ -3,5 +3,6 @@ ...@@ -3,5 +3,6 @@
namespace torchaudio { namespace torchaudio {
bool is_kaldi_available(); bool is_kaldi_available();
bool is_rir_available();
c10::optional<int64_t> cuda_version(); c10::optional<int64_t> cuda_version();
} // namespace torchaudio } // namespace torchaudio
...@@ -6,6 +6,7 @@ from ._dsp import ( ...@@ -6,6 +6,7 @@ from ._dsp import (
oscillator_bank, oscillator_bank,
sinc_impulse_response, sinc_impulse_response,
) )
from ._rir import simulate_rir_ism
from .functional import barkscale_fbanks from .functional import barkscale_fbanks
...@@ -17,4 +18,5 @@ __all__ = [ ...@@ -17,4 +18,5 @@ __all__ = [
"frequency_impulse_response", "frequency_impulse_response",
"oscillator_bank", "oscillator_bank",
"sinc_impulse_response", "sinc_impulse_response",
"simulate_rir_ism",
] ]
import math
from typing import Optional, Tuple, Union
import torch
import torchaudio
from torch import Tensor
def _compute_image_sources(
room: torch.Tensor,
source: torch.Tensor,
max_order: int,
absorption: torch.Tensor,
scatter: Optional[torch.Tensor] = None,
) -> Tuple[Tensor, Tensor]:
"""Compute image sources in a shoebox-like room.
Args:
room (torch.Tensor): The 1D Tensor to determine the room size. The shape is
`(D,)`, where ``D`` is 2 if room is a 2D room, or 3 if room is a 3D room.
source (torch.Tensor): The coordinate of the sound source. Tensor with dimensions
`(D)`.
max_order (int): The maximum number of reflections of the source.
absorption (torch.Tensor): The absorption coefficients of wall materials.
``absorption`` is a Tensor with dimensions `(num_band, num_wall)`.
The shape options are ``[(1, 4), (1, 6), (7, 4), (7, 6)]``.
``num_band`` is `1` if the coefficients is the same for all frequencies, or is `7`
if the coefficients are different to different frequencies. `7` refers to the default number
of octave bands. (See note in `simulate_rir_ism` method).
``num_wall`` is `4` if the room is a 2D room, representing absorption coefficients
of ``"west"``, ``"east"``, ``"south"``, and ``"north"`` walls, respectively.
Or it is `6` if the room is a 3D room, representing absorption coefficients
of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``, and ``"ceiling"``, respectively.
scatter (torch.Tensor): The scattering coefficients of wall materials.
The shape of ``scatter`` must match that of ``absorption``. If ``None``, it is not
used in image source computation. (Default: ``None``)
Returns:
(torch.Tensor): The coordinates of all image sources within ``max_order`` number of reflections.
Tensor with dimensions `(num_image_source, D)`.
(torch.Tensor): The attenuation of corresponding image sources. Tensor with dimensions
`(num_band, num_image_source)`.
"""
if scatter is None:
tr = torch.sqrt(1 - absorption)
else:
tr = torch.sqrt(1 - absorption) * torch.sqrt(1 - scatter)
ind = torch.arange(-max_order, max_order + 1, device=source.device)
if room.shape[0] == 2:
XYZ = torch.meshgrid(ind, ind, indexing="ij")
else:
XYZ = torch.meshgrid(ind, ind, ind, indexing="ij")
XYZ = torch.stack([c.reshape((-1,)) for c in XYZ], dim=-1)
XYZ = XYZ[XYZ.abs().sum(dim=-1) <= max_order]
# compute locations of image sources
d = room[None, :]
s = source[None, :]
img_loc = torch.where(XYZ % 2 == 1, d * (XYZ + 1) - s, d * XYZ + s)
# attenuation
exp_lo = abs(torch.floor((XYZ / 2)))
exp_hi = abs(torch.floor((XYZ + 1) / 2))
t_lo = tr[:, ::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, left walls)
t_hi = tr[:, 1::2].unsqueeze(1).repeat(1, XYZ.shape[0], 1) # (num_band, right walls)
att = torch.prod((t_lo**exp_lo) * (t_hi**exp_hi), dim=-1) # (num_band, num_image_source)
return img_loc, att
def _hann(x: torch.Tensor, T: int):
"""Compute the Hann window where the values are truncated based on window length.
torch.hann_window can only sample window function at integer points, the method is to sample
continuous window function at non-integer points.
Args:
x (torch.Tensor): The fractional component of time delay Tensor.
T (torch.Tensor): The window length of sinc function.
Returns:
(torch.Tensor): The hann window Tensor where values outside
the sinc window (`T`) is set to zero.
"""
y = torch.where(
torch.abs(x) <= T / 2,
0.5 * (1 + torch.cos(2 * math.pi * x / T)),
x.new_zeros(1),
)
return y
def _frac_delay(delay: torch.Tensor, delay_i: torch.Tensor, delay_filter_length: int):
"""Compute fractional delay of impulse response signal.
Args:
delay (torch.Tensor): The time delay Tensor in samples.
delay_i (torch.Tensor): The integer part of delay.
delay_filter_length (int): The window length for sinc function.
Returns:
(torch.Tensor): The impulse response Tensor for all image sources.
"""
if delay_filter_length % 2 != 1:
raise ValueError("The filter length must be odd")
pad = delay_filter_length // 2
n = torch.arange(-pad, pad + 1, device=delay.device) + delay_i[..., None]
delay = delay[..., None]
return torch.special.sinc(n - delay) * _hann(n - delay, 2 * pad)
def _validate_inputs(
room: torch.Tensor, source: torch.Tensor, mic_array: torch.Tensor, absorption: Union[float, torch.Tensor]
) -> torch.Tensor:
"""Validate dimensions of input arguments, and normalize different kinds of absorption into the same dimension.
Args:
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
three dimensions of the room.
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
absorption (float or torch.Tensor): The absorption coefficients of wall materials.
If the dtype is ``float``, the absorption coefficient is identical for all walls and
all frequencies.
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
and ``"ceiling"``, respectively.
If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
Returns:
(torch.Tensor): The absorption Tensor. The shape is `(1, 6)` for single octave band case,
or `(7, 6)` for multi octave band case.
"""
if room.ndim != 1:
raise ValueError(f"room must be a 1D Tensor. Found {room.shape}.")
D = room.shape[0]
if D != 3:
raise ValueError(f"room must be a 3D room. Found {room.shape}.")
num_wall = 6
if source.shape[0] != D:
raise ValueError(f"The shape of source must be `(3,)`. Found {source.shape}")
if mic_array.ndim != 2:
raise ValueError(f"mic_array must be a 2D Tensor. Found {mic_array.shape}.")
if mic_array.shape[1] != D:
raise ValueError(f"The second dimension of mic_array must be 3. Found {mic_array.shape}.")
if isinstance(absorption, float):
absorption = torch.ones(1, num_wall) * absorption
elif isinstance(absorption, Tensor) and absorption.ndim == 1:
if absorption.shape[0] != num_wall:
raise ValueError(
"The shape of absorption must be `(6,)` if it is a 1D Tensor." f"Found the shape {absorption.shape}."
)
absorption = absorption.unsqueeze(0)
elif isinstance(absorption, Tensor) and absorption.ndim == 2:
if absorption.shape != (7, num_wall):
raise ValueError(
"The shape of absorption must be `(7, 6)` if it is a 2D Tensor."
f"Found the shape of room is {D} and shape of absorption is {absorption.shape}."
)
absorption = absorption
else:
absorption = absorption
return absorption
def simulate_rir_ism(
room: torch.Tensor,
source: torch.Tensor,
mic_array: torch.Tensor,
max_order: int,
absorption: Union[float, torch.Tensor],
output_length: Optional[int] = None,
delay_filter_length: int = 81,
center_frequency: Optional[torch.Tensor] = None,
sound_speed: float = 343.0,
sample_rate: float = 16000.0,
) -> Tensor:
r"""Compute Room Impulse Response (RIR) based on the *image source method* :cite:`allen1979image`.
The implementation is based on *pyroomacoustics* :cite:`scheibler2018pyroomacoustics`.
.. devices:: CPU
.. properties:: TorchScript
Args:
room (torch.Tensor): Room coordinates. The shape of `room` must be `(3,)` which represents
three dimensions of the room.
source (torch.Tensor): Sound source coordinates. Tensor with dimensions `(3,)`.
mic_array (torch.Tensor): Microphone coordinates. Tensor with dimensions `(channel, 3)`.
max_order (int): The maximum number of reflections of the source.
absorption (float or torch.Tensor): The *absorption* :cite:`wiki:Absorption_(acoustics)`
coefficients of wall materials for sound energy.
If the dtype is ``float``, the absorption coefficient is identical for all walls and
all frequencies.
If ``absorption`` is a 1D Tensor, the shape must be `(6,)`, where the values represent
absorption coefficients of ``"west"``, ``"east"``, ``"south"``, ``"north"``, ``"floor"``,
and ``"ceiling"``, respectively.
If ``absorption`` is a 2D Tensor, the shape must be `(7, 6)`, where 7 represents the number of octave bands.
output_length (int or None, optional): The output length of simulated RIR signal. If ``None``,
the length is defined as
.. math::
\frac{\text{max\_d} \cdot \text{sample\_rate}}{\text{sound\_speed}} + \text{delay\_filter\_length}
where ``max_d`` is the maximum distance between image sources and microphones.
delay_filter_length (int, optional): The filter length for computing sinc function. (Default: ``81``)
center_frequency (torch.Tensor, optional): The center frequencies of octave bands for multi-band walls.
Only used when ``absorption`` is a 2D Tensor.
sound_speed (float, optional): The speed of sound. (Default: ``343.0``)
sample_rate (float, optional): The sample rate of the generated room impulse response signal.
(Default: ``16000.0``)
Returns:
(torch.Tensor): The simulated room impulse response waveform. Tensor with dimensions
`(channel, rir_length)`.
Note:
If ``absorption`` is a 2D Tensor and ``center_frequency`` is set to ``None``, the center frequencies
of octave bands are fixed to ``[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0]``.
Users need to tune the values of ``absorption`` to the corresponding frequencies.
"""
absorption = _validate_inputs(room, source, mic_array, absorption)
img_location, att = _compute_image_sources(room, source, max_order, absorption)
# compute distances between image sources and microphones
vec = img_location[:, None, :] - mic_array[None, :, :]
dist = torch.linalg.norm(vec, dim=-1) # (image_source, channel)
img_src_att = att[..., None] / dist[None, ...] # (band, image_source, channel)
# separate delays in integer / frac part
delay = dist * sample_rate / sound_speed # distance to delay in samples
delay_i = torch.ceil(delay) # integer part
# compute the shorts IRs corresponding to each image source
irs = img_src_att[..., None] * _frac_delay(delay, delay_i, delay_filter_length)[None, ...]
rir_length = int(delay_i.max() + irs.shape[-1])
rir = torch.ops.torchaudio._simulate_rir(irs, delay_i.type(torch.int32), rir_length)
# multi-band processing
if absorption.shape[0] > 1:
if center_frequency is None:
center = torch.tensor(
[125.0, 250.0, 500.0, 1000.0, 2000.0, 4000.0, 8000.0], dtype=room.dtype, device=room.device
)
else:
center = center_frequency
# n_fft is set to 512 by default.
filters = torch.ops.torchaudio._make_rir_filter(center, sample_rate, n_fft=512)
rir = torchaudio.functional.fftconvolve(rir, filters.unsqueeze(1).repeat(1, rir.shape[1], 1), mode="same")
# sum up rir signals of all image sources into one waveform.
rir = rir.sum(0)
if output_length is not None:
if output_length > rir.shape[-1]:
rir = torch.nn.functional.pad(rir, (0, output_length - rir.shape[-1]), "constant", 0.0)
else:
rir = rir[..., :output_length]
return rir
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment