mvdr_tutorial.py 8.48 KB
Newer Older
moto's avatar
moto committed
1
2
3
4
5
6
7
8
9
10
"""
MVDR with torchaudio
====================

**Author** `Zhaoheng Ni <zni@fb.com>`__

"""

######################################################################
# Overview
moto's avatar
moto committed
11
# --------
moto's avatar
moto committed
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# 
# This is a tutorial on how to apply MVDR beamforming by using `torchaudio <https://github.com/pytorch/audio>`__.
# 
# Steps
# 
# - Ideal Ratio Mask (IRM) is generated by dividing the clean/noise
#   magnitude by the mixture magnitude.
# - We test all three solutions (``ref_channel``, ``stv_evd``, ``stv_power``)
#   of torchaudio's MVDR module.
# - We test the single-channel and multi-channel masks for MVDR beamforming.
#   The multi-channel mask is averaged along channel dimension when computing
#   the covariance matrices of speech and noise, respectively.


######################################################################
# Preparation
moto's avatar
moto committed
28
# -----------
moto's avatar
moto committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# 
# First, we import the necessary packages and retrieve the data.
# 
# The multi-channel audio example is selected from
# `ConferencingSpeech <https://github.com/ConferencingSpeech/ConferencingSpeech2021>`__
# dataset.
# 
# The original filename is
# 
#    ``SSB07200001\#noise-sound-bible-0038\#7.86_6.16_3.00_3.14_4.84_134.5285_191.7899_0.4735\#15217\#25.16333303751458\#0.2101221178590021.wav``
# 
# which was generated with;
# 
# - ``SSB07200001.wav`` from `AISHELL-3 <https://www.openslr.org/93/>`__ (Apache License v.2.0)
# - ``noise-sound-bible-0038.wav`` from `MUSAN <http://www.openslr.org/17/>`__ (Attribution 4.0 International — CC BY 4.0)
# 

import os
import requests
import torch
import torchaudio
import IPython.display as ipd

torch.random.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(torchaudio.__version__)
print(device)

filenames = [
    'mix.wav',
    'reverb_clean.wav',
    'clean.wav',
]
base_url = 'https://download.pytorch.org/torchaudio/tutorial-assets/mvdr'

for filename in filenames:
    os.makedirs('_assets', exist_ok=True)
    if not os.path.exists(filename):
        with open(f'_assets/{filename}', 'wb') as file:
            file.write(requests.get(f'{base_url}/{filename}').content)

######################################################################
# Generate the Ideal Ratio Mask (IRM)
moto's avatar
moto committed
74
# -----------------------------------
moto's avatar
moto committed
75
76
77
78
# 

######################################################################
# Loading audio data
moto's avatar
moto committed
79
# ~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
# 

mix, sr = torchaudio.load('_assets/mix.wav')
reverb_clean, sr2 = torchaudio.load('_assets/reverb_clean.wav')
clean, sr3 = torchaudio.load('_assets/clean.wav')
assert sr == sr2

noise = mix - reverb_clean

######################################################################
# 
# .. note::
#    The MVDR Module requires ``torch.cdouble`` dtype for noisy STFT.
#    We need to convert the dtype of the waveforms to ``torch.double``
# 

mix = mix.to(torch.double)
noise = noise.to(torch.double)
clean = clean.to(torch.double)
reverb_clean = reverb_clean.to(torch.double)

######################################################################
# Compute STFT
moto's avatar
moto committed
103
# ~~~~~~~~~~~~
moto's avatar
moto committed
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# 

stft = torchaudio.transforms.Spectrogram(
    n_fft=1024,
    hop_length=256,
    power=None,
)
istft = torchaudio.transforms.InverseSpectrogram(n_fft=1024, hop_length=256)

spec_mix = stft(mix)
spec_clean = stft(clean)
spec_reverb_clean = stft(reverb_clean)
spec_noise = stft(noise)

######################################################################
# Generate the Ideal Ratio Mask (IRM)
moto's avatar
moto committed
120
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
# 
# .. note::
#    We found using the mask directly peforms better than using the
#    square root of it. This is slightly different from the definition of IRM.
# 


def get_irms(spec_clean, spec_noise, spec_mix):
    mag_mix = spec_mix.abs() ** 2
    mag_clean = spec_clean.abs() ** 2
    mag_noise = spec_noise.abs() ** 2
    irm_speech = mag_clean / (mag_clean + mag_noise)
    irm_noise = mag_noise / (mag_clean + mag_noise)

    return irm_speech, irm_noise

######################################################################
# .. note::
#    We use reverberant clean speech as the target here,
#    you can also set it to dry clean speech.

irm_speech, irm_noise = get_irms(spec_reverb_clean, spec_noise, spec_mix)

######################################################################
# Apply MVDR
moto's avatar
moto committed
146
# ----------
moto's avatar
moto committed
147
148
149
150
# 

######################################################################
# Apply MVDR beamforming by using multi-channel masks
moto's avatar
moto committed
151
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
152
153
154
155
156
157
158
159
160
161
162
# 

results_multi = {}
for solution in ['ref_channel', 'stv_evd', 'stv_power']:
    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=True)
    stft_est = mvdr(spec_mix, irm_speech, irm_noise)
    est = istft(stft_est, length=mix.shape[-1])
    results_multi[solution] = est

######################################################################
# Apply MVDR beamforming by using single-channel masks
moto's avatar
moto committed
163
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
164
165
166
167
168
169
170
171
172
173
174
175
176
# 
# We use the 1st channel as an example.
# The channel selection may depend on the design of the microphone array

results_single = {}
for solution in ['ref_channel', 'stv_evd', 'stv_power']:
    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=False)
    stft_est = mvdr(spec_mix, irm_speech[0], irm_noise[0])
    est = istft(stft_est, length=mix.shape[-1])
    results_single[solution] = est

######################################################################
# Compute Si-SDR scores
moto's avatar
moto committed
177
# ~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# 

def si_sdr(estimate, reference, epsilon=1e-8):
    estimate = estimate - estimate.mean()
    reference = reference - reference.mean()
    reference_pow = reference.pow(2).mean(axis=1, keepdim=True)
    mix_pow = (estimate * reference).mean(axis=1, keepdim=True)
    scale = mix_pow / (reference_pow + epsilon)

    reference = scale * reference
    error = estimate - reference

    reference_pow = reference.pow(2)
    error_pow = error.pow(2)

    reference_pow = reference_pow.mean(axis=1)
    error_pow = error_pow.mean(axis=1)

    sisdr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
    return sisdr.item()

######################################################################
# Results
moto's avatar
moto committed
201
# -------
moto's avatar
moto committed
202
203
204
205
# 

######################################################################
# Single-channel mask results
moto's avatar
moto committed
206
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
207
208
209
210
211
212
213
# 

for solution in results_single:
    print(solution+": ", si_sdr(results_single[solution][None,...], reverb_clean[0:1]))

######################################################################
# Multi-channel mask results
moto's avatar
moto committed
214
# ~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
215
216
217
218
219
220
221
# 

for solution in results_multi:
    print(solution+": ", si_sdr(results_multi[solution][None,...], reverb_clean[0:1]))

######################################################################
# Original audio
moto's avatar
moto committed
222
# --------------
moto's avatar
moto committed
223
224
225
226
# 

######################################################################
# Mixture speech
moto's avatar
moto committed
227
# ~~~~~~~~~~~~~~
moto's avatar
moto committed
228
229
230
231
232
233
# 

ipd.Audio(mix[0], rate=16000)

######################################################################
# Noise
moto's avatar
moto committed
234
# ~~~~~
moto's avatar
moto committed
235
236
237
238
239
240
# 

ipd.Audio(noise[0], rate=16000)

######################################################################
# Clean speech
moto's avatar
moto committed
241
# ~~~~~~~~~~~~
moto's avatar
moto committed
242
243
244
245
246
247
# 

ipd.Audio(clean[0], rate=16000)

######################################################################
# Enhanced audio
moto's avatar
moto committed
248
# --------------
moto's avatar
moto committed
249
250
251
252
# 

######################################################################
# Multi-channel mask, ref_channel solution
moto's avatar
moto committed
253
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
254
255
256
257
258
259
# 

ipd.Audio(results_multi['ref_channel'], rate=16000)

######################################################################
# Multi-channel mask, stv_evd solution
moto's avatar
moto committed
260
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
261
262
263
264
265
266
# 

ipd.Audio(results_multi['stv_evd'], rate=16000)

######################################################################
# Multi-channel mask, stv_power solution
moto's avatar
moto committed
267
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
268
269
270
271
272
273
# 

ipd.Audio(results_multi['stv_power'], rate=16000)

######################################################################
# Single-channel mask, ref_channel solution
moto's avatar
moto committed
274
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
275
276
277
278
279
280
# 

ipd.Audio(results_single['ref_channel'], rate=16000)

######################################################################
# Single-channel mask, stv_evd solution
moto's avatar
moto committed
281
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
282
283
284
285
286
287
# 

ipd.Audio(results_single['stv_evd'], rate=16000)

######################################################################
# Single-channel mask, stv_power solution
moto's avatar
moto committed
288
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
moto's avatar
moto committed
289
290
291
# 

ipd.Audio(results_single['stv_power'], rate=16000)