mvdr_tutorial.py 8.48 KB
Newer Older
moto's avatar
moto committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
"""
MVDR with torchaudio
====================

**Author** `Zhaoheng Ni <zni@fb.com>`__

"""

######################################################################
# Overview
# ========
# 
# This is a tutorial on how to apply MVDR beamforming by using `torchaudio <https://github.com/pytorch/audio>`__.
# 
# Steps
# 
# - Ideal Ratio Mask (IRM) is generated by dividing the clean/noise
#   magnitude by the mixture magnitude.
# - We test all three solutions (``ref_channel``, ``stv_evd``, ``stv_power``)
#   of torchaudio's MVDR module.
# - We test the single-channel and multi-channel masks for MVDR beamforming.
#   The multi-channel mask is averaged along channel dimension when computing
#   the covariance matrices of speech and noise, respectively.


######################################################################
# Preparation
# ===========
# 
# First, we import the necessary packages and retrieve the data.
# 
# The multi-channel audio example is selected from
# `ConferencingSpeech <https://github.com/ConferencingSpeech/ConferencingSpeech2021>`__
# dataset.
# 
# The original filename is
# 
#    ``SSB07200001\#noise-sound-bible-0038\#7.86_6.16_3.00_3.14_4.84_134.5285_191.7899_0.4735\#15217\#25.16333303751458\#0.2101221178590021.wav``
# 
# which was generated with;
# 
# - ``SSB07200001.wav`` from `AISHELL-3 <https://www.openslr.org/93/>`__ (Apache License v.2.0)
# - ``noise-sound-bible-0038.wav`` from `MUSAN <http://www.openslr.org/17/>`__ (Attribution 4.0 International — CC BY 4.0)
# 

import os
import requests
import torch
import torchaudio
import IPython.display as ipd

torch.random.manual_seed(0)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

print(torch.__version__)
print(torchaudio.__version__)
print(device)

filenames = [
    'mix.wav',
    'reverb_clean.wav',
    'clean.wav',
]
base_url = 'https://download.pytorch.org/torchaudio/tutorial-assets/mvdr'

for filename in filenames:
    os.makedirs('_assets', exist_ok=True)
    if not os.path.exists(filename):
        with open(f'_assets/{filename}', 'wb') as file:
            file.write(requests.get(f'{base_url}/{filename}').content)

######################################################################
# Generate the Ideal Ratio Mask (IRM)
# ===================================
# 

######################################################################
# Loading audio data
# ------------------
# 

mix, sr = torchaudio.load('_assets/mix.wav')
reverb_clean, sr2 = torchaudio.load('_assets/reverb_clean.wav')
clean, sr3 = torchaudio.load('_assets/clean.wav')
assert sr == sr2

noise = mix - reverb_clean

######################################################################
# 
# .. note::
#    The MVDR Module requires ``torch.cdouble`` dtype for noisy STFT.
#    We need to convert the dtype of the waveforms to ``torch.double``
# 

mix = mix.to(torch.double)
noise = noise.to(torch.double)
clean = clean.to(torch.double)
reverb_clean = reverb_clean.to(torch.double)

######################################################################
# Compute STFT
# ------------
# 

stft = torchaudio.transforms.Spectrogram(
    n_fft=1024,
    hop_length=256,
    power=None,
)
istft = torchaudio.transforms.InverseSpectrogram(n_fft=1024, hop_length=256)

spec_mix = stft(mix)
spec_clean = stft(clean)
spec_reverb_clean = stft(reverb_clean)
spec_noise = stft(noise)

######################################################################
# Generate the Ideal Ratio Mask (IRM)
# -----------------------------------
# 
# .. note::
#    We found using the mask directly peforms better than using the
#    square root of it. This is slightly different from the definition of IRM.
# 


def get_irms(spec_clean, spec_noise, spec_mix):
    mag_mix = spec_mix.abs() ** 2
    mag_clean = spec_clean.abs() ** 2
    mag_noise = spec_noise.abs() ** 2
    irm_speech = mag_clean / (mag_clean + mag_noise)
    irm_noise = mag_noise / (mag_clean + mag_noise)

    return irm_speech, irm_noise

######################################################################
# .. note::
#    We use reverberant clean speech as the target here,
#    you can also set it to dry clean speech.

irm_speech, irm_noise = get_irms(spec_reverb_clean, spec_noise, spec_mix)

######################################################################
# Apply MVDR
# ==========
# 

######################################################################
# Apply MVDR beamforming by using multi-channel masks
# ---------------------------------------------------
# 

results_multi = {}
for solution in ['ref_channel', 'stv_evd', 'stv_power']:
    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=True)
    stft_est = mvdr(spec_mix, irm_speech, irm_noise)
    est = istft(stft_est, length=mix.shape[-1])
    results_multi[solution] = est

######################################################################
# Apply MVDR beamforming by using single-channel masks
# ----------------------------------------------------
# 
# We use the 1st channel as an example.
# The channel selection may depend on the design of the microphone array

results_single = {}
for solution in ['ref_channel', 'stv_evd', 'stv_power']:
    mvdr = torchaudio.transforms.MVDR(ref_channel=0, solution=solution, multi_mask=False)
    stft_est = mvdr(spec_mix, irm_speech[0], irm_noise[0])
    est = istft(stft_est, length=mix.shape[-1])
    results_single[solution] = est

######################################################################
# Compute Si-SDR scores
# ---------------------
# 

def si_sdr(estimate, reference, epsilon=1e-8):
    estimate = estimate - estimate.mean()
    reference = reference - reference.mean()
    reference_pow = reference.pow(2).mean(axis=1, keepdim=True)
    mix_pow = (estimate * reference).mean(axis=1, keepdim=True)
    scale = mix_pow / (reference_pow + epsilon)

    reference = scale * reference
    error = estimate - reference

    reference_pow = reference.pow(2)
    error_pow = error.pow(2)

    reference_pow = reference_pow.mean(axis=1)
    error_pow = error_pow.mean(axis=1)

    sisdr = 10 * torch.log10(reference_pow) - 10 * torch.log10(error_pow)
    return sisdr.item()

######################################################################
# Results
# =======
# 

######################################################################
# Single-channel mask results
# ---------------------------
# 

for solution in results_single:
    print(solution+": ", si_sdr(results_single[solution][None,...], reverb_clean[0:1]))

######################################################################
# Multi-channel mask results
# --------------------------
# 

for solution in results_multi:
    print(solution+": ", si_sdr(results_multi[solution][None,...], reverb_clean[0:1]))

######################################################################
# Original audio
# ==============
# 

######################################################################
# Mixture speech
# --------------
# 

ipd.Audio(mix[0], rate=16000)

######################################################################
# Noise
# -----
# 

ipd.Audio(noise[0], rate=16000)

######################################################################
# Clean speech
# ------------
# 

ipd.Audio(clean[0], rate=16000)

######################################################################
# Enhanced audio
# ==============
# 

######################################################################
# Multi-channel mask, ref_channel solution
# ----------------------------------------
# 

ipd.Audio(results_multi['ref_channel'], rate=16000)

######################################################################
# Multi-channel mask, stv_evd solution
# ------------------------------------
# 

ipd.Audio(results_multi['stv_evd'], rate=16000)

######################################################################
# Multi-channel mask, stv_power solution
# --------------------------------------
# 

ipd.Audio(results_multi['stv_power'], rate=16000)

######################################################################
# Single-channel mask, ref_channel solution
# -----------------------------------------
# 

ipd.Audio(results_single['ref_channel'], rate=16000)

######################################################################
# Single-channel mask, stv_evd solution
# -------------------------------------
# 

ipd.Audio(results_single['stv_evd'], rate=16000)

######################################################################
# Single-channel mask, stv_power solution
# ---------------------------------------
# 

ipd.Audio(results_single['stv_power'], rate=16000)