Store n_bits in WaveRNN (#1847)

Move the computation of `#classes -> #bits` to the constructor of WaveRNN and attach it to the instance, so that it can be reused elsewhere.

Store n_bits in WaveRNN (#1847)
Move the computation of `#classes -> #bits` to the constructor of WaveRNN and attach it to the instance, so that it can be reused elsewhere.
498722b5 · moto · 202bc4f2 · 498722b5 · 498722b5 · 498722b5
Commit 498722b5 authored Oct 10, 2021 by moto
3 changed files
--- a/examples/pipeline_wavernn/wavernn_inference_wrapper.py
+++ b/examples/pipeline_wavernn/wavernn_inference_wrapper.py
@@ -193,12 +193,10 @@ class WaveRNNInferenceWrapper(torch.nn.Module):
        if batched:
            specgram = _fold_with_overlap(specgram, timesteps, overlap)
-        n_bits = int(torch.log2(torch.ones(1) * self.wavernn_model.n_classes))
        output = self.wavernn_model.infer(specgram).cpu()
        if mulaw:
-            output = normalized_waveform_to_bits(output, n_bits)
+            output = normalized_waveform_to_bits(output, self.wavernn_model.n_bits)
            output = torchaudio.functional.mu_law_decoding(output, self.wavernn_model.n_classes)
        if batched:

--- a/test/torchaudio_unittest/models/models_test.py
+++ b/test/torchaudio_unittest/models/models_test.py
@@ -6,6 +6,7 @@ from parameterized import parameterized
 from torchaudio.models import ConvTasNet, DeepSpeech, Wav2Letter, WaveRNN
 from torchaudio.models.wavernn import MelResNet, UpsampleNetwork
 from torchaudio_unittest import common_utils
+from torchaudio_unittest.common_utils import torch_script
 class TestWav2Letter(common_utils.TorchaudioTestCase):
@@ -145,6 +146,32 @@ class TestWaveRNN(common_utils.TorchaudioTestCase):
        assert out.size() == (n_batch, 1, hop_length * (n_time - kernel_size + 1))
+    def test_torchscript_infer(self):
+        """Scripted model outputs the same as eager mode"""
+        upsample_scales = [5, 5, 8]
+        n_rnn = 512
+        n_fc = 512
+        n_classes = 512
+        hop_length = 200
+        n_batch = 2
+        n_time = 200
+        n_freq = 100
+        n_output = 256
+        n_res_block = 10
+        n_hidden = 128
+        kernel_size = 5
+        model = WaveRNN(upsample_scales, n_classes, hop_length, n_res_block,
+                        n_rnn, n_fc, kernel_size, n_freq, n_hidden, n_output)
+        model.eval()
+        x = torch.rand(n_batch, n_freq, n_time)
+        torch.random.manual_seed(0)
+        out_eager = model.infer(x)
+        torch.random.manual_seed(0)
+        out_script = torch_script(model).infer(x)
+        self.assertEqual(out_eager, out_script)
 _ConvTasNetParams = namedtuple(
    '_ConvTasNetParams',

--- a/torchaudio/models/wavernn.py
+++ b/torchaudio/models/wavernn.py
 from typing import List, Tuple, Dict, Any
+import math
 import torch
 from torch import Tensor
@@ -268,6 +269,7 @@ class WaveRNN(nn.Module):
        self.n_aux = n_output // 4
        self.hop_length = hop_length
        self.n_classes = n_classes
+        self.n_bits: int = int(math.log2(self.n_classes))
        total_scale = 1
        for upsample_scale in upsample_scales:
@@ -365,8 +367,6 @@ class WaveRNN(nn.Module):
        device = specgram.device
        dtype = specgram.dtype
-        # make it compatible with torchscript
-        n_bits = int(torch.log2(torch.ones(1) * self.n_classes))
        specgram, aux = self.upsample(specgram)
@@ -406,7 +406,7 @@ class WaveRNN(nn.Module):
            x = torch.multinomial(posterior, 1).float()
            # Transform label [0, 2 ** n_bits - 1] to waveform [-1, 1]
-            x = 2 * x / (2 ** n_bits - 1.0) - 1.0
+            x = 2 * x / (2 ** self.n_bits - 1.0) - 1.0
            output.append(x)