"tests/nn/moe/test_moe_layer.py" did not exist on "82dbd5d827b8b4ee2532fdbd9cad13ab2d664635"
wav2letter.py 3.17 KB
Newer Older
1
from torch import nn, Tensor
Tomás Osório's avatar
Tomás Osório committed
2

3
4
5
__all__ = [
    "Wav2Letter",
]
Tomás Osório's avatar
Tomás Osório committed
6
7
8


class Wav2Letter(nn.Module):
9
    r"""Wav2Letter model architecture from *Wav2Letter: an End-to-End ConvNet-based Speech
10
    Recognition System* :cite:`collobert2016wav2letter`.
Tomás Osório's avatar
Tomás Osório committed
11
12
13
14
15
16
17
18
19
20

     :math:`\text{padding} = \frac{\text{ceil}(\text{kernel} - \text{stride})}{2}`

    Args:
        num_classes (int, optional): Number of classes to be classified. (Default: ``40``)
        input_type (str, optional): Wav2Letter can use as input: ``waveform``, ``power_spectrum``
         or ``mfcc`` (Default: ``waveform``).
        num_features (int, optional): Number of input features that the network will receive (Default: ``1``).
    """

21
    def __init__(self, num_classes: int = 40, input_type: str = "waveform", num_features: int = 1) -> None:
Tomás Osório's avatar
Tomás Osório committed
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
        super(Wav2Letter, self).__init__()

        acoustic_num_features = 250 if input_type == "waveform" else num_features
        acoustic_model = nn.Sequential(
            nn.Conv1d(in_channels=acoustic_num_features, out_channels=250, kernel_size=48, stride=2, padding=23),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=250, kernel_size=7, stride=1, padding=3),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=250, out_channels=2000, kernel_size=32, stride=1, padding=16),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=2000, out_channels=2000, kernel_size=1, stride=1, padding=0),
            nn.ReLU(inplace=True),
            nn.Conv1d(in_channels=2000, out_channels=num_classes, kernel_size=1, stride=1, padding=0),
47
            nn.ReLU(inplace=True),
Tomás Osório's avatar
Tomás Osório committed
48
49
50
51
52
        )

        if input_type == "waveform":
            waveform_model = nn.Sequential(
                nn.Conv1d(in_channels=num_features, out_channels=250, kernel_size=250, stride=160, padding=45),
53
                nn.ReLU(inplace=True),
Tomás Osório's avatar
Tomás Osório committed
54
55
56
57
58
59
60
61
62
            )
            self.acoustic_model = nn.Sequential(waveform_model, acoustic_model)

        if input_type in ["power_spectrum", "mfcc"]:
            self.acoustic_model = acoustic_model

    def forward(self, x: Tensor) -> Tensor:
        r"""
        Args:
63
            x (torch.Tensor): Tensor of dimension (batch_size, num_features, input_length).
Tomás Osório's avatar
Tomás Osório committed
64
65
66
67
68
69
70
71

        Returns:
            Tensor: Predictor tensor of dimension (batch_size, number_of_classes, input_length).
        """

        x = self.acoustic_model(x)
        x = nn.functional.log_softmax(x, dim=1)
        return x