impl.py 52.9 KB
Newer Older
1
from dataclasses import dataclass
2
from typing import Any, Dict, List, Optional, Tuple
3

4
import torch
5
6
from torch import Tensor
from torch.nn import functional as F, Module
moto's avatar
moto committed
7
from torchaudio._internal import load_state_dict_from_url
Grigory Sizov's avatar
Grigory Sizov committed
8
from torchaudio.models import wav2vec2_model, Wav2Vec2Model, wavlm_model
9

moto's avatar
moto committed
10
11
from . import utils

12
13
14
15

__all__ = []


16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
class _Wav2Vec2Model(Module):
    """Wrapper class for :py:class:`~torchaudio.models.Wav2Vec2Model`.

    This is used for layer normalization at the input
    """

    def __init__(self, model: Wav2Vec2Model):
        super().__init__()
        self.model = model

    def forward(self, waveforms: Tensor, lengths: Optional[Tensor] = None) -> Tuple[Tensor, Optional[Tensor]]:
        waveforms = F.layer_norm(waveforms, waveforms.shape)
        return self.model(waveforms, lengths)

    @torch.jit.export
    def extract_features(
        self,
        waveforms: Tensor,
        lengths: Optional[Tensor] = None,
        num_layers: Optional[int] = None,
    ) -> Tuple[List[Tensor], Optional[Tensor]]:
        waveforms = F.layer_norm(waveforms, waveforms.shape)
        return self.model.extract_features(waveforms, lengths, num_layers)


41
@dataclass
42
class Wav2Vec2Bundle:
43
    """Data class that bundles associated information to use pretrained :py:class:`~torchaudio.models.Wav2Vec2Model`.
44
45
46
47
48
49
50
51
52
53
54

    This class provides interfaces for instantiating the pretrained model along with
    the information necessary to retrieve pretrained weights and additional data
    to be used with the model.

    Torchaudio library instantiates objects of this class, each of which represents
    a different pretrained model. Client code should access pretrained models via these
    instances.

    Please see below for the usage and the available values.

55
    Example - Feature Extraction
56
57
        >>> import torchaudio
        >>>
58
59
        >>> bundle = torchaudio.pipelines.HUBERT_BASE
        >>>
60
        >>> # Build the model and load pretrained weight.
61
        >>> model = bundle.get_model()
62
63
        Downloading:
        100%|███████████████████████████████| 360M/360M [00:06<00:00, 60.6MB/s]
64
65
66
67
        >>>
        >>> # Resample audio to the expected sampling rate
        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        >>>
68
69
70
        >>> # Extract acoustic features
        >>> features, _ = model.extract_features(waveform)
    """  # noqa: E501
71

72
73
    _path: str
    _params: Dict[str, Any]
74
    _sample_rate: float
75
    _normalize_waveform: bool
76
77
78
79
80
81
82
83

    @property
    def sample_rate(self) -> float:
        """Sample rate of the audio that the model is trained on.

        :type: float
        """
        return self._sample_rate
84

85
    def _get_state_dict(self, dl_kwargs):
86
        url = f"https://download.pytorch.org/torchaudio/models/{self._path}"
87
88
89
90
        dl_kwargs = {} if dl_kwargs is None else dl_kwargs
        state_dict = load_state_dict_from_url(url, **dl_kwargs)
        return state_dict

91
    def get_model(self, *, dl_kwargs=None) -> Module:
moto's avatar
moto committed
92
        """Construct the model and load the pretrained weight.
93
94
95
96
97
98

        The weight file is downloaded from the internet and cached with
        :func:`torch.hub.load_state_dict_from_url`

        Args:
            dl_kwargs (dictionary of keyword arguments): Passed to :func:`torch.hub.load_state_dict_from_url`.
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

        Returns:
            Variation of :py:class:`~torchaudio.models.Wav2Vec2Model`.

            For the models listed below, an additional layer normalization is performed on the input.

            For all other models, a :py:class:`~torchaudio.models.Wav2Vec2Model` instance is returned.

            - WAV2VEC2_LARGE_LV60K
            - WAV2VEC2_ASR_LARGE_LV60K_10M
            - WAV2VEC2_ASR_LARGE_LV60K_100H
            - WAV2VEC2_ASR_LARGE_LV60K_960H
            - WAV2VEC2_XLSR53
            - HUBERT_LARGE
            - HUBERT_XLARGE
            - HUBERT_ASR_LARGE
            - HUBERT_ASR_XLARGE
            - WAVLM_LARGE
117
        """
Grigory Sizov's avatar
Grigory Sizov committed
118
119
120
121
122
        model_type = self._params.pop("model_type", None)
        if model_type == "WavLM":
            model = wavlm_model(**self._params)
        else:
            model = wav2vec2_model(**self._params)
123
        model.load_state_dict(self._get_state_dict(dl_kwargs))
124
125
        if self._normalize_waveform:
            model = _Wav2Vec2Model(model)
126
        model.eval()
127
128
        return model

129
130
131

@dataclass
class Wav2Vec2ASRBundle(Wav2Vec2Bundle):
132
133
    """Data class that bundles associated information to use pretrained
    :py:class:`~torchaudio.models.Wav2Vec2Model`.
134
135
136
137
138
139
140
141
142
143
144
145
146
147

    This class provides interfaces for instantiating the pretrained model along with
    the information necessary to retrieve pretrained weights and additional data
    to be used with the model.

    Torchaudio library instantiates objects of this class, each of which represents
    a different pretrained model. Client code should access pretrained models via these
    instances.

    Please see below for the usage and the available values.

    Example - ASR
        >>> import torchaudio
        >>>
148
149
        >>> bundle = torchaudio.pipelines.HUBERT_ASR_LARGE
        >>>
150
        >>> # Build the model and load pretrained weight.
151
        >>> model = bundle.get_model()
152
153
        Downloading:
        100%|███████████████████████████████| 1.18G/1.18G [00:17<00:00, 73.8MB/s]
154
        >>>
155
        >>> # Check the corresponding labels of the output.
156
        >>> labels = bundle.get_labels()
157
        >>> print(labels)
158
        ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
159
160
161
162
        >>>
        >>> # Resample audio to the expected sampling rate
        >>> waveform = torchaudio.functional.resample(waveform, sample_rate, bundle.sample_rate)
        >>>
163
164
        >>> # Infer the label probability distribution
        >>> emissions, _ = model(waveform)
165
        >>>
166
167
168
169
        >>> # Pass emission to decoder
        >>> # `ctc_decode` is for illustration purpose only
        >>> transcripts = ctc_decode(emissions, labels)
    """  # noqa: E501
170

171
    _labels: Tuple[str]
172
    _remove_aux_axis: Tuple[int] = (1, 2, 3)
173

174
    def get_labels(
175
176
177
        self,
        *,
        blank: str = "-",
178
179
180
    ) -> Tuple[str]:
        """The output class labels (only applicable to fine-tuned bundles)

181
        The first is blank token, and it is customizable.
182
183

        Args:
184
            blank (str, optional): Blank token. (default: ``'-'``)
185
186

        Returns:
187
            Tuple[str]:
188
189
190
191
192
193
            For models fine-tuned on ASR, returns the tuple of strings representing
            the output class labels.

        Example
            >>> import torchaudio
            >>> torchaudio.models.HUBERT_ASR_LARGE.get_labels()
194
            ('-', '|', 'E', 'T', 'A', 'O', 'N', 'I', 'H', 'S', 'R', 'D', 'L', 'U', 'M', 'W', 'C', 'F', 'G', 'Y', 'P', 'B', 'V', 'K', "'", 'X', 'J', 'Q', 'Z')
195
        """  # noqa: E501
196
        return (blank, *self._labels)
197

198
199
200
201
202
203
204
205
206
207
208
209
210
211
    def _get_state_dict(self, dl_kwargs):
        state_dict = super()._get_state_dict(dl_kwargs)
        if self._remove_aux_axis:
            # Remove the seemingly unnecessary axis
            # For ASR task, the pretrained weights originated from fairseq has unrelated dimensions at index 1, 2, 3
            # It's originated from the Dictionary implementation of fairseq, which was intended for NLP tasks,
            # but not used during the ASR training.
            # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/data/dictionary.py#L21-L37
            # https://github.com/pytorch/fairseq/blob/c5ff181125c7e6126b49a85e5ebdd5f5b6a07914/fairseq/criterions/ctc.py#L126-L129
            #
            # Also, some pretrained weights originated from voxpopuli has an extra dimensions that almost never used and
            # that resembles mistake.
            # The label `1` shows up in the training dataset of German (1 out of 16M),
            # English (1 / 28M), Spanish (1 / 9.4M), Romanian (1 / 4.7M) and Polish (6 / 5.8M)
212
            for key in ["aux.weight", "aux.bias"]:
213
214
215
216
                t = state_dict[key]
                state_dict[key] = torch.stack([t[i] for i in range(t.size(0)) if i not in self._remove_aux_axis])
        return state_dict

217

218
WAV2VEC2_BASE = Wav2Vec2Bundle(
219
    _path="wav2vec2_fairseq_base_ls960.pth",
220
    _params={
221
222
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
223
224
225
226
227
228
229
230
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
231
232
233
234
235
236
237
238
239
240
241
242
243
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
244
245
        "aux_num_out": None,
    },
246
    _sample_rate=16000,
247
    _normalize_waveform=False,
248
)
249
250
251
WAV2VEC2_BASE.__doc__ = """Wav2vec 2.0 model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
252

253
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
254
255
256
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
257

258
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
259
"""  # noqa: E501
260

261
WAV2VEC2_ASR_BASE_10M = Wav2Vec2ASRBundle(
262
    _path="wav2vec2_fairseq_base_ls960_asr_ll10m.pth",
263
    _params={
264
265
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
266
267
268
269
270
271
272
273
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
274
275
276
277
278
279
280
281
282
283
284
285
286
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
287
        "aux_num_out": 29,
288
    },
moto's avatar
moto committed
289
    _labels=utils._get_en_labels(),
290
    _sample_rate=16000,
291
    _normalize_waveform=False,
292
)
293
294
WAV2VEC2_ASR_BASE_10M.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
295
296
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
297
:cite:`librilight` ("train-10min" subset).
298

299
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
300
301
302
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
303

304
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
305
"""  # noqa: E501
306

307
WAV2VEC2_ASR_BASE_100H = Wav2Vec2ASRBundle(
308
    "wav2vec2_fairseq_base_ls960_asr_ls100.pth",
309
    {
310
311
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
312
313
314
315
316
317
318
319
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
320
321
322
323
324
325
326
327
328
329
330
331
332
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
333
        "aux_num_out": 29,
334
    },
moto's avatar
moto committed
335
    _labels=utils._get_en_labels(),
336
    _sample_rate=16000,
337
    _normalize_waveform=False,
338
339
)

340
341
WAV2VEC2_ASR_BASE_100H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
342
343
344
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from "train-clean-100" subset.

345
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
346
347
348
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
349

350
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
351
"""  # noqa: E501
352

353
WAV2VEC2_ASR_BASE_960H = Wav2Vec2ASRBundle(
354
    "wav2vec2_fairseq_base_ls960_asr_ls960.pth",
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
379
        "aux_num_out": 29,
380
    },
moto's avatar
moto committed
381
    _labels=utils._get_en_labels(),
382
    _sample_rate=16000,
383
    _normalize_waveform=False,
384
)
385
386
WAV2VEC2_ASR_BASE_960H.__doc__ = """Wav2vec 2.0 model ("base" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
387
388
389
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.

390
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
391
392
393
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
394

395
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
396
"""  # noqa: E501
397

398
WAV2VEC2_LARGE = Wav2Vec2Bundle(
399
    "wav2vec2_fairseq_large_ls960.pth",
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.2,
        "aux_num_out": None,
    },
426
    _sample_rate=16000,
427
    _normalize_waveform=False,
428
)
429
430
431
WAV2VEC2_LARGE.__doc__ = """Wav2vec 2.0 model ("large" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
432

433
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
434
435
436
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
437

438
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
439
"""  # noqa: E501
440

441
WAV2VEC2_ASR_LARGE_10M = Wav2Vec2ASRBundle(
442
    "wav2vec2_fairseq_large_ls960_asr_ll10m.pth",
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.2,
467
        "aux_num_out": 29,
468
    },
moto's avatar
moto committed
469
    _labels=utils._get_en_labels(),
470
    _sample_rate=16000,
471
    _normalize_waveform=False,
472
)
473
474
WAV2VEC2_ASR_LARGE_10M.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
475
476
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 10 minutes of transcribed audio from *Libri-Light* dataset
477
:cite:`librilight` ("train-10min" subset).
478

479
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
480
481
482
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
483

484
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
485
"""  # noqa: E501
486

487
WAV2VEC2_ASR_LARGE_100H = Wav2Vec2ASRBundle(
488
    "wav2vec2_fairseq_large_ls960_asr_ls100.pth",
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.2,
513
        "aux_num_out": 29,
514
    },
moto's avatar
moto committed
515
    _labels=utils._get_en_labels(),
516
    _sample_rate=16000,
517
    _normalize_waveform=False,
518
)
519
520
WAV2VEC2_ASR_LARGE_100H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
521
522
523
524
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on 100 hours of transcribed audio from
the same dataset ("train-clean-100" subset).

525
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
526
527
528
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
529

530
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
531
"""  # noqa: E501
532

533
WAV2VEC2_ASR_LARGE_960H = Wav2Vec2ASRBundle(
534
    "wav2vec2_fairseq_large_ls960_asr_ls960.pth",
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.2,
559
        "aux_num_out": 29,
560
    },
moto's avatar
moto committed
561
    _labels=utils._get_en_labels(),
562
    _sample_rate=16000,
563
    _normalize_waveform=False,
564
)
565
566
WAV2VEC2_ASR_LARGE_960H.__doc__ = """Wav2vec 2.0 model ("large" architecture with an extra linear module),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
567
568
569
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), and
fine-tuned for ASR on the same audio with the corresponding transcripts.

570
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
571
572
573
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
574

575
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
576
"""  # noqa:  E501
577

578
WAV2VEC2_LARGE_LV60K = Wav2Vec2Bundle(
579
    "wav2vec2_fairseq_large_lv60k.pth",
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": True,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
        "aux_num_out": None,
    },
606
    _sample_rate=16000,
607
    _normalize_waveform=True,
608
)
609
610
611
WAV2VEC2_LARGE_LV60K.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.
612

613
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
614
615
616
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
617

618
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
619
"""  # noqa: E501
620

621
WAV2VEC2_ASR_LARGE_LV60K_10M = Wav2Vec2ASRBundle(
622
    "wav2vec2_fairseq_large_lv60k_asr_ll10m.pth",
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": True,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
647
        "aux_num_out": 29,
648
    },
moto's avatar
moto committed
649
    _labels=utils._get_en_labels(),
650
    _sample_rate=16000,
651
    _normalize_waveform=True,
652
)
653
654
655
WAV2VEC2_ASR_LARGE_LV60K_10M.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 10 minutes of transcribed audio from the same dataset ("train-10min" subset).
656

657
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
658
659
660
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
661

662
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
663
"""  # noqa: E501
664

665
WAV2VEC2_ASR_LARGE_LV60K_100H = Wav2Vec2ASRBundle(
666
    "wav2vec2_fairseq_large_lv60k_asr_ls100.pth",
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": True,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
691
        "aux_num_out": 29,
692
    },
moto's avatar
moto committed
693
    _labels=utils._get_en_labels(),
694
    _sample_rate=16000,
695
    _normalize_waveform=True,
696
)
697
698
WAV2VEC2_ASR_LARGE_LV60K_100H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
699
fine-tuned for ASR on 100 hours of transcribed audio from
700
*LibriSpeech* dataset :cite:`7178964` ("train-clean-100" subset).
701

702
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
703
704
705
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
706

707
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
708
"""  # noqa: E501
709

710
WAV2VEC2_ASR_LARGE_LV60K_960H = Wav2Vec2ASRBundle(
711
    "wav2vec2_fairseq_large_lv60k_asr_ls960.pth",
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": True,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
736
        "aux_num_out": 29,
737
    },
moto's avatar
moto committed
738
    _labels=utils._get_en_labels(),
739
    _sample_rate=16000,
740
    _normalize_waveform=True,
741
)
742
743
744
WAV2VEC2_ASR_LARGE_LV60K_960H.__doc__ = """Wav2vec 2.0 model ("large-lv60k" architecture with an extra linear module),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* :cite:`librilight` dataset, and
fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
745
746
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").

747
Originally published by the authors of *wav2vec 2.0* :cite:`baevski2020wav2vec` under MIT License and
748
749
750
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
751

752
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
753
"""  # noqa: E501
754

755
WAV2VEC2_XLSR53 = Wav2Vec2Bundle(
756
    "wav2vec2_fairseq_large_xlsr53.pth",
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": True,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
        "aux_num_out": None,
    },
783
    _sample_rate=16000,
784
    _normalize_waveform=True,
785
)
786
787
WAV2VEC2_XLSR53.__doc__ = """Wav2vec 2.0 model ("base" architecture),
pre-trained on 56,000 hours of unlabeled audio from multiple datasets (
788
789
*Multilingual LibriSpeech* :cite:`Pratap_2020`,
*CommonVoice* :cite:`ardila2020common` and
790
791
*BABEL* :cite:`Gales2014SpeechRA`),
not fine-tuned.
792
793
794

Originally published by the authors of
*Unsupervised Cross-lingual Representation Learning for Speech Recognition*
795
:cite:`conneau2020unsupervised` under MIT License and redistributed with the same license.
796
797
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/wav2vec#pre-trained-models>`__]
798

799
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
800
"""  # noqa: E501
801

802
HUBERT_BASE = Wav2Vec2Bundle(
803
    "hubert_fairseq_base_ls960.pth",
804
    {
805
806
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
807
808
809
810
811
812
813
814
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
815
816
817
818
819
820
821
822
823
824
825
826
827
828
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
        "aux_num_out": None,
829
    },
830
    _sample_rate=16000,
831
    _normalize_waveform=False,
832
)
833
834
835
HUBERT_BASE.__doc__ = """HuBERT model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`
(the combination of "train-clean-100", "train-clean-360", and "train-other-500"), not fine-tuned.
836

837
Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
838
839
840
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
841

842
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
843
"""  # noqa: E501
844

845
HUBERT_LARGE = Wav2Vec2Bundle(
846
    "hubert_fairseq_large_ll60k.pth",
847
    {
848
849
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
850
851
852
853
854
855
856
857
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
858
859
860
861
862
863
864
865
866
867
868
869
870
871
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
        "aux_num_out": None,
872
    },
873
    _sample_rate=16000,
874
    _normalize_waveform=True,
875
)
876
877
878
HUBERT_LARGE.__doc__ = """HuBERT model ("large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.
879

880
Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
881
882
883
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
884

885
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
886
"""  # noqa: E501
887

888
HUBERT_XLARGE = Wav2Vec2Bundle(
889
    "hubert_fairseq_xlarge_ll60k.pth",
890
    {
891
892
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
893
894
895
896
897
898
899
900
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
901
902
903
904
905
906
907
908
909
910
911
912
913
914
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1280,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 48,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 5120,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.0,
        "aux_num_out": None,
915
    },
916
    _sample_rate=16000,
917
    _normalize_waveform=True,
918
)
919
920
921
HUBERT_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`,
not fine-tuned.
922

923
Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
924
925
926
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
927

928
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
929
"""  # noqa: E501
930

931
HUBERT_ASR_LARGE = Wav2Vec2ASRBundle(
932
    "hubert_fairseq_large_ll60k_asr_ls960.pth",
933
    {
934
935
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
936
937
938
939
940
941
942
943
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
944
945
946
947
948
949
950
951
952
953
954
955
956
957
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.1,
        "aux_num_out": 29,
958
    },
moto's avatar
moto committed
959
    _labels=utils._get_en_labels(),
960
    _sample_rate=16000,
961
    _normalize_waveform=True,
962
)
963
964
965
HUBERT_ASR_LARGE.__doc__ = """HuBERT model ("large" architecture),
pre-trained on 60,000 hours of unlabeled audio from *Libri-Light* dataset :cite:`librilight`, and
fine-tuned for ASR on 960 hours of transcribed audio from *LibriSpeech* dataset :cite:`7178964`
966
967
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").

968
Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
969
970
971
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
972

973
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
974
"""  # noqa: E501
975

976
HUBERT_ASR_XLARGE = Wav2Vec2ASRBundle(
977
    "hubert_fairseq_xlarge_ll60k_asr_ls960.pth",
978
    {
979
980
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
981
982
983
984
985
986
987
988
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1280,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 48,
        "encoder_num_heads": 16,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 5120,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": True,
        "encoder_layer_drop": 0.1,
        "aux_num_out": 29,
1003
    },
moto's avatar
moto committed
1004
    _labels=utils._get_en_labels(),
1005
    _sample_rate=16000,
1006
    _normalize_waveform=True,
1007
)
1008
1009
HUBERT_ASR_XLARGE.__doc__ = """HuBERT model ("extra large" architecture),
pre-trained on 60,000 hours of unlabeled audio from
1010
*Libri-Light* dataset :cite:`librilight`, and
1011
fine-tuned for ASR on 960 hours of transcribed audio from
1012
*LibriSpeech* dataset :cite:`7178964`
1013
(the combination of "train-clean-100", "train-clean-360", and "train-other-500").
1014

1015
Originally published by the authors of *HuBERT* :cite:`hsu2021hubert` under MIT License and
1016
1017
1018
redistributed with the same license.
[`License <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/LICENSE>`__,
`Source <https://github.com/pytorch/fairseq/blob/ce6c9eeae163ac04b79539c78e74f292f29eaa18/examples/hubert#pre-trained-and-fine-tuned-asr-models>`__]
1019

1020
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1021
"""  # noqa: E501
1022
1023


1024
VOXPOPULI_ASR_BASE_10K_DE = Wav2Vec2ASRBundle(
1025
    "wav2vec2_voxpopuli_base_10k_asr_de.pt",
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.1,
        "aux_num_out": 32,
    },
    _labels=utils._get_de_labels(),
    _sample_rate=16000,
1054
    _normalize_waveform=False,
1055
1056
    _remove_aux_axis=(1, 2, 3, 35),
)
1057
1058
1059
1060
VOXPOPULI_ASR_BASE_10K_DE.__doc__ = """wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 282 hours of transcribed audio from "de" subset.
1061

1062
Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1063
1064
1065
1066
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

1067
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1068
1069
1070
"""  # noqa: E501


1071
VOXPOPULI_ASR_BASE_10K_EN = Wav2Vec2ASRBundle(
1072
    "wav2vec2_voxpopuli_base_10k_asr_en.pt",
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.1,
1097
        "aux_num_out": 28,
1098
1099
1100
    },
    _labels=utils._get_vp_en_labels(),
    _sample_rate=16000,
1101
    _normalize_waveform=False,
1102
1103
    _remove_aux_axis=(1, 2, 3, 31),
)
1104
1105
1106
1107
VOXPOPULI_ASR_BASE_10K_EN.__doc__ = """wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 543 hours of transcribed audio from "en" subset.
moto's avatar
moto committed
1108

1109
Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1110
1111
1112
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]
moto's avatar
moto committed
1113

1114
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1115
1116
1117
"""  # noqa: E501


1118
VOXPOPULI_ASR_BASE_10K_ES = Wav2Vec2ASRBundle(
1119
    "wav2vec2_voxpopuli_base_10k_asr_es.pt",
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.1,
1144
        "aux_num_out": 35,
1145
1146
1147
    },
    _labels=utils._get_es_labels(),
    _sample_rate=16000,
1148
    _normalize_waveform=False,
1149
1150
    _remove_aux_axis=(1, 2, 3, 35),
)
1151
1152
1153
1154
VOXPOPULI_ASR_BASE_10K_ES.__doc__ = """wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 166 hours of transcribed audio from "es" subset.
1155

1156
Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1157
1158
1159
1160
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

1161
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1162
1163
"""  # noqa: E501

1164
VOXPOPULI_ASR_BASE_10K_FR = Wav2Vec2ASRBundle(
1165
    "wav2vec2_voxpopuli_base_10k_asr_fr.pt",
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.1,
1190
        "aux_num_out": 43,
1191
1192
1193
    },
    _labels=utils._get_fr_labels(),
    _sample_rate=16000,
1194
    _normalize_waveform=False,
1195
)
1196
1197
1198
1199
VOXPOPULI_ASR_BASE_10K_FR.__doc__ = """wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 211 hours of transcribed audio from "fr" subset.
1200

1201
Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1202
1203
1204
1205
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

1206
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1207
"""  # noqa: E501
1208
1209
1210


VOXPOPULI_ASR_BASE_10K_IT = Wav2Vec2ASRBundle(
1211
    "wav2vec2_voxpopuli_base_10k_asr_it.pt",
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.0,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_attention_dropout": 0.0,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.1,
        "encoder_dropout": 0.0,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.1,
        "aux_num_out": 37,
    },
    _labels=utils._get_it_labels(),
    _sample_rate=16000,
1240
    _normalize_waveform=False,
1241
1242
    _remove_aux_axis=(1, 2, 3),
)
1243
1244
1245
1246
VOXPOPULI_ASR_BASE_10K_IT.__doc__ = """wav2vec 2.0 model ("base" architecture),
pre-trained on 10k hours of unlabeled audio from *VoxPopuli* dataset :cite:`voxpopuli`
("10k" subset, consisting of 23 languages), and
fine-tuned for ASR on 91 hours of transcribed audio from "it" subset.
1247

1248
Originally published by the authors of *VoxPopuli* :cite:`voxpopuli` under CC BY-NC 4.0 and
1249
1250
1251
1252
redistributed with the same license.
[`License <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#license>`__,
`Source <https://github.com/facebookresearch/voxpopuli/tree/160e4d7915bad9f99b2c35b1d3833e51fd30abf2#asr-and-lm>`__]

1253
Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2ASRBundle` for the usage.
1254
"""  # noqa: E501
Grigory Sizov's avatar
Grigory Sizov committed
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288


WAVLM_BASE = Wav2Vec2Bundle(
    "wavlm_base.pth",
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_max_distance": 800,
        "encoder_num_buckets": 320,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
        "aux_num_out": None,
        "model_type": "WavLM",
    },
    _sample_rate=16000,
1289
    _normalize_waveform=False,
Grigory Sizov's avatar
Grigory Sizov committed
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
)
WAVLM_BASE.__doc__ = """WavLM Base model ("base" architecture),
pre-trained on 960 hours of unlabeled audio from *LibriSpeech* dataset :cite:`7178964`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
"""  # noqa: E501


WAVLM_BASE_PLUS = Wav2Vec2Bundle(
    "wavlm_base_plus.pth",
    {
        "extractor_mode": "group_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 768,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 12,
        "encoder_num_heads": 12,
        "encoder_max_distance": 800,
        "encoder_num_buckets": 320,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 3072,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
        "aux_num_out": None,
        "model_type": "WavLM",
    },
    _sample_rate=16000,
1335
    _normalize_waveform=False,
Grigory Sizov's avatar
Grigory Sizov committed
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
)
WAVLM_BASE_PLUS.__doc__ = """WavLM Base+ model ("base" architecture),
pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
"""  # noqa: E501


WAVLM_LARGE = Wav2Vec2Bundle(
    "wavlm_large.pth",
    {
        "extractor_mode": "layer_norm",
        "extractor_conv_layer_config": [
            (512, 10, 5),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 3, 2),
            (512, 2, 2),
            (512, 2, 2),
        ],
        "extractor_conv_bias": False,
        "encoder_embed_dim": 1024,
        "encoder_projection_dropout": 0.1,
        "encoder_pos_conv_kernel": 128,
        "encoder_pos_conv_groups": 16,
        "encoder_num_layers": 24,
        "encoder_num_heads": 16,
        "encoder_max_distance": 800,
        "encoder_num_buckets": 320,
        "encoder_attention_dropout": 0.1,
        "encoder_ff_interm_features": 4096,
        "encoder_ff_interm_dropout": 0.0,
        "encoder_dropout": 0.1,
        "encoder_layer_norm_first": False,
        "encoder_layer_drop": 0.05,
        "aux_num_out": None,
        "model_type": "WavLM",
    },
    _sample_rate=16000,
1382
    _normalize_waveform=True,
Grigory Sizov's avatar
Grigory Sizov committed
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
)
WAVLM_LARGE.__doc__ = """WavLM Large model ("large" architecture),
pre-trained on 60,000 hours of Libri-Light dataset :cite:`librilight`, 10,000 hours of GigaSpeech :cite:`GigaSpeech2021`,
and 24,000 hours of *VoxPopuli* :cite:`voxpopuli`, not fine-tuned.

Originally published by the authors of *WavLM* :cite:`chen2022wavlm` under MIT License and
redistributed with the same license.
[`License <https://github.com/microsoft/unilm/blob/65f15af2a307ebb64cfb25adf54375b002e6fe8d/LICENSE>`__,
`Source https://github.com/microsoft/unilm/tree/65f15af2a307ebb64cfb25adf54375b002e6fe8d/wavlm#pre-trained-models>`__]

Please refer to :py:class:`torchaudio.pipelines.Wav2Vec2Bundle` for the usage.
"""  # noqa: E501