Add HuBERT pretrain model to enable training from scratch (#2064)

Summary: - Add three factory functions:`hubert_pretrain_base`, `hubert_pretrain_large`, and `hubert_pretrain_xlarge`, to enable the HuBERT model to train from scratch. - Add `num_classes` argument to `hubert_pretrain_base` factory function because the base model has two iterations of training, the first iteration the `num_cluster` is 100, in the second iteration `num_cluster` is 500. - The model takes `waveforms`, `labels`, and `lengths` as inputs - The model generates the last layer of transformer embedding, `logit_m`, `logit_u` as the outputs. Pull Request resolved: https://github.com/pytorch/audio/pull/2064 Reviewed By: hwangjeff, mthrok Differential Revision: D33338587 Pulled By: nateanl fbshipit-source-id: 534bc17c576c5f344043d8ba098204b8da6e630a

Add HuBERT pretrain model to enable training from scratch (#2064)
Summary: - Add three factory functions:`hubert_pretrain_base`, `hubert_pretrain_large`, and `hubert_pretrain_xlarge`, to enable the HuBERT model to train from scratch. - Add `num_classes` argument to `hubert_pretrain_base` factory function because the base model has two iterations of training, the first iteration the `num_cluster` is 100, in the second iteration `num_cluster` is 500. - The model takes `waveforms`, `labels`, and `lengths` as inputs - The model generates the last layer of transformer embedding, `logit_m`, `logit_u` as the outputs. Pull Request resolved: https://github.com/pytorch/audio/pull/2064 Reviewed By: hwangjeff, mthrok Differential Revision: D33338587 Pulled By: nateanl fbshipit-source-id: 534bc17c576c5f344043d8ba098204b8da6e630a
37a2555f · Zhaoheng Ni · Facebook GitHub Bot · 7bf04d1e · 37a2555f · 37a2555f
Commit 37a2555f authored Dec 28, 2021 by Zhaoheng Ni Committed by Facebook GitHub Bot Dec 28, 2021
5 changed files
--- a/docs/source/models.rst
+++ b/docs/source/models.rst
@@ -59,6 +59,13 @@ Wav2Vec2Model

  .. automethod:: forward

+HuBERTPretrainModel
+^^^^^^^^^^^^^^^^^^^
+
+.. autoclass:: HuBERTPretrainModel
+
+  .. automethod:: forward
+
 Factory Functions
 -----------------

@@ -98,6 +105,26 @@ hubert_xlarge

 .. autofunction:: hubert_xlarge

+hubert_pretrain_model
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_pretrain_model
+
+hubert_pretrain_base
+^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_pretrain_base
+
+hubert_pretrain_large
+^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_pretrain_large
+
+hubert_pretrain_xlarge
+^^^^^^^^^^^^^^^^^^^^^^
+
+.. autofunction:: hubert_pretrain_xlarge
+
 Utility Functions
 -----------------


--- a/torchaudio/models/__init__.py
+++ b/torchaudio/models/__init__.py
@@ -4,6 +4,7 @@ from .tacotron2 import Tacotron2
 from .wav2letter import Wav2Letter
 from .wav2vec2 import (
    Wav2Vec2Model,
+    HuBERTPretrainModel,
    wav2vec2_model,
    wav2vec2_base,
    wav2vec2_large,
@@ -11,6 +12,10 @@ from .wav2vec2 import (
    hubert_base,
    hubert_large,
    hubert_xlarge,
+    hubert_pretrain_model,
+    hubert_pretrain_base,
+    hubert_pretrain_large,
+    hubert_pretrain_xlarge,
 )
 from .wavernn import WaveRNN

@@ -20,6 +25,7 @@ __all__ = [
    "ConvTasNet",
    "DeepSpeech",
    "Wav2Vec2Model",
+    "HuBERTPretrainModel",
    "wav2vec2_model",
    "wav2vec2_base",
    "wav2vec2_large",
@@ -27,5 +33,9 @@ __all__ = [
    "hubert_base",
    "hubert_large",
    "hubert_xlarge",
+    "hubert_pretrain_model",
+    "hubert_pretrain_base",
+    "hubert_pretrain_large",
+    "hubert_pretrain_xlarge",
    "Tacotron2",
 ]
--- a/torchaudio/models/wav2vec2/__init__.py
+++ b/torchaudio/models/wav2vec2/__init__.py
 from . import utils
 from .model import (
    Wav2Vec2Model,
+    HuBERTPretrainModel,
    wav2vec2_model,
    wav2vec2_base,
    wav2vec2_large,
@@ -8,10 +9,15 @@ from .model import (
    hubert_base,
    hubert_large,
    hubert_xlarge,
+    hubert_pretrain_model,
+    hubert_pretrain_base,
+    hubert_pretrain_large,
+    hubert_pretrain_xlarge,
 )

 __all__ = [
    "Wav2Vec2Model",
+    "HuBERTPretrainModel",
    "wav2vec2_model",
    "wav2vec2_base",
    "wav2vec2_large",
@@ -19,5 +25,9 @@ __all__ = [
    "hubert_base",
    "hubert_large",
    "hubert_xlarge",
+    "hubert_pretrain_model",
+    "hubert_pretrain_base",
+    "hubert_pretrain_large",
+    "hubert_pretrain_xlarge",
    "utils",
 ]
--- a/torchaudio/models/wav2vec2/components.py
+++ b/torchaudio/models/wav2vec2/components.py
@@ -3,7 +3,7 @@ from typing import Optional, Tuple, List

 import torch
 from torch import Tensor, nn
-from torch.nn import Module
+from torch.nn import Module, Parameter

 _LG = logging.getLogger(__name__)

@@ -713,3 +713,327 @@ def _get_encoder(
        layer_drop=layer_drop,
    )
    return Encoder(feature_projection, transformer)
+
+
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    padding_mask: Optional[Tensor],
+    mask_prob: float,
+    mask_length: int,
+    mask_type: str = "static",
+    mask_other: float = 0.0,
+    min_masks: int = 0,
+    no_overlap: bool = False,
+    min_space: int = 0,
+) -> Tensor:
+    """Computes random mask spans for a given shape.
+    Args:
+        shape (int, int): The shape for which to compute masks.
+            The first element is batch size and second is the number of frames.
+        padding_mask (Tensor or None): The padding mask of the same dimension as shape,
+            which will prevent masking padded elements.
+        mask_prob (float): Probability for each token to be chosen as start of the span to be masked.
+            This will be multiplied by number of timesteps divided by length of mask span to mask
+            approximately this percentage of all elements. However due to overlaps, the actual number
+            will be smaller (unless no_overlap is True).
+        mask_type (str): How to compute mask lengths. Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+            ``static``: Fixed size
+            ``uniform``: Sample from uniform distribution [mask_other, mask_length*2]
+            ``normal``: Sample from normal distribution with mean ``mask_length`` and stdev ``mask_other``.
+            ``poisson``: Sample from possion distribution with lambda = ``mask_length``.
+        min_masks (int): Minimum number of masked spans.
+        no_overlap (bool): If false, will switch to an alternative recursive algorithm
+            that prevents spans from overlapping.
+        min_space (int): How many frames to keep unmasked between spans (Only used if no_overlap is True).
+
+    Returns:
+        (Tensor): The mask indices of dimension `[batch, frame]`.
+    """
+
+    batch_size, frame = shape
+    mask = torch.full((batch_size, frame), False)
+    # add a random number for probabilistic rounding
+    all_num_mask = int(mask_prob * frame / float(mask_length) + torch.rand(1))
+
+    all_num_mask = max(min_masks, all_num_mask)
+
+    mask_idcs = []
+    for i in range(batch_size):
+        if padding_mask is not None:
+            sz = frame - padding_mask[i].long().sum().item()
+            # add a random number for probabilistic rounding
+            num_mask = int(mask_prob * sz / float(mask_length) + torch.rand(1))
+            num_mask = max(min_masks, num_mask)
+        else:
+            sz = frame
+            num_mask = all_num_mask
+
+        if mask_type == "static":
+            lengths = torch.full((num_mask,), mask_length)
+        elif mask_type == "uniform":
+            lengths = torch.randint(mask_other, mask_length * 2 + 1, size=(num_mask,))
+        elif mask_type == "normal":
+            lengths = torch.normal(mask_length, mask_other, size=(num_mask,))
+            lengths = torch.maximum(torch.ones(1), torch.round(lengths)).int()
+        elif mask_type == "poisson":
+            lengths = torch.poisson(mask_length, size=(num_mask,))
+            lengths = torch.round(lengths).int()
+        else:
+            raise Exception(f"unknown mask selection: {mask_type}")
+
+        if sum(lengths) == 0:
+            lengths[0] = min(mask_length, sz - 1)
+
+        if no_overlap:
+            mask_idc = []
+
+            def arrange(s, e, length, keep_length):
+                span_start = torch.randint(s, e - length, size=(1,))
+                mask_idc.extend(span_start + i for i in range(length))
+
+                new_parts = []
+                if span_start - s - min_space >= keep_length:
+                    new_parts.append((s, span_start - min_space + 1))
+                if e - span_start - keep_length - min_space > keep_length:
+                    new_parts.append((span_start + length + min_space, e))
+                return new_parts
+
+            parts = [(0, sz)]
+            min_length = min(lengths)
+            for length in sorted(lengths, reverse=True):
+                lens = torch.tensor([e - s for s, e in parts], dtype=torch.int)
+                lens[lens < length + min_space] = 0
+                l_sum = lens.sum()
+                if l_sum == 0:
+                    break
+                probs = lens / l_sum
+                c = torch.distributions.categorical.Categorical(probs).sample()
+                s, e = parts.pop(c)
+                parts.extend(arrange(s, e, length, min_length))
+            mask_idc = torch.tensor(mask_idc)
+        else:
+            min_len = min(lengths)
+            if sz - min_len <= num_mask:
+                min_len = sz - num_mask - 1
+
+            mask_idc = torch.multinomial(torch.ones((sz - min_len,)), num_samples=num_mask, replacement=False)
+
+            mask_idc = torch.tensor(
+                [mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])]
+            )
+
+        mask_idcs.append(torch.unique(mask_idc[mask_idc < sz]))
+
+    min_len = min([len(m) for m in mask_idcs])
+    for i, mask_idc in enumerate(mask_idcs):
+        if len(mask_idc) > min_len:
+            mask_idc = torch.index_select(
+                mask_idc,
+                0,
+                torch.multinomial(
+                    torch.ones((mask_idc.shape[0],)),
+                    num_samples=min_len,
+                    replacement=False,
+                ),
+            )
+        mask[i, mask_idc] = True
+
+    return mask
+
+
+def _get_padding_mask(input: Tensor, lengths: Tensor) -> Tensor:
+    """Generate the padding mask given the padded input and the lengths Tensors.
+    Args:
+        input (Tensor): The padded Tensor of dimension `[batch, max_len, frequency]`.
+        lengths (Tensor): The lengths Tensor of dimension `[batch,]`.
+
+    Returns:
+        (Tensor): The padding mask.
+    """
+    batch_size, max_len, _ = input.shape
+    mask = torch.arange(max_len, device=lengths.device).expand(batch_size, max_len) >= lengths[:, None]
+    return mask
+
+
+class MaskGenerator(Module):
+    """Generate the masks for masked prediction.
+    Args:
+        encoder_embed_dim (int): The dimension of the transformer embedding output.
+        mask_prob (float): Probability for each token to be chosen as start of the span to be masked.
+            This will be multiplied by number of timesteps divided by length of mask span to mask
+            approximately this percentage of all elements. However due to overlaps, the actual number
+            will be smaller (unless no_overlap is True).
+        mask_selection (str): How to choose the mask length.
+            Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_other (float): Secondary mask argument (used for more complex distributions).
+        mask_length (int): The lengths of the mask.
+        no_mask_overlap (bool):  Whether to allow masks to overlap.
+        mask_min_space (int):  Minimum space between spans (if no overlap is enabled).
+        mask_channel_prob (float): The probability of replacing a feature with 0.
+        mask_channel_selection (str): How to choose the mask length for channel masking.
+            Options: [``static``, ``uniform``, ``normal``, ``poisson``].
+        mask_channel_other (float): Secondary mask argument for channel masking(used for more complex distributions).
+        mask_channel_length (int): Minimum space between spans (if no overlap is enabled) for channel masking.
+        no_mask_channel_overlap (bool):  Whether to allow channel masks to overlap.
+        mask_channel_min_space (int): Minimum space between spans for channel masking(if no overlap is enabled).
+    """
+
+    def __init__(
+        self,
+        encoder_embed_dim: int,
+        mask_prob: float,
+        mask_selection: str,
+        mask_other: float,
+        mask_length: int,
+        no_mask_overlap: bool,
+        mask_min_space: int,
+        mask_channel_prob: float,
+        mask_channel_selection: str,
+        mask_channel_other: float,
+        mask_channel_length: int,
+        no_mask_channel_overlap: bool,
+        mask_channel_min_space: int,
+    ):
+        super().__init__()
+        self.mask_prob = mask_prob
+        self.mask_selection = mask_selection
+        self.mask_other = mask_other
+        self.mask_length = mask_length
+        self.no_mask_overlap = no_mask_overlap
+        self.mask_min_space = mask_min_space
+        self.mask_channel_prob = mask_channel_prob
+        self.mask_channel_selection = mask_channel_selection
+        self.mask_channel_other = mask_channel_other
+        self.mask_channel_length = mask_channel_length
+        self.no_mask_channel_overlap = no_mask_channel_overlap
+        self.mask_channel_min_space = mask_channel_min_space
+        self.mask_embedding = Parameter(torch.FloatTensor(encoder_embed_dim))
+        torch.nn.init.uniform_(self.mask_embedding)
+
+    def forward(self, x: Tensor, padding_mask: Optional[Tensor]) -> Tensor:
+        """
+        Args:
+            x (Tensor): The encoded representations after feature extraction module.
+            padding_mask (Tensor or None): The padding mask of the same dimension as shape,
+                which will prevent masking padded elements.
+
+        Returns:
+            Tensor: The feature representations after masking.
+            Tensor: The generated mask indices.
+        """
+        B, T, C = x.shape
+        if self.mask_prob > 0:
+            mask_indices = _compute_mask_indices(
+                (B, T),
+                padding_mask,
+                self.mask_prob,
+                self.mask_length,
+                self.mask_selection,
+                self.mask_other,
+                min_masks=2,
+                no_overlap=self.no_mask_overlap,
+                min_space=self.mask_min_space,
+            )
+            mask_indices = mask_indices.to(x.device)
+            x[mask_indices] = self.mask_embedding
+        else:
+            mask_indices = None
+
+        if self.mask_channel_prob > 0:
+            mask_channel_indices = _compute_mask_indices(
+                (B, C),
+                None,
+                self.mask_channel_prob,
+                self.mask_channel_length,
+                self.mask_channel_selection,
+                self.mask_channel_other,
+                no_overlap=self.no_mask_channel_overlap,
+                min_space=self.mask_channel_min_space,
+            )
+            mask_channel_indices = mask_channel_indices.to(x.device).unsqueeze(1).expand(-1, T, -1)
+            x[mask_channel_indices] = 0
+
+        return x, mask_indices
+
+
+def _compute_logits(
+    proj_x: Tensor,
+    target: Tensor,
+    label_embeddings: Parameter,
+) -> Tensor:
+    """Compute the logits of the embeddings.
+    Args:
+        proj_x (Tensor): The projected masked representations of dimension `[batch, frame, final_dim]`.
+        target (Tensor): The target Tensor of dimension `[batch, frame, final_dim]`.
+        label_embeddings (Parameter): The trainable embeddings of target of dimension `[num_class, final_dim]`.
+
+    Returns:
+        (Tensor): The logits of the inputs.
+    """
+    logit_temp = 0.1
+    pos = torch.index_select(label_embeddings, 0, target.long())
+    negs = label_embeddings.unsqueeze(1).expand(-1, proj_x.size(0), -1)
+    neg_is_pos = (pos == negs).all(-1)
+    pos = pos.unsqueeze(0)
+    targets = torch.cat([pos, negs], dim=0)
+
+    logits = torch.cosine_similarity(proj_x.float(), targets.float(), dim=-1).type_as(proj_x)
+    logits /= logit_temp
+    if neg_is_pos.any():
+        logits[1:][neg_is_pos] = float("-inf")
+    logits = logits.transpose(0, 1)  # (num_x, num_cls+1)
+    return logits
+
+
+class LogitGenerator(Module):
+    """Generate the logits of masked and unmasked inputs.
+    Args:
+        encoder_embed_dim (int): The dimension of the transformer embedding output.
+        num_classes (int): The number of classes in the labels.
+        final_dim (int): Project final representations and targets to `final_dim`.
+        skip_masked (bool): If True, skip computing losses over masked frames.
+        skip_nomask (bool): If True, skip computing losses over unmasked frames.
+    """
+
+    def __init__(
+        self,
+        encoder_embed_dim: int,
+        num_classes: int,
+        final_dim: int,
+        skip_masked: bool,
+        skip_nomask: bool,
+    ):
+        super().__init__()
+        self.label_embeddings = Parameter(torch.FloatTensor(num_classes, final_dim))
+        torch.nn.init.uniform_(self.label_embeddings)
+        self.final_proj = torch.nn.Linear(encoder_embed_dim, final_dim)
+        self.skip_masked = skip_masked
+        self.skip_nomask = skip_nomask
+
+    def forward(self, x: Tensor, label: Tensor, mask_m: Tensor, mask_u: Tensor) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+            x (Tensor): The feature representation of the last transformer layer.
+            label (Tensor): The label Tensor of dimension `[batch, frame]`.
+            mask_m (Tensor): The masked indices of dimension `[batch, frame]`.
+            mask_u (Tensor): The unmasked indices of dimension `[batch, frame]`.
+
+        Returns:
+            Tensor: The logits of masked frames. Tensor of dimension `[masked_frame, final_dim]`.
+            Tensor: The logits of unmasked frames. Tensor of dimension `[unmasked_frame, final_dim]`.
+        """
+        proj_x = self.final_proj(x)
+        if self.skip_masked:
+            logit_m = None
+        else:
+            proj_x_m = proj_x[mask_m]
+            label_m = label[mask_m]
+            logit_m = _compute_logits(proj_x_m, label_m, self.label_embeddings)
+
+        if self.skip_nomask:
+            logit_u = None
+        else:
+            proj_x_u = proj_x[mask_u]
+            label_u = label[mask_u]
+            logit_u = _compute_logits(proj_x_u, label_u, self.label_embeddings)
+        return logit_m, logit_u
--- a/torchaudio/models/wav2vec2/model.py
+++ b/torchaudio/models/wav2vec2/model.py