Add modularized SSL training recipe (#2876)

Summary: TorchAudio currently has one training recipe for HuBET + LibriSpeech pre-training. It may not suit well when users want to use customized dataset, or use a new training objective (such as contrastive loss in Wav2Vec2). The PR addresses the issue by providing a modularized training recipe for audio self-supervised learning. Users can inject customized model module, loss function, optimizer, lr scheduler, and datamodule for training a SSL model. Pull Request resolved: https://github.com/pytorch/audio/pull/2876 Reviewed By: hwangjeff Differential Revision: D42617414 Pulled By: nateanl fbshipit-source-id: 6413df45a9d106ed1d5ff830bf628c54368c5792

Add modularized SSL training recipe (#2876)
Summary: TorchAudio currently has one training recipe for HuBET + LibriSpeech pre-training. It may not suit well when users want to use customized dataset, or use a new training objective (such as contrastive loss in Wav2Vec2). The PR addresses the issue by providing a modularized training recipe for audio self-supervised learning. Users can inject customized model module, loss function, optimizer, lr scheduler, and datamodule for training a SSL model. Pull Request resolved: https://github.com/pytorch/audio/pull/2876 Reviewed By: hwangjeff Differential Revision: D42617414 Pulled By: nateanl fbshipit-source-id: 6413df45a9d106ed1d5ff830bf628c54368c5792
2eaefe27 · Zhaoheng Ni · Facebook GitHub Bot · c6a52355 · 2eaefe27 · 2eaefe27
Commit 2eaefe27 authored Jan 19, 2023 by Zhaoheng Ni Committed by Facebook GitHub Bot Jan 19, 2023
9 changed files
--- a/examples/ssl/data_modules/__init__.py
+++ b/examples/ssl/data_modules/__init__.py
+from ._hubert_datamodule import HuBERTDataModule
+__all__ = [
+    "HuBERTDataModule",
+]
--- a/examples/ssl/data_modules/_hubert_datamodule.py
+++ b/examples/ssl/data_modules/_hubert_datamodule.py
+import torch
+from pytorch_lightning import LightningDataModule
+from ._utils import BucketizeBatchSampler, CollateFnHubert, DistributedBatchSampler, HuBERTDataSet
+class HuBERTDataModule(LightningDataModule):
+    hubert_cls = HuBERTDataSet
+    def __init__(
+        self,
+        *,
+        dataset_path,
+        dataset,
+        feature_type,
+        seconds_per_batch,
+        train_shuffle=True,
+        num_workers=10,
+    ):
+        super().__init__()
+        self.dataset_path = dataset_path
+        self.dataset = dataset
+        self.feature_type = feature_type
+        self.seconds_per_batch = seconds_per_batch
+        self.train_shuffle = train_shuffle
+        self.num_workers = num_workers
+    def train_dataloader(self):
+        dataset = self.hubert_cls(self.dataset_path, self.dataset, "train")
+        sampler = BucketizeBatchSampler(
+            dataset.len_list,
+            num_buckets=10000,
+            max_token_count=self.seconds_per_batch * 16000,
+            min_len=32000,
+            max_len=250000,
+            shuffle=True,
+        )
+        sampler = DistributedBatchSampler(sampler, shuffle=self.train_shuffle)
+        sampler.set_epoch(self.trainer.current_epoch)
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=sampler,
+            collate_fn=CollateFnHubert(feature_type=self.feature_type, pad=False, rand_crop=True),
+            num_workers=self.num_workers,
+        )
+        return dataloader
+    def val_dataloader(self):
+        dataset = self.hubert_cls(self.dataset_path, self.dataset, "valid")
+        sampler = BucketizeBatchSampler(
+            dataset.len_list,
+            num_buckets=1000,
+            max_token_count=self.seconds_per_batch * 16000,
+            min_len=32000,
+            max_len=250000,
+            shuffle=False,
+        )
+        dataloader = torch.utils.data.DataLoader(
+            dataset,
+            batch_sampler=sampler,
+            collate_fn=CollateFnHubert(feature_type=self.feature_type, pad=False, rand_crop=True),
+            num_workers=self.num_workers,
+        )
+        return dataloader
--- a/examples/ssl/data_modules/_utils.py
+++ b/examples/ssl/data_modules/_utils.py
+import math
+from pathlib import Path
+from typing import Dict, Iterator, List, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.distributed as dist
+import torchaudio
+from lightning import Batch
+from torch import Tensor
+from torch.utils.data import BatchSampler, Dataset, DistributedSampler
+class BucketizeBatchSampler(BatchSampler):
+    """Buketized BatchSampler for sequential data with different lengths to reduce number of paddings.
+    Args:
+        lengths (List[int]): The lengths of the samples in the dataset.
+        num_buckets (int): The number of buckets to split the data samples.
+        min_len (int, optional): The minimum sample lengths to keep.
+            (Default: 0)
+        max_len (int or None, optional): The maximum sample lengths to keep. Inferred if not provided.
+            (Default ``None``)
+        max_token_count (int or None, optional): The max number of tokens in one mini-batch.
+            (Default: ``None``)
+        batch_size (int or None, optional): The number of samples in one mini-batch.
+            (Default: ``None``)
+        shuffle (bool, optional): Whether to shuffle buckets for non-monotonic length sampling.
+            (Default: True)
+        drop_last (bool, optional): If ``True``, the sampler will drop the last batch if
+            its size would be less than ``batch_size``
+            (Default: False)
+    Note:
+        ``max_token_count`` and ``batch_size`` are mutually exclusive. Only one argument of the two
+        should have value.
+    Note:
+        ``drop_last`` is only valid when ``batch_size`` argument is given.
+    Note:
+        if ``shuffle`` is True, it will only shuffle the data once. Please set ``reload_dataloaders_every_n_epochs=1``
+        in pytorch_lightning Trainer to enable shuffling every epoch.
+    """
+    def __init__(
+        self,
+        lengths: List[int],
+        num_buckets: int,
+        min_len: int = 0,
+        max_len: Optional[int] = None,
+        max_token_count: Optional[int] = None,
+        batch_size: Optional[int] = None,
+        shuffle: bool = True,
+        drop_last: bool = False,
+    ) -> None:
+        if max_len is None:
+            max_len = max(lengths)
+        if not (0 <= min_len <= max_len):
+            raise AssertionError("``min_len`` should be non-negative and smaller than ``max_len``")
+        if max_token_count is not None and batch_size is not None:
+            raise AssertionError("The ``max_token_count`` and ``batch_size`` can't be both set.")
+        if max_token_count is None and batch_size is None:
+            raise AssertionError("One of ``max_token_count`` or ``batch_size`` must be set.")
+        if max_token_count is not None:
+            assert (
+                max_len <= max_token_count
+            ), "The  ``max_token_count`` must be greater than or equal to the maximum value of ``lengths``."
+        # Filter out samples which are outside the bounds of [min_len, max_len]
+        filtered_length_idx = [(length, i) for i, length in enumerate(lengths) if min_len <= length <= max_len]
+        if len(filtered_length_idx) == 0:
+            raise AssertionError("``lengths`` cannot be empty after filtering.")
+        sorted_filtered_length_idx = sorted(filtered_length_idx, key=lambda x: x[0])
+        self.lengths = [e[0] for e in sorted_filtered_length_idx]
+        self.indices = [e[1] for e in sorted_filtered_length_idx]
+        self.max_token_count = max_token_count
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.drop_last = drop_last
+        self.buckets = self._get_buckets(self.lengths, num_buckets, min_len, max_len)
+        self._update_iter_list()
+    def _get_buckets(self, lengths: List[int], num_buckets: int, min_len: int, max_len: int) -> Dict[int, Tensor]:
+        """Generate buckets based on the dataset.
+        Args:
+            lengths (List[int]): The lengths of the samples in the dataset.
+            num_buckets (int): The number of buckets.
+            min_len (int): The lower bound of the evenly spaced length intervals to determine bucket width.
+            max_len (int): The upper bound of the evenly spaced length intervals to determine bucket width.
+        Returns:
+            (dict[int, Tensor]): A dictionary in which the key is the bucket index, the value is
+                the Tensor of corresponding sample indices.
+        """
+        buckets = {}
+        boundaries = torch.linspace(min_len - 1, max_len + 1, num_buckets + 1)
+        bucket_ids = torch.bucketize(torch.tensor(lengths), boundaries)
+        for i in range(bucket_ids.size(0)):
+            bucket_id = int(bucket_ids[i])
+            if bucket_id in buckets:
+                buckets[bucket_id].append(i)
+            else:
+                buckets[bucket_id] = [i]
+        for k in buckets:
+            buckets[k] = torch.as_tensor(buckets[k], dtype=torch.int)
+        buckets = {k: v for k, v in sorted(buckets.items())}
+        return buckets
+    def _update_iter_list(self) -> None:
+        if self.shuffle:
+            for k in self.buckets:
+                self.buckets[k] = self.buckets[k][torch.randperm(self.buckets[k].size(0))]
+        self.iter_list = []
+        total_len = 0
+        batch = []
+        max_batch_size = self.max_token_count if self.max_token_count else self.batch_size
+        for k in self.buckets:
+            for i in range(self.buckets[k].size(0)):
+                index = int(self.buckets[k][i])
+                sample_length = self.lengths[index] if self.max_token_count else 1
+                if total_len + sample_length <= max_batch_size:
+                    batch.append(self.indices[index])
+                    total_len += sample_length
+                else:
+                    self.iter_list.append(batch)
+                    batch = [self.indices[index]]
+                    total_len = sample_length
+        if len(batch) > 0 and (self.max_token_count or not self.drop_last):
+            self.iter_list.append(batch)
+    def __iter__(self) -> Iterator[List[int]]:
+        return iter(self.iter_list)
+    def __len__(self):
+        if self.batch_size or (self.max_token_count and not self.shuffle):
+            return len(self.iter_list)
+class DistributedBatchSampler(DistributedSampler):
+    """`BucketizeBatchSampler` wrapper that distributes across each processor.
+    Args:
+        batch_sampler (BucketizeBatchSampler): the initialized bucketize batch sampler.
+        num_replicas (int, optional): Number of processes participating in
+            distributed training. By default, :attr:`world_size` is retrieved from the
+            current distributed group.
+        rank (int, optional): Rank of the current process within :attr:`num_replicas`.
+            By default, :attr:`rank` is retrieved from the current distributed
+            group.
+        shuffle (bool, optional): if ``True``, the list of batch indices will be shuffled.
+            (Default: ``True``)
+        seed (int, optional): random seed used to shuffle the batch_sampler if
+            :attr:`shuffle=True`. This number should be identical across all
+            processes in the distributed group. (Default: ``0``)
+        drop_last (bool, optional): if ``True``, then the sampler will drop the
+            tail of the data to make it evenly divisible across the number of
+            replicas. If ``False``, the sampler will add extra indices to make
+            the data evenly divisible across the replicas. (Default: ``False``)
+    Note:
+        if ``shuffle`` is True, it will only shuffle the data once. Please set ``reload_dataloaders_every_n_epochs=1``
+        in pytorch_lightning Trainer, and set `sampler.set_epoch(self.current_epoch)` before DataLoader initialization
+        in `train_dataloader` method to enable shuffling every epoch.
+    """
+    def __init__(
+        self,
+        batch_sampler: BucketizeBatchSampler,
+        num_replicas: Optional[int] = None,
+        rank: Optional[int] = None,
+        shuffle: bool = True,
+        seed: int = 0,
+        drop_last: bool = False,
+    ) -> None:
+        self.batch_sampler = batch_sampler
+        if num_replicas is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            num_replicas = dist.get_world_size()
+        if rank is None:
+            if not dist.is_available():
+                raise RuntimeError("Requires distributed package to be available")
+            rank = dist.get_rank()
+        self.num_replicas = num_replicas
+        self.rank = rank
+        self.shuffle = shuffle
+        self.epoch = 0
+        self.seed = seed
+        self.drop_last = drop_last
+        if shuffle:
+            g = torch.Generator()
+            g.manual_seed(self.seed + self.epoch)
+            perm = torch.randperm(len(self.batch_sampler.iter_list), generator=g).tolist()
+            indices = [self.batch_sampler.iter_list[i] for i in perm]
+        else:
+            indices = self.batch_sampler.iter_list
+        if self.drop_last:
+            self.total_size = len(indices) - len(indices) % self.num_replicas
+        else:
+            padding_size = self.num_replicas - len(indices) % self.num_replicas
+            indices += indices[:padding_size]
+            self.total_size = len(indices)
+        self.num_samples = self.total_size // self.num_replicas
+        self.subset = indices[self.rank : self.total_size : self.num_replicas]
+        assert len(self.subset) == self.num_samples
+    def __iter__(self):
+        return iter(self.subset)
+    def __len__(self):
+        return self.num_samples
+class HuBERTDataSet(Dataset):
+    """Create a Dataset for HuBERT model training and fine-tuning.
+    Args:
+        exp_dir (str or Path): The root directory of the ``.tsv`` file list.
+        dataset (str): The dataset for training. Options: [``librispeech``, ``librilight``].
+        subset (str): The subset of the dataset. Options: [``train``, ``valid``].
+    """
+    def __init__(
+        self,
+        exp_dir: Union[str, Path],
+        dataset: str,
+        subset: str,
+    ) -> None:
+        self.exp_dir = Path(exp_dir)
+        tsv_dir = self.exp_dir / "tsv"
+        label_dir = self.exp_dir / "label"
+        f_list, ind_list, len_list = self._get_lists(tsv_dir, dataset, subset)
+        self.f_list, self.ind_list, self.len_list = f_list, ind_list, len_list
+        self.labels = self._load_labels(label_dir, dataset, subset)
+    def __len__(self):
+        return len(self.f_list)
+    def _get_lists(
+        self,
+        tsv_dir: Path,
+        dataset: str,
+        subset: str,
+    ) -> Tuple[List[Path], List[int], List[int]]:
+        """Get the list of paths for iteration.
+        Args:
+            tsv_dir (Path): The root directory of the ``.tsv`` file list.
+            dataset (str): The dataset for training. Options: [``librispeech``, ``librilight``].
+            subset (str): The subset of the dataset. Options: [``train``, ``valid``].
+        Returns:
+            (numpy.array) List of file paths.
+            (numpy.array) List of indices.
+            (numpy.array) List of waveform lengths.
+        """
+        f_ind_len_list = []
+        with open(tsv_dir / f"{dataset}_{subset}.tsv") as f:
+            root = f.readline().rstrip()
+            for index, line in enumerate(f):
+                path, nsample = line.split("\t")
+                path = f"{root}/{path}"
+                nsample = int(nsample)
+                f_ind_len_list.append((path, index, nsample))
+        f_list, ind_list, len_list = [], [], []
+        for ele in f_ind_len_list:
+            f_list.append(ele[0])
+            ind_list.append(ele[1])
+            len_list.append(ele[2])
+        return np.asarray(f_list), np.asarray(ind_list), np.asarray(len_list)
+    def _load_audio(self, index: int) -> Tensor:
+        """Load waveform given the sample index of the dataset.
+        Args:
+            index (int): The sample index.
+        Returns:
+            (Tensor): The corresponding waveform Tensor.
+        """
+        wav_path = self.f_list[index]
+        waveform, sample_rate = torchaudio.load(wav_path)
+        assert waveform.shape[1] == self.len_list[index]
+        return waveform
+    def _load_labels(self, label_dir: Path, dataset: str, subset: str) -> np.array:
+        """Load all labels to memory into a numpy array.
+        Args:
+            label_dir (Path): The directory that contains the label file.
+            dataset (str): The dataset for training. Options: [``librispeech``, ``librilight``].
+            subset (str): The subset of the dataset. Options: [``train``, ``valid``].
+        Returns:
+            (np.array): The numpy arrary that contains the labels for each audio file.
+        """
+        with open(label_dir / f"label_{subset}.pt") as f:
+            labels = [line.rstrip() for line in f]
+            labels = [labels[i] for i in self.ind_list]
+        return np.asarray(labels, dtype=np.string_)
+    def __getitem__(self, index):
+        waveform = self._load_audio(index)
+        length = waveform.shape[1]
+        label = [int(ele) for ele in self.labels[index].split()]
+        label = torch.tensor(label)
+        return (waveform, label, length)
+def _crop_audio_label(
+    waveform: Tensor,
+    label: Tensor,
+    length: Tensor,
+    num_frames: int,
+    rand_crop: bool,
+) -> Tuple[Tensor, Tensor, Tensor]:
+    """Collate the audio and label at the same time.
+    Args:
+        waveform (Tensor): The waveform Tensor with dimensions `(1, time)`.
+        label (Tensor): The label Tensor with dimensions `(1, seq)`.
+        length (Tensor): The length Tensor with dimension `(1,)`.
+        num_frames (int): The final length of the waveform.
+        rand_crop (bool): if ``rand_crop`` is True, the starting index of the
+            waveform and label is random if the length is longer than the minimum
+            length in the mini-batch.
+    Returns:
+        (Tuple(Tensor, Tensor, Tensor)): Returns the Tensors for the waveform,
+            label, and the waveform length.
+    """
+    kernel_size = 25
+    stride = 20
+    sample_rate = 16  # 16 per millisecond
+    frame_offset = 0
+    waveform = waveform[0]
+    if waveform.size(0) > num_frames and rand_crop:
+        diff = waveform.size(0) - num_frames
+        frame_offset = torch.randint(diff, size=(1,))
+    elif waveform.size(0) < num_frames:
+        num_frames = waveform.size(0)
+    label_offset = max(
+        math.floor((frame_offset - kernel_size * sample_rate) / (stride * sample_rate)) + 1,
+        0,
+    )
+    num_label = math.floor((num_frames - kernel_size * sample_rate) / (stride * sample_rate)) + 1
+    waveform = waveform[frame_offset : frame_offset + num_frames]
+    label = label[label_offset : label_offset + num_label]
+    length = num_frames
+    return waveform, label, length
+class CollateFnHubert:
+    """The collate class for HuBERT pre-training and fine-tuning.
+    Args:
+        feature_type (str): The type of features for KMeans clustering.
+            Options: [``mfcc``, ``hubert``].
+        pad (bool): If ``True``, the waveforms and labels will be padded to the
+            max length in the mini-batch. If ``pad`` is False, the waveforms
+            and labels will be cropped to the minimum length in the mini-batch.
+            (Default: False)
+        rand_crop (bool): if ``True``, the starting index of the waveform
+            and label is random if the length is longer than the minimum
+            length in the mini-batch.
+    """
+    def __init__(
+        self,
+        feature_type: str,
+        pad: bool = False,
+        rand_crop: bool = True,
+    ) -> None:
+        self.feature_type = feature_type
+        self.pad = pad
+        self.rand_crop = rand_crop
+    def __call__(self, batch: List[Tuple[Tensor, Tensor, int]]) -> Dict:
+        """
+        Args:
+            batch (List[Tuple(Tensor, Tensor, int)]):
+                The list of tuples that contains the waveforms, labels, and audio lengths.
+        Returns:
+            Dictionary
+                "input": Tuple of waveforms and lengths.
+                    waveforms Tensor with dimensions `(batch, time)`.
+                    lengths Tensor with dimension `(batch,)`.
+                "label": Tuple of label Tensor with dimensions `(batch, seq)`.
+        """
+        if self.pad:
+            num_frames = max([sample[0].shape[1] for sample in batch])
+        else:
+            num_frames = min([sample[0].shape[1] for sample in batch])
+        waveforms, labels, lengths = [], [], []
+        for sample in batch:
+            waveform, label, length = sample
+            # The MFCC feature is 10ms per frame, while the HuBERT's transformer output
+            # is 20ms per frame. Downsample the KMeans label if it's generated by MFCC features.
+            if self.feature_type == "mfcc":
+                label = label[::2]
+            waveform, label, length = _crop_audio_label(waveform, label, length, num_frames, self.rand_crop)
+            waveforms.append(waveform)
+            lengths.append(length)
+            labels.append(label)
+        # make sure the shapes are the same if not apply zero-padding
+        if not self.pad:
+            assert all(
+                [waveform.shape[0] == waveforms[0].shape[0] for waveform in waveforms]
+            ), "The dimensions of the waveforms should be identical in the same batch."
+            assert all(
+                [label.shape[0] == labels[0].shape[0] for label in labels]
+            ), "The dimensions of the labels should be identical in the same batch."
+        waveforms = torch.nn.utils.rnn.pad_sequence(waveforms, batch_first=True)
+        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True)
+        lengths = torch.tensor(lengths)
+        batch = Batch((waveforms, labels, lengths), (labels,))
+        return batch
--- a/examples/ssl/lightning.py
+++ b/examples/ssl/lightning.py
+from collections import namedtuple
+from typing import Callable, Optional
+import pytorch_lightning as pl
+import torch
+import torch.nn as nn
+from torch.optim.optimizer import Optimizer
+Batch = namedtuple("Batch", ["inputs", "labels"])
+class SSLPretrainModule(pl.LightningModule):
+    def __init__(
+        self,
+        model: nn.Module,
+        loss_fn: Callable,
+        optimizer: Optimizer,
+        lr_scheduler: Optional[torch.optim.lr_scheduler._LRScheduler] = None,
+    ):
+        super().__init__()
+        self.model = model
+        self.loss_fn = loss_fn
+        self.optimizer = optimizer
+        self.lr_scheduler = lr_scheduler
+    def log_metrics(self, batch: Batch, output, loss, step_type):
+        """Log useful information to TensorBoard. Users are expected to
+        write their customized `log_metrics` method to log information
+        such as loss values, metric scores, etc.
+        Args:
+            batch (Batch): Batch tuple from the dataloader.
+            output: Output generated by the model.
+            loss (Tensor): Generated class
+            step_type (str): Type of step. Choices are "train", "val", and "test".
+        """
+        pass
+    def training_step(self, batch: Batch, batch_idx):
+        out = self.model(*batch.inputs)
+        loss, num_frame = self.loss_fn(*out, *batch.labels)
+        self.log_metric(batch, out, loss, "train")
+        # normalize the loss based on the sum of num_frame across all GPUs
+        num_frames = self.all_gather(num_frame)
+        self.log(
+            "Gathered number of frames",
+            num_frames.float().sum(),
+            on_step=True,
+            on_epoch=True,
+        )
+        loss *= num_frames.size(0) / num_frames.sum()  # world size / num_frames
+        return loss
+    def validation_step(self, batch, batch_idx):
+        out = self.model(*batch.inputs)
+        loss, _ = self.loss_fn(*out, *batch.labels)
+        self.log_metric(batch, out, loss, "val")
+        return loss
--- a/examples/ssl/losses/__init__.py
+++ b/examples/ssl/losses/__init__.py
+from ._hubert_loss import hubert_loss
+__all__ = [
+    "hubert_loss",
+]
--- a/examples/ssl/losses/_hubert_loss.py
+++ b/examples/ssl/losses/_hubert_loss.py
+from typing import Optional, Tuple
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+def hubert_loss(
+    logit_m: Optional[Tensor],
+    logit_u: Optional[Tensor],
+    feature_penalty: Tensor,
+    label: Optional[Tensor] = None,
+    masked_weight: float = 1.0,
+    unmasked_weight: float = 0.0,
+    feature_weight: float = 10.0,
+    reduction: str = "sum",
+) -> Tuple[Tensor, float]:
+    """Compute the cross-entropy loss on HuBERT masked and non-masked logits.
+    Args:
+        logit_m (Tensor or None): The masked logit Tensor of dimension `(masked_frames, final_dim)`.
+        logit_u (Tensor or None): The non-masked logit Tensor of dimension `(unmasked_frames, final_dim)`.
+        feature_penalty (Tensor): The feature mean value for additional penalty loss.
+        masked_weight (float, optional): The weight for masked cross-entropy loss (Default: ``1.0``).
+        unmasked_weight (float, optional): The weight for non-masked cross-entropy loss (Default: ``0.0``).
+        feature_weight (float, optional): The weight for feature penalty loss (Default: ``10.0``).
+        reduction (str, optional): The reduction method for cross-entropy loss (Default: ``"sum"``).
+    Returns:
+        (Tensor, float)
+        Tensor: The desired loss Tensor.
+        float: Number of frames used in loss computation.
+    """
+    num_frame = 0.0
+    loss = 0.0
+    if logit_m is not None:
+        target_m = torch.zeros(logit_m.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_m = F.cross_entropy(logit_m, target_m, reduction=reduction)
+        loss += loss_m * masked_weight
+        num_frame += logit_m.shape[0]
+    if logit_u is not None:
+        target_u = torch.zeros(logit_u.shape[0], dtype=torch.long, device=logit_m.device)
+        loss_u = F.cross_entropy(logit_u, target_u, reduction=reduction)
+        loss += loss_u * unmasked_weight
+        num_frame += logit_u.shape[0]
+    loss += feature_penalty * feature_weight * num_frame
+    return loss, num_frame
--- a/examples/ssl/lr_schedulers/__init__.py
+++ b/examples/ssl/lr_schedulers/__init__.py
+from _linear_decay import LinearDecayLRScheduler
+__all__ = [
+    "LinearDecayLRScheduler",
+]
--- a/examples/ssl/lr_schedulers/_linear_decay.py
+++ b/examples/ssl/lr_schedulers/_linear_decay.py
+import torch
+from torch.optim.optimizer import Optimizer
+class LinearDecayLRScheduler(torch.optim.lr_scheduler._LRScheduler):
+    """Linear learning rate scheduler with warm up."""
+    def __init__(
+        self,
+        optimizer: Optimizer,
+        warmup_updates: int,
+        max_updates: int,
+        last_epoch: int = -1,
+        verbose: bool = False,
+    ):
+        self.warmup_updates = warmup_updates
+        self.max_updates = max_updates
+        super().__init__(optimizer, last_epoch=last_epoch, verbose=verbose)
+    def get_lr(self):
+        if self._step_count <= self.warmup_updates:
+            return [self._step_count / self.warmup_updates * base_lr for base_lr in self.base_lrs]
+        elif self._step_count >= self.max_updates:
+            return [0.0 for _ in self.base_lrs]
+        else:
+            pct_remaining = (self.max_updates - self._step_count) / (self.max_updates - self.warmup_updates)
+            return [base_lr * pct_remaining for base_lr in self.base_lrs]
--- a/examples/ssl/train_hubert.py
+++ b/examples/ssl/train_hubert.py
+import logging
+import pathlib
+from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, RawDescriptionHelpFormatter
+from functools import partial
+from typing import Dict, Tuple
+import torch
+import torchaudio.models
+from data_modules import HuBERTDataModule
+from lightning import SSLPretrainModule
+from losses import hubert_loss
+from lr_schedulers import LinearDecayLRScheduler
+from pytorch_lightning import Trainer
+from pytorch_lightning.callbacks import ModelCheckpoint
+from pytorch_lightning.utilities.seed import seed_everything
+class _Formatter(ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter):
+    # To use ArgumentDefaultsHelpFormatter as the formatter_class and
+    # RawDescriptionHelpFormatter to add custom formatting to description or epilog.
+    # Check: https://stackoverflow.com/a/18462760
+    pass
+def _compute_accuracy(logits: torch.Tensor):
+    with torch.no_grad():
+        max = logits.argmax(-1) == 0
+        min = logits.argmin(-1) == 0
+        both = max & min
+        corr = max.long().sum().item() - both.long().sum().item()
+        count = max.numel()
+    return corr / count
+class HuBERTModule(SSLPretrainModule):
+    def configure_optimizers(self):
+        return (
+            [self.optimizer],
+            [
+                {
+                    "scheduler": self.lr_scheduler,
+                    "interval": "step",
+                },
+            ],
+        )
+    def log_metric(self, batch: Dict, output: Tuple, loss: torch.Tensor, step_type: str):
+        logit_m, logit_u, _ = output
+        self.log(
+            f"{step_type}_loss",
+            loss.item(),
+            on_step=True,
+            on_epoch=True,
+        )
+        acc_m = _compute_accuracy(logit_m)
+        acc_u = _compute_accuracy(logit_u)
+        self.log(
+            f"{step_type}_acc_m",
+            acc_m,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+            prog_bar=step_type == "train",
+        )
+        self.log(
+            f"{step_type}_acc_u",
+            acc_u,
+            on_step=True,
+            on_epoch=True,
+            sync_dist=True,
+            prog_bar=step_type == "train",
+        )
+def run_train(args):
+    seed_everything(1337)
+    checkpoint_dir = args.exp_dir / f"checkpoints_{args.dataset}_{args.model_name}"
+    checkpoint = ModelCheckpoint(
+        checkpoint_dir,
+        monitor="val_loss",
+        mode="min",
+        save_top_k=5,
+        save_weights_only=False,
+        verbose=True,
+    )
+    train_checkpoint = ModelCheckpoint(
+        checkpoint_dir,
+        monitor="train_loss",
+        mode="min",
+        save_top_k=5,
+        save_weights_only=False,
+        verbose=True,
+    )
+    callbacks = [
+        checkpoint,
+        train_checkpoint,
+    ]
+    trainer = Trainer(
+        default_root_dir=args.exp_dir,
+        max_steps=args.max_updates,
+        num_nodes=args.num_nodes,
+        devices=args.gpus,
+        accelerator="gpu",
+        strategy="ddp",
+        precision=args.precision,
+        accumulate_grad_batches=args.accumulate_grad_batches,
+        gradient_clip_val=args.clip_norm,
+        replace_sampler_ddp=False,
+        callbacks=callbacks,
+        reload_dataloaders_every_n_epochs=1,
+    )
+    if args.model_name not in ["hubert_pretrain_base", "hubert_pretrain_large", "hubert_pretrain_xlarge"]:
+        raise ValueError(
+            "Expect model_name to be one of 'hubert_pretrain_base', 'hubert_pretrain_large', 'hubert_pretrain_xlarge'."
+            f"Found {args.model_name}."
+        )
+    model = getattr(torchaudio.models, args.model_name)()
+    loss_fn = partial(
+        hubert_loss,
+        masked_weight=args.masked_weight,
+        unmasked_weight=args.unmasked_weight,
+        feature_weight=args.feature_weight,
+    )
+    optimizer = torch.optim.AdamW(
+        model.parameters(),
+        lr=args.learning_rate,
+        betas=args.betas,
+        eps=args.eps,
+        weight_decay=args.weight_decay,
+    )
+    lr_scheduler = LinearDecayLRScheduler(optimizer, args.warmup_updates, args.max_updates)
+    lightning_module = HuBERTModule(
+        model,
+        loss_fn,
+        optimizer,
+        lr_scheduler,
+    )
+    data_module = HuBERTDataModule(
+        dataset_path=args.dataset_path,
+        dataset="librispeech",
+        feature_type="mfcc",
+        seconds_per_batch=200,
+        train_shuffle=True,
+        num_workers=10,
+    )
+    trainer.fit(lightning_module, datamodule=data_module)
+def _parse_args():
+    parser = ArgumentParser(
+        description=__doc__,
+        formatter_class=_Formatter,
+    )
+    parser.add_argument(
+        "--dataset-path",
+        type=pathlib.Path,
+        required=True,
+        help="Path to the feature and label directories.",
+    )
+    parser.add_argument(
+        "--resume-checkpoint",
+        type=pathlib.Path,
+        default=None,
+        help="Path to the feature and label directories. (Default: None)",
+    )
+    parser.add_argument(
+        "--feature-type",
+        choices=["mfcc", "hubert"],
+        type=str,
+        required=True,
+    )
+    parser.add_argument(
+        "--feature-grad-mult",
+        default=0.1,
+        type=float,
+        help="The scaling factor to multiply the feature extractor gradient. (Default: 0.1)",
+    )
+    parser.add_argument(
+        "--num-classes",
+        choices=[100, 500],
+        type=int,
+        required=True,
+        help="The ``num_class`` when building the hubert_pretrain_base model.",
+    )
+    parser.add_argument(
+        "--model-name",
+        default="hubert_pretrain_base",
+        choices=[
+            "hubert_pretrain_base",
+            "hubert_pretrain_large",
+            "hubert_pretrain_xlarge",
+        ],
+        type=str,
+        help="The HuBERT model to train. (Default: 'hubert_pretrain_base')",
+    )
+    parser.add_argument(
+        "--exp-dir",
+        default=pathlib.Path("./exp"),
+        type=pathlib.Path,
+        help="Directory to save checkpoints and logs to. (Default: './exp')",
+    )
+    parser.add_argument(
+        "--dataset",
+        default="librispeech",
+        choices=["librispeech", "librilight"],
+        type=str,
+        help="The dataset for training. (Default: 'librispeech')",
+    )
+    parser.add_argument(
+        "--learning-rate",
+        default=0.0005,
+        type=float,
+        help="The peak learning rate. (Default: 0.0005)",
+    )
+    parser.add_argument(
+        "--betas",
+        default=(0.9, 0.98),
+        type=Tuple,
+        help="The coefficients for computing running averages of gradient and its square (Default: (0.9, 0.98))",
+    )
+    parser.add_argument(
+        "--eps",
+        default=1e-6,
+        type=float,
+        help="Epsilon value in Adam optimizer. (Default: 1e-6)",
+    )
+    parser.add_argument(
+        "--weight-decay",
+        default=0.01,
+        type=float,
+        help="Weight decay (L2 penalty) (Default: 0.01)",
+    )
+    parser.add_argument(
+        "--precision",
+        default=16,
+        choices=[16, 32, 64, "bf16"],
+        help="Precision of model training. (Default: 16)",
+    )
+    parser.add_argument(
+        "--accumulate-grad-batches",
+        default=1,
+        type=int,
+        help="Number of steps for accumulating gradients. (Default: 1)",
+    )
+    parser.add_argument(
+        "--clip-norm",
+        default=10.0,
+        type=float,
+        help="The gradient norm value to clip. (Default: 10.0)",
+    )
+    parser.add_argument(
+        "--num-nodes",
+        default=4,
+        type=int,
+        help="Number of nodes to use for training. (Default: 4)",
+    )
+    parser.add_argument(
+        "--gpus",
+        default=8,
+        type=int,
+        help="Number of GPUs per node to use for training. (Default: 8)",
+    )
+    parser.add_argument(
+        "--warmup-updates",
+        default=32000,
+        type=int,
+        help="Number of steps for warm up the learning rate. (Default: 32000)",
+    )
+    parser.add_argument(
+        "--max-updates",
+        default=250000,
+        type=int,
+        help="Total number of training steps. (Default: 250000)",
+    )
+    parser.add_argument(
+        "--seconds-per-batch",
+        default=87.5,
+        type=float,
+        help="Number of seconds of audio in a mini-batch. (Default: 87.5)",
+    )
+    parser.add_argument(
+        "--masked-weight",
+        default=1.0,
+        type=float,
+        help="The weight for cross-entropy loss of masksed frames. (Default: ``1.0``)",
+    )
+    parser.add_argument(
+        "--unmasked-weight",
+        default=0.0,
+        type=float,
+        help="The weight for cross-entropy loss of unmasksed frames. (Default: ``0.0``)",
+    )
+    parser.add_argument(
+        "--feature-weight",
+        default=10.0,
+        type=float,
+        help="The weight for feature penalty loss. (Default: ``10.0``)",
+    )
+    parser.add_argument("--debug", action="store_true", help="whether to use debug level for logging")
+    return parser.parse_args()
+def _init_logger(debug):
+    fmt = "%(asctime)s %(message)s" if debug else "%(message)s"
+    level = logging.DEBUG if debug else logging.INFO
+    logging.basicConfig(format=fmt, level=level, datefmt="%Y-%m-%d %H:%M:%S")
+def cli_main():
+    args = _parse_args()
+    _init_logger(args.debug)
+    run_train(args)
+if __name__ == "__main__":
+    cli_main()