fix #14524 (IndexError when mask prob is too low) (#14525)

* fix #14524 (IndexError when mask prob is too low) * fix formatting * correct documentation, add option for setting min_num_masks * change the semantic meaning of `mask_prob` in _compute_mask_indices With this commit the meaing of `mask_prob` actually adhered to the probability for each vector to be the start of a masked span of length. * fix check_copies test * fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices * fix typo

fix #14524 (IndexError when mask prob is too low) (#14525)
* fix #14524 (IndexError when mask prob is too low) * fix formatting * correct documentation, add option for setting min_num_masks * change the semantic meaning of `mask_prob` in _compute_mask_indices With this commit the meaing of `mask_prob` actually adhered to the probability for each vector to be the start of a masked span of length. * fix check_copies test * fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices * fix typo
6645eb61 · Nik · GitHub · 96cc02b5 · 6645eb61 · 6645eb61
Unverified Commit 6645eb61 authored Dec 02, 2021 by Nik Committed by GitHub Dec 02, 2021
13 changed files
--- a/src/transformers/models/hubert/configuration_hubert.py
+++ b/src/transformers/models/hubert/configuration_hubert.py
@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
            instance of :class:`~transformers.HubertForCTC`.
@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        ctc_loss_reduction="sum",
        ctc_zero_infinity=False,
        use_weighted_layer_sum=False,
@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # ctc loss
        self.ctc_loss_reduction = ctc_loss_reduction

--- a/src/transformers/models/hubert/modeling_hubert.py
+++ b/src/transformers/models/hubert/modeling_hubert.py
@@ -69,13 +69,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -84,9 +87,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -113,15 +118,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -137,6 +148,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/src/transformers/models/sew/configuration_sew.py
+++ b/src/transformers/models/sew/configuration_sew.py
@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
            Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
            instance of :class:`~transformers.SEWForCTC`.
@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        ctc_loss_reduction="mean",
        ctc_zero_infinity=False,
        use_weighted_layer_sum=False,
@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # ctc loss
        self.ctc_loss_reduction = ctc_loss_reduction

--- a/src/transformers/models/sew/modeling_sew.py
+++ b/src/transformers/models/sew/modeling_sew.py
@@ -67,13 +67,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -82,9 +85,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -111,15 +116,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -135,6 +146,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/src/transformers/models/sew_d/configuration_sew_d.py
+++ b/src/transformers/models/sew_d/configuration_sew_d.py
@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
            The weight of the codebook diversity loss component.
        ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        ctc_loss_reduction="mean",
        ctc_zero_infinity=False,
        use_weighted_layer_sum=False,
@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # ctc loss
        self.ctc_loss_reduction = ctc_loss_reduction

--- a/src/transformers/models/sew_d/modeling_sew_d.py
+++ b/src/transformers/models/sew_d/modeling_sew_d.py
@@ -73,13 +73,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -88,9 +91,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -117,15 +122,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -141,6 +152,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/src/transformers/models/unispeech/configuration_unispeech.py
+++ b/src/transformers/models/unispeech/configuration_unispeech.py
@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
            Number of entries in each quantization codebook (group).
        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        num_codevectors_per_group=320,
        num_codevector_groups=2,
        contrastive_logits_temperature=0.1,
@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # parameters for pretraining with codevector quantized representations
        self.num_codevectors_per_group = num_codevectors_per_group

--- a/src/transformers/models/unispeech/modeling_unispeech.py
+++ b/src/transformers/models/unispeech/modeling_unispeech.py
@@ -136,13 +136,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -151,9 +154,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -180,15 +185,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -204,6 +215,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/configuration_unispeech_sat.py
@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
            Number of entries in each quantization codebook (group).
        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        num_codevectors_per_group=320,
        num_codevector_groups=2,
        contrastive_logits_temperature=0.1,
@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # parameters for pretraining with codevector quantized representations
        self.num_codevectors_per_group = num_codevectors_per_group

--- a/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
+++ b/src/transformers/models/unispeech_sat/modeling_unispeech_sat.py
@@ -137,13 +137,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -152,9 +155,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -181,15 +186,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -205,6 +216,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/src/transformers/models/wav2vec2/configuration_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/configuration_wav2vec2.py
@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig):
            `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
            <https://arxiv.org/abs/1904.08779>`__.
        mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
-            Propability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
-            masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be
+            procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
+            masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
+            the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
        mask_time_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the time axis.
+        mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
        mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
-            Propability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
-            be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be
+            masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
-            masked along the time axis. This is only relevant if ``apply_spec_augment is True``.
+            the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
+            span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
+            overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
+            is True``.
        mask_feature_length (:obj:`int`, `optional`, defaults to 10):
            Length of vector span along the feature axis.
+        mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
+            The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
+            step, irrespectively of ``mask_feature_prob``. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
        num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
            Number of entries in each quantization codebook (group).
        num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig):
        apply_spec_augment=True,
        mask_time_prob=0.05,
        mask_time_length=10,
+        mask_time_min_masks=2,
        mask_feature_prob=0.0,
        mask_feature_length=10,
+        mask_feature_min_masks=0,
        num_codevectors_per_group=320,
        num_codevector_groups=2,
        contrastive_logits_temperature=0.1,
@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig):
        self.apply_spec_augment = apply_spec_augment
        self.mask_time_prob = mask_time_prob
        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
        self.mask_feature_prob = mask_feature_prob
        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
        # parameters for pretraining with codevector quantized representations
        self.num_codevectors_per_group = num_codevectors_per_group

--- a/src/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/src/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -145,13 +145,16 @@ def _compute_mask_indices(
    on CPU as part of the preprocessing during training.
    Args:
-        shape: the the shape for which to compute masks.
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
-            should be of size 2 where first element is batch size and 2nd is timesteps
+               the first element is the batch size and the second element is the length of the axis to span.
-        mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
-            number of timesteps divided by length of mask span to mask approximately this percentage of all elements.
+                    independently generated mask spans of length `mask_length` is computed by
-            however due to overlaps, the actual number will be smaller (unless no_overlap is True)
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
    """
    batch_size, sequence_length = shape
@@ -160,9 +163,11 @@ def _compute_mask_indices(
    if mask_length > sequence_length:
        raise ValueError(
-            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`"
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
        )
+    # epsilon is used for probabilistic rounding
    epsilon = np.random.rand(1).item()
    def compute_num_masked_span(input_length):
@@ -189,15 +194,21 @@ def _compute_mask_indices(
    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
    for input_length in input_lengths:
        # compute num of masked spans for this input
        num_masked_span = compute_num_masked_span(input_length)
        # get random indices to mask
        spec_aug_mask_idx = np.random.choice(
            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
        )
        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
        dummy_mask_idx = spec_aug_mask_idx[0]
        spec_aug_mask_idx = np.concatenate(
@@ -213,6 +224,7 @@ def _compute_mask_indices(
    )
    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that that indexes now create a span
    offsets = np.arange(mask_length)[None, None, :]
    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
        batch_size, max_num_masked_span * mask_length
@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
                mask_prob=self.config.mask_time_prob,
                mask_length=self.config.mask_time_length,
                attention_mask=attention_mask,
-                min_masks=2,
+                min_masks=self.config.mask_time_min_masks,
            )
            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
                (batch_size, hidden_size),
                mask_prob=self.config.mask_feature_prob,
                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
            )
            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)

--- a/tests/test_modeling_wav2vec2.py
+++ b/tests/test_modeling_wav2vec2.py
@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
        self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
+    def test_compute_mask_indices_low_prob(self):
+        # with these settings num_masked_spans=0.5, which means probabilistic rounding
+        # ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
+        # the other 5 out of 10, cases num_masked_spans=1
+        n_trials = 100
+        batch_size = 4
+        sequence_length = 100
+        mask_prob = 0.05
+        mask_length = 10
+        count_dimensions_masked = 0
+        count_dimensions_not_masked = 0
+        for _ in range(n_trials):
+            mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
+            mask = torch.from_numpy(mask).to(torch_device)
+            num_masks = torch.sum(mask).item()
+            if num_masks > 0:
+                count_dimensions_masked += 1
+            else:
+                count_dimensions_not_masked += 1
+        # as we test for at least 10 masked dimension and at least
+        # 10 non-masked dimension, this test could fail with probability:
+        # P(100 coin flips, at most 9 heads) = 1.66e-18
+        self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
+        self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
    def test_compute_mask_indices_overlap(self):
        batch_size = 4
        sequence_length = 80