Unverified Commit 6645eb61 authored by Nik's avatar Nik Committed by GitHub
Browse files

fix #14524 (IndexError when mask prob is too low) (#14525)

* fix #14524 (IndexError when mask prob is too low)

* fix formatting

* correct documentation, add option for setting min_num_masks

* change the semantic meaning of `mask_prob` in _compute_mask_indices

With this commit the meaing of `mask_prob` actually adhered to the probability for each
vector to be the start of a masked span of length.

* fix check_copies test

* fix documentation to semantic meaning of `upper bound of overall masking percentage`, revert changes to _compute_mask_indices

* fix typo
parent 96cc02b5
...@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig): ...@@ -101,17 +101,30 @@ class HubertConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.HubertForCTC`. instance of :class:`~transformers.HubertForCTC`.
...@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig): ...@@ -169,8 +182,10 @@ class HubertConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="sum", ctc_loss_reduction="sum",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
...@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig): ...@@ -225,8 +240,10 @@ class HubertConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction
......
...@@ -69,13 +69,16 @@ def _compute_mask_indices( ...@@ -69,13 +69,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -84,9 +87,11 @@ def _compute_mask_indices( ...@@ -84,9 +87,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -113,15 +118,21 @@ def _compute_mask_indices( ...@@ -113,15 +118,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -137,6 +148,7 @@ def _compute_mask_indices( ...@@ -137,6 +148,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel): ...@@ -930,7 +942,7 @@ class HubertModel(HubertPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel): ...@@ -941,6 +953,7 @@ class HubertModel(HubertPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig): ...@@ -95,17 +95,30 @@ class SEWConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an Specifies the reduction to apply to the output of ``torch.nn.CTCLoss``. Only relevant when training an
instance of :class:`~transformers.SEWForCTC`. instance of :class:`~transformers.SEWForCTC`.
...@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig): ...@@ -162,8 +175,10 @@ class SEWConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean", ctc_loss_reduction="mean",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
...@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig): ...@@ -215,8 +230,10 @@ class SEWConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction
......
...@@ -67,13 +67,16 @@ def _compute_mask_indices( ...@@ -67,13 +67,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -82,9 +85,11 @@ def _compute_mask_indices( ...@@ -82,9 +85,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -111,15 +116,21 @@ def _compute_mask_indices( ...@@ -111,15 +116,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -135,6 +146,7 @@ def _compute_mask_indices( ...@@ -135,6 +146,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel): ...@@ -829,7 +841,7 @@ class SEWModel(SEWPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel): ...@@ -840,6 +852,7 @@ class SEWModel(SEWPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig): ...@@ -113,17 +113,30 @@ class SEWDConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1): diversity_loss_weight (:obj:`int`, `optional`, defaults to 0.1):
The weight of the codebook diversity loss component. The weight of the codebook diversity loss component.
ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`): ctc_loss_reduction (:obj:`str`, `optional`, defaults to :obj:`"sum"`):
...@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig): ...@@ -190,8 +203,10 @@ class SEWDConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
ctc_loss_reduction="mean", ctc_loss_reduction="mean",
ctc_zero_infinity=False, ctc_zero_infinity=False,
use_weighted_layer_sum=False, use_weighted_layer_sum=False,
...@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig): ...@@ -251,8 +266,10 @@ class SEWDConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# ctc loss # ctc loss
self.ctc_loss_reduction = ctc_loss_reduction self.ctc_loss_reduction = ctc_loss_reduction
......
...@@ -73,13 +73,16 @@ def _compute_mask_indices( ...@@ -73,13 +73,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -88,9 +91,11 @@ def _compute_mask_indices( ...@@ -88,9 +91,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -117,15 +122,21 @@ def _compute_mask_indices( ...@@ -117,15 +122,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -141,6 +152,7 @@ def _compute_mask_indices( ...@@ -141,6 +152,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel): ...@@ -1360,7 +1372,7 @@ class SEWDModel(SEWDPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel): ...@@ -1371,6 +1383,7 @@ class SEWDModel(SEWDPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -101,17 +101,30 @@ class UniSpeechConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
...@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -187,8 +200,10 @@ class UniSpeechConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
...@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig): ...@@ -252,8 +267,10 @@ class UniSpeechConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group
......
...@@ -136,13 +136,16 @@ def _compute_mask_indices( ...@@ -136,13 +136,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -151,9 +154,11 @@ def _compute_mask_indices( ...@@ -151,9 +154,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -180,15 +185,21 @@ def _compute_mask_indices( ...@@ -180,15 +185,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -204,6 +215,7 @@ def _compute_mask_indices( ...@@ -204,6 +215,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1076,7 +1088,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel): ...@@ -1087,6 +1099,7 @@ class UniSpeechModel(UniSpeechPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -101,17 +101,30 @@ class UniSpeechSatConfig(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
...@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -185,8 +198,10 @@ class UniSpeechSatConfig(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
...@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig): ...@@ -249,8 +264,10 @@ class UniSpeechSatConfig(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group
......
...@@ -137,13 +137,16 @@ def _compute_mask_indices( ...@@ -137,13 +137,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -152,9 +155,11 @@ def _compute_mask_indices( ...@@ -152,9 +155,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -181,15 +186,21 @@ def _compute_mask_indices( ...@@ -181,15 +186,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -205,6 +216,7 @@ def _compute_mask_indices( ...@@ -205,6 +216,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1077,7 +1089,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel): ...@@ -1088,6 +1100,7 @@ class UniSpeechSatModel(UniSpeechSatPreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -101,17 +101,30 @@ class Wav2Vec2Config(PretrainedConfig):
`SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition `SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition
<https://arxiv.org/abs/1904.08779>`__. <https://arxiv.org/abs/1904.08779>`__.
mask_time_prob (:obj:`float`, `optional`, defaults to 0.05): mask_time_prob (:obj:`float`, `optional`, defaults to 0.05):
Propability of each feature vector along the time axis to be chosen as the start of the vector span to be Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
masked. Approximately ``mask_time_prob * sequence_length // mask_time_length`` feature vectors will be procecure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. reasoning from the propability of each feature vector to be chosen as the start of the vector span to be
masked, `mask_time_prob` should be ``prob_vector_start*mask_time_length``. Note that overlap may decrease
the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment is True``.
mask_time_length (:obj:`int`, `optional`, defaults to 10): mask_time_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the time axis. Length of vector span along the time axis.
mask_time_min_masks (:obj:`int`, `optional`, defaults to 2),:
The minimum number of masks of length ``mask_feature_length`` generated along the time axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_time_prob*len(time_axis)/mask_time_length < mask_time_min_masks''
mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0): mask_feature_prob (:obj:`float`, `optional`, defaults to 0.0):
Propability of each feature vector along the feature axis to be chosen as the start of the vector span to Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
be masked. Approximately ``mask_time_prob * hidden_size // mask_time_length`` feature vectors will be masking procecure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
masked along the time axis. This is only relevant if ``apply_spec_augment is True``. the axis. If reasoning from the propability of each feature vector to be chosen as the start of the vector
span to be masked, `mask_feature_prob` should be ``prob_vector_start*mask_feature_length``. Note that
overlap may decrease the actual percentage of masked vectors. This is only relevant if ``apply_spec_augment
is True``.
mask_feature_length (:obj:`int`, `optional`, defaults to 10): mask_feature_length (:obj:`int`, `optional`, defaults to 10):
Length of vector span along the feature axis. Length of vector span along the feature axis.
mask_feature_min_masks (:obj:`int`, `optional`, defaults to 0),:
The minimum number of masks of length ``mask_feature_length`` generated along the feature axis, each time
step, irrespectively of ``mask_feature_prob``. Only relevant if
''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320): num_codevectors_per_group (:obj:`int`, `optional`, defaults to 320):
Number of entries in each quantization codebook (group). Number of entries in each quantization codebook (group).
num_codevector_groups (:obj:`int`, `optional`, defaults to 2): num_codevector_groups (:obj:`int`, `optional`, defaults to 2):
...@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -198,8 +211,10 @@ class Wav2Vec2Config(PretrainedConfig):
apply_spec_augment=True, apply_spec_augment=True,
mask_time_prob=0.05, mask_time_prob=0.05,
mask_time_length=10, mask_time_length=10,
mask_time_min_masks=2,
mask_feature_prob=0.0, mask_feature_prob=0.0,
mask_feature_length=10, mask_feature_length=10,
mask_feature_min_masks=0,
num_codevectors_per_group=320, num_codevectors_per_group=320,
num_codevector_groups=2, num_codevector_groups=2,
contrastive_logits_temperature=0.1, contrastive_logits_temperature=0.1,
...@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig): ...@@ -265,8 +280,10 @@ class Wav2Vec2Config(PretrainedConfig):
self.apply_spec_augment = apply_spec_augment self.apply_spec_augment = apply_spec_augment
self.mask_time_prob = mask_time_prob self.mask_time_prob = mask_time_prob
self.mask_time_length = mask_time_length self.mask_time_length = mask_time_length
self.mask_time_min_masks = mask_time_min_masks
self.mask_feature_prob = mask_feature_prob self.mask_feature_prob = mask_feature_prob
self.mask_feature_length = mask_feature_length self.mask_feature_length = mask_feature_length
self.mask_feature_min_masks = mask_feature_min_masks
# parameters for pretraining with codevector quantized representations # parameters for pretraining with codevector quantized representations
self.num_codevectors_per_group = num_codevectors_per_group self.num_codevectors_per_group = num_codevectors_per_group
......
...@@ -145,13 +145,16 @@ def _compute_mask_indices( ...@@ -145,13 +145,16 @@ def _compute_mask_indices(
on CPU as part of the preprocessing during training. on CPU as part of the preprocessing during training.
Args: Args:
shape: the the shape for which to compute masks. shape: The shape for which to compute masks. This should be of a tuple of size 2 where
should be of size 2 where first element is batch size and 2nd is timesteps the first element is the batch size and the second element is the length of the axis to span.
mask_prob: probability for each token to be chosen as start of the span to be masked. this will be multiplied by mask_prob: The percentage of the whole axis (between 0 and 1) which will be masked. The number of
number of timesteps divided by length of mask span to mask approximately this percentage of all elements. independently generated mask spans of length `mask_length` is computed by
however due to overlaps, the actual number will be smaller (unless no_overlap is True) `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
actual percentage will be smaller.
mask_length: size of the mask mask_length: size of the mask
min_masks: minimum number of masked spans min_masks: minimum number of masked spans
attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
each batch dimension.
""" """
batch_size, sequence_length = shape batch_size, sequence_length = shape
...@@ -160,9 +163,11 @@ def _compute_mask_indices( ...@@ -160,9 +163,11 @@ def _compute_mask_indices(
if mask_length > sequence_length: if mask_length > sequence_length:
raise ValueError( raise ValueError(
f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length} and `sequence_length`: {sequence_length}`" f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
f" and `sequence_length`: {sequence_length}`"
) )
# epsilon is used for probabilistic rounding
epsilon = np.random.rand(1).item() epsilon = np.random.rand(1).item()
def compute_num_masked_span(input_length): def compute_num_masked_span(input_length):
...@@ -189,15 +194,21 @@ def _compute_mask_indices( ...@@ -189,15 +194,21 @@ def _compute_mask_indices(
max_num_masked_span = compute_num_masked_span(sequence_length) max_num_masked_span = compute_num_masked_span(sequence_length)
if max_num_masked_span == 0:
return spec_aug_mask
for input_length in input_lengths: for input_length in input_lengths:
# compute num of masked spans for this input # compute num of masked spans for this input
num_masked_span = compute_num_masked_span(input_length) num_masked_span = compute_num_masked_span(input_length)
# get random indices to mask # get random indices to mask
spec_aug_mask_idx = np.random.choice( spec_aug_mask_idx = np.random.choice(
np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
) )
# pick first sampled index that will serve as a dummy index to pad vector # pick first sampled index that will serve as a dummy index to pad vector
# to ensure same dimension for all batches due to probabilistic rounding
# Picking first sample just pads those vectors twice.
dummy_mask_idx = spec_aug_mask_idx[0] dummy_mask_idx = spec_aug_mask_idx[0]
spec_aug_mask_idx = np.concatenate( spec_aug_mask_idx = np.concatenate(
...@@ -213,6 +224,7 @@ def _compute_mask_indices( ...@@ -213,6 +224,7 @@ def _compute_mask_indices(
) )
spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length) spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
# add offset to the starting indexes so that that indexes now create a span
offsets = np.arange(mask_length)[None, None, :] offsets = np.arange(mask_length)[None, None, :]
offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape( offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
batch_size, max_num_masked_span * mask_length batch_size, max_num_masked_span * mask_length
...@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): ...@@ -1182,7 +1194,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
mask_prob=self.config.mask_time_prob, mask_prob=self.config.mask_time_prob,
mask_length=self.config.mask_time_length, mask_length=self.config.mask_time_length,
attention_mask=attention_mask, attention_mask=attention_mask,
min_masks=2, min_masks=self.config.mask_time_min_masks,
) )
mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool) mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype) hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
...@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel): ...@@ -1193,6 +1205,7 @@ class Wav2Vec2Model(Wav2Vec2PreTrainedModel):
(batch_size, hidden_size), (batch_size, hidden_size),
mask_prob=self.config.mask_feature_prob, mask_prob=self.config.mask_feature_prob,
mask_length=self.config.mask_feature_length, mask_length=self.config.mask_feature_length,
min_masks=self.config.mask_feature_min_masks,
) )
mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool) mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1) mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
......
...@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase): ...@@ -854,6 +854,36 @@ class Wav2Vec2UtilsTest(unittest.TestCase):
self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)]) self.assertListEqual(mask.sum(axis=-1).tolist(), [mask_prob * sequence_length for _ in range(batch_size)])
def test_compute_mask_indices_low_prob(self):
# with these settings num_masked_spans=0.5, which means probabilistic rounding
# ensures that in 5 out of 10 method calls, num_masked_spans=0, and in
# the other 5 out of 10, cases num_masked_spans=1
n_trials = 100
batch_size = 4
sequence_length = 100
mask_prob = 0.05
mask_length = 10
count_dimensions_masked = 0
count_dimensions_not_masked = 0
for _ in range(n_trials):
mask = _compute_mask_indices((batch_size, sequence_length), mask_prob, mask_length)
mask = torch.from_numpy(mask).to(torch_device)
num_masks = torch.sum(mask).item()
if num_masks > 0:
count_dimensions_masked += 1
else:
count_dimensions_not_masked += 1
# as we test for at least 10 masked dimension and at least
# 10 non-masked dimension, this test could fail with probability:
# P(100 coin flips, at most 9 heads) = 1.66e-18
self.assertGreater(count_dimensions_masked, int(n_trials * 0.1))
self.assertGreater(count_dimensions_not_masked, int(n_trials * 0.1))
def test_compute_mask_indices_overlap(self): def test_compute_mask_indices_overlap(self):
batch_size = 4 batch_size = 4
sequence_length = 80 sequence_length = 80
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment