Unverified Commit bdbe3df8 authored by Patrick von Platen's avatar Patrick von Platen Committed by GitHub
Browse files

[WavLM] Layerdrop is not allowed for first layer (#14811)

* [WavLM] Layerdrop is not allowed for first layer

* Apply suggestions from code review
parent cbf036f7
...@@ -686,7 +686,6 @@ class WavLMEncoder(nn.Module): ...@@ -686,7 +686,6 @@ class WavLMEncoder(nn.Module):
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
position_bias = None position_bias = None
for i, layer in enumerate(self.layers): for i, layer in enumerate(self.layers):
...@@ -696,7 +695,7 @@ class WavLMEncoder(nn.Module): ...@@ -696,7 +695,7 @@ class WavLMEncoder(nn.Module):
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = np.random.uniform(0, 1)
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
# under deepspeed zero3 all gpus must run in sync # under deepspeed zero3 all gpus must run in sync
if self.gradient_checkpointing and self.training: if self.gradient_checkpointing and self.training:
...@@ -777,17 +776,16 @@ class WavLMEncoderStableLayerNorm(nn.Module): ...@@ -777,17 +776,16 @@ class WavLMEncoderStableLayerNorm(nn.Module):
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled() deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
position_bias = None position_bias = None
for layer in self.layers: for i, layer in enumerate(self.layers):
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = np.random.uniform(0, 1)
skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False skip_the_layer = self.training and i > 0 and (dropout_probability < self.config.layerdrop)
if not skip_the_layer or deepspeed_zero3_is_enabled: if not skip_the_layer or deepspeed_zero3_is_enabled:
# under deepspeed zero3 all gpus must run in sync # under deepspeed zero3 all gpus must run in sync
# XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication # XXX: could optimize this like synced_gpus in generate_utils but not sure if it's worth the code complication
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment