"git@developer.sourcefind.cn:chenpangpang/transformers.git" did not exist on "bf64b8cf095a23303315e1347d8eac0bce9d73be"
Unverified Commit d14e0af2 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

sync LayerDrop for Wav2Vec2Encoder + tests (#12076)

parent 82a2b76c
...@@ -24,6 +24,7 @@ sys.path.insert(1, str(git_repo_path)) ...@@ -24,6 +24,7 @@ sys.path.insert(1, str(git_repo_path))
import dataclasses # noqa import dataclasses # noqa
import io # noqa import io # noqa
import itertools # noqa
import json # noqa import json # noqa
import os # noqa import os # noqa
import unittest # noqa import unittest # noqa
...@@ -50,48 +51,62 @@ from transformers.trainer_utils import set_seed # noqa ...@@ -50,48 +51,62 @@ from transformers.trainer_utils import set_seed # noqa
set_seed(42) set_seed(42)
WAV2VEC2_TINY = "patrickvonplaten/wav2vec2_tiny_random_robust" models = dict(base="patrickvonplaten/wav2vec2_tiny_random", robust="patrickvonplaten/wav2vec2_tiny_random_robust")
ZERO2 = "zero2" ZERO2 = "zero2"
ZERO3 = "zero3" ZERO3 = "zero3"
stages = [ZERO2, ZERO3] stages = [ZERO2, ZERO3]
def custom_name_func(func, param_num, param):
# customize the test name generator function as we want both params to appear in the sub-test
# name, as by default it shows only the first param
param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
return f"{func.__name__}_{param_based_name}"
# Cartesian-product of zero stages with models to test
params = list(itertools.product(stages, models.keys()))
@slow @slow
@require_deepspeed @require_deepspeed
@require_torch_gpu @require_torch_gpu
class TestDeepSpeedWav2Vec2(TestCasePlus): class TestDeepSpeedWav2Vec2(TestCasePlus):
@parameterized.expand(stages) @parameterized.expand(params, name_func=custom_name_func)
def test_fp32_non_distributed(self, stage): def test_fp32_non_distributed(self, stage, model):
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
model=model,
distributed=False, distributed=False,
fp16=False, fp16=False,
) )
@require_torch_multi_gpu @require_torch_multi_gpu
@parameterized.expand(stages) @parameterized.expand(params, name_func=custom_name_func)
def test_fp32_distributed(self, stage): def test_fp32_distributed(self, stage, model):
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
model=model,
distributed=True, distributed=True,
fp16=False, fp16=False,
) )
@parameterized.expand(stages) @parameterized.expand(params, name_func=custom_name_func)
def test_fp16_non_distributed(self, stage): def test_fp16_non_distributed(self, stage, model):
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
model=model,
distributed=False, distributed=False,
fp16=True, fp16=True,
) )
@require_torch_multi_gpu @require_torch_multi_gpu
@parameterized.expand(stages) @parameterized.expand(params, name_func=custom_name_func)
def test_fp16_distributed(self, stage): def test_fp16_distributed(self, stage, model):
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
model=model,
distributed=True, distributed=True,
fp16=True, fp16=True,
) )
...@@ -104,14 +119,16 @@ class TestDeepSpeedWav2Vec2(TestCasePlus): ...@@ -104,14 +119,16 @@ class TestDeepSpeedWav2Vec2(TestCasePlus):
# XXX: need to do better validation beyond just that the run was successful # XXX: need to do better validation beyond just that the run was successful
def run_and_check( def run_and_check(
self, self,
stage, stage: str,
model_name: str = WAV2VEC2_TINY, model: str,
eval_steps: int = 10, eval_steps: int = 10,
distributed: bool = True, distributed: bool = True,
quality_checks: bool = True, quality_checks: bool = True,
fp16: bool = True, fp16: bool = True,
): ):
model_name = models[model]
output_dir = self.run_trainer( output_dir = self.run_trainer(
stage=stage, stage=stage,
model_name=model_name, model_name=model_name,
......
...@@ -548,15 +548,18 @@ class Wav2Vec2Encoder(nn.Module): ...@@ -548,15 +548,18 @@ class Wav2Vec2Encoder(nn.Module):
hidden_states = self.layer_norm(hidden_states) hidden_states = self.layer_norm(hidden_states)
hidden_states = self.dropout(hidden_states) hidden_states = self.dropout(hidden_states)
deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
for layer in self.layers: for layer in self.layers:
if output_hidden_states: if output_hidden_states:
all_hidden_states = all_hidden_states + (hidden_states,) all_hidden_states = all_hidden_states + (hidden_states,)
# add LayerDrop (see https://arxiv.org/abs/1909.11556 for description) # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
dropout_probability = np.random.uniform(0, 1) dropout_probability = np.random.uniform(0, 1)
if self.training and (dropout_probability < self.config.layerdrop): # skip the layer
layer_outputs = (None, None) skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
else: if not skip_the_layer or deepspeed_zero3_is_enabled:
# under deepspeed zero3 all gpus must run in sync
if getattr(self.config, "gradient_checkpointing", False) and self.training: if getattr(self.config, "gradient_checkpointing", False) and self.training:
# create gradient checkpointing function # create gradient checkpointing function
def create_custom_forward(module): def create_custom_forward(module):
...@@ -576,6 +579,9 @@ class Wav2Vec2Encoder(nn.Module): ...@@ -576,6 +579,9 @@ class Wav2Vec2Encoder(nn.Module):
) )
hidden_states = layer_outputs[0] hidden_states = layer_outputs[0]
if skip_the_layer:
layer_outputs = (None, None)
if output_attentions: if output_attentions:
all_self_attentions = all_self_attentions + (layer_outputs[1],) all_self_attentions = all_self_attentions + (layer_outputs[1],)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment