"...src/http/git@developer.sourcefind.cn:OpenDAS/dynamo.git" did not exist on "4e6f3fef9aa485f0607693bdaea6c7aa63409f5e"
Unverified Commit e830ef91 authored by flybird11111's avatar flybird11111 Committed by GitHub
Browse files

[ci] fix shardformer tests. (#5255)



* fix ci

fix

* revert: revert p2p

* feat: add enable_metadata_cache option

* revert: enable t5 tests

---------
Co-authored-by: default avatarWenhao Chen <cwher@outlook.com>
parent 756c400a
...@@ -919,6 +919,7 @@ class HybridParallelPlugin(PipelinePluginBase): ...@@ -919,6 +919,7 @@ class HybridParallelPlugin(PipelinePluginBase):
custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None. custom_policy (Policy, optional): Custom policy for Shardformer. Defaults to None.
pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'. pp_style (str, optional): The style for pipeline parallelism. Defaults to '1f1b'.
num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1. num_model_chunks (int, optional): The number of model chunks for interleaved pipeline parallelism. Defaults to 1.
enable_metadata_cache (bool, optional): Whether to enable metadata cache for pipeline parallelism. Defaults to True.
""" """
def __init__( def __init__(
...@@ -956,6 +957,7 @@ class HybridParallelPlugin(PipelinePluginBase): ...@@ -956,6 +957,7 @@ class HybridParallelPlugin(PipelinePluginBase):
custom_policy: Policy = None, custom_policy: Policy = None,
pp_style: str = "1f1b", pp_style: str = "1f1b",
num_model_chunks: int = 1, num_model_chunks: int = 1,
enable_metadata_cache: bool = True,
) -> None: ) -> None:
super().__init__() super().__init__()
assert ( assert (
...@@ -1002,10 +1004,14 @@ class HybridParallelPlugin(PipelinePluginBase): ...@@ -1002,10 +1004,14 @@ class HybridParallelPlugin(PipelinePluginBase):
num_model_chunks=num_model_chunks, num_model_chunks=num_model_chunks,
num_microbatch=num_microbatches, num_microbatch=num_microbatches,
microbatch_size=microbatch_size, microbatch_size=microbatch_size,
enable_metadata_cache=enable_metadata_cache,
) )
elif pp_style == "1f1b": elif pp_style == "1f1b":
self.schedule = OneForwardOneBackwardSchedule( self.schedule = OneForwardOneBackwardSchedule(
self.stage_manager, num_microbatches=num_microbatches, microbatch_size=microbatch_size stage_manager=self.stage_manager,
num_microbatches=num_microbatches,
microbatch_size=microbatch_size,
enable_metadata_cache=enable_metadata_cache,
) )
else: else:
raise NotImplementedError() raise NotImplementedError()
......
...@@ -165,7 +165,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -165,7 +165,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
) )
@clear_cache_before_run() @clear_cache_before_run()
def run_gpt2_test(test_config): def run_gpt2_test(test_config):
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt") sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
...@@ -200,7 +200,7 @@ def run_gpt2_test(test_config): ...@@ -200,7 +200,7 @@ def run_gpt2_test(test_config):
) )
@clear_cache_before_run() @clear_cache_before_run()
def run_gpt2_3d_test(test_config): def run_gpt2_3d_test(test_config):
sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt") sub_model_zoo = model_zoo.get_sub_registry("transformers_gpt", exclude="transformers_gptj")
for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items(): for name, (model_fn, data_gen_fn, output_transform_fn, loss_fn, _) in sub_model_zoo.items():
check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config) check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, test_config)
......
...@@ -86,6 +86,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -86,6 +86,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 2, "num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True, "enable_all_optimization": True,
"use_lazy_init": True, "use_lazy_init": True,
"precision": "fp16", "precision": "fp16",
...@@ -95,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -95,6 +96,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1, "tp_size": 1,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp16", "precision": "fp16",
"initial_scale": 1, "initial_scale": 1,
...@@ -110,6 +112,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -110,6 +112,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1, "tp_size": 1,
"pp_size": 4, "pp_size": 4,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False, "enable_all_optimization": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
...@@ -128,6 +131,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -128,6 +131,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1, "tp_size": 1,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 2, "num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True, "enable_all_optimization": True,
"use_lazy_init": True, "use_lazy_init": True,
"zero_stage": 1, "zero_stage": 1,
...@@ -159,6 +163,7 @@ def run_t5_test(test_config): ...@@ -159,6 +163,7 @@ def run_t5_test(test_config):
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False, "enable_all_optimization": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
...@@ -168,6 +173,7 @@ def run_t5_test(test_config): ...@@ -168,6 +173,7 @@ def run_t5_test(test_config):
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False, "enable_all_optimization": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp16", "precision": "fp16",
......
...@@ -114,6 +114,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -114,6 +114,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 2, "num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": True, "enable_all_optimization": True,
"use_lazy_init": True, "use_lazy_init": True,
"precision": "fp32", "precision": "fp32",
...@@ -123,6 +124,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -123,6 +124,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1, "tp_size": 1,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
"initial_scale": 1, "initial_scale": 1,
...@@ -138,6 +140,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn, ...@@ -138,6 +140,7 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
"tp_size": 1, "tp_size": 1,
"pp_size": 4, "pp_size": 4,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
}, },
...@@ -163,6 +166,7 @@ def run_whisper_test(test_config): ...@@ -163,6 +166,7 @@ def run_whisper_test(test_config):
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 4, "num_microbatches": 4,
"enable_metadata_cache": False,
"enable_all_optimization": False, "enable_all_optimization": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
...@@ -172,6 +176,7 @@ def run_whisper_test(test_config): ...@@ -172,6 +176,7 @@ def run_whisper_test(test_config):
"tp_size": 2, "tp_size": 2,
"pp_size": 2, "pp_size": 2,
"num_microbatches": 2, "num_microbatches": 2,
"enable_metadata_cache": False,
"enable_all_optimization": False, "enable_all_optimization": False,
"use_lazy_init": False, "use_lazy_init": False,
"precision": "fp32", "precision": "fp32",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment