Unverified Commit 5dd0c956 authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

non-native optimizers are mostly ok with zero-offload (#12690)

parent 4cdb7ee5
...@@ -1061,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler ...@@ -1061,7 +1061,8 @@ optimizers, with the exception of using the combination of HuggingFace scheduler
| DS Optimizer | No | Yes | | DS Optimizer | No | Yes |
+--------------+--------------+--------------+ +--------------+--------------+--------------+
If ``offload_optimizer`` is enabled you must use both DeepSpeed scheduler and DeepSpeed optimizer. It is possible to use a non-DeepSpeed optimizer when ``offload_optimizer`` is enabled, as long as it has both CPU and
GPU implementation (except LAMB).
......
...@@ -315,9 +315,11 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): ...@@ -315,9 +315,11 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
# #
# Unless Offload is enabled in which case it's: # Unless Offload is enabled in which case it's:
# 1. DS scheduler + DS optimizer: Yes # 1. DS scheduler + DS optimizer: Yes
# 2. HF scheduler + HF optimizer: No # 2. HF scheduler + HF optimizer: Mostly*
# 3. DS scheduler + HF optimizer: No # 3. DS scheduler + HF optimizer: Mostly*
# 4. HF scheduler + DS optimizer: No # 4. HF scheduler + DS optimizer: No
#
# Mostly*: All non-native DeepSpeed optimizers that have both CPU and GPU implementation should work (except LAMB)
optimizer = None optimizer = None
if "optimizer" in config: if "optimizer" in config:
...@@ -328,7 +330,9 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None): ...@@ -328,7 +330,9 @@ def deepspeed_init(trainer, num_training_steps, resume_from_checkpoint=None):
) )
else: else:
if hf_deepspeed_config.is_offload(): if hf_deepspeed_config.is_offload():
raise ValueError("ZeRO Offload can only work with DeepSpeed optimizers") logger.info(
"Detected ZeRO Offload and non-DeepSpeed optimizers: This combination should work as long as the custom optimizer has both CPU and GPU implementation (except LAMB)"
)
# ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch. # ds supports Adam, OneBitAdam, and Lamb optimizers and can import other optimizers from torch.
# But trainer uses AdamW by default. # But trainer uses AdamW by default.
......
...@@ -325,20 +325,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -325,20 +325,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
@parameterized.expand(stages) @parameterized.expand(stages)
def test_hf_optimizer_with_offload(self, stage): def test_hf_optimizer_with_offload(self, stage):
# must not allow non-DS optimizer when using ZERO-offload # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
del ds_config_dict["optimizer"] # force default HF Trainer optimizer del ds_config_dict["optimizer"] # force default HF Trainer optimizer
# force cpu offload # force cpu offload
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict) trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict)
with self.assertRaises(Exception) as context: with CaptureLogger(deepspeed_logger) as cl:
trainer.train() trainer.train()
self.assertIn( self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
"ZeRO Offload can only work with DeepSpeed optimizers",
str(context.exception),
f"got exception: {context.exception}",
)
@parameterized.expand(stages) @parameterized.expand(stages)
def test_fake_notebook_no_launcher(self, stage): def test_fake_notebook_no_launcher(self, stage):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment