Unverified Commit 580dd87c authored by Stas Bekman's avatar Stas Bekman Committed by GitHub
Browse files

[Deepspeed] add support for bf16 mode (#14569)



* [WIP] add support for bf16 mode

* prep for bf16

* prep for bf16

* fix; zero2/bf16 is ok

* check bf16 is available

* test fixes

* enable zero3_bf16

* config files

* docs

* split stage_dtype; merge back to non-dtype-specific config file

* fix doc

* cleanup

* cleanup

* bfloat16 => bf16 to match the PR changes

* s/zero_gather_fp16_weights_on_model_save/zero_gather_16bit_weights_on_model_save/; s/save_fp16_model/save_16bit_model/

* test fixes/skipping

* move

* fix

* Update docs/source/main_classes/deepspeed.mdx
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>

* backticks

* cleanup

* cleanup

* cleanup

* new version

* add note about grad accum in bf16
Co-authored-by: default avatarSylvain Gugger <35901082+sgugger@users.noreply.github.com>
parent c1f209da
...@@ -367,7 +367,7 @@ cat <<'EOT' > ds_config_zero3.json ...@@ -367,7 +367,7 @@ cat <<'EOT' > ds_config_zero3.json
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
...@@ -652,7 +652,7 @@ The following is an example of configuration for ZeRO stage 3: ...@@ -652,7 +652,7 @@ The following is an example of configuration for ZeRO stage 3:
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
} }
} }
``` ```
...@@ -691,7 +691,7 @@ The following configuration values depend on the model's hidden size: ...@@ -691,7 +691,7 @@ The following configuration values depend on the model's hidden size:
therefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended therefore set these values to `auto` and the [`Trainer`] will automatically assign the recommended
values. But, of course, feel free to set these explicitly as well. values. But, of course, feel free to set these explicitly as well.
`stage3_gather_fp16_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large `stage3_gather_16bit_weights_on_model_save` enables model fp16 weights consolidation when model gets saved. With large
models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if models and multiple GPUs this is an expensive operation both in terms of memory and speed. It's currently required if
you plan to resume the training. Watch out for future updates that will remove this limitation and make things more you plan to resume the training. Watch out for future updates that will remove this limitation and make things more
flexible. flexible.
...@@ -760,8 +760,8 @@ The following configuration example enables NVMe to offload both optimizer state ...@@ -760,8 +760,8 @@ The following configuration example enables NVMe to offload both optimizer state
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
} },
} }
``` ```
...@@ -966,7 +966,7 @@ Here is a full ZeRO-3 auto-configuration file `ds_config_zero3.json`: ...@@ -966,7 +966,7 @@ Here is a full ZeRO-3 auto-configuration file `ds_config_zero3.json`:
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
...@@ -1029,7 +1029,7 @@ values look like, but we highly recommend using the one with multiple `auto` set ...@@ -1029,7 +1029,7 @@ values look like, but we highly recommend using the one with multiple `auto` set
"stage3_param_persistence_threshold": 1e4, "stage3_param_persistence_threshold": 1e4,
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"steps_per_print": 2000, "steps_per_print": 2000,
...@@ -1232,6 +1232,7 @@ the much more efficient tf32 format for some operations, but the results will st ...@@ -1232,6 +1232,7 @@ the much more efficient tf32 format for some operations, but the results will st
benchmarks, please, see [TensorFloat-32(TF32) on Ampere devices](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). The document includes benchmarks, please, see [TensorFloat-32(TF32) on Ampere devices](https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices). The document includes
instructions on how to disable this automatic conversion if for some reason you prefer not to use it. instructions on how to disable this automatic conversion if for some reason you prefer not to use it.
With the 🤗 Trainer you can use `--tf32` to enable it, or disable it with `--tf32 0` or `--no_tf32`. By default the PyTorch default is used.
...@@ -1241,7 +1242,9 @@ instructions on how to disable this automatic conversion if for some reason you ...@@ -1241,7 +1242,9 @@ instructions on how to disable this automatic conversion if for some reason you
You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way: You can use automatic mixed precision with either a pytorch-like AMP way or the apex-like way:
To configure pytorch AMP-like mode set: ### fp16
To configure pytorch AMP-like mode with fp16 (float16) set:
```json ```json
{ {
...@@ -1259,7 +1262,7 @@ To configure pytorch AMP-like mode set: ...@@ -1259,7 +1262,7 @@ To configure pytorch AMP-like mode set:
and the [`Trainer`] will automatically enable or disable it based on the value of and the [`Trainer`] will automatically enable or disable it based on the value of
`args.fp16_backend`. The rest of config values are up to you. `args.fp16_backend`. The rest of config values are up to you.
This mode gets enabled when `--fp16 --fp16_backend amp` command line args are passed. This mode gets enabled when `--fp16 --fp16_backend amp` or `--fp16_full_eval` command line args are passed.
You can also enable/disable this mode explicitly: You can also enable/disable this mode explicitly:
...@@ -1281,6 +1284,43 @@ configuration. ...@@ -1281,6 +1284,43 @@ configuration.
Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options). Here is the [documentation](https://www.deepspeed.ai/docs/config-json/#fp16-training-options).
### bf16
If bf16 (bfloat16) is desired instead of fp16 then the following configuration section is to be used:
```json
{
"bf16": {
"enabled": "auto"
}
}
```
bf16 has the same dynamic range as fp32 and thus doesn't require loss scaling.
This mode gets enabled when `--bf16` or `--bf16_full_eval` command line args are passed.
You can also enable/disable this mode explicitly:
```json
{
"bf16": {
"enabled": true
}
}
```
<Tip>
As of `deepspeed==0.6.0` the bf16 support is new and experimental.
If you use [gradient accumulation](#gradient-accumulation) with bf16-enabled, you need to be aware that it'll accumulate gradients in bf16, which may not be what you want due to this format's low precision, as it may lead to a lossy accumulation.
</Tip>
### apex
To configure apex AMP-like mode set: To configure apex AMP-like mode set:
```json ```json
...@@ -1411,15 +1451,14 @@ When a model is saved under ZeRO-2, you end up having the normal `pytorch_model. ...@@ -1411,15 +1451,14 @@ When a model is saved under ZeRO-2, you end up having the normal `pytorch_model.
they are only the fp16 version of the weights. they are only the fp16 version of the weights.
Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs, Under ZeRO-3, things are much more complicated, since the model weights are partitioned out over multiple GPUs,
therefore `"stage3_gather_fp16_weights_on_model_save": true` is required to get the `Trainer` to save the fp16 therefore `"stage3_gather_16bit_weights_on_model_save": true` is required to get the `Trainer` to save the fp16
version of the weights. If this setting is `False` ``pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict`` it version of the weights. If this setting is `False` `pytorch_model.bin` won't be created. This is because by default DeepSpeed's `state_dict` contains a placeholder and not the real weights. If we were to save this `state_dict` it won't be possible to load it back.
won't be possible to load it back.
```json ```json
{ {
"zero_optimization": { "zero_optimization": {
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
} }
} }
``` ```
......
...@@ -45,7 +45,7 @@ ...@@ -45,7 +45,7 @@
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
......
...@@ -98,7 +98,7 @@ _deps = [ ...@@ -98,7 +98,7 @@ _deps = [
"cookiecutter==1.7.2", "cookiecutter==1.7.2",
"dataclasses", "dataclasses",
"datasets", "datasets",
"deepspeed>=0.5.9", "deepspeed>=0.6.0",
"fairscale>0.3", "fairscale>0.3",
"faiss-cpu", "faiss-cpu",
"fastapi", "fastapi",
......
...@@ -73,7 +73,7 @@ class HfDeepSpeedConfig: ...@@ -73,7 +73,7 @@ class HfDeepSpeedConfig:
# zero stage - this is done as early as possible, before model is created, to allow # zero stage - this is done as early as possible, before model is created, to allow
# ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object # ``is_deepspeed_zero3_enabled`` query and getting to the early deepspeed config object
# during ``zero.Init()`` which needs whether fp16 is enabled, dtype, etc. # during ``zero.Init()`` which needs to know the dtype, and some other hparams.
self._stage = self.get_value("zero_optimization.stage", -1) self._stage = self.get_value("zero_optimization.stage", -1)
# offload # offload
...@@ -169,10 +169,12 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): ...@@ -169,10 +169,12 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
def __init__(self, config_file_or_dict): def __init__(self, config_file_or_dict):
super().__init__(config_file_or_dict) super().__init__(config_file_or_dict)
self._dtype = torch.float16 self._dtype = None
self.mismatches = [] self.mismatches = []
def dtype(self): def dtype(self):
if self._dtype is None:
raise ValueError("trainer_config_process() wasn't called yet to tell dtype")
return self._dtype return self._dtype
def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True): def fill_match(self, ds_key_long, hf_val, hf_key=None, must_match=True):
...@@ -228,26 +230,33 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig): ...@@ -228,26 +230,33 @@ class HfTrainerDeepSpeedConfig(HfDeepSpeedConfig):
# total_num_steps - will get set in trainer_config_finalize # total_num_steps - will get set in trainer_config_finalize
# fp16 # fp16
if args.fp16: if args.fp16 or args.fp16_full_eval:
fp16_backend = "apex" if args.fp16_backend == "apex" else "amp" fp16_backend = "apex" if args.fp16_backend == "apex" else "amp"
else: else:
fp16_backend = None fp16_backend = None
# amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set # amp: similar to the pytorch native amp - it has a bunch of optional params but we won't set
# any here unless the user did the work # any here unless the user did the work
self.fill_match("fp16.enabled", fp16_backend == "amp", "fp16+fp16_backend(amp)") self.fill_match(
"fp16.enabled",
((args.fp16 or args.fp16_full_eval) and fp16_backend == "amp"),
"fp16|fp16_full_eval+fp16_backend(amp)",
)
# apex: delegates amp work to apex (which needs to be available), but it cannot be used with any # apex: delegates amp work to apex (which needs to be available), but it cannot be used with any
# ZeRO features # ZeRO features
self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)") self.fill_match("amp.enabled", fp16_backend == "apex", "fp16+fp16_backend(apex)")
self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level") self.fill_match("amp.opt_level", args.fp16_opt_level, "fp16_opt_level")
# only if we have an explicit fp16.enabled = False then it's fp32, if it's True or this self.fill_match("bf16.enabled", (args.bf16 or args.bf16_full_eval), "bf16|bf16_full_eval")
# whole config section is missing then the fallback is fp16
if self.is_false("fp16.enabled"): # deepspeed's default mode is fp16 unless there is a config that says differently
if self.is_true("bfoat16.enabled"):
self._dtype = torch.bfloat16
elif self.is_false("fp16.enabled"):
self._dtype = torch.float32 self._dtype = torch.float32
# later there will be other dtypes besides just fp16 and fp32 else:
# also not quite sure what dtype should be under apex, defaulting to fp16 for now self._dtype = torch.float16
def trainer_config_finalize(self, args, model, num_training_steps): def trainer_config_finalize(self, args, model, num_training_steps):
""" """
......
...@@ -8,7 +8,7 @@ deps = { ...@@ -8,7 +8,7 @@ deps = {
"cookiecutter": "cookiecutter==1.7.2", "cookiecutter": "cookiecutter==1.7.2",
"dataclasses": "dataclasses", "dataclasses": "dataclasses",
"datasets": "datasets", "datasets": "datasets",
"deepspeed": "deepspeed>=0.5.9", "deepspeed": "deepspeed>=0.6.0",
"fairscale": "fairscale>0.3", "fairscale": "fairscale>0.3",
"faiss-cpu": "faiss-cpu", "faiss-cpu": "faiss-cpu",
"fastapi": "fastapi", "fastapi": "fastapi",
......
...@@ -1687,7 +1687,7 @@ class Trainer: ...@@ -1687,7 +1687,7 @@ class Trainer:
self.save_model(output_dir, _internal_call=True) self.save_model(output_dir, _internal_call=True)
if self.deepspeed: if self.deepspeed:
# under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed # under zero3 model file itself doesn't get saved since it's bogus! Unless deepspeed
# config `stage3_gather_fp16_weights_on_model_save` is True # config `stage3_gather_16bit_weights_on_model_save` is True
self.deepspeed.save_checkpoint(output_dir) self.deepspeed.save_checkpoint(output_dir)
# Save optimizer and scheduler # Save optimizer and scheduler
...@@ -2101,12 +2101,12 @@ class Trainer: ...@@ -2101,12 +2101,12 @@ class Trainer:
# logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights") # logger.info(f"deepspeed zero3: removing {file}, see zero_to_fp32.py to recover weights")
os.remove(file) os.remove(file)
# now save the real model if stage3_gather_fp16_weights_on_model_save=True # now save the real model if stage3_gather_16bit_weights_on_model_save=True
# if false it will not be saved. # if false it will not be saved.
# This must be called on all ranks # This must be called on all ranks
if not self.deepspeed.save_fp16_model(output_dir, WEIGHTS_NAME): if not self.deepspeed.save_16bit_model(output_dir, WEIGHTS_NAME):
logger.warning( logger.warning(
"deepspeed.save_fp16_model didn't save the model, since stage3_gather_fp16_weights_on_model_save=false. " "deepspeed.save_16bit_model didn't save the model, since stage3_gather_16bit_weights_on_model_save=false. "
"Saving the full checkpoint instead, use zero_to_fp32.py to recover weights" "Saving the full checkpoint instead, use zero_to_fp32.py to recover weights"
) )
self.deepspeed.save_checkpoint(output_dir) self.deepspeed.save_checkpoint(output_dir)
......
...@@ -8,6 +8,10 @@ ...@@ -8,6 +8,10 @@
"min_loss_scale": 1 "min_loss_scale": 1
}, },
"bf16": {
"enabled": "auto"
},
"optimizer": { "optimizer": {
"type": "AdamW", "type": "AdamW",
"params": { "params": {
......
...@@ -8,6 +8,10 @@ ...@@ -8,6 +8,10 @@
"min_loss_scale": 1 "min_loss_scale": 1
}, },
"bf16": {
"enabled": "auto"
},
"optimizer": { "optimizer": {
"type": "AdamW", "type": "AdamW",
"params": { "params": {
...@@ -45,7 +49,7 @@ ...@@ -45,7 +49,7 @@
"stage3_param_persistence_threshold": "auto", "stage3_param_persistence_threshold": "auto",
"stage3_max_live_parameters": 1e9, "stage3_max_live_parameters": 1e9,
"stage3_max_reuse_distance": 1e9, "stage3_max_reuse_distance": 1e9,
"stage3_gather_fp16_weights_on_model_save": true "stage3_gather_16bit_weights_on_model_save": true
}, },
"gradient_accumulation_steps": "auto", "gradient_accumulation_steps": "auto",
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
import dataclasses import dataclasses
import io import io
import itertools
import json import json
import os import os
import unittest import unittest
...@@ -23,7 +24,7 @@ from parameterized import parameterized ...@@ -23,7 +24,7 @@ from parameterized import parameterized
from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa from tests.trainer.test_trainer import TrainerIntegrationCommon # noqa
from transformers import AutoModel, TrainingArguments, is_torch_available, logging from transformers import AutoModel, TrainingArguments, is_torch_available, logging
from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available from transformers.deepspeed import HfDeepSpeedConfig, is_deepspeed_available
from transformers.file_utils import WEIGHTS_NAME from transformers.file_utils import WEIGHTS_NAME, is_torch_bf16_available
from transformers.testing_utils import ( from transformers.testing_utils import (
CaptureLogger, CaptureLogger,
CaptureStd, CaptureStd,
...@@ -120,7 +121,26 @@ def get_launcher(distributed=False): ...@@ -120,7 +121,26 @@ def get_launcher(distributed=False):
ZERO2 = "zero2" ZERO2 = "zero2"
ZERO3 = "zero3" ZERO3 = "zero3"
FP16 = "fp16"
BF16 = "bf16"
stages = [ZERO2, ZERO3] stages = [ZERO2, ZERO3]
if is_torch_bf16_available():
dtypes = [FP16, BF16]
else:
dtypes = [FP16]
def parameterized_custom_name_func(func, param_num, param):
# customize the test name generator function as we want both params to appear in the sub-test
# name, as by default it shows only the first param
param_based_name = parameterized.to_safe_name("_".join(str(x) for x in param.args))
return f"{func.__name__}_{param_based_name}"
# Cartesian-product of zero stages with models to test
params = list(itertools.product(stages, dtypes))
@require_deepspeed @require_deepspeed
...@@ -138,8 +158,8 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -138,8 +158,8 @@ class CoreIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1" MASTER_ADDR="localhost", MASTER_PORT=master_port, RANK="0", LOCAL_RANK="0", WORLD_SIZE="1"
) )
def test_init_zero3(self): def test_init_zero3_fp16(self):
# test that zero.Init() works correctly under zero3 # test that zero.Init() works correctly under zero3/fp16
ds_config = { ds_config = {
"train_batch_size": 1, "train_batch_size": 1,
"zero_optimization": { "zero_optimization": {
...@@ -216,15 +236,12 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -216,15 +236,12 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# use self.get_config_dict(stage) to use these to ensure the original is not modified # use self.get_config_dict(stage) to use these to ensure the original is not modified
with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f: with io.open(self.ds_config_file[ZERO2], "r", encoding="utf-8") as f:
config_zero2 = json.load(f) config_zero2 = json.load(f)
# by default use fp16
config_zero2["fp16"]["enabled"] = True
with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f: with io.open(self.ds_config_file[ZERO3], "r", encoding="utf-8") as f:
config_zero3 = json.load(f) config_zero3 = json.load(f)
# by default use fp16 # The following setting slows things down, so don't enable it by default unless needed by a test.
config_zero3["fp16"]["enabled"] = True
# This setting slows things down, so don't enable it by default unless needed by a test.
# It's in the file as a demo for users since we want everything to work out of the box even if slower. # It's in the file as a demo for users since we want everything to work out of the box even if slower.
config_zero3["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = False config_zero3["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = False
self.ds_config_dict = dict( self.ds_config_dict = dict(
zero2=config_zero2, zero2=config_zero2,
zero3=config_zero3, zero3=config_zero3,
...@@ -348,21 +365,23 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -348,21 +365,23 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# --- These tests need to run on both zero stages --- # # --- These tests need to run on both zero stages --- #
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_hf_optimizer_with_offload(self, stage): def test_hf_optimizer_with_offload(self, stage, dtype):
# non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB)) # non-DS optimizers can be used with ZERO-offload (as long as they have both CPU and GPU implementation (except LAMB))
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
del ds_config_dict["optimizer"] # force default HF Trainer optimizer del ds_config_dict["optimizer"] # force default HF Trainer optimizer
# force cpu offload # force cpu offload
ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu" ds_config_dict["zero_optimization"]["offload_optimizer"]["device"] = "cpu"
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=ds_config_dict) kwargs = dict(local_rank=0, deepspeed=ds_config_dict)
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl: with CaptureLogger(deepspeed_logger) as cl:
trainer.train() trainer.train()
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none") self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_fake_notebook_no_launcher(self, stage): def test_fake_notebook_no_launcher(self, stage, dtype):
# this setup emulates a notebook where a launcher needs to be emulated by hand # this setup emulates a notebook where a launcher needs to be emulated by hand
# note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture # note that unittest resets sys.stdout each test, so `CaptureStd` will work here to capture
...@@ -370,13 +389,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -370,13 +389,16 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# it's run not as a first test as `sys.stdout` will no longer be the same. So we either have # it's run not as a first test as `sys.stdout` will no longer be the same. So we either have
# to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger. # to reset `deepspeed_logger.handlers[0].setStream(sys.stdout)` or directly capture from the deepspeed_logger.
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(local_rank=0, fp16=True, deepspeed=self.get_config_dict(stage)) kwargs = dict(local_rank=0, deepspeed=self.get_config_dict(stage))
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
with CaptureLogger(deepspeed_logger) as cl: with CaptureLogger(deepspeed_logger) as cl:
trainer.train() trainer.train()
self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none") self.assertIn("DeepSpeed info", cl.out, "expected DeepSpeed logger output but got none")
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_early_get_last_lr(self, stage): def test_early_get_last_lr(self, stage, dtype):
# with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
# not run for the first few dozen steps while loss scale is too large, and thus during # not run for the first few dozen steps while loss scale is too large, and thus during
# that time `get_last_lr` will fail if called during that warm up stage, # that time `get_last_lr` will fail if called during that warm up stage,
...@@ -385,34 +407,36 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -385,34 +407,36 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step. # `self.lr_scheduler.get_last_lr()` and originally it'd fail on the very first step.
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
a = b = 0.0 a = b = 0.0
trainer = get_regression_trainer( kwargs = dict(
a=a, a=a,
b=b, b=b,
local_rank=0, local_rank=0,
train_len=8, train_len=8,
fp16=True,
deepspeed=self.get_config_dict(stage), deepspeed=self.get_config_dict(stage),
per_device_train_batch_size=8, per_device_train_batch_size=8,
logging_steps=1, logging_steps=1,
) )
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
trainer.train() trainer.train()
post_train_a = trainer.model.a.item() post_train_a = trainer.model.a.item()
# XXX: for some reason the following check fails with zero3 - not a broken but a # XXX: for some reason the following check fails with zero3/fp16 and any/bf16 - not a
# different qualitative outcome - as if optimizer did run # broken but a different qualitative outcome - as if optimizer did run
# oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere # oddly getting 1.0 for both a and b from 0.0 - there is a bug somewhere
# print(trainer.model.a.item()) # print(trainer.model.a.item())
# print(trainer.model.b.item()) # print(trainer.model.b.item())
# need to investigate at some point # need to investigate at some point
if stage == ZERO3: if (stage == ZERO3 and dtype == FP16) or (dtype == BF16):
return return
# it's enough that train didn't fail for this test, but we must check that # it's enough that train didn't fail for this test, but we must check that
# optimizer/scheduler didn't run (since if it did this test isn't testing the right thing) # optimizer/scheduler didn't run (since if it did this test isn't testing the right thing)
self.assertEqual(post_train_a, a) self.assertEqual(post_train_a, a)
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_gradient_accumulation(self, stage): def test_gradient_accumulation(self, stage, dtype):
# this test measures that we get identical weights and similar loss with: # this test measures that we get identical weights and similar loss with:
# 1. per_device_train_batch_size=8, gradient_accumulation_steps=1 # 1. per_device_train_batch_size=8, gradient_accumulation_steps=1
# 2. per_device_train_batch_size=4, gradient_accumulation_steps=2 # 2. per_device_train_batch_size=4, gradient_accumulation_steps=2
...@@ -433,9 +457,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -433,9 +457,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
b=b, b=b,
local_rank=0, local_rank=0,
train_len=train_len, train_len=train_len,
fp16=True,
deepspeed=self.get_config_dict(stage), deepspeed=self.get_config_dict(stage),
) )
kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
no_grad_accum_trainer = get_regression_trainer( no_grad_accum_trainer = get_regression_trainer(
...@@ -482,15 +506,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -482,15 +506,7 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
else: else:
raise ValueError(f"unknown stage {stage}") raise ValueError(f"unknown stage {stage}")
# XXX: this can be recoded and then removed once we require deepspeed>0.3.13 ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
from packaging import version
import deepspeed
if version.parse(deepspeed.__version__) > version.parse("0.3.13"):
ds_file_list.append("zero_pp_rank_0_mp_rank_00_optim_states.pt")
else:
ds_file_list.append("zero_pp_rank_0_mp_rank_00optim_states.pt")
for step in range(freq, total, freq): for step in range(freq, total, freq):
checkpoint = os.path.join(output_dir, f"checkpoint-{step}") checkpoint = os.path.join(output_dir, f"checkpoint-{step}")
...@@ -509,37 +525,42 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -509,37 +525,42 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
path = os.path.join(ds_path, filename) path = os.path.join(ds_path, filename)
self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found") self.assertTrue(os.path.isfile(path), f"[{stage}] {path} is not found")
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_save_checkpoints(self, stage): def test_save_checkpoints(self, stage, dtype):
# adapted from TrainerIntegrationTest.test_save_checkpoints # adapted from TrainerIntegrationTest.test_save_checkpoints
freq = 5 freq = 5
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step if dtype == FP16:
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
# XXX:
if stage == ZERO3: if stage == ZERO3:
ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
# save checkpoints # save checkpoints
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer( kwargs = dict(
output_dir=output_dir, output_dir=output_dir,
save_steps=freq, save_steps=freq,
fp16=True,
deepspeed=ds_config_dict, deepspeed=ds_config_dict,
) )
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
trainer.train() trainer.train()
total = int(self.n_epochs * 64 / self.batch_size) total = int(self.n_epochs * 64 / self.batch_size)
self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage) self.check_saved_checkpoints_deepspeed(output_dir, freq, total, stage)
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_can_resume_training_errors(self, stage): def test_can_resume_training_errors(self, stage, dtype):
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
trainer = get_regression_trainer(output_dir=output_dir, fp16=True, deepspeed=ds_config_dict) kwargs = dict(output_dir=output_dir, deepspeed=ds_config_dict)
kwargs[dtype] = True
trainer = get_regression_trainer(**kwargs)
# 1. fail to find any checkpoint - due a fresh output_dir # 1. fail to find any checkpoint - due a fresh output_dir
with self.assertRaises(Exception) as context: with self.assertRaises(Exception) as context:
...@@ -557,19 +578,20 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -557,19 +578,20 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
"Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}" "Can't find a valid checkpoint at" in str(context.exception), f"got exception: {context.exception}"
) )
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_can_resume_training_normal(self, stage): def test_can_resume_training_normal(self, stage, dtype):
# adapted from TrainerIntegrationTest.test_can_resume_training # adapted from TrainerIntegrationTest.test_can_resume_training
# test normal resume for each stage separately, error-handling is tested in a different test # test normal resume for each stage separately, error-handling is tested in a different test
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir("./xxx", after=False)
ds_config_dict = self.get_config_dict(stage) ds_config_dict = self.get_config_dict(stage)
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step if dtype == FP16:
ds_config_dict["fp16"]["initial_scale_power"] = 1 # force optimizer on the first step
# XXX:
if stage == ZERO3: if stage == ZERO3:
ds_config_dict["zero_optimization"]["stage3_gather_fp16_weights_on_model_save"] = True ds_config_dict["zero_optimization"]["stage3_gather_16bit_weights_on_model_save"] = True
kwargs = dict( kwargs = dict(output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, deepspeed=ds_config_dict)
output_dir=output_dir, train_len=128, save_steps=5, learning_rate=0.1, fp16=True, deepspeed=ds_config_dict kwargs[dtype] = True
)
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
...@@ -607,8 +629,8 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -607,8 +629,8 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
# trainer.train(resume_from_checkpoint=checkpoint) # trainer.train(resume_from_checkpoint=checkpoint)
# a workaround needs to be used that re-creates the deepspeed engine # a workaround needs to be used that re-creates the deepspeed engine
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_load_state_dict_from_zero_checkpoint(self, stage): def test_load_state_dict_from_zero_checkpoint(self, stage, dtype):
# test that we can load fp32 weights directly from the zero checkpoint into the current model # test that we can load fp32 weights directly from the zero checkpoint into the current model
output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False, before=False) output_dir = self.get_auto_remove_tmp_dir() # "./xxx", after=False, before=False)
...@@ -623,9 +645,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -623,9 +645,9 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
save_strategy="steps", save_strategy="steps",
save_steps=1, save_steps=1,
learning_rate=0.1, learning_rate=0.1,
fp16=True,
deepspeed=ds_config_dict, deepspeed=ds_config_dict,
) )
kwargs[dtype] = True
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(**kwargs) trainer = get_regression_trainer(**kwargs)
...@@ -648,8 +670,8 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon): ...@@ -648,8 +670,8 @@ class TrainerIntegrationDeepSpeed(TestCasePlus, TrainerIntegrationCommon):
output_dir = self.get_auto_remove_tmp_dir() output_dir = self.get_auto_remove_tmp_dir()
kwargs = dict(output_dir=output_dir, train_len=8, fp16=True) kwargs = dict(output_dir=output_dir, train_len=8, fp16=True)
ds_config_zero3_dict = self.get_config_dict("zero3") ds_config_zero3_dict = self.get_config_dict(ZERO3)
ds_config_zero2_dict = self.get_config_dict("zero2") ds_config_zero2_dict = self.get_config_dict(ZERO2)
with mockenv_context(**self.dist_env_1_gpu): with mockenv_context(**self.dist_env_1_gpu):
trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs) trainer = get_regression_trainer(deepspeed=ds_config_zero3_dict, **kwargs)
...@@ -698,57 +720,60 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -698,57 +720,60 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
# #
@require_torch_multi_gpu @require_torch_multi_gpu
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_basic_distributed(self, stage): def test_basic_distributed(self, stage, dtype):
self.run_and_check(stage=stage, distributed=True) self.run_and_check(stage=stage, dtype=dtype, distributed=True)
def test_do_eval_no_train(self): def test_do_eval_no_train(self):
# testing only zero3 since zero2 makes no sense with inference # testing only zero3 since zero2 makes no sense with inference
self.run_and_check( self.run_and_check(
stage=ZERO3, stage=ZERO3,
dtype=FP16,
eval_steps=1, eval_steps=1,
distributed=False, distributed=False,
do_train=False, do_train=False,
do_eval=True, do_eval=True,
) )
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_fp32_non_distributed(self, stage): def test_fp32_non_distributed(self, stage, dtype):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done # therefore no quality checks, just basic completion checks are done
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
dtype=dtype,
model_name=T5_TINY, model_name=T5_TINY,
distributed=False, distributed=False,
do_train=True, do_train=True,
do_eval=True, do_eval=True,
quality_checks=False, quality_checks=False,
fp16=False, fp32=True,
) )
@require_torch_multi_gpu @require_torch_multi_gpu
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_fp32_distributed(self, stage): def test_fp32_distributed(self, stage, dtype):
# real model needs too much GPU memory under stage2+fp32, so using tiny random model here - # real model needs too much GPU memory under stage2+fp32, so using tiny random model here -
# therefore no quality checks, just basic completion checks are done # therefore no quality checks, just basic completion checks are done
self.run_and_check( self.run_and_check(
stage=stage, stage=stage,
dtype=dtype,
model_name=T5_TINY, model_name=T5_TINY,
distributed=True, distributed=True,
do_train=True, do_train=True,
do_eval=True, do_eval=True,
quality_checks=False, quality_checks=False,
fp16=False, fp32=True,
) )
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_resume_train_not_from_ds_checkpoint(self, stage): def test_resume_train_not_from_ds_checkpoint(self, stage, dtype):
# do normal training and then resume not from the deepspeed checkpoint but explicitly from # do normal training and then resume not from the deepspeed checkpoint but explicitly from
# the saved model dir # the saved model dir
do_train = True do_train = True
do_eval = False do_eval = False
kwargs = dict(stage=stage, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval) kwargs = dict(stage=stage, dtype=dtype, eval_steps=1, distributed=True, do_train=do_train, do_eval=do_eval)
# 1. normal training # 1. normal training
output_dir = self.run_and_check(**kwargs) output_dir = self.run_and_check(**kwargs)
...@@ -760,19 +785,23 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -760,19 +785,23 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
self.do_checks(output_dir, do_train=do_train, do_eval=do_eval) self.do_checks(output_dir, do_train=do_train, do_eval=do_eval)
@require_torch_multi_gpu @require_torch_multi_gpu
@parameterized.expand(["fp16", "fp32"]) @parameterized.expand(["bf16", "fp16", "fp32"])
def test_inference(self, dtype): def test_inference(self, dtype):
if dtype == "bf16" and not is_torch_bf16_available():
self.skipTest("test requires bfloat16 hardware support")
# this is just inference, so no optimizer should be loaded # this is just inference, so no optimizer should be loaded
# it only works for z3 (makes no sense with z1-z2) # it only works for z3 (makes no sense with z1-z2)
fp16 = True if dtype == "fp16" else False fp32 = True if dtype == "fp32" else False
self.run_and_check( self.run_and_check(
stage=ZERO3, stage=ZERO3,
dtype=FP16,
model_name=T5_TINY, model_name=T5_TINY,
distributed=True, distributed=True,
do_train=False, do_train=False,
do_eval=True, do_eval=True,
quality_checks=False, quality_checks=False,
fp16=fp16, fp32=fp32,
) )
def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True): def do_checks(self, output_dir, do_train=True, do_eval=True, quality_checks=True):
...@@ -793,13 +822,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -793,13 +822,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
def run_and_check( def run_and_check(
self, self,
stage, stage,
dtype,
model_name: str = T5_SMALL, model_name: str = T5_SMALL,
eval_steps: int = 10, eval_steps: int = 10,
distributed: bool = True, distributed: bool = True,
do_train: bool = True, do_train: bool = True,
do_eval: bool = True, do_eval: bool = True,
quality_checks: bool = True, quality_checks: bool = True,
fp16: bool = True, fp32: bool = False,
extra_args_str: str = None, extra_args_str: str = None,
remove_args_str: str = None, remove_args_str: str = None,
): ):
...@@ -807,13 +837,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -807,13 +837,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
# we are doing quality testing so using a small real model # we are doing quality testing so using a small real model
output_dir = self.run_trainer( output_dir = self.run_trainer(
stage=stage, stage=stage,
dtype=dtype,
model_name=model_name, model_name=model_name,
eval_steps=eval_steps, eval_steps=eval_steps,
num_train_epochs=1, num_train_epochs=1,
do_train=do_train, do_train=do_train,
do_eval=do_eval, do_eval=do_eval,
distributed=distributed, distributed=distributed,
fp16=fp16, fp32=fp32,
extra_args_str=extra_args_str, extra_args_str=extra_args_str,
remove_args_str=remove_args_str, remove_args_str=remove_args_str,
) )
...@@ -825,13 +856,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -825,13 +856,14 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
def run_trainer( def run_trainer(
self, self,
stage: str, stage: str,
dtype: str,
model_name: str, model_name: str,
eval_steps: int = 10, eval_steps: int = 10,
num_train_epochs: int = 1, num_train_epochs: int = 1,
do_train: bool = False, do_train: bool = False,
do_eval: bool = True, do_eval: bool = True,
distributed: bool = True, distributed: bool = True,
fp16: bool = True, fp32: bool = False,
extra_args_str: str = None, extra_args_str: str = None,
remove_args_str: str = None, remove_args_str: str = None,
): ):
...@@ -859,8 +891,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -859,8 +891,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
""".split() """.split()
args.extend(["--source_prefix", '"translate English to Romanian: "']) args.extend(["--source_prefix", '"translate English to Romanian: "'])
if fp16: if not fp32:
args.extend(["--fp16"]) args.extend([f"--{dtype}"])
actions = 0 actions = 0
if do_train: if do_train:
...@@ -906,8 +938,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -906,8 +938,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
return output_dir return output_dir
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_clm(self, stage): def test_clm(self, stage, dtype):
# this test exercises model.resize_token_embeddings() which requires param gathering outside # this test exercises model.resize_token_embeddings() which requires param gathering outside
# of forward - it's not used by `run_translation.py`, but it is in `run_clm.py` # of forward - it's not used by `run_translation.py`, but it is in `run_clm.py`
...@@ -928,10 +960,11 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -928,10 +960,11 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
--num_train_epochs 1 --num_train_epochs 1
--warmup_steps 8 --warmup_steps 8
--block_size 64 --block_size 64
--fp16
--report_to none --report_to none
""".split() """.split()
args.extend([f"--{dtype}"])
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"] script = [f"{self.examples_dir_str}/pytorch/language-modeling/run_clm.py"]
launcher = get_launcher(distributed=True) launcher = get_launcher(distributed=True)
...@@ -941,7 +974,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -941,7 +974,7 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
# print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die # print(" ".join([f"\nPYTHONPATH={self.src_dir_str}"] +cmd)); die
execute_subprocess_async(cmd, env=self.get_env()) execute_subprocess_async(cmd, env=self.get_env())
def test_clm_from_config_zero3(self): def test_clm_from_config_zero3_fp16(self):
# this test exercises AutoModel.from_config(config) - to ensure zero.Init is called # this test exercises AutoModel.from_config(config) - to ensure zero.Init is called
data_dir = self.tests_dir / "fixtures" data_dir = self.tests_dir / "fixtures"
...@@ -974,8 +1007,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -974,8 +1007,8 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
execute_subprocess_async(cmd, env=self.get_env()) execute_subprocess_async(cmd, env=self.get_env())
self.assertIn("Detected DeepSpeed ZeRO-3", cs.err) self.assertIn("Detected DeepSpeed ZeRO-3", cs.err)
@parameterized.expand(stages) @parameterized.expand(params, name_func=parameterized_custom_name_func)
def test_load_best_model(self, stage): def test_load_best_model(self, stage, dtype):
# this test exercises --load_best_model_at_end - the key is being able to resume after some training # this test exercises --load_best_model_at_end - the key is being able to resume after some training
data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro" data_dir = self.tests_dir / "fixtures/tests_samples/wmt_en_ro"
...@@ -1003,11 +1036,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus): ...@@ -1003,11 +1036,12 @@ class TestDeepSpeedWithLauncher(TestCasePlus):
--per_device_train_batch_size 1 --per_device_train_batch_size 1
--per_device_eval_batch_size 1 --per_device_eval_batch_size 1
--num_train_epochs 1 --num_train_epochs 1
--fp16
--report_to none --report_to none
""".split() """.split()
args.extend(["--source_prefix", "translate English to Romanian: "]) args.extend(["--source_prefix", "translate English to Romanian: "])
args.extend([f"--{dtype}"])
ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split() ds_args = f"--deepspeed {self.test_file_dir_str}/ds_config_{stage}.json".split()
script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"] script = [f"{self.examples_dir_str}/pytorch/translation/run_translation.py"]
launcher = get_launcher(distributed=False) launcher = get_launcher(distributed=False)
......
...@@ -205,8 +205,19 @@ task_cmds = make_task_cmds() ...@@ -205,8 +205,19 @@ task_cmds = make_task_cmds()
ZERO2 = "zero2" ZERO2 = "zero2"
ZERO3 = "zero3" ZERO3 = "zero3"
stages = [ZERO2, ZERO3] stages = [ZERO2, ZERO3]
# future preparation:
# for now test just fp16, as these tests are quite slow
# FP16 = "fp16"
# BF16 = "bf16"
#
# dtypes = [FP16]
# so just hardcoding --fp16 for now
# if is_torch_bf16_available():
# dtypes += [BF16]
def parameterized_custom_name_func(func, param_num, param): def parameterized_custom_name_func(func, param_num, param):
# customize the test name generator function as we want both params to appear in the sub-test # customize the test name generator function as we want both params to appear in the sub-test
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment