Unverified Commit 41db1c2f authored by Jeff Rasley's avatar Jeff Rasley Committed by GitHub
Browse files
parent 79093d74
...@@ -2,4 +2,5 @@ torch>=1.2 ...@@ -2,4 +2,5 @@ torch>=1.2
torchvision>=0.4.0 torchvision>=0.4.0
tqdm tqdm
psutil psutil
cpufeature
tensorboardX==1.8 tensorboardX==1.8
...@@ -32,33 +32,46 @@ if torch.cuda.is_available(): ...@@ -32,33 +32,46 @@ if torch.cuda.is_available():
onebit_adam_requires.append(f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}") onebit_adam_requires.append(f"cupy-cuda{torch.version.cuda.replace('.','')[:3]}")
install_requires += onebit_adam_requires install_requires += onebit_adam_requires
# Constants for each op
LAMB = "lamb"
TRANSFORMER = "transformer"
SPARSE_ATTN = "sparse-attn"
ADAM = "cpu-adam"
# Build environment variables for custom builds # Build environment variables for custom builds
DS_BUILD_LAMB_MASK = 1 DS_BUILD_LAMB_MASK = 1
DS_BUILD_TRANSFORMER_MASK = 10 DS_BUILD_TRANSFORMER_MASK = 10
DS_BUILD_SPARSE_ATTN_MASK = 100 DS_BUILD_SPARSE_ATTN_MASK = 100
DS_BUILD_ADAM_MASK = 1000
DS_BUILD_AVX512_MASK = 10000
# Allow for build_cuda to turn on or off all ops # Allow for build_cuda to turn on or off all ops
DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK DS_BUILD_ALL_OPS = DS_BUILD_LAMB_MASK | DS_BUILD_TRANSFORMER_MASK | DS_BUILD_SPARSE_ATTN_MASK | DS_BUILD_ADAM_MASK | DS_BUILD_AVX512_MASK
DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS DS_BUILD_CUDA = int(os.environ.get('DS_BUILD_CUDA', 1)) * DS_BUILD_ALL_OPS
# Set default of each op based on if build_cuda is set # Set default of each op based on if build_cuda is set
OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS OP_DEFAULT = DS_BUILD_CUDA == DS_BUILD_ALL_OPS
DS_BUILD_ADAM = int(os.environ.get('DS_BUILD_ADAM', OP_DEFAULT)) * DS_BUILD_ADAM_MASK
DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK DS_BUILD_LAMB = int(os.environ.get('DS_BUILD_LAMB', OP_DEFAULT)) * DS_BUILD_LAMB_MASK
DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER', DS_BUILD_TRANSFORMER = int(os.environ.get('DS_BUILD_TRANSFORMER',
OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK OP_DEFAULT)) * DS_BUILD_TRANSFORMER_MASK
DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN', DS_BUILD_SPARSE_ATTN = int(os.environ.get('DS_BUILD_SPARSE_ATTN',
0)) * DS_BUILD_SPARSE_ATTN_MASK 0)) * DS_BUILD_SPARSE_ATTN_MASK
DS_BUILD_AVX512 = int(os.environ.get('DS_BUILD_AVX512', 0)) * DS_BUILD_AVX512_MASK
# Final effective mask is the bitwise OR of each op # Final effective mask is the bitwise OR of each op
BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN) BUILD_MASK = (DS_BUILD_LAMB | DS_BUILD_TRANSFORMER | DS_BUILD_SPARSE_ATTN
| DS_BUILD_ADAM)
install_ops = [] install_ops = dict.fromkeys([LAMB, TRANSFORMER, SPARSE_ATTN, ADAM], False)
if BUILD_MASK & DS_BUILD_LAMB: if BUILD_MASK & DS_BUILD_LAMB:
install_ops.append('lamb') install_ops[LAMB] = True
if BUILD_MASK & DS_BUILD_ADAM:
install_ops[ADAM] = True
if BUILD_MASK & DS_BUILD_TRANSFORMER: if BUILD_MASK & DS_BUILD_TRANSFORMER:
install_ops.append('transformer') install_ops[TRANSFORMER] = True
if BUILD_MASK & DS_BUILD_SPARSE_ATTN: if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
install_ops.append('sparse-attn') install_ops[SPARSE_ATTN] = True
if len(install_ops) == 0: if len(install_ops) == 0:
print("Building without any cuda/cpp extensions") print("Building without any cuda/cpp extensions")
print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}') print(f'BUILD_MASK={BUILD_MASK}, install_ops={install_ops}')
...@@ -90,6 +103,16 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4): ...@@ -90,6 +103,16 @@ if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
version_ge_1_5 = ['-DVERSION_GE_1_5'] version_ge_1_5 = ['-DVERSION_GE_1_5']
version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5 version_dependent_macros = version_ge_1_1 + version_ge_1_3 + version_ge_1_5
import cpufeature
cpu_info = cpufeature.CPUFeature
SIMD_WIDTH = ''
if cpu_info['AVX512f'] and DS_BUILD_AVX512:
SIMD_WIDTH = '-D__AVX512__'
elif cpu_info['AVX2']:
SIMD_WIDTH = '-D__AVX256__'
print("SIMD_WIDTH = ", SIMD_WIDTH)
ext_modules = [] ext_modules = []
## Lamb ## ## Lamb ##
...@@ -109,6 +132,43 @@ if BUILD_MASK & DS_BUILD_LAMB: ...@@ -109,6 +132,43 @@ if BUILD_MASK & DS_BUILD_LAMB:
'--use_fast_math'] + version_dependent_macros '--use_fast_math'] + version_dependent_macros
})) }))
## Adam ##
if BUILD_MASK & DS_BUILD_ADAM:
ext_modules.append(
CUDAExtension(name='deepspeed.ops.adam.cpu_adam_op',
sources=[
'csrc/adam/cpu_adam.cpp',
'csrc/adam/custom_cuda_kernel.cu',
],
include_dirs=['csrc/includes',
'/usr/local/cuda/include'],
extra_compile_args={
'cxx': [
'-O3',
'-std=c++14',
'-L/usr/local/cuda/lib64',
'-lcudart',
'-lcublas',
'-g',
'-Wno-reorder',
'-march=native',
'-fopenmp',
SIMD_WIDTH
],
'nvcc': [
'-O3',
'--use_fast_math',
'-gencode',
'arch=compute_61,code=compute_61',
'-gencode',
'arch=compute_70,code=compute_70',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
}))
## Transformer ## ## Transformer ##
if BUILD_MASK & DS_BUILD_TRANSFORMER: if BUILD_MASK & DS_BUILD_TRANSFORMER:
ext_modules.append( ext_modules.append(
...@@ -177,14 +237,21 @@ if BUILD_MASK & DS_BUILD_TRANSFORMER: ...@@ -177,14 +237,21 @@ if BUILD_MASK & DS_BUILD_TRANSFORMER:
def command_exists(cmd): def command_exists(cmd):
if '|' in cmd:
cmds = cmd.split("|")
else:
cmds = [cmd]
valid = False
for cmd in cmds:
result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True) result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
return result.wait() == 0 valid = valid or result.wait() == 0
return valid
## Sparse transformer ## ## Sparse transformer ##
if BUILD_MASK & DS_BUILD_SPARSE_ATTN: if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
# Check to see if llvm and cmake are installed since they are dependencies # Check to see if llvm and cmake are installed since they are dependencies
required_commands = ['llc-9', 'cmake'] required_commands = ['llvm-config|llvm-config-9', 'cmake']
command_status = list(map(command_exists, required_commands)) command_status = list(map(command_exists, required_commands))
if not all(command_status): if not all(command_status):
...@@ -194,6 +261,8 @@ if BUILD_MASK & DS_BUILD_SPARSE_ATTN: ...@@ -194,6 +261,8 @@ if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
) )
warnings.warn( warnings.warn(
'Skipping sparse attention installation due to missing required packages') 'Skipping sparse attention installation due to missing required packages')
# remove from installed ops list
install_ops[SPARSE_ATTN] = False
elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5: elif TORCH_MAJOR == 1 and TORCH_MINOR >= 5:
ext_modules.append( ext_modules.append(
CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils', CppExtension(name='deepspeed.ops.sparse_attention.cpp_utils',
...@@ -204,6 +273,8 @@ if BUILD_MASK & DS_BUILD_SPARSE_ATTN: ...@@ -204,6 +273,8 @@ if BUILD_MASK & DS_BUILD_SPARSE_ATTN:
install_requires += sparse_attn_requires install_requires += sparse_attn_requires
else: else:
warnings.warn('Unable to meet requirements to install sparse attention') warnings.warn('Unable to meet requirements to install sparse attention')
# remove from installed ops list
install_ops[SPARSE_ATTN] = False
# Add development requirements # Add development requirements
install_requires += dev_requires install_requires += dev_requires
...@@ -224,6 +295,7 @@ with open('deepspeed/git_version_info.py', 'w') as fd: ...@@ -224,6 +295,7 @@ with open('deepspeed/git_version_info.py', 'w') as fd:
fd.write(f"version='{VERSION}+{git_hash}'\n") fd.write(f"version='{VERSION}+{git_hash}'\n")
fd.write(f"git_hash='{git_hash}'\n") fd.write(f"git_hash='{git_hash}'\n")
fd.write(f"git_branch='{git_branch}'\n") fd.write(f"git_branch='{git_branch}'\n")
fd.write(f"installed_ops={install_ops}\n")
print(f'install_requires={install_requires}') print(f'install_requires={install_requires}')
......
...@@ -3,13 +3,7 @@ ...@@ -3,13 +3,7 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":1 "stage": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
......
...@@ -3,17 +3,11 @@ ...@@ -3,17 +3,11 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2, "stage": 2,
"reduce_bucket_size": 7000000, "reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000, "allgather_bucket_size": 7000000,
"reduce_scatter": true "reduce_scatter": true
}, },
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
......
{
"train_batch_size": 4,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true,
"cpu_offload": true
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
...@@ -3,13 +3,7 @@ ...@@ -3,13 +3,7 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":0 "stage": 0
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
......
{ {
"train_micro_batch_size_per_gpu":8, "train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 3, "gradient_accumulation_steps": 3,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":0, "stage": 0,
"reduce_bucket_size": 7000000, "reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000, "allgather_bucket_size": 7000000,
"reduce_scatter": true "reduce_scatter": true
}, },
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
...@@ -26,5 +20,4 @@ ...@@ -26,5 +20,4 @@
"partition_activations": true, "partition_activations": true,
"contiguous_memory_optimization": true "contiguous_memory_optimization": true
} }
} }
...@@ -2,14 +2,8 @@ ...@@ -2,14 +2,8 @@
"train_batch_size": 8, "train_batch_size": 8,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization":{ "zero_optimization": {
"stage":1 "stage": 1
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
......
...@@ -3,17 +3,11 @@ ...@@ -3,17 +3,11 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2, "stage": 2,
"reduce_bucket_size": 7000000, "reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000, "allgather_bucket_size": 7000000,
"reduce_scatter": true "reduce_scatter": true
}, },
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
...@@ -26,5 +20,4 @@ ...@@ -26,5 +20,4 @@
"partition_activations": true, "partition_activations": true,
"contiguous_memory_optimization": true "contiguous_memory_optimization": true
} }
} }
{ {
"train_micro_batch_size_per_gpu":8, "train_micro_batch_size_per_gpu": 8,
"gradient_accumulation_steps": 3, "gradient_accumulation_steps": 3,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2, "stage": 2,
"reduce_bucket_size": 7000000, "reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000, "allgather_bucket_size": 7000000,
"reduce_scatter": true "reduce_scatter": true
}, },
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
...@@ -26,5 +20,4 @@ ...@@ -26,5 +20,4 @@
"partition_activations": true, "partition_activations": true,
"contiguous_memory_optimization": true "contiguous_memory_optimization": true
} }
} }
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": {
"stage": 2,
"reduce_bucket_size": 7000000,
"allgather_bucket_size": 7000000,
"reduce_scatter": true,
"cpu_offload": true
},
"gradient_clipping": 1.0,
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
},
"activation_checkpointing": {
"partition_activations": true,
"contiguous_memory_optimization": true
}
}
...@@ -3,13 +3,7 @@ ...@@ -3,13 +3,7 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":2 "stage": 2
},
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"scheduler": { "scheduler": {
...@@ -20,7 +14,6 @@ ...@@ -20,7 +14,6 @@
"warmup_num_steps": 10 "warmup_num_steps": 10
} }
}, },
"fp16": { "fp16": {
"enabled": true, "enabled": true,
"loss_scale": 0, "loss_scale": 0,
......
...@@ -2,14 +2,10 @@ ...@@ -2,14 +2,10 @@
"train_batch_size": 16, "train_batch_size": 16,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": 1, "zero_optimization": {
"disable_allgather": true, "stage": 1
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"disable_allgather": true,
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
......
...@@ -3,15 +3,9 @@ ...@@ -3,15 +3,9 @@
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": { "zero_optimization": {
"stage":1 "stage": 1
}, },
"disable_allgather": true, "disable_allgather": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
},
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
......
...@@ -2,14 +2,10 @@ ...@@ -2,14 +2,10 @@
"train_batch_size": 8, "train_batch_size": 8,
"gradient_accumulation_steps": 1, "gradient_accumulation_steps": 1,
"steps_per_print": 1, "steps_per_print": 1,
"zero_optimization": 1, "zero_optimization": {
"disable_allgather": true, "stage": 1
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015
}
}, },
"disable_allgather": true,
"gradient_clipping": 1.0, "gradient_clipping": 1.0,
"fp16": { "fp16": {
"enabled": true, "enabled": true,
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
helpFunction() helpFunction()
{ {
echo "" echo ""
echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]" echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers -p [-d]"
echo -e "\t-m model parallelism" echo -e "\t-m model parallelism"
echo -e "\t-g gpus per node" echo -e "\t-g gpus per node"
echo -e "\t-n node count" echo -e "\t-n node count"
...@@ -17,6 +17,7 @@ helpFunction() ...@@ -17,6 +17,7 @@ helpFunction()
echo -e "\t-o other args" echo -e "\t-o other args"
echo -e "\t-d DeepSpeed config json file" echo -e "\t-d DeepSpeed config json file"
echo -e "\t-z Enable Zero optimization" echo -e "\t-z Enable Zero optimization"
echo -e "\t-p DeepSpeed master port"
exit 1 exit 1
} }
...@@ -27,6 +28,7 @@ ckpt_num_layers=1 ...@@ -27,6 +28,7 @@ ckpt_num_layers=1
other_args="" other_args=""
ds_opt="" ds_opt=""
zero_opt="" zero_opt=""
master_port=29600
script_path=$(realpath $0) script_path=$(realpath $0)
script_dir=$(dirname $script_path) script_dir=$(dirname $script_path)
...@@ -44,6 +46,7 @@ do ...@@ -44,6 +46,7 @@ do
q ) seq_length="$OPTARG" ;; q ) seq_length="$OPTARG" ;;
e ) heads="$OPTARG" ;; e ) heads="$OPTARG" ;;
c ) ckpt_num_layers="$OPTARG" ;; c ) ckpt_num_layers="$OPTARG" ;;
p ) master_port="$OPTARG" ;;
o ) other_args="$OPTARG" ;; o ) other_args="$OPTARG" ;;
d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;; d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
z ) zero_opt="--zero_optimization" ;; z ) zero_opt="--zero_optimization" ;;
...@@ -93,7 +96,7 @@ gpt_options=" \ ...@@ -93,7 +96,7 @@ gpt_options=" \
" "
work_dir="../../../DeepSpeedExamples/Megatron-LM/" work_dir="../../../DeepSpeedExamples/Megatron-LM/"
run_cmd="(cd ${work_dir} && deepspeed --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})" run_cmd="(cd ${work_dir} && deepspeed --master_port ${master_port} --num_nodes $nodes --num_gpus $gpus pretrain_gpt2.py ${gpt_options})"
echo ${run_cmd} echo ${run_cmd}
eval ${run_cmd} eval ${run_cmd}
......
...@@ -97,6 +97,29 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -97,6 +97,29 @@ class GPT2CheckpointTestCase(BaseTestCase):
succ = self.run_test(test_config, 0.01) succ = self.run_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp2_gpu4_node1_with_zero2_offload(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2_offload",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp2_gpu8_w_zero2_offload",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu2_load_gpu1_node1_with_zero1(self): def test_mp1_gpu2_load_gpu1_node1_with_zero1(self):
test_config = { test_config = {
"mp": 1, "mp": 1,
...@@ -110,7 +133,7 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -110,7 +133,7 @@ class GPT2CheckpointTestCase(BaseTestCase):
"seq_length": 256, "seq_length": 256,
"heads": ATTN_HEADS, "heads": ATTN_HEADS,
"deepspeed": True, "deepspeed": True,
"tag": "ds_zero2", "tag": "ds_zero1",
"zero": True, "zero": True,
"other_args": "", "other_args": "",
"checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero1", "checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero1",
...@@ -133,7 +156,7 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -133,7 +156,7 @@ class GPT2CheckpointTestCase(BaseTestCase):
"seq_length": 256, "seq_length": 256,
"heads": ATTN_HEADS, "heads": ATTN_HEADS,
"deepspeed": True, "deepspeed": True,
"tag": "ds_zero2", "tag": "ds_zero1",
"zero": True, "zero": True,
"other_args": "", "other_args": "",
"checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero1", "checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero1",
...@@ -166,6 +189,30 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -166,6 +189,30 @@ class GPT2CheckpointTestCase(BaseTestCase):
succ = self.run_test(test_config, 0.01) succ = self.run_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp1_gpu2_load_gpu1_node1_with_zero2_offload(self):
test_config = {
"mp": 1,
"gpus": 2,
"load_gpus": 1,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2_offload",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp1_gpu2_gpu1_w_zero2_offload",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu2_load_gpu4_node1_with_zero2(self): def test_mp1_gpu2_load_gpu4_node1_with_zero2(self):
test_config = { test_config = {
"mp": 1, "mp": 1,
...@@ -189,6 +236,30 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -189,6 +236,30 @@ class GPT2CheckpointTestCase(BaseTestCase):
succ = self.run_test(test_config, 0.01) succ = self.run_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp1_gpu2_load_gpu4_node1_with_zero2_offload(self):
test_config = {
"mp": 1,
"gpus": 2,
"load_gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2_offload",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp1_gpu2_gpu4_w_zero2_offload",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp2_gpu4_load_gpu2_node1_with_zero1(self): def test_mp2_gpu4_load_gpu2_node1_with_zero1(self):
test_config = { test_config = {
"mp": 2, "mp": 2,
...@@ -258,6 +329,30 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -258,6 +329,30 @@ class GPT2CheckpointTestCase(BaseTestCase):
succ = self.run_test(test_config, 0.01) succ = self.run_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp2_gpu4_load_gpu2_node1_with_zero2_offload(self):
test_config = {
"mp": 2,
"gpus": 4,
"load_gpus": 2,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2_offload",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp2_gpu4_gpu2_w_zero2_offload",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp2_gpu2_load_gpu4_node1_with_zero2(self): def test_mp2_gpu2_load_gpu4_node1_with_zero2(self):
test_config = { test_config = {
"mp": 2, "mp": 2,
...@@ -281,6 +376,30 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -281,6 +376,30 @@ class GPT2CheckpointTestCase(BaseTestCase):
succ = self.run_test(test_config, 0.01) succ = self.run_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp2_gpu2_load_gpu4_node1_with_zero2_offload(self):
test_config = {
"mp": 2,
"gpus": 2,
"load_gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": 256,
"heads": ATTN_HEADS,
"deepspeed": True,
"tag": "ds_zero2_offload",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp2_gpu2_gpu4_w_zero2_offload",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp2_gpu4_node1_without_zero(self): def test_mp2_gpu4_node1_without_zero(self):
test_config = { test_config = {
"mp": 2, "mp": 2,
...@@ -306,7 +425,8 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -306,7 +425,8 @@ class GPT2CheckpointTestCase(BaseTestCase):
def gen_name(self, test_config, prefix): def gen_name(self, test_config, prefix):
save_dir = "checkpoint_test_logs" save_dir = "checkpoint_test_logs"
tag = test_config["tag"] tag = test_config["tag"]
file_name = f"_{tag}.log" checkpoint_name = test_config["checkpoint_name"]
file_name = f"_{tag}_{checkpoint_name}.log"
return os.path.join(save_dir, prefix + file_name) return os.path.join(save_dir, prefix + file_name)
def run_test(self, test_config, r_tol): def run_test(self, test_config, r_tol):
...@@ -334,10 +454,15 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -334,10 +454,15 @@ class GPT2CheckpointTestCase(BaseTestCase):
except: except:
print("No old checkpoint") print("No old checkpoint")
if "cpu_optimizer" in test_config and test_config["cpu_optimizer"]:
cpu_optimizer_flag = " --cpu-optimizer"
else:
cpu_optimizer_flag = ""
#-----------------Saving Checkpoint-----------------# #-----------------Saving Checkpoint-----------------#
#building checkpoint arguments # building checkpoint arguments
test_config[ test_config[
"other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval}\"" "other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval} {cpu_optimizer_flag}\""
prefix = "gpt2_saving_checkpoint" prefix = "gpt2_saving_checkpoint"
...@@ -356,10 +481,11 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -356,10 +481,11 @@ class GPT2CheckpointTestCase(BaseTestCase):
#-----------------Loading Checkpoint-----------------# #-----------------Loading Checkpoint-----------------#
#building checkpoint arguments # building checkpoint arguments
test_config["other_args"] = f"\"--load {checkpoint_folder}\"" test_config[
"other_args"] = f"\"--load {checkpoint_folder} {cpu_optimizer_flag} \""
#set checkpoint load iteration # set checkpoint load iteration
try: try:
cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt" cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
print(f"{self.id()} running cmd: {cmd}") print(f"{self.id()} running cmd: {cmd}")
...@@ -411,20 +537,32 @@ class GPT2CheckpointTestCase(BaseTestCase): ...@@ -411,20 +537,32 @@ class GPT2CheckpointTestCase(BaseTestCase):
def checkpoint_suite(): def checkpoint_suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero1'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_with_zero2_offload'))
# Shrink DP # Shrink DP
suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero1'))
suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2'))
suite.addTest(
GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu1_node1_with_zero2_offload'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero1'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2'))
suite.addTest(
GPT2CheckpointTestCase('test_mp2_gpu4_load_gpu2_node1_with_zero2_offload'))
# Expand DP # Expand DP
suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero1'))
suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2')) suite.addTest(GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2'))
suite.addTest(
GPT2CheckpointTestCase('test_mp1_gpu2_load_gpu4_node1_with_zero2_offload'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero1'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2'))
suite.addTest(
GPT2CheckpointTestCase('test_mp2_gpu2_load_gpu4_node1_with_zero2_offload'))
suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero')) suite.addTest(GPT2CheckpointTestCase('test_mp2_gpu4_node1_without_zero'))
......
...@@ -14,11 +14,12 @@ LAYERS = 2 ...@@ -14,11 +14,12 @@ LAYERS = 2
HIDDEN_SIZE = 128 HIDDEN_SIZE = 128
ATTN_HEADS = 8 ATTN_HEADS = 8
SEQ_LEN = 64 SEQ_LEN = 64
MASTER_PORT = 29700
def grep_loss_from_file(file_name): def grep_loss_from_file(file_name):
loss = 0.0 loss = 0.0
print(f'grepping {file_name}')
with open(file_name, 'r') as f: with open(file_name, 'r') as f:
lines = f.readlines() lines = f.readlines()
line_filter = "validation loss at the end of training for test data | LM loss:" line_filter = "validation loss at the end of training for test data | LM loss:"
...@@ -48,6 +49,24 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -48,6 +49,24 @@ class GPT2FuncTestCase(BaseTestCase):
def tearDown(self): def tearDown(self):
os.chdir(self.save_dir) os.chdir(self.save_dir)
def test_mp1_gpu2_node1_fp16(self):
test_config = {
"mp": 1,
"gpus": 2,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_no_zero.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu1_node1_zero1(self): def test_mp1_gpu1_node1_zero1(self):
test_config = { test_config = {
"mp": 1, "mp": 1,
...@@ -171,10 +190,12 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -171,10 +190,12 @@ class GPT2FuncTestCase(BaseTestCase):
"json": "ds_config_func_bs8_zero2.json", "json": "ds_config_func_bs8_zero2.json",
} }
succ = self.run_test(test_config, 0.01) basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
succ = self.run_partition_activations_test(test_config, 0.01) partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp4_gpu4_node1_zero2(self): def test_mp4_gpu4_node1_zero2(self):
...@@ -192,10 +213,48 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -192,10 +213,48 @@ class GPT2FuncTestCase(BaseTestCase):
"json": "ds_config_func_bs8_zero2.json", "json": "ds_config_func_bs8_zero2.json",
} }
succ = self.run_test(test_config, 0.01) basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
succ = self.run_partition_activations_test(test_config, 0.01) partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu1_node1_zero2_ds_offload(self):
test_config = {
"mp": 1,
"gpus": 1,
"nodes": 1,
"bs": 4,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs4_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.02)
self.assertTrue(succ)
def test_mp1_gpu2_node1_zero2_ds_offload(self):
test_config = {
"mp": 1,
"gpus": 2,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
succ = self.run_test(test_config, 0.02)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp2_gpu4_node1_zero2_gas(self): def test_mp2_gpu4_node1_zero2_gas(self):
...@@ -220,6 +279,143 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -220,6 +279,143 @@ class GPT2FuncTestCase(BaseTestCase):
succ = self.run_partition_activations_test(test_config, 0.01) succ = self.run_partition_activations_test(test_config, 0.01)
self.assertTrue(succ) self.assertTrue(succ)
def test_mp2_gpu4_node1_zero2_ds_offload(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.02)
self.assertTrue(succ)
partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.02)
self.assertTrue(succ)
def test_mp4_gpu4_node1_zero2_ds_offload(self):
test_config = {
"mp": 4,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
}
basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.02)
self.assertTrue(succ)
partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.02)
self.assertTrue(succ)
def test_mp1_gpu1_node1_zero2_torch_offload(self):
test_config = {
"mp": 1,
"gpus": 1,
"nodes": 1,
"bs": 4,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs4_zero2_offload.json",
"cpu_optimizer": True,
"test_torch_offload": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu2_node1_zero2_torch_offload(self):
test_config = {
"mp": 1,
"gpus": 2,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
"test_torch_offload": True,
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp2_gpu4_node1_zero2_torch_offload(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
"test_torch_offload": True,
}
basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.01)
self.assertTrue(succ)
partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.01)
self.assertTrue(succ)
def test_mp4_gpu4_node1_zero2_torch_offload(self):
test_config = {
"mp": 4,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": LAYERS,
"hidden_size": HIDDEN_SIZE,
"seq_length": SEQ_LEN,
"heads": ATTN_HEADS,
"deepspeed": False,
"json": "ds_config_func_bs8_zero2_offload.json",
"cpu_optimizer": True,
"test_torch_offload": True,
}
basic_run_config = test_config
succ = self.run_test(basic_run_config, 0.01)
self.assertTrue(succ)
partition_activation_config = test_config
succ = self.run_partition_activations_test(partition_activation_config, 0.01)
def test_optimizer_scheduler(self): def test_optimizer_scheduler(self):
test_config = { test_config = {
"mp": 1, "mp": 1,
...@@ -248,6 +444,7 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -248,6 +444,7 @@ class GPT2FuncTestCase(BaseTestCase):
deepspeed_config = test_config["json"] deepspeed_config = test_config["json"]
baseline_deepspeed_config = False baseline_deepspeed_config = False
cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
# baseline run... # baseline run...
# turnoff deepspeed if baseline deepspeed config # turnoff deepspeed if baseline deepspeed config
...@@ -259,6 +456,7 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -259,6 +456,7 @@ class GPT2FuncTestCase(BaseTestCase):
baseline_prefix += test_config["json"][0:-5] baseline_prefix += test_config["json"][0:-5]
baseline_deepspeed_config = True baseline_deepspeed_config = True
test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
base_file = self.gen_output_name(test_config, base_file = self.gen_output_name(test_config,
baseline_prefix, baseline_prefix,
baseline_config=baseline_deepspeed_config) baseline_config=baseline_deepspeed_config)
...@@ -272,8 +470,11 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -272,8 +470,11 @@ class GPT2FuncTestCase(BaseTestCase):
# DeepSpeed run... # DeepSpeed run...
test_config["deepspeed"] = True test_config["deepspeed"] = True
test_config["other_args"] = "--deepspeed-activation-checkpointing" cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
test_config[
"other_args"] = f"\"--deepspeed-activation-checkpointing {cpu_optimizer_flag}\""
test_config["json"] = deepspeed_config test_config["json"] = deepspeed_config
print("{0}: DeepSpeed run.".format(self.id())) print("{0}: DeepSpeed run.".format(self.id()))
test_file = self.gen_output_name(test_config, prefix) test_file = self.gen_output_name(test_config, prefix)
self.run_gpt2_test(test_config, test_file) self.run_gpt2_test(test_config, test_file)
...@@ -289,6 +490,7 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -289,6 +490,7 @@ class GPT2FuncTestCase(BaseTestCase):
deepspeed_config = test_config["json"] deepspeed_config = test_config["json"]
baseline_deepspeed_config = False baseline_deepspeed_config = False
cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, True)
# baseline run... # baseline run...
# turn off deepspeed if a baseline deepspeed config # turn off deepspeed if a baseline deepspeed config
...@@ -300,6 +502,8 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -300,6 +502,8 @@ class GPT2FuncTestCase(BaseTestCase):
baseline_prefix = prefix + test_config["json"][0:-5] baseline_prefix = prefix + test_config["json"][0:-5]
baseline_deepspeed_config = True baseline_deepspeed_config = True
test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
# baseline run... # baseline run...
base_file = self.gen_output_name(test_config, base_file = self.gen_output_name(test_config,
baseline_prefix, baseline_prefix,
...@@ -314,7 +518,8 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -314,7 +518,8 @@ class GPT2FuncTestCase(BaseTestCase):
# DeepSpeed run... # DeepSpeed run...
test_config["deepspeed"] = True test_config["deepspeed"] = True
test_config["json"] = deepspeed_config cpu_optimizer_flag = self.gen_cpu_optimizer_flag(test_config, False)
test_config["other_args"] = f"\"{cpu_optimizer_flag}\""
print("{0}: DeepSpeed run.".format(self.id())) print("{0}: DeepSpeed run.".format(self.id()))
test_file = self.gen_output_name(test_config, prefix) test_file = self.gen_output_name(test_config, prefix)
...@@ -345,9 +550,40 @@ class GPT2FuncTestCase(BaseTestCase): ...@@ -345,9 +550,40 @@ class GPT2FuncTestCase(BaseTestCase):
return True return True
def gen_cpu_optimizer_flag(self, test_config, is_baseline):
if 'cpu_optimizer' in test_config and test_config['cpu_optimizer']:
cpu_optimizer_flag = "--cpu-optimizer"
if is_baseline:
cpu_optimizer_flag += " --cpu_torch_adam"
return cpu_optimizer_flag
if 'test_torch_offload' in test_config and test_config['test_torch_offload']:
cpu_optimizer_flag += " --cpu_torch_adam"
return cpu_optimizer_flag
else:
cpu_optimizer_flag = ""
return cpu_optimizer_flag
def suite(): def suite():
suite = unittest.TestSuite() suite = unittest.TestSuite()
suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_fp16'))
# Baseline = Megatron + Torch.Optim.Adam
# Test = Megatron + Torch.Optim.Adam + ZeRO-Offload
suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_torch_offload'))
suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_torch_offload'))
suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_torch_offload'))
suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_torch_offload'))
# Baseline = Megatron + Torch.Optim.Adam
# Test = Megatron + DeepSpeedAdam + ZeRO-Offload
suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero2_ds_offload'))
suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero2_ds_offload'))
suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero2_ds_offload'))
suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1_zero2_ds_offload'))
suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1')) suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1_zero1'))
suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1')) suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1_zero1'))
suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1')) suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1_zero1'))
......
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
import time
device = 'cpu'
model_size = 1 * 1024**3
group_size = [model_size, 274432]
param = [torch.nn.Parameter(torch.ones(size, device=device)) for size in group_size]
optimizer = DeepSpeedCPUAdam(param)
#torch.set_num_threads(128)
for i, p in enumerate(param):
p.grad = torch.ones(group_size[i], device=device)
#param.grad = torch.ones(model_size, device=device)
avg = 0
for i in range(100):
start = time.time()
optimizer.step()
stop = time.time()
avg += (stop - start)
for i, p in enumerate(param):
p.grad = torch.ones(group_size[i], device=device) * 2
#param.grad = torch.ones(model_size, device=device) * 2
print("Elapsed Time is ", avg / 100)
import torch
from deepspeed.ops.adam import DeepSpeedCPUAdam
import time
device = 'cpu'
model_size = 1 * 1024**3
param = torch.nn.Parameter(torch.ones(model_size, device=device))
param_fp16 = torch.nn.Parameter(torch.ones(model_size,
dtype=torch.half,
device='cuda:0'))
optimizer = DeepSpeedCPUAdam([param])
#torch.set_num_threads(128)
param.grad = torch.ones(model_size, device=device)
avg = 0
for i in range(100):
start = time.time()
optimizer.step(fp16_param_groups=[param_fp16])
stop = time.time()
avg += (stop - start)
param.grad = torch.ones(model_size, device=device) * 2
print("Elapsed Time is ", avg / 100)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment