"src/diffusers/commands/diffusers_cli.py" did not exist on "27d11a0094e292a8d790714d1b5cdf5e9186814d"
Commit 98f5131b authored by Elton Zheng's avatar Elton Zheng
Browse files

add test model Megatron_GPT2

parent 3bc21cd0
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
from .run_func_test import GPT2FuncTestCase
from .run_checkpoint_test import GPT2CheckpointTestCase, checkpoint_suite
from .run_func_test import suite
{
"train_batch_size": 4,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": false,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 4,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"scheduler": {
"type": "WarmupLR",
"params": {
"warmup_min_lr": 0,
"warmup_max_lr": 0.001,
"warmup_num_steps": 10
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 16,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 32,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
{
"train_batch_size": 8,
"gradient_accumulation_steps": 1,
"steps_per_print": 1,
"zero_optimization": true,
"disable_allgather": true,
"optimizer": {
"type": "Adam",
"params": {
"lr": 0.00015,
"max_grad_norm": 1.0
}
},
"fp16": {
"enabled": true,
"loss_scale": 0,
"loss_scale_window": 1000,
"hysteresis": 2,
"min_loss_scale": 1
}
}
#! /bin/bash
helpFunction()
{
echo ""
echo "Usage: $0 -m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]"
echo -e "\t-m model parallelism"
echo -e "\t-g gpus per node"
echo -e "\t-n node count"
echo -e "\t-b batch size"
echo -e "\t-s training steps"
echo -e "\t-l layers"
echo -e "\t-h hidden size"
echo -e "\t-q sequence length"
echo -e "\t-e attention heads"
echo -e "\t-c checkpoint num_layers"
echo -e "\t-o other args"
echo -e "\t-d DeepSpeed config json file"
echo -e "\t-z Enable Zero optimization"
exit 1
}
layers=24
hidden_size=1024
seq_length=1024
ckpt_num_layers=1
other_args=""
ds_opt=""
zero_opt=""
script_path=$(realpath $0)
script_dir=$(dirname $script_path)
while getopts "m:g:n:b:s:l:h:q:e:c:o:d:z" opt
do
case "$opt" in
m ) mp="$OPTARG" ;;
g ) gpus="$OPTARG" ;;
n ) nodes="$OPTARG" ;;
b ) bs="$OPTARG" ;;
s ) steps="$OPTARG" ;;
l ) layers="$OPTARG" ;;
h ) hidden_size="$OPTARG" ;;
q ) seq_length="$OPTARG" ;;
e ) heads="$OPTARG" ;;
c ) ckpt_num_layers="$OPTARG" ;;
o ) other_args="$OPTARG" ;;
d ) ds_opt="--deepspeed --deepspeed_config $script_dir/$OPTARG" ;;
z ) zero_opt="--zero_optimization" ;;
? ) helpFunction ;;
esac
done
# Print helpFunction in case parameters are empty
if [ -z "$mp" ] || [ -z "$gpus" ] || [ -z "$nodes" ] || [ -z "$bs" ] || [ -z "$steps" ]
then
echo "Some or all of the parameters are empty";
helpFunction
fi
# Change for multinode config
MASTER_ADDR=localhost
MASTER_PORT=6000
gpt_options=" \
--model-parallel-size ${mp} \
--num-layers ${layers} \
--hidden-size ${hidden_size} \
--num-attention-heads ${heads} \
--batch-size ${bs} \
--seq-length ${seq_length} \
--max-position-embeddings ${seq_length} \
--train-iters ${steps} \
--train-data webtext \
--lazy-loader \
--tokenizer-type GPT2BPETokenizer \
--split 949,50,1 \
--distributed-backend nccl \
--lr 0.00015 \
--no-load-optim \
--lr-decay-style cosine \
--weight-decay 1e-2 \
--clip-grad 1.0 \
--warmup .01 \
--checkpoint-activations \
--checkpoint-num-layers ${ckpt_num_layers} \
--fp16 \
--log-interval 1 \
${other_args} \
${ds_opt} \
${zero_opt} \
"
work_dir="../../../examples/Megatron-LM/"
include_str=`seq 0 $(( $gpus - 1 )) | paste -sd "," -`
run_cmd="(cd ${work_dir} && deepspeed.pt -i localhost:${include_str} pretrain_gpt2.py ${gpt_options})"
echo ${run_cmd}
eval ${run_cmd}
set +x
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import unittest
import subprocess
import os
import time
import re
from .test_common import BaseTestCase
def grep_loss_from_file(file_name):
loss = 0.0
with open(file_name, 'r') as f:
lines = f.readlines()
line_filter = "validation loss at the end of training for test data | LM loss:"
match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
for line in lines:
if line_filter in line:
loss = re.findall(match_number, line)
loss = float(loss[0])
if loss == 0.0:
print("no loss found in file ", file_name)
return loss
class GPT2CheckpointTestCase(BaseTestCase):
def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
super(GPT2CheckpointTestCase, self).__init__(methodName)
def setUp(self):
self.save_dir = os.getcwd()
new_dir = os.path.dirname(__file__)
if new_dir:
os.chdir(new_dir)
def tearDown(self):
os.chdir(self.save_dir)
def test_mp4_gpu16_node1_with_zero(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": 2,
"hidden_size": 256,
"seq_length": 256,
"heads": 16,
"deepspeed": True,
"tag": "ds_zero",
"zero": True,
"other_args": "",
"checkpoint_name": "ckpt_mp4_gpu16_w_zero",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp4_gpu16_node1_without_zero(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1100,
"layers": 2,
"hidden_size": 256,
"seq_length": 256,
"heads": 16,
"deepspeed": True,
"zero": False,
"other_args": "",
"tag": "ds_without_zero",
"checkpoint_name": "ckpt_mp4_gpu16_wo_zero",
"checkpoint_interval": 1000,
"json": "ds_config_func_bs8_no_zero.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def gen_name(self, test_config, prefix):
save_dir = "checkpoint_test_logs"
tag = test_config["tag"]
file_name = f"_{tag}.log"
return os.path.join(save_dir, prefix + file_name)
def run_test(self, test_config, r_tol):
print("\n")
print("{0}: starting......".format(self.id()))
# save to current directory.
checkpoint_folder = test_config["checkpoint_name"]
checkpoint_interval = test_config["checkpoint_interval"]
checkpoint_name = test_config["checkpoint_name"]
#---------------remove old checkpoint---------------#
try:
cmd = f"rm -rf {checkpoint_name}"
print(f"{self.id()} cmd: {cmd}")
subprocess.run(cmd,
shell=True,
check=False,
executable='/bin/bash',
stdout=f,
stderr=f)
except:
print("No old checkpoint")
#-----------------Saving Checkpoint-----------------#
#building checkpoint arguments
test_config[
"other_args"] = f"\"--save {checkpoint_folder} --save-interval {checkpoint_interval}\""
prefix = "gpt2_saving_checkpoint"
# create checkpoint run...
base_file = self.gen_name(test_config, prefix)
# remove previous test log
try:
cmd = f"rm {base_file}"
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
except:
print(f"{self.id()} No old logs")
print("{0}: Run for saving checkpoint".format(self.id()))
self.run_gpt2_test(test_config, base_file)
#-----------------Loading Checkpoint-----------------#
#building checkpoint arguments
test_config["other_args"] = f"\"--load {checkpoint_folder}\""
#set checkpoint load iteration
try:
cmd = f"echo {checkpoint_interval} > {checkpoint_name}/latest_checkpointed_iteration.txt"
print(f"{self.id()} running cmd: {cmd}")
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
except:
print(f"{self.id()} Failed to update the checkpoint iteration file")
return False
prefix = "gpt2_loading_checkpoint"
print("{0}: Second run loading checkpoint and continuing.".format(self.id()))
test_file = self.gen_name(test_config, prefix)
# remove previous test log
try:
cmd = f"rm {test_file}"
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
except:
print(f"{self.id()} no previous logs for")
self.run_gpt2_test(test_config, test_file)
return self.check_parity(base_file, test_file, r_tol)
def has_loss_data(self, file_name):
has_loss = False
if os.path.exists(file_name):
loss = grep_loss_from_file(file_name)
if loss != 0.0:
has_loss = True
return has_loss
def check_parity(self, base_file, test_file, r_tol):
base_loss = grep_loss_from_file(base_file)
test_loss = grep_loss_from_file(test_file)
print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
if base_loss == 0.0 or test_loss == 0.0:
return False
if abs((base_loss - test_loss) / base_loss) > r_tol:
return False
return True
def checkpoint_suite():
suite = unittest.TestSuite()
suite.addTest(GPT2CheckpointTestCase('test_mp4_gpu16_node1_with_zero'))
suite.addTest(GPT2CheckpointTestCase('test_mp4_gpu16_node1_without_zero'))
return suite
if __name__ == '__main__':
runner = unittest.TextTestRunner(failfast=True)
runner.run(checkpoint_suite())
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import unittest
import subprocess
import os
import time
import re
from .test_common import BaseTestCase
def grep_loss_from_file(file_name):
loss = 0.0
with open(file_name, 'r') as f:
lines = f.readlines()
line_filter = "validation loss at the end of training for test data | LM loss:"
match_number = re.compile('LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)')
for line in lines:
if line_filter in line:
loss = re.findall(match_number, line)
loss = float(loss[0])
if loss == 0.0:
print("no loss found in file ", file_name)
return loss
class GPT2FuncTestCase(BaseTestCase):
def __init__(self, methodName="DeepSpeed function test on GPT2 model"):
super(GPT2FuncTestCase, self).__init__(methodName)
def setUp(self):
self.save_dir = os.getcwd()
new_dir = os.path.dirname(__file__)
if new_dir:
os.chdir(new_dir)
def tearDown(self):
os.chdir(self.save_dir)
def test_mp1_gpu1_node1(self):
test_config = {
"mp": 1,
"gpus": 1,
"nodes": 1,
"bs": 4,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"deepspeed": False,
"json": "ds_config_func_bs4.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp1_gpu2_node1(self):
test_config = {
"mp": 1,
"gpus": 2,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"deepspeed": False,
"json": "ds_config_func_bs8.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp2_gpu4_node1(self):
test_config = {
"mp": 2,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"deepspeed": False,
"json": "ds_config_func_bs8.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
succ = self.run_partition_activations_test(test_config, 0.01)
self.assertTrue(succ)
def test_mp4_gpu4_node1(self):
test_config = {
"mp": 4,
"gpus": 4,
"nodes": 1,
"bs": 8,
"steps": 1000,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"deepspeed": False,
"json": "ds_config_func_bs8.json",
}
succ = self.run_test(test_config, 0.01)
self.assertTrue(succ)
succ = self.run_partition_activations_test(test_config, 0.01)
self.assertTrue(succ)
def test_optimizer_scheduler(self):
test_config = {
"mp": 1,
"gpus": 1,
"nodes": 1,
"bs": 4,
"steps": 20,
"layers": 12,
"hidden_size": 768,
"seq_length": 256,
"heads": 12,
"deepspeed": False,
"json": "ds_config_func_scheduler.json",
}
succ = self.run_test(test_config, 0.01)
# assure no crash.
self.assertTrue(True)
def run_partition_activations_test(self, test_config, r_tol):
print("\n")
print("{0}: starting......".format(self.id()))
prefix = "gpt2_partition_activation_"
# baseline run...
test_config["deepspeed"] = False
base_file = self.gen_output_name(test_config, prefix)
# skip baseline run if it exists.
if not self.has_loss_data(base_file):
print("{0}: baseline run.".format(self.id()))
self.run_gpt2_test(test_config, base_file)
else:
print("{0}: baseline exists.".format(self.id()))
# DeepSpeed run...
test_config["deepspeed"] = True
test_config["other_args"] = "--partition-activations"
print("{0}: DeepSpeed run.".format(self.id()))
test_file = self.gen_output_name(test_config, prefix)
self.run_gpt2_test(test_config, test_file)
return self.check_parity(base_file, test_file, r_tol)
def run_test(self, test_config, r_tol):
print("\n")
print("{0}: starting......".format(self.id()))
prefix = "gpt2_func"
# baseline run...
test_config["deepspeed"] = False
base_file = self.gen_output_name(test_config, prefix)
# skip baseline run if it exists.
if not self.has_loss_data(base_file):
print("{0}: baseline run.".format(self.id()))
self.run_gpt2_test(test_config, base_file)
else:
print("{0}: baseline exists.".format(self.id()))
# DeepSpeed run...
test_config["deepspeed"] = True
print("{0}: DeepSpeed run.".format(self.id()))
test_file = self.gen_output_name(test_config, prefix)
self.run_gpt2_test(test_config, test_file)
return self.check_parity(base_file, test_file, r_tol)
def has_loss_data(self, file_name):
has_loss = False
if os.path.exists(file_name):
loss = grep_loss_from_file(file_name)
if loss != 0.0:
has_loss = True
return has_loss
def check_parity(self, base_file, test_file, r_tol):
base_loss = grep_loss_from_file(base_file)
test_loss = grep_loss_from_file(test_file)
print("baseline loss: {0}, test loss: {1}".format(base_loss, test_loss))
if base_loss == 0.0 or test_loss == 0.0:
return False
if abs((base_loss - test_loss) / base_loss) > r_tol:
return False
return True
def suite():
suite = unittest.TestSuite()
suite.addTest(GPT2FuncTestCase('test_mp1_gpu1_node1'))
suite.addTest(GPT2FuncTestCase('test_mp1_gpu2_node1'))
suite.addTest(GPT2FuncTestCase('test_mp2_gpu4_node1'))
suite.addTest(GPT2FuncTestCase('test_mp4_gpu4_node1'))
suite.addTest(GPT2FuncTestCase('test_optimizer_scheduler'))
return suite
if __name__ == '__main__':
runner = unittest.TextTestRunner(failfast=True)
runner.run(suite())
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import unittest
import subprocess
import os
import time
import re
from test_common import BaseTestCase
class GPT2PerfBaselineTestCase(BaseTestCase):
def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
super(GPT2PerfBaselineTestCase, self).__init__(methodName)
def test_perf_1_5B(self):
test_config = {
"mp": 2,
"gpus": 16,
"nodes": 4,
"bs": 16,
"steps": 100,
"layers": 48,
"hidden_size": 1600,
"seq_length": 1024,
"heads": 16,
"deepspeed": False,
}
self.run_test(test_config)
def test_perf_4B(self):
test_config = {
"mp": 4,
"gpus": 16,
"nodes": 4,
"bs": 8,
"steps": 100,
"layers": 64,
"hidden_size": 2304,
"seq_length": 1024,
"heads": 16,
"deepspeed": False,
}
self.run_test(test_config)
def test_perf_8B(self):
test_config = {
"mp": 4,
"gpus": 16,
"nodes": 4,
"bs": 8,
"steps": 100,
"layers": 72,
"hidden_size": 3072,
"seq_length": 1024,
"heads": 24,
"deepspeed": False,
}
self.run_test(test_config)
def test_perf_20B(self):
test_config = {
"mp": 16,
"gpus": 16,
"nodes": 4,
"bs": 4,
"steps": 50,
"layers": 111,
"hidden_size": 3808,
"seq_length": 1024,
"heads": 32,
"ckpt_num_layers": 1,
"deepspeed": False,
}
self.run_test(test_config)
def run_test(self, test_config):
print("\n")
print("{0}: starting......".format(self.id()))
prefix = "gpt2_perf"
test_file = self.gen_output_name(test_config, prefix)
self.run_gpt2_test(test_config, test_file)
exec_time = self.grep_latency_from_file(test_file)
if exec_time == 0.0:
print("{0}: no latency found in file {1}".format(self.id(), test_file))
else:
print("{0}: execution time per iteration is {1}ms.".format(
self.id(),
exec_time))
def grep_latency_from_file(self, file_name):
latency = 0.0
count = 0
with open(file_name, 'r') as f:
lines = f.readlines()
line_filter = "elapsed time per iteration"
match_number = re.compile(
'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for line in lines:
if line_filter in line:
ms_per_iter = re.findall(match_number, line)
latency += float(ms_per_iter[0])
count += 1
if count > 0:
latency /= count
return latency
def suite():
suite = unittest.TestSuite()
suite.addTest(GPT2PerfBaselineTestCase('test_perf_1_5B'))
suite.addTest(GPT2PerfBaselineTestCase('test_perf_4B'))
suite.addTest(GPT2PerfBaselineTestCase('test_perf_8B'))
suite.addTest(GPT2PerfBaselineTestCase('test_perf_20B'))
return suite
if __name__ == '__main__':
runner = unittest.TextTestRunner(failfast=True)
runner.run(suite())
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import unittest
import subprocess
import os
import time
import re
from test_common import BaseTestCase
class GPT2PerfTestCase(BaseTestCase):
def __init__(self, methodName="DeepSpeed performance test on GPT2 model"):
super(GPT2PerfTestCase, self).__init__(methodName)
def test_perf_1_5B(self):
test_config = {
"mp": 1,
"gpus": 16,
"nodes": 4,
"bs": 32,
"steps": 100,
"layers": 48,
"hidden_size": 1600,
"seq_length": 1024,
"heads": 16,
"deepspeed": True,
"json": "ds_config_perf_bs32.json",
}
self.run_test(test_config)
def test_perf_4B(self):
test_config = {
"mp": 1,
"gpus": 16,
"nodes": 4,
"bs": 8,
"steps": 100,
"layers": 64,
"hidden_size": 2304,
"seq_length": 1024,
"heads": 16,
"deepspeed": True,
"json": "ds_config_perf_bs8.json",
}
self.run_test(test_config)
def test_perf_8B(self):
test_config = {
"mp": 2,
"gpus": 16,
"nodes": 4,
"bs": 16,
"steps": 100,
"layers": 72,
"hidden_size": 3072,
"seq_length": 1024,
"heads": 24,
"deepspeed": True,
"json": "ds_config_perf_bs16.json",
}
self.run_test(test_config)
def test_perf_20B(self):
test_config = {
"mp": 4,
"gpus": 16,
"nodes": 4,
"bs": 8,
"steps": 50,
"layers": 111,
"hidden_size": 3808,
"seq_length": 1024,
"heads": 32,
"ckpt_num_layers": 1,
"deepspeed": True,
"json": "ds_config_perf_bs8.json",
}
self.run_test(test_config)
def run_test(self, test_config):
print("\n")
print("{0}: starting......".format(self.id()))
prefix = "gpt2_perf"
test_file = self.gen_output_name(test_config, prefix)
self.run_gpt2_test(test_config, test_file)
exec_time = self.grep_latency_from_file(test_file)
if exec_time == 0.0:
print("{0}: no latency found in file {1}".format(self.id(), test_file))
else:
print("{0}: execution time per iteration is {1}ms.".format(
self.id(),
exec_time))
def grep_latency_from_file(self, file_name):
latency = 0.0
count = 0
with open(file_name, 'r') as f:
lines = f.readlines()
line_filter = "elapsed time per iteration"
match_number = re.compile(
'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for line in lines:
if line_filter in line:
ms_per_iter = re.findall(match_number, line)
latency += float(ms_per_iter[0])
count += 1
if count > 0:
latency /= count
return latency
def suite():
suite = unittest.TestSuite()
suite.addTest(GPT2PerfTestCase('test_perf_1_5B'))
suite.addTest(GPT2PerfTestCase('test_perf_4B'))
suite.addTest(GPT2PerfTestCase('test_perf_8B'))
suite.addTest(GPT2PerfTestCase('test_perf_20B'))
return suite
if __name__ == '__main__':
runner = unittest.TextTestRunner(failfast=True)
runner.run(suite())
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
import unittest
import subprocess
import os
import time
import re
class BaseTestCase(unittest.TestCase):
def __init__(self, methodName="DeepSpeed performance test"):
super(BaseTestCase, self).__init__(methodName)
self.test_dir = "./test"
self.baseline_dir = "./baseline"
self.timestr = time.strftime("%Y%m%d-%H%M%S")
def gen_output_name(self, test_config, prefix):
other_args = test_config["other_args"] if "other_args" in test_config else ""
zero_args = "_zero" if "zero" in test_config and test_config["zero"] else ""
other_args = other_args.strip(' -\\').replace(" ", "").replace("\"", "")
if other_args:
other_args = "_" + other_args
if test_config["deepspeed"]:
file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log".format(
test_config["mp"],
test_config["gpus"],
test_config["nodes"],
test_config["bs"],
test_config["steps"],
test_config["layers"],
test_config["hidden_size"],
test_config["seq_length"],
test_config["heads"],
other_args,
zero_args,
self.timestr)
save_dir = self.test_dir
else:
file_name = "_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log".format(
test_config["mp"],
test_config["gpus"],
test_config["nodes"],
test_config["bs"],
test_config["steps"],
test_config["layers"],
test_config["hidden_size"],
test_config["seq_length"],
test_config["heads"],
other_args)
save_dir = self.baseline_dir
return os.path.join(save_dir, prefix + file_name)
def ensure_directory_exists(self, filename):
dirname = os.path.dirname(filename)
if not os.path.exists(dirname):
os.makedirs(dirname)
def clean_test_env(self):
cmd = "dlts_ssh pkill -9 -f /usr/bin/python"
print(cmd)
subprocess.run(cmd, shell=True, check=False, executable='/bin/bash')
time.sleep(20)
def run_gpt2_test(self, test_config, output):
ds_flag = "-d " + test_config["json"] if test_config["deepspeed"] else ""
ckpt_num = test_config[
"ckpt_num_layers"] if "ckpt_num_layers" in test_config else 1
other_args = "-o " + test_config[
"other_args"] if "other_args" in test_config else ""
cmd = "./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}".format(
test_config["mp"],
test_config["gpus"],
test_config["nodes"],
test_config["bs"],
test_config["steps"],
test_config["layers"],
test_config["hidden_size"],
test_config["seq_length"],
test_config["heads"],
ckpt_num,
other_args,
ds_flag)
self.ensure_directory_exists(output)
with open(output, "w") as f:
print(cmd)
subprocess.run(cmd,
shell=True,
check=False,
executable='/bin/bash',
stdout=f,
stderr=f)
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import sys
import unittest
sys.path.append('../examples/Megatron_GPT2')
sys.path.append('../examples/BingBertSquad')
sys.path.append('../examples/QANet-Pytorch')
sys.path.append('../examples/bing_bert')
import os
# Import the test cases here.
import Megatron_GPT2
import BingBertSquad
import bing_bert
def pytest_hack(runner_result):
'''This is an ugly hack to get the unittest suites to play nicely with
pytest. Otherwise failed tests are not reported by pytest for some reason.
Long-term, these model tests should be adapted to pytest.
'''
if not runner_result.wasSuccessful():
print('SUITE UNSUCCESSFUL:', file=sys.stderr)
for fails in runner_result.failures:
print(fails, file=sys.stderr)
assert runner_result.wasSuccessful() # fail the test
def test_run():
runner = unittest.TextTestRunner(failfast=True)
# Add test suites here.
pytest_hack(runner.run(Megatron_GPT2.suite()))
pytest_hack(runner.run(Megatron_GPT2.checkpoint_suite()))
#pytest_hack(runner.run(BingBertSquad.suite()))
#pytest_hack(runner.run(bing_bert.checkpoint_suite()))
#pytest_hack(runner.run(bing_bert.pretrain_suite()))
if __name__ == '__main__':
test_run()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment