Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
deepspeed
Commits
98f5131b
"src/diffusers/commands/diffusers_cli.py" did not exist on "27d11a0094e292a8d790714d1b5cdf5e9186814d"
Commit
98f5131b
authored
Feb 03, 2020
by
Elton Zheng
Browse files
add test model Megatron_GPT2
parent
3bc21cd0
Changes
15
Hide whitespace changes
Inline
Side-by-side
Showing
15 changed files
with
1112 additions
and
0 deletions
+1112
-0
tests/model/Megatron_GPT2/__init__.py
tests/model/Megatron_GPT2/__init__.py
+8
-0
tests/model/Megatron_GPT2/ds_config_func_bs4.json
tests/model/Megatron_GPT2/ds_config_func_bs4.json
+21
-0
tests/model/Megatron_GPT2/ds_config_func_bs8.json
tests/model/Megatron_GPT2/ds_config_func_bs8.json
+21
-0
tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
+21
-0
tests/model/Megatron_GPT2/ds_config_func_scheduler.json
tests/model/Megatron_GPT2/ds_config_func_scheduler.json
+30
-0
tests/model/Megatron_GPT2/ds_config_perf_bs16.json
tests/model/Megatron_GPT2/ds_config_perf_bs16.json
+22
-0
tests/model/Megatron_GPT2/ds_config_perf_bs32.json
tests/model/Megatron_GPT2/ds_config_perf_bs32.json
+22
-0
tests/model/Megatron_GPT2/ds_config_perf_bs8.json
tests/model/Megatron_GPT2/ds_config_perf_bs8.json
+22
-0
tests/model/Megatron_GPT2/ds_gpt2_test.sh
tests/model/Megatron_GPT2/ds_gpt2_test.sh
+100
-0
tests/model/Megatron_GPT2/run_checkpoint_test.py
tests/model/Megatron_GPT2/run_checkpoint_test.py
+200
-0
tests/model/Megatron_GPT2/run_func_test.py
tests/model/Megatron_GPT2/run_func_test.py
+230
-0
tests/model/Megatron_GPT2/run_perf_baseline.py
tests/model/Megatron_GPT2/run_perf_baseline.py
+133
-0
tests/model/Megatron_GPT2/run_perf_test.py
tests/model/Megatron_GPT2/run_perf_test.py
+137
-0
tests/model/Megatron_GPT2/test_common.py
tests/model/Megatron_GPT2/test_common.py
+98
-0
tests/model/run_sanity_check.py
tests/model/run_sanity_check.py
+47
-0
No files found.
tests/model/Megatron_GPT2/__init__.py
0 → 100644
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
from
.run_func_test
import
GPT2FuncTestCase
from
.run_checkpoint_test
import
GPT2CheckpointTestCase
,
checkpoint_suite
from
.run_func_test
import
suite
tests/model/Megatron_GPT2/ds_config_func_bs4.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
4
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_func_bs8.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
8
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_func_bs8_no_zero.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
8
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
false
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_func_scheduler.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
4
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"scheduler"
:
{
"type"
:
"WarmupLR"
,
"params"
:
{
"warmup_min_lr"
:
0
,
"warmup_max_lr"
:
0.001
,
"warmup_num_steps"
:
10
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_perf_bs16.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
16
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"disable_allgather"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_perf_bs32.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
32
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"disable_allgather"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_config_perf_bs8.json
0 → 100644
View file @
98f5131b
{
"train_batch_size"
:
8
,
"gradient_accumulation_steps"
:
1
,
"steps_per_print"
:
1
,
"zero_optimization"
:
true
,
"disable_allgather"
:
true
,
"optimizer"
:
{
"type"
:
"Adam"
,
"params"
:
{
"lr"
:
0.00015
,
"max_grad_norm"
:
1.0
}
},
"fp16"
:
{
"enabled"
:
true
,
"loss_scale"
:
0
,
"loss_scale_window"
:
1000
,
"hysteresis"
:
2
,
"min_loss_scale"
:
1
}
}
tests/model/Megatron_GPT2/ds_gpt2_test.sh
0 → 100755
View file @
98f5131b
#! /bin/bash
helpFunction
()
{
echo
""
echo
"Usage:
$0
-m model-parallelism -g gpu-per-node -n node# -b batch-size -s stpes -l layers -h hidden_size -q seq_length -e heads -c ckpt_num_layers [-d]"
echo
-e
"
\t
-m model parallelism"
echo
-e
"
\t
-g gpus per node"
echo
-e
"
\t
-n node count"
echo
-e
"
\t
-b batch size"
echo
-e
"
\t
-s training steps"
echo
-e
"
\t
-l layers"
echo
-e
"
\t
-h hidden size"
echo
-e
"
\t
-q sequence length"
echo
-e
"
\t
-e attention heads"
echo
-e
"
\t
-c checkpoint num_layers"
echo
-e
"
\t
-o other args"
echo
-e
"
\t
-d DeepSpeed config json file"
echo
-e
"
\t
-z Enable Zero optimization"
exit
1
}
layers
=
24
hidden_size
=
1024
seq_length
=
1024
ckpt_num_layers
=
1
other_args
=
""
ds_opt
=
""
zero_opt
=
""
script_path
=
$(
realpath
$0
)
script_dir
=
$(
dirname
$script_path
)
while
getopts
"m:g:n:b:s:l:h:q:e:c:o:d:z"
opt
do
case
"
$opt
"
in
m
)
mp
=
"
$OPTARG
"
;;
g
)
gpus
=
"
$OPTARG
"
;;
n
)
nodes
=
"
$OPTARG
"
;;
b
)
bs
=
"
$OPTARG
"
;;
s
)
steps
=
"
$OPTARG
"
;;
l
)
layers
=
"
$OPTARG
"
;;
h
)
hidden_size
=
"
$OPTARG
"
;;
q
)
seq_length
=
"
$OPTARG
"
;;
e
)
heads
=
"
$OPTARG
"
;;
c
)
ckpt_num_layers
=
"
$OPTARG
"
;;
o
)
other_args
=
"
$OPTARG
"
;;
d
)
ds_opt
=
"--deepspeed --deepspeed_config
$script_dir
/
$OPTARG
"
;;
z
)
zero_opt
=
"--zero_optimization"
;;
?
)
helpFunction
;;
esac
done
# Print helpFunction in case parameters are empty
if
[
-z
"
$mp
"
]
||
[
-z
"
$gpus
"
]
||
[
-z
"
$nodes
"
]
||
[
-z
"
$bs
"
]
||
[
-z
"
$steps
"
]
then
echo
"Some or all of the parameters are empty"
;
helpFunction
fi
# Change for multinode config
MASTER_ADDR
=
localhost
MASTER_PORT
=
6000
gpt_options
=
"
\
--model-parallel-size
${
mp
}
\
--num-layers
${
layers
}
\
--hidden-size
${
hidden_size
}
\
--num-attention-heads
${
heads
}
\
--batch-size
${
bs
}
\
--seq-length
${
seq_length
}
\
--max-position-embeddings
${
seq_length
}
\
--train-iters
${
steps
}
\
--train-data webtext
\
--lazy-loader
\
--tokenizer-type GPT2BPETokenizer
\
--split 949,50,1
\
--distributed-backend nccl
\
--lr 0.00015
\
--no-load-optim
\
--lr-decay-style cosine
\
--weight-decay 1e-2
\
--clip-grad 1.0
\
--warmup .01
\
--checkpoint-activations
\
--checkpoint-num-layers
${
ckpt_num_layers
}
\
--fp16
\
--log-interval 1
\
${
other_args
}
\
${
ds_opt
}
\
${
zero_opt
}
\
"
work_dir
=
"../../../examples/Megatron-LM/"
include_str
=
`
seq
0
$((
$gpus
-
1
))
|
paste
-sd
","
-
`
run_cmd
=
"(cd
${
work_dir
}
&& deepspeed.pt -i localhost:
${
include_str
}
pretrain_gpt2.py
${
gpt_options
}
)"
echo
${
run_cmd
}
eval
${
run_cmd
}
set
+x
tests/model/Megatron_GPT2/run_checkpoint_test.py
0 → 100644
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import
unittest
import
subprocess
import
os
import
time
import
re
from
.test_common
import
BaseTestCase
def
grep_loss_from_file
(
file_name
):
loss
=
0.0
with
open
(
file_name
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
line_filter
=
"validation loss at the end of training for test data | LM loss:"
match_number
=
re
.
compile
(
'LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for
line
in
lines
:
if
line_filter
in
line
:
loss
=
re
.
findall
(
match_number
,
line
)
loss
=
float
(
loss
[
0
])
if
loss
==
0.0
:
print
(
"no loss found in file "
,
file_name
)
return
loss
class
GPT2CheckpointTestCase
(
BaseTestCase
):
def
__init__
(
self
,
methodName
=
"DeepSpeed function test on GPT2 model"
):
super
(
GPT2CheckpointTestCase
,
self
).
__init__
(
methodName
)
def
setUp
(
self
):
self
.
save_dir
=
os
.
getcwd
()
new_dir
=
os
.
path
.
dirname
(
__file__
)
if
new_dir
:
os
.
chdir
(
new_dir
)
def
tearDown
(
self
):
os
.
chdir
(
self
.
save_dir
)
def
test_mp4_gpu16_node1_with_zero
(
self
):
test_config
=
{
"mp"
:
2
,
"gpus"
:
4
,
"nodes"
:
1
,
"bs"
:
8
,
"steps"
:
1100
,
"layers"
:
2
,
"hidden_size"
:
256
,
"seq_length"
:
256
,
"heads"
:
16
,
"deepspeed"
:
True
,
"tag"
:
"ds_zero"
,
"zero"
:
True
,
"other_args"
:
""
,
"checkpoint_name"
:
"ckpt_mp4_gpu16_w_zero"
,
"checkpoint_interval"
:
1000
,
"json"
:
"ds_config_func_bs8.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
test_mp4_gpu16_node1_without_zero
(
self
):
test_config
=
{
"mp"
:
2
,
"gpus"
:
4
,
"nodes"
:
1
,
"bs"
:
8
,
"steps"
:
1100
,
"layers"
:
2
,
"hidden_size"
:
256
,
"seq_length"
:
256
,
"heads"
:
16
,
"deepspeed"
:
True
,
"zero"
:
False
,
"other_args"
:
""
,
"tag"
:
"ds_without_zero"
,
"checkpoint_name"
:
"ckpt_mp4_gpu16_wo_zero"
,
"checkpoint_interval"
:
1000
,
"json"
:
"ds_config_func_bs8_no_zero.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
gen_name
(
self
,
test_config
,
prefix
):
save_dir
=
"checkpoint_test_logs"
tag
=
test_config
[
"tag"
]
file_name
=
f
"_
{
tag
}
.log"
return
os
.
path
.
join
(
save_dir
,
prefix
+
file_name
)
def
run_test
(
self
,
test_config
,
r_tol
):
print
(
"
\n
"
)
print
(
"{0}: starting......"
.
format
(
self
.
id
()))
# save to current directory.
checkpoint_folder
=
test_config
[
"checkpoint_name"
]
checkpoint_interval
=
test_config
[
"checkpoint_interval"
]
checkpoint_name
=
test_config
[
"checkpoint_name"
]
#---------------remove old checkpoint---------------#
try
:
cmd
=
f
"rm -rf
{
checkpoint_name
}
"
print
(
f
"
{
self
.
id
()
}
cmd:
{
cmd
}
"
)
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
,
stdout
=
f
,
stderr
=
f
)
except
:
print
(
"No old checkpoint"
)
#-----------------Saving Checkpoint-----------------#
#building checkpoint arguments
test_config
[
"other_args"
]
=
f
"
\"
--save
{
checkpoint_folder
}
--save-interval
{
checkpoint_interval
}
\"
"
prefix
=
"gpt2_saving_checkpoint"
# create checkpoint run...
base_file
=
self
.
gen_name
(
test_config
,
prefix
)
# remove previous test log
try
:
cmd
=
f
"rm
{
base_file
}
"
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
)
except
:
print
(
f
"
{
self
.
id
()
}
No old logs"
)
print
(
"{0}: Run for saving checkpoint"
.
format
(
self
.
id
()))
self
.
run_gpt2_test
(
test_config
,
base_file
)
#-----------------Loading Checkpoint-----------------#
#building checkpoint arguments
test_config
[
"other_args"
]
=
f
"
\"
--load
{
checkpoint_folder
}
\"
"
#set checkpoint load iteration
try
:
cmd
=
f
"echo
{
checkpoint_interval
}
>
{
checkpoint_name
}
/latest_checkpointed_iteration.txt"
print
(
f
"
{
self
.
id
()
}
running cmd:
{
cmd
}
"
)
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
)
except
:
print
(
f
"
{
self
.
id
()
}
Failed to update the checkpoint iteration file"
)
return
False
prefix
=
"gpt2_loading_checkpoint"
print
(
"{0}: Second run loading checkpoint and continuing."
.
format
(
self
.
id
()))
test_file
=
self
.
gen_name
(
test_config
,
prefix
)
# remove previous test log
try
:
cmd
=
f
"rm
{
test_file
}
"
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
)
except
:
print
(
f
"
{
self
.
id
()
}
no previous logs for"
)
self
.
run_gpt2_test
(
test_config
,
test_file
)
return
self
.
check_parity
(
base_file
,
test_file
,
r_tol
)
def
has_loss_data
(
self
,
file_name
):
has_loss
=
False
if
os
.
path
.
exists
(
file_name
):
loss
=
grep_loss_from_file
(
file_name
)
if
loss
!=
0.0
:
has_loss
=
True
return
has_loss
def
check_parity
(
self
,
base_file
,
test_file
,
r_tol
):
base_loss
=
grep_loss_from_file
(
base_file
)
test_loss
=
grep_loss_from_file
(
test_file
)
print
(
"baseline loss: {0}, test loss: {1}"
.
format
(
base_loss
,
test_loss
))
if
base_loss
==
0.0
or
test_loss
==
0.0
:
return
False
if
abs
((
base_loss
-
test_loss
)
/
base_loss
)
>
r_tol
:
return
False
return
True
def
checkpoint_suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
GPT2CheckpointTestCase
(
'test_mp4_gpu16_node1_with_zero'
))
suite
.
addTest
(
GPT2CheckpointTestCase
(
'test_mp4_gpu16_node1_without_zero'
))
return
suite
if
__name__
==
'__main__'
:
runner
=
unittest
.
TextTestRunner
(
failfast
=
True
)
runner
.
run
(
checkpoint_suite
())
tests/model/Megatron_GPT2/run_func_test.py
0 → 100755
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import
unittest
import
subprocess
import
os
import
time
import
re
from
.test_common
import
BaseTestCase
def
grep_loss_from_file
(
file_name
):
loss
=
0.0
with
open
(
file_name
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
line_filter
=
"validation loss at the end of training for test data | LM loss:"
match_number
=
re
.
compile
(
'LM loss: ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for
line
in
lines
:
if
line_filter
in
line
:
loss
=
re
.
findall
(
match_number
,
line
)
loss
=
float
(
loss
[
0
])
if
loss
==
0.0
:
print
(
"no loss found in file "
,
file_name
)
return
loss
class
GPT2FuncTestCase
(
BaseTestCase
):
def
__init__
(
self
,
methodName
=
"DeepSpeed function test on GPT2 model"
):
super
(
GPT2FuncTestCase
,
self
).
__init__
(
methodName
)
def
setUp
(
self
):
self
.
save_dir
=
os
.
getcwd
()
new_dir
=
os
.
path
.
dirname
(
__file__
)
if
new_dir
:
os
.
chdir
(
new_dir
)
def
tearDown
(
self
):
os
.
chdir
(
self
.
save_dir
)
def
test_mp1_gpu1_node1
(
self
):
test_config
=
{
"mp"
:
1
,
"gpus"
:
1
,
"nodes"
:
1
,
"bs"
:
4
,
"steps"
:
1000
,
"layers"
:
12
,
"hidden_size"
:
768
,
"seq_length"
:
256
,
"heads"
:
12
,
"deepspeed"
:
False
,
"json"
:
"ds_config_func_bs4.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
test_mp1_gpu2_node1
(
self
):
test_config
=
{
"mp"
:
1
,
"gpus"
:
2
,
"nodes"
:
1
,
"bs"
:
8
,
"steps"
:
1000
,
"layers"
:
12
,
"hidden_size"
:
768
,
"seq_length"
:
256
,
"heads"
:
12
,
"deepspeed"
:
False
,
"json"
:
"ds_config_func_bs8.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
test_mp2_gpu4_node1
(
self
):
test_config
=
{
"mp"
:
2
,
"gpus"
:
4
,
"nodes"
:
1
,
"bs"
:
8
,
"steps"
:
1000
,
"layers"
:
12
,
"hidden_size"
:
768
,
"seq_length"
:
256
,
"heads"
:
12
,
"deepspeed"
:
False
,
"json"
:
"ds_config_func_bs8.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
succ
=
self
.
run_partition_activations_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
test_mp4_gpu4_node1
(
self
):
test_config
=
{
"mp"
:
4
,
"gpus"
:
4
,
"nodes"
:
1
,
"bs"
:
8
,
"steps"
:
1000
,
"layers"
:
12
,
"hidden_size"
:
768
,
"seq_length"
:
256
,
"heads"
:
12
,
"deepspeed"
:
False
,
"json"
:
"ds_config_func_bs8.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
succ
=
self
.
run_partition_activations_test
(
test_config
,
0.01
)
self
.
assertTrue
(
succ
)
def
test_optimizer_scheduler
(
self
):
test_config
=
{
"mp"
:
1
,
"gpus"
:
1
,
"nodes"
:
1
,
"bs"
:
4
,
"steps"
:
20
,
"layers"
:
12
,
"hidden_size"
:
768
,
"seq_length"
:
256
,
"heads"
:
12
,
"deepspeed"
:
False
,
"json"
:
"ds_config_func_scheduler.json"
,
}
succ
=
self
.
run_test
(
test_config
,
0.01
)
# assure no crash.
self
.
assertTrue
(
True
)
def
run_partition_activations_test
(
self
,
test_config
,
r_tol
):
print
(
"
\n
"
)
print
(
"{0}: starting......"
.
format
(
self
.
id
()))
prefix
=
"gpt2_partition_activation_"
# baseline run...
test_config
[
"deepspeed"
]
=
False
base_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
# skip baseline run if it exists.
if
not
self
.
has_loss_data
(
base_file
):
print
(
"{0}: baseline run."
.
format
(
self
.
id
()))
self
.
run_gpt2_test
(
test_config
,
base_file
)
else
:
print
(
"{0}: baseline exists."
.
format
(
self
.
id
()))
# DeepSpeed run...
test_config
[
"deepspeed"
]
=
True
test_config
[
"other_args"
]
=
"--partition-activations"
print
(
"{0}: DeepSpeed run."
.
format
(
self
.
id
()))
test_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
self
.
run_gpt2_test
(
test_config
,
test_file
)
return
self
.
check_parity
(
base_file
,
test_file
,
r_tol
)
def
run_test
(
self
,
test_config
,
r_tol
):
print
(
"
\n
"
)
print
(
"{0}: starting......"
.
format
(
self
.
id
()))
prefix
=
"gpt2_func"
# baseline run...
test_config
[
"deepspeed"
]
=
False
base_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
# skip baseline run if it exists.
if
not
self
.
has_loss_data
(
base_file
):
print
(
"{0}: baseline run."
.
format
(
self
.
id
()))
self
.
run_gpt2_test
(
test_config
,
base_file
)
else
:
print
(
"{0}: baseline exists."
.
format
(
self
.
id
()))
# DeepSpeed run...
test_config
[
"deepspeed"
]
=
True
print
(
"{0}: DeepSpeed run."
.
format
(
self
.
id
()))
test_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
self
.
run_gpt2_test
(
test_config
,
test_file
)
return
self
.
check_parity
(
base_file
,
test_file
,
r_tol
)
def
has_loss_data
(
self
,
file_name
):
has_loss
=
False
if
os
.
path
.
exists
(
file_name
):
loss
=
grep_loss_from_file
(
file_name
)
if
loss
!=
0.0
:
has_loss
=
True
return
has_loss
def
check_parity
(
self
,
base_file
,
test_file
,
r_tol
):
base_loss
=
grep_loss_from_file
(
base_file
)
test_loss
=
grep_loss_from_file
(
test_file
)
print
(
"baseline loss: {0}, test loss: {1}"
.
format
(
base_loss
,
test_loss
))
if
base_loss
==
0.0
or
test_loss
==
0.0
:
return
False
if
abs
((
base_loss
-
test_loss
)
/
base_loss
)
>
r_tol
:
return
False
return
True
def
suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
GPT2FuncTestCase
(
'test_mp1_gpu1_node1'
))
suite
.
addTest
(
GPT2FuncTestCase
(
'test_mp1_gpu2_node1'
))
suite
.
addTest
(
GPT2FuncTestCase
(
'test_mp2_gpu4_node1'
))
suite
.
addTest
(
GPT2FuncTestCase
(
'test_mp4_gpu4_node1'
))
suite
.
addTest
(
GPT2FuncTestCase
(
'test_optimizer_scheduler'
))
return
suite
if
__name__
==
'__main__'
:
runner
=
unittest
.
TextTestRunner
(
failfast
=
True
)
runner
.
run
(
suite
())
tests/model/Megatron_GPT2/run_perf_baseline.py
0 → 100755
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import
unittest
import
subprocess
import
os
import
time
import
re
from
test_common
import
BaseTestCase
class
GPT2PerfBaselineTestCase
(
BaseTestCase
):
def
__init__
(
self
,
methodName
=
"DeepSpeed performance test on GPT2 model"
):
super
(
GPT2PerfBaselineTestCase
,
self
).
__init__
(
methodName
)
def
test_perf_1_5B
(
self
):
test_config
=
{
"mp"
:
2
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
16
,
"steps"
:
100
,
"layers"
:
48
,
"hidden_size"
:
1600
,
"seq_length"
:
1024
,
"heads"
:
16
,
"deepspeed"
:
False
,
}
self
.
run_test
(
test_config
)
def
test_perf_4B
(
self
):
test_config
=
{
"mp"
:
4
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
8
,
"steps"
:
100
,
"layers"
:
64
,
"hidden_size"
:
2304
,
"seq_length"
:
1024
,
"heads"
:
16
,
"deepspeed"
:
False
,
}
self
.
run_test
(
test_config
)
def
test_perf_8B
(
self
):
test_config
=
{
"mp"
:
4
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
8
,
"steps"
:
100
,
"layers"
:
72
,
"hidden_size"
:
3072
,
"seq_length"
:
1024
,
"heads"
:
24
,
"deepspeed"
:
False
,
}
self
.
run_test
(
test_config
)
def
test_perf_20B
(
self
):
test_config
=
{
"mp"
:
16
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
4
,
"steps"
:
50
,
"layers"
:
111
,
"hidden_size"
:
3808
,
"seq_length"
:
1024
,
"heads"
:
32
,
"ckpt_num_layers"
:
1
,
"deepspeed"
:
False
,
}
self
.
run_test
(
test_config
)
def
run_test
(
self
,
test_config
):
print
(
"
\n
"
)
print
(
"{0}: starting......"
.
format
(
self
.
id
()))
prefix
=
"gpt2_perf"
test_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
self
.
run_gpt2_test
(
test_config
,
test_file
)
exec_time
=
self
.
grep_latency_from_file
(
test_file
)
if
exec_time
==
0.0
:
print
(
"{0}: no latency found in file {1}"
.
format
(
self
.
id
(),
test_file
))
else
:
print
(
"{0}: execution time per iteration is {1}ms."
.
format
(
self
.
id
(),
exec_time
))
def
grep_latency_from_file
(
self
,
file_name
):
latency
=
0.0
count
=
0
with
open
(
file_name
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
line_filter
=
"elapsed time per iteration"
match_number
=
re
.
compile
(
'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for
line
in
lines
:
if
line_filter
in
line
:
ms_per_iter
=
re
.
findall
(
match_number
,
line
)
latency
+=
float
(
ms_per_iter
[
0
])
count
+=
1
if
count
>
0
:
latency
/=
count
return
latency
def
suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
GPT2PerfBaselineTestCase
(
'test_perf_1_5B'
))
suite
.
addTest
(
GPT2PerfBaselineTestCase
(
'test_perf_4B'
))
suite
.
addTest
(
GPT2PerfBaselineTestCase
(
'test_perf_8B'
))
suite
.
addTest
(
GPT2PerfBaselineTestCase
(
'test_perf_20B'
))
return
suite
if
__name__
==
'__main__'
:
runner
=
unittest
.
TextTestRunner
(
failfast
=
True
)
runner
.
run
(
suite
())
tests/model/Megatron_GPT2/run_perf_test.py
0 → 100755
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import
unittest
import
subprocess
import
os
import
time
import
re
from
test_common
import
BaseTestCase
class
GPT2PerfTestCase
(
BaseTestCase
):
def
__init__
(
self
,
methodName
=
"DeepSpeed performance test on GPT2 model"
):
super
(
GPT2PerfTestCase
,
self
).
__init__
(
methodName
)
def
test_perf_1_5B
(
self
):
test_config
=
{
"mp"
:
1
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
32
,
"steps"
:
100
,
"layers"
:
48
,
"hidden_size"
:
1600
,
"seq_length"
:
1024
,
"heads"
:
16
,
"deepspeed"
:
True
,
"json"
:
"ds_config_perf_bs32.json"
,
}
self
.
run_test
(
test_config
)
def
test_perf_4B
(
self
):
test_config
=
{
"mp"
:
1
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
8
,
"steps"
:
100
,
"layers"
:
64
,
"hidden_size"
:
2304
,
"seq_length"
:
1024
,
"heads"
:
16
,
"deepspeed"
:
True
,
"json"
:
"ds_config_perf_bs8.json"
,
}
self
.
run_test
(
test_config
)
def
test_perf_8B
(
self
):
test_config
=
{
"mp"
:
2
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
16
,
"steps"
:
100
,
"layers"
:
72
,
"hidden_size"
:
3072
,
"seq_length"
:
1024
,
"heads"
:
24
,
"deepspeed"
:
True
,
"json"
:
"ds_config_perf_bs16.json"
,
}
self
.
run_test
(
test_config
)
def
test_perf_20B
(
self
):
test_config
=
{
"mp"
:
4
,
"gpus"
:
16
,
"nodes"
:
4
,
"bs"
:
8
,
"steps"
:
50
,
"layers"
:
111
,
"hidden_size"
:
3808
,
"seq_length"
:
1024
,
"heads"
:
32
,
"ckpt_num_layers"
:
1
,
"deepspeed"
:
True
,
"json"
:
"ds_config_perf_bs8.json"
,
}
self
.
run_test
(
test_config
)
def
run_test
(
self
,
test_config
):
print
(
"
\n
"
)
print
(
"{0}: starting......"
.
format
(
self
.
id
()))
prefix
=
"gpt2_perf"
test_file
=
self
.
gen_output_name
(
test_config
,
prefix
)
self
.
run_gpt2_test
(
test_config
,
test_file
)
exec_time
=
self
.
grep_latency_from_file
(
test_file
)
if
exec_time
==
0.0
:
print
(
"{0}: no latency found in file {1}"
.
format
(
self
.
id
(),
test_file
))
else
:
print
(
"{0}: execution time per iteration is {1}ms."
.
format
(
self
.
id
(),
exec_time
))
def
grep_latency_from_file
(
self
,
file_name
):
latency
=
0.0
count
=
0
with
open
(
file_name
,
'r'
)
as
f
:
lines
=
f
.
readlines
()
line_filter
=
"elapsed time per iteration"
match_number
=
re
.
compile
(
'elapsed time per iteration \(ms\): ([-+]?[0-9]+\.?[0-9]*(?:[Ee][-+]?[0-9]+)?)'
)
for
line
in
lines
:
if
line_filter
in
line
:
ms_per_iter
=
re
.
findall
(
match_number
,
line
)
latency
+=
float
(
ms_per_iter
[
0
])
count
+=
1
if
count
>
0
:
latency
/=
count
return
latency
def
suite
():
suite
=
unittest
.
TestSuite
()
suite
.
addTest
(
GPT2PerfTestCase
(
'test_perf_1_5B'
))
suite
.
addTest
(
GPT2PerfTestCase
(
'test_perf_4B'
))
suite
.
addTest
(
GPT2PerfTestCase
(
'test_perf_8B'
))
suite
.
addTest
(
GPT2PerfTestCase
(
'test_perf_20B'
))
return
suite
if
__name__
==
'__main__'
:
runner
=
unittest
.
TextTestRunner
(
failfast
=
True
)
runner
.
run
(
suite
())
tests/model/Megatron_GPT2/test_common.py
0 → 100755
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
import
unittest
import
subprocess
import
os
import
time
import
re
class
BaseTestCase
(
unittest
.
TestCase
):
def
__init__
(
self
,
methodName
=
"DeepSpeed performance test"
):
super
(
BaseTestCase
,
self
).
__init__
(
methodName
)
self
.
test_dir
=
"./test"
self
.
baseline_dir
=
"./baseline"
self
.
timestr
=
time
.
strftime
(
"%Y%m%d-%H%M%S"
)
def
gen_output_name
(
self
,
test_config
,
prefix
):
other_args
=
test_config
[
"other_args"
]
if
"other_args"
in
test_config
else
""
zero_args
=
"_zero"
if
"zero"
in
test_config
and
test_config
[
"zero"
]
else
""
other_args
=
other_args
.
strip
(
' -
\\
'
).
replace
(
" "
,
""
).
replace
(
"
\"
"
,
""
)
if
other_args
:
other_args
=
"_"
+
other_args
if
test_config
[
"deepspeed"
]:
file_name
=
"_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}_ds{10}-{11}.log"
.
format
(
test_config
[
"mp"
],
test_config
[
"gpus"
],
test_config
[
"nodes"
],
test_config
[
"bs"
],
test_config
[
"steps"
],
test_config
[
"layers"
],
test_config
[
"hidden_size"
],
test_config
[
"seq_length"
],
test_config
[
"heads"
],
other_args
,
zero_args
,
self
.
timestr
)
save_dir
=
self
.
test_dir
else
:
file_name
=
"_mp{0}_gpu{1}_node{2}_bs{3}_step{4}_layer{5}_hidden{6}_seq{7}_head{8}{9}.log"
.
format
(
test_config
[
"mp"
],
test_config
[
"gpus"
],
test_config
[
"nodes"
],
test_config
[
"bs"
],
test_config
[
"steps"
],
test_config
[
"layers"
],
test_config
[
"hidden_size"
],
test_config
[
"seq_length"
],
test_config
[
"heads"
],
other_args
)
save_dir
=
self
.
baseline_dir
return
os
.
path
.
join
(
save_dir
,
prefix
+
file_name
)
def
ensure_directory_exists
(
self
,
filename
):
dirname
=
os
.
path
.
dirname
(
filename
)
if
not
os
.
path
.
exists
(
dirname
):
os
.
makedirs
(
dirname
)
def
clean_test_env
(
self
):
cmd
=
"dlts_ssh pkill -9 -f /usr/bin/python"
print
(
cmd
)
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
)
time
.
sleep
(
20
)
def
run_gpt2_test
(
self
,
test_config
,
output
):
ds_flag
=
"-d "
+
test_config
[
"json"
]
if
test_config
[
"deepspeed"
]
else
""
ckpt_num
=
test_config
[
"ckpt_num_layers"
]
if
"ckpt_num_layers"
in
test_config
else
1
other_args
=
"-o "
+
test_config
[
"other_args"
]
if
"other_args"
in
test_config
else
""
cmd
=
"./ds_gpt2_test.sh -m {0} -g {1} -n {2} -b {3} -s {4} -l {5} -h {6} -q {7} -e {8} -c {9} {10} {11}"
.
format
(
test_config
[
"mp"
],
test_config
[
"gpus"
],
test_config
[
"nodes"
],
test_config
[
"bs"
],
test_config
[
"steps"
],
test_config
[
"layers"
],
test_config
[
"hidden_size"
],
test_config
[
"seq_length"
],
test_config
[
"heads"
],
ckpt_num
,
other_args
,
ds_flag
)
self
.
ensure_directory_exists
(
output
)
with
open
(
output
,
"w"
)
as
f
:
print
(
cmd
)
subprocess
.
run
(
cmd
,
shell
=
True
,
check
=
False
,
executable
=
'/bin/bash'
,
stdout
=
f
,
stderr
=
f
)
tests/model/run_sanity_check.py
0 → 100755
View file @
98f5131b
# coding=utf-8
# Copyright (c) 2019, The Microsoft DeepSpeed Team. All rights reserved.
#
# Note: please copy webtext data to "Megatron-LM" folder, before running this script.
import
sys
import
unittest
sys
.
path
.
append
(
'../examples/Megatron_GPT2'
)
sys
.
path
.
append
(
'../examples/BingBertSquad'
)
sys
.
path
.
append
(
'../examples/QANet-Pytorch'
)
sys
.
path
.
append
(
'../examples/bing_bert'
)
import
os
# Import the test cases here.
import
Megatron_GPT2
import
BingBertSquad
import
bing_bert
def
pytest_hack
(
runner_result
):
'''This is an ugly hack to get the unittest suites to play nicely with
pytest. Otherwise failed tests are not reported by pytest for some reason.
Long-term, these model tests should be adapted to pytest.
'''
if
not
runner_result
.
wasSuccessful
():
print
(
'SUITE UNSUCCESSFUL:'
,
file
=
sys
.
stderr
)
for
fails
in
runner_result
.
failures
:
print
(
fails
,
file
=
sys
.
stderr
)
assert
runner_result
.
wasSuccessful
()
# fail the test
def
test_run
():
runner
=
unittest
.
TextTestRunner
(
failfast
=
True
)
# Add test suites here.
pytest_hack
(
runner
.
run
(
Megatron_GPT2
.
suite
()))
pytest_hack
(
runner
.
run
(
Megatron_GPT2
.
checkpoint_suite
()))
#pytest_hack(runner.run(BingBertSquad.suite()))
#pytest_hack(runner.run(bing_bert.checkpoint_suite()))
#pytest_hack(runner.run(bing_bert.pretrain_suite()))
if
__name__
==
'__main__'
:
test_run
()
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment