Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
f525d1f5
Unverified
Commit
f525d1f5
authored
Jan 13, 2023
by
ver217
Committed by
GitHub
Jan 13, 2023
Browse files
[example] update gpt gemini example ci test (#2477)
parent
fef5c949
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
15 additions
and
17 deletions
+15
-17
examples/language/gpt/gemini/train_gpt_demo.py
examples/language/gpt/gemini/train_gpt_demo.py
+2
-3
examples/language/gpt/test_ci.sh
examples/language/gpt/test_ci.sh
+13
-14
No files found.
examples/language/gpt/gemini/train_gpt_demo.py
View file @
f525d1f5
...
...
@@ -65,6 +65,7 @@ def parse_args():
default
=
"gpt2_medium"
,
help
=
"model model scale"
,
)
parser
.
add_argument
(
"--steps"
,
type
=
int
,
default
=
10
,
help
=
"num of training steps"
)
args
=
parser
.
parse_args
()
return
args
...
...
@@ -236,7 +237,7 @@ def main():
SEQ_LEN
=
1024
VOCAB_SIZE
=
50257
NUM_STEPS
=
10
NUM_STEPS
=
args
.
steps
WARMUP_STEPS
=
1
assert
WARMUP_STEPS
<
NUM_STEPS
,
"warmup steps should smaller than the total steps"
assert
(
NUM_STEPS
-
WARMUP_STEPS
)
%
2
==
1
,
"the number of valid steps should be odd to take the median "
...
...
@@ -290,14 +291,12 @@ def main():
from
torch.distributed.optim
import
ZeroRedundancyOptimizer
optimizer
=
ZeroRedundancyOptimizer
(
model
.
parameters
(),
optimizer_class
=
torch
.
optim
.
Adam
,
lr
=
0.01
)
elif
args
.
distplan
.
startswith
(
"zero"
):
pg
=
ProcessGroup
()
model
=
model
.
half
()
partition_flag
=
(
args
.
distplan
==
"zero2"
)
optimizer
=
torch
.
optim
.
Adam
(
model
.
parameters
(),
lr
=
0.01
)
optimizer
=
LowLevelZeroOptimizer
(
optimizer
,
pg
=
pg
,
reduce_bucket_size
=
12
*
1024
*
1024
,
overlap_communication
=
True
,
partition_grad
=
partition_flag
,
...
...
examples/language/gpt/test_ci.sh
View file @
f525d1f5
pip
install
-r
requirements.txt
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
export
DISTPAN
=
"colossalai"
# The following options only valid when DISTPAN="colossalai"
export
TPDEGREE
=
2
export
GPUNUM
=
4
export
PLACEMENT
=
'cpu'
export
USE_SHARD_INIT
=
False
export
BATCH_SIZE
=
8
export
MODEL_TYPE
=
"gpt2_medium"
mkdir
-p
logs
torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
train_gpt_demo.py
--tp_degree
=
${
TPDEGREE
}
--model_type
=
${
MODEL_TYPE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
./logs/
${
MODEL_TYPE
}
_
${
DISTPAN
}
_gpu_
${
GPUNUM
}
_bs_
${
BATCH_SIZE
}
_tp_
${
TPDEGREE
}
.log
# test colossalai
for
TP
in
1 2
;
do
for
PLACEMENT
in
"cpu"
"cuda"
"auto"
"const"
;
do
for
SHARD
in
"True"
"False"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
colossalai
--tp_degree
$TP
--placement
$PLACEMENT
--shardinit
$SHARD
||
exit
1
done
done
done
# test zero1&2
for
DIST
in
"zero1"
"zero2"
;
do
colossalai run
--nproc_per_node
=
4 ./gemini/train_gpt_demo.py
--steps
4
--distplan
$DIST
||
exit
1
done
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment