Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
49c601da
Unverified
Commit
49c601da
authored
Dec 29, 2022
by
Jiarui Fang
Committed by
GitHub
Dec 29, 2022
Browse files
[example] add benchmark.sh for gpt (#2226)
parent
3629e611
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
36 additions
and
10 deletions
+36
-10
examples/language/gpt/benchmark.sh
examples/language/gpt/benchmark.sh
+22
-0
examples/language/gpt/model_zoo.py
examples/language/gpt/model_zoo.py
+3
-1
examples/language/gpt/run.sh
examples/language/gpt/run.sh
+7
-7
examples/language/gpt/train_gpt_demo.py
examples/language/gpt/train_gpt_demo.py
+4
-2
No files found.
examples/language/gpt/benchmark.sh
0 → 100644
View file @
49c601da
for
MODEL_NAME
in
"GPT2small"
do
for
BATCH_SIZE
in
8
do
for
GPUNUM
in
1 2 4 8
do
for
TPDEGREE
in
1 2 4 8
do
if
[
${
TPDEGREE
}
-gt
${
GPUNUM
}
]
then
continue
fi
echo
"****************** Begin ***************************"
echo
"* benchmrking MODEL_NAME
${
MODEL_NAME
}
BS
${
BATCH_SIZE
}
BS
${
BS
}
GPUNUM
${
GPUNUM
}
TPDEGREE
${
TPDEGREE
}
"
bash ./run.sh
echo
"****************** Finished ***************************"
echo
""
echo
""
done
done
done
done
examples/language/gpt/model_zoo.py
View file @
49c601da
...
@@ -53,7 +53,7 @@ def gpt2_24b(checkpoint=True):
...
@@ -53,7 +53,7 @@ def gpt2_24b(checkpoint=True):
return
GPTLMModel
(
hidden_size
=
8192
,
num_layers
=
30
,
num_attention_heads
=
16
,
checkpoint
=
checkpoint
)
return
GPTLMModel
(
hidden_size
=
8192
,
num_layers
=
30
,
num_attention_heads
=
16
,
checkpoint
=
checkpoint
)
def
model_builder
(
model_size
:
str
):
def
model_builder
(
model_size
:
str
)
->
callable
:
if
model_size
==
"gpt2_medium"
:
if
model_size
==
"gpt2_medium"
:
return
gpt2_medium
return
gpt2_medium
elif
model_size
==
"gpt2_xl"
:
elif
model_size
==
"gpt2_xl"
:
...
@@ -66,6 +66,8 @@ def model_builder(model_size: str):
...
@@ -66,6 +66,8 @@ def model_builder(model_size: str):
return
gpt2_20b
return
gpt2_20b
elif
model_size
==
"gpt2_24b"
:
elif
model_size
==
"gpt2_24b"
:
return
gpt2_24b
return
gpt2_24b
else
:
raise
TypeError
(
f
"model_builder
{
model_size
}
"
)
__all__
=
[
'model_builder'
]
__all__
=
[
'model_builder'
]
examples/language/gpt/run.sh
View file @
49c601da
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
# distplan in ["colossalai", "zero1", "zero2", "torch_ddp", "torch_zero"]
export
DISTPAN
=
"colossalai"
export
DISTPAN
=
{
$DISTPAN
:-
"colossalai"
}
# The following options only valid when DISTPAN="colossalai"
# The following options only valid when DISTPAN="colossalai"
export
TPDEGREE
=
1
export
TPDEGREE
=
${
TPDEGREE
:-
1
}
export
GPUNUM
=
1
export
GPUNUM
=
${
GPUNUM
:-
1
}
export
PLACEMENT
=
'const'
export
PLACEMENT
=
${
PLACEMENT
:
'const'
}
export
USE_SHARD_INIT
=
False
export
USE_SHARD_INIT
=
${
USE_SHARD_INIT
:
False
}
export
BATCH_SIZE
=
32
export
BATCH_SIZE
=
${
BATCH_SIZE
:-
8
}
#
export MODEL_TYPE=
"gpt2_10b"
export
MODEL_TYPE
=
${
MODEL_TYPE
:
"gpt2_medium"
}
mkdir
-p
logs
mkdir
-p
logs
torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
train_gpt_demo.py
--tp_degree
=
${
TPDEGREE
}
--model_type
=
${
MODEL_TYPE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
./logs/
${
MODEL_TYPE
}
_
${
DISTPAN
}
_gpu_
${
GPUNUM
}
_bs_
${
BATCH_SIZE
}
_tp_
${
TPDEGREE
}
.log
torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
train_gpt_demo.py
--tp_degree
=
${
TPDEGREE
}
--model_type
=
${
MODEL_TYPE
}
--batch_size
=
${
BATCH_SIZE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
./logs/
${
MODEL_TYPE
}
_
${
DISTPAN
}
_gpu_
${
GPUNUM
}
_bs_
${
BATCH_SIZE
}
_tp_
${
TPDEGREE
}
.log
examples/language/gpt/train_gpt_demo.py
View file @
49c601da
...
@@ -5,7 +5,6 @@ from time import time
...
@@ -5,7 +5,6 @@ from time import time
import
psutil
import
psutil
import
torch
import
torch
import
torch.nn
as
nn
import
torch.nn
as
nn
from
model_zoo
import
model_builder
from
packaging
import
version
from
packaging
import
version
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
from
torch.nn.parallel
import
DistributedDataParallel
as
DDP
...
@@ -17,6 +16,7 @@ from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, Proces
...
@@ -17,6 +16,7 @@ from colossalai.tensor import ColoParameter, ComputePattern, ComputeSpec, Proces
from
colossalai.utils
import
get_current_device
from
colossalai.utils
import
get_current_device
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.utils.model.colo_init_context
import
ColoInitContext
from
colossalai.zero.sharded_optim
import
LowLevelZeroOptimizer
from
colossalai.zero.sharded_optim
import
LowLevelZeroOptimizer
from
model_zoo
import
model_builder
def
parse_args
():
def
parse_args
():
...
@@ -55,7 +55,7 @@ def parse_args():
...
@@ -55,7 +55,7 @@ def parse_args():
parser
.
add_argument
(
parser
.
add_argument
(
"--model_type"
,
"--model_type"
,
type
=
str
,
type
=
str
,
default
=
'
gpt2_medium
'
,
default
=
"
gpt2_medium
"
,
help
=
"model model scale"
,
help
=
"model model scale"
,
)
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
...
@@ -309,6 +309,8 @@ def main():
...
@@ -309,6 +309,8 @@ def main():
if
n
>=
WARMUP_STEPS
:
if
n
>=
WARMUP_STEPS
:
tflops_list
.
append
(
step_tflops
)
tflops_list
.
append
(
step_tflops
)
logger
.
info
(
f
"max memory
{
torch
.
cuda
.
memory_allocated
()
/
1024
**
2
}
MB"
,
ranks
=
[
0
])
tflops_list
.
sort
()
tflops_list
.
sort
()
median_index
=
((
NUM_STEPS
-
WARMUP_STEPS
)
>>
1
)
+
WARMUP_STEPS
median_index
=
((
NUM_STEPS
-
WARMUP_STEPS
)
>>
1
)
+
WARMUP_STEPS
logger
.
info
(
f
"Median TFLOPS is
{
tflops_list
[
median_index
]:.
3
f
}
"
)
logger
.
info
(
f
"Median TFLOPS is
{
tflops_list
[
median_index
]:.
3
f
}
"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment