Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
29868a9e
Unverified
Commit
29868a9e
authored
Dec 27, 2022
by
Jiarui Fang
Committed by
GitHub
Dec 27, 2022
Browse files
[example] update gpt readme with performance (#2206)
parent
1cb532ff
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
47 additions
and
10 deletions
+47
-10
examples/language/gpt/README.md
examples/language/gpt/README.md
+33
-0
examples/language/gpt/run.sh
examples/language/gpt/run.sh
+2
-2
examples/language/gpt/train_gpt_demo.py
examples/language/gpt/train_gpt_demo.py
+12
-8
No files found.
examples/language/gpt/README.md
View file @
29868a9e
...
@@ -53,3 +53,36 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
...
@@ -53,3 +53,36 @@ The `train_gpt_demo.py` provides three distributed plans, you can choose the pla
-
ZeRO2 (Colossal-AI)
-
ZeRO2 (Colossal-AI)
-
Pytorch DDP
-
Pytorch DDP
-
Pytorch ZeRO
-
Pytorch ZeRO
## Performance
Testbed: a cluster of 8xA100 (80GB) and 1xAMD EPYC 7543 32-Core Processor (512 GB). GPUs are connected via PCI-e.
ColossalAI version 0.1.13.
How dose Batch Size affect the efficency.
| model | #GPU | policy | TP |batch | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 2 | cpu | 1 | 32 | 122.046 |
| gpt2_10b | 2 | cpu | 1 | 16 | 82.649 |
| gpt2_10b | 2 | cpu | 1 | 8 | 61.354 |
How dose the Placement Policy affect the efficency.
| model | #GPU | policy | TP |batch | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
| gpt2_10b | 4 | cuda | 1 | 8 | OOM |
| gpt2_10b | 4 | cpu | 1 | 8 | 61.354 |
| gpt2_10b | 4 | const | 1 | 8 | 82.137 |
How dose the Tensor Parallel Degree affect the efficency.
| model | #GPU | policy | TP |batch | Tflops |
| ---------- | --------- |--------- |--------- |--------- |--------- |
| gpt2_10b | 4 | auto | 1 | 8 | 88.657 |
| gpt2_10b | 4 | auto | 2 | 8 | 56.687 |
| gpt2_10b | 4 | auto | 4 | 8 | 29.019 |
| gpt2_10b | 4 | auto | 4 | 64 | 50.411 |
examples/language/gpt/run.sh
View file @
29868a9e
...
@@ -2,9 +2,9 @@
...
@@ -2,9 +2,9 @@
export
DISTPAN
=
"colossalai"
export
DISTPAN
=
"colossalai"
# The following options only valid when DISTPAN="colossalai"
# The following options only valid when DISTPAN="colossalai"
export
TPDEGREE
=
2
export
TPDEGREE
=
4
export
GPUNUM
=
4
export
GPUNUM
=
4
export
PLACEMENT
=
'
cpu
'
export
PLACEMENT
=
'
auto
'
export
USE_SHARD_INIT
=
False
export
USE_SHARD_INIT
=
False
env
OMP_NUM_THREADS
=
16 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
train_gpt_demo.py
--tp_degree
=
${
TPDEGREE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
env
OMP_NUM_THREADS
=
16 torchrun
--standalone
--nproc_per_node
=
${
GPUNUM
}
train_gpt_demo.py
--tp_degree
=
${
TPDEGREE
}
--placement
${
PLACEMENT
}
--shardinit
${
USE_SHARD_INIT
}
--distplan
${
DISTPAN
}
2>&1 |
tee
run.log
examples/language/gpt/train_gpt_demo.py
View file @
29868a9e
...
@@ -179,13 +179,17 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
...
@@ -179,13 +179,17 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):
# Gemini + ZeRO DDP
# Gemini + ZeRO DDP
def
gemini_zero_dpp
(
model
:
torch
.
nn
.
Module
,
pg
:
ProcessGroup
,
placememt_policy
:
str
=
"auto"
):
def
gemini_zero_dpp
(
model
:
torch
.
nn
.
Module
,
pg
:
ProcessGroup
,
placememt_policy
:
str
=
"auto"
):
cai_version
=
colossalai
.
__version__
cai_version
=
colossalai
.
__version__
from
colossalai.gemini
import
ChunkManager
,
GeminiManager
if
version
.
parse
(
cai_version
)
>
version
.
parse
(
"0.1.10"
):
if
version
.
parse
(
cai_version
)
>
version
.
parse
(
"0.1.10"
):
from
colossalai.nn.parallel
import
GeminiDDP
from
colossalai.nn.parallel
import
GeminiDDP
model
=
GeminiDDP
(
model
,
model
=
GeminiDDP
(
model
,
device
=
get_current_device
(),
device
=
get_current_device
(),
placement_policy
=
placememt_policy
,
placement_policy
=
placememt_policy
,
pin_memory
=
True
,
pin_memory
=
True
,
search_range_mb
=
32
)
hidden_dim
=
4096
,
search_range_mb
=
64
)
if
placememt_policy
==
'const'
:
model
.
gemini_manager
.
_placement_policy
.
set_const_memory_boundary
(
10
*
1024
)
elif
version
.
parse
(
cai_version
)
<=
version
.
parse
(
"0.1.10"
)
and
version
.
parse
(
cai_version
)
>=
version
.
parse
(
"0.1.9"
):
elif
version
.
parse
(
cai_version
)
<=
version
.
parse
(
"0.1.10"
)
and
version
.
parse
(
cai_version
)
>=
version
.
parse
(
"0.1.9"
):
from
colossalai.gemini
import
ChunkManager
,
GeminiManager
from
colossalai.gemini
import
ChunkManager
,
GeminiManager
chunk_size
=
ChunkManager
.
search_chunk_size
(
model
,
64
*
1024
**
2
,
32
)
chunk_size
=
ChunkManager
.
search_chunk_size
(
model
,
64
*
1024
**
2
,
32
)
...
@@ -206,9 +210,10 @@ def main():
...
@@ -206,9 +210,10 @@ def main():
if
args
.
distplan
not
in
[
"colossalai"
,
"torch_ddp"
,
"torch_zero"
,
"zero1"
,
"zero2"
]:
if
args
.
distplan
not
in
[
"colossalai"
,
"torch_ddp"
,
"torch_zero"
,
"zero1"
,
"zero2"
]:
raise
TypeError
(
f
"
{
args
.
distplan
}
is error"
)
raise
TypeError
(
f
"
{
args
.
distplan
}
is error"
)
BATCH_SIZE
=
8
BATCH_SIZE
=
64
SEQ_LEN
=
1024
SEQ_LEN
=
1024
VOCAB_SIZE
=
50257
VOCAB_SIZE
=
50257
NUM_STEPS
=
10
NUM_STEPS
=
10
disable_existing_loggers
()
disable_existing_loggers
()
...
@@ -227,22 +232,21 @@ def main():
...
@@ -227,22 +232,21 @@ def main():
default_dist_spec
=
ShardSpec
([
-
1
],
[
args
.
tp_degree
])
if
args
.
shardinit
else
None
default_dist_spec
=
ShardSpec
([
-
1
],
[
args
.
tp_degree
])
if
args
.
shardinit
else
None
# build GPT model
# build GPT model
with
ColoInitContext
(
device
=
'cpu'
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
default_pg
):
with
ColoInitContext
(
device
=
get_current_device
()
,
default_dist_spec
=
default_dist_spec
,
default_pg
=
default_pg
):
model
=
gpt2_
medium
(
checkpoint
=
True
)
model
=
gpt2_
10b
(
checkpoint
=
True
)
pg
=
default_pg
pg
=
default_pg
# Tensor Parallelism (TP)
# Tensor Parallelism (TP)
tensor_parallelize
(
model
,
pg
)
tensor_parallelize
(
model
,
pg
)
# Gemini + ZeRO DP, Note it must be used after TP
# Gemini + ZeRO DP, Note it must be used after TP
model
=
gemini_zero_dpp
(
model
,
pg
,
args
.
placement
)
model
=
gemini_zero_dpp
(
model
,
pg
,
args
.
placement
)
# build optimizer
# build
highly optimized cpu
optimizer
optimizer
=
GeminiAdamOptimizer
(
model
,
lr
=
1e-3
,
initial_scale
=
2
**
5
)
optimizer
=
GeminiAdamOptimizer
(
model
,
lr
=
1e-3
,
initial_scale
=
2
**
5
)
# optimizer = HybridAdam(model.parameters(), lr=1e-3)
# optimizer = ZeroOptimizer(optimizer, model, initial_scale=2**5)
logger
.
info
(
get_mem_info
(
prefix
=
'After init optim, '
),
ranks
=
[
0
])
logger
.
info
(
get_mem_info
(
prefix
=
'After init optim, '
),
ranks
=
[
0
])
else
:
else
:
model
=
gpt2_
medium
(
checkpoint
=
True
).
cuda
()
model
=
gpt2_
10b
(
checkpoint
=
True
).
cuda
()
if
args
.
distplan
.
startswith
(
"torch"
):
if
args
.
distplan
.
startswith
(
"torch"
):
model
=
DDP
(
model
)
model
=
DDP
(
model
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment