Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
c4b15661
Unverified
Commit
c4b15661
authored
Feb 02, 2023
by
oahzxl
Committed by
GitHub
Feb 02, 2023
Browse files
[autochunk] add benchmark for transformer and alphafold (#2543)
parent
9885ec2b
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
286 additions
and
5 deletions
+286
-5
tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
...test_autochunk_alphafold/benchmark_autochunk_alphafold.py
+131
-0
tests/test_autochunk/test_autochunk_alphafold/test_autochunk_alphafold_utils.py
...est_autochunk_alphafold/test_autochunk_alphafold_utils.py
+0
-0
tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_block.py
...est_autochunk_alphafold/test_autochunk_evoformer_block.py
+1
-1
tests/test_autochunk/test_autochunk_alphafold/test_autochunk_evoformer_stack.py
...est_autochunk_alphafold/test_autochunk_evoformer_stack.py
+1
-1
tests/test_autochunk/test_autochunk_alphafold/test_autochunk_extramsa_block.py
...test_autochunk_alphafold/test_autochunk_extramsa_block.py
+1
-1
tests/test_autochunk/test_autochunk_diffuser/test_autochunk_diffuser_utils.py
.../test_autochunk_diffuser/test_autochunk_diffuser_utils.py
+0
-0
tests/test_autochunk/test_autochunk_diffuser/test_autochunk_unet.py
..._autochunk/test_autochunk_diffuser/test_autochunk_unet.py
+1
-1
tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
..._autochunk_transformer/benchmark_autochunk_transformer.py
+150
-0
tests/test_autochunk/test_autochunk_transformer/test_autochunk_gpt.py
...utochunk/test_autochunk_transformer/test_autochunk_gpt.py
+1
-1
tests/test_autochunk/test_autochunk_transformer/test_autochunk_transformer_utils.py
...autochunk_transformer/test_autochunk_transformer_utils.py
+0
-0
No files found.
tests/test_autochunk/test_autochunk_alphafold/benchmark_autochunk_alphafold.py
0 → 100644
View file @
c4b15661
import
time
from
typing
import
Any
,
Dict
,
List
import
torch
import
torch.fx
import
colossalai
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.fx.graph_module
import
ColoGraphModule
from
colossalai.fx.passes.meta_info_prop
import
MetaInfoProp
from
colossalai.utils
import
free_port
if
AUTOCHUNK_AVAILABLE
:
from
colossalai.autochunk.autochunk_codegen
import
AutoChunkCodeGen
from
colossalai.fx.profiler
import
MetaTensor
from
colossalai.fx.tracer.experimental
import
ColoTracer
,
symbolic_trace
def
_benchmark_evoformer_stack_gm
(
data_args
:
tuple
,
max_memory
:
int
,
get_model
:
Any
,
get_data
:
Any
,
)
->
None
:
# build model and input
model
=
get_model
()
meta_args
,
concrete_args
=
get_data
(
*
data_args
)
if
concrete_args
is
None
:
concrete_args
=
[]
# trace the meta graph and setup codegen
meta_graph
=
symbolic_trace
(
model
,
meta_args
=
{
k
:
v
.
to
(
torch
.
device
(
"meta"
))
for
k
,
v
in
meta_args
},
concrete_args
=
{
k
:
v
for
k
,
v
in
concrete_args
},
)
interp
=
MetaInfoProp
(
meta_graph
)
meta_tensors
=
[
MetaTensor
(
i
[
1
],
fake_device
=
"cuda:0"
)
for
i
in
meta_args
]
+
[
i
[
1
]
for
i
in
concrete_args
]
interp
.
propagate
(
*
meta_tensors
)
codegen
=
AutoChunkCodeGen
(
meta_graph
,
max_memory
=
max_memory
,
)
# trace and recompile
# MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
graph
=
ColoTracer
().
trace
(
model
,
meta_args
=
{
k
:
v
.
to
(
torch
.
device
(
"meta"
))
for
k
,
v
in
meta_args
},
concrete_args
=
{
k
:
v
for
k
,
v
in
concrete_args
},
)
graph
.
set_codegen
(
codegen
)
gm
=
ColoGraphModule
(
model
,
graph
,
ckpt_codegen
=
False
)
gm
.
recompile
()
# init inputs
inputs
=
[
i
[
1
]
for
i
in
meta_args
]
+
[
i
[
1
]
for
i
in
concrete_args
]
inputs
=
[
i
.
cuda
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
]
model
.
cuda
()
# bench
mem
=
_benchmark_memory
(
gm
,
inputs
)
speed
=
_benchmark_speed
(
gm
,
inputs
)
print
(
"evoformer stack gm, mem: %.2fMB, time: %.4fs, data_args: %s"
%
(
mem
,
speed
,
str
(
data_args
)))
def
_benchmark_evoformer_stack_origin
(
data_args
:
tuple
,
get_model
:
Any
,
get_data
:
Any
,
)
->
None
:
# build model and input
model
=
get_model
()
meta_args
,
concrete_args
=
get_data
(
*
data_args
)
if
concrete_args
is
None
:
concrete_args
=
[]
# init inputs
inputs
=
[
i
[
1
]
for
i
in
meta_args
]
+
[
i
[
1
]
for
i
in
concrete_args
]
inputs
=
[
i
.
cuda
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
]
model
.
cuda
()
# bench
mem
=
_benchmark_memory
(
model
,
inputs
)
speed
=
_benchmark_speed
(
model
,
inputs
)
print
(
"evoformer stack origin, mem: %.2fMB, time: %.4fs, data_args: %s"
%
(
mem
,
speed
,
str
(
data_args
)))
def
_benchmark_memory
(
model
,
inputs
):
with
torch
.
no_grad
():
torch
.
cuda
.
reset_peak_memory_stats
()
now_mem
=
torch
.
cuda
.
memory_allocated
()
/
1024
**
2
model
(
*
[
i
.
clone
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
])
new_max_mem
=
torch
.
cuda
.
max_memory_allocated
()
/
1024
**
2
return
new_max_mem
-
now_mem
def
_benchmark_speed
(
model
,
inputs
,
loop
=
5
):
with
torch
.
no_grad
():
for
_
in
range
(
loop
//
2
+
1
):
model
(
*
inputs
)
torch
.
cuda
.
synchronize
()
time1
=
time
.
time
()
for
_
in
range
(
loop
):
model
(
*
inputs
)
torch
.
cuda
.
synchronize
()
time2
=
time
.
time
()
return
(
time2
-
time1
)
/
loop
def
benchmark_evoformer_stack
():
from
test_autochunk_evoformer_stack
import
get_data
,
get_model
data_args
=
[
128
,
256
]
print
(
""
)
_benchmark_evoformer_stack_origin
(
data_args
,
get_model
,
get_data
)
_benchmark_evoformer_stack_gm
(
data_args
,
600
,
get_model
,
get_data
)
_benchmark_evoformer_stack_gm
(
data_args
,
400
,
get_model
,
get_data
)
_benchmark_evoformer_stack_gm
(
data_args
,
None
,
get_model
,
get_data
)
if
__name__
==
"__main__"
:
# launch colossalai
colossalai
.
launch
(
config
=
{},
rank
=
0
,
world_size
=
1
,
host
=
"localhost"
,
port
=
free_port
(),
backend
=
"nccl"
,
)
benchmark_evoformer_stack
()
tests/test_autochunk/test_alphafold/test_alphafold_utils.py
→
tests/test_autochunk/test_
autochunk_
alphafold/test_
autochunk_
alphafold_utils.py
View file @
c4b15661
File moved
tests/test_autochunk/test_alphafold/test_evoformer_block.py
→
tests/test_autochunk/test_
autochunk_
alphafold/test_
autochunk_
evoformer_block.py
View file @
c4b15661
...
@@ -12,7 +12,7 @@ try:
...
@@ -12,7 +12,7 @@ try:
except
:
except
:
HAS_REPO
=
False
HAS_REPO
=
False
from
test_alphafold_utils
import
run_test
from
test_
autochunk_
alphafold_utils
import
run_test
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
...
...
tests/test_autochunk/test_alphafold/test_evoformer_stack.py
→
tests/test_autochunk/test_
autochunk_
alphafold/test_
autochunk_
evoformer_stack.py
View file @
c4b15661
...
@@ -12,7 +12,7 @@ try:
...
@@ -12,7 +12,7 @@ try:
except
:
except
:
HAS_REPO
=
False
HAS_REPO
=
False
from
test_alphafold_utils
import
run_test
from
test_
autochunk_
alphafold_utils
import
run_test
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
...
...
tests/test_autochunk/test_alphafold/test_extramsa_block.py
→
tests/test_autochunk/test_
autochunk_
alphafold/test_
autochunk_
extramsa_block.py
View file @
c4b15661
...
@@ -11,7 +11,7 @@ try:
...
@@ -11,7 +11,7 @@ try:
HAS_REPO
=
True
HAS_REPO
=
True
except
:
except
:
HAS_REPO
=
False
HAS_REPO
=
False
from
test_alphafold_utils
import
run_test
from
test_
autochunk_
alphafold_utils
import
run_test
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
...
...
tests/test_autochunk/test_diffuser/test_diffuser_utils.py
→
tests/test_autochunk/test_
autochunk_
diffuser/test_
autochunk_
diffuser_utils.py
View file @
c4b15661
File moved
tests/test_autochunk/test_diffuser/test_unet.py
→
tests/test_autochunk/test_
autochunk_
diffuser/test_
autochunk_
unet.py
View file @
c4b15661
...
@@ -13,7 +13,7 @@ except:
...
@@ -13,7 +13,7 @@ except:
MODELS
=
[]
MODELS
=
[]
HAS_REPO
=
False
HAS_REPO
=
False
from
test_diffuser_utils
import
run_test
from
test_
autochunk_
diffuser_utils
import
run_test
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
...
...
tests/test_autochunk/test_autochunk_transformer/benchmark_autochunk_transformer.py
0 → 100644
View file @
c4b15661
import
time
from
typing
import
Any
,
Dict
,
List
import
torch
import
torch.fx
import
colossalai
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.fx.graph_module
import
ColoGraphModule
from
colossalai.fx.passes.meta_info_prop
import
MetaInfoProp
from
colossalai.fx.profiler
import
parameter_size
from
colossalai.utils
import
free_port
if
AUTOCHUNK_AVAILABLE
:
from
colossalai.autochunk.autochunk_codegen
import
AutoChunkCodeGen
from
colossalai.fx.profiler
import
MetaTensor
from
colossalai.fx.tracer.experimental
import
ColoTracer
,
symbolic_trace
def
_benchmark_autochunk_gpt_gm
(
model
:
Any
,
data
:
tuple
,
max_memory
:
int
=
None
,
)
->
None
:
model
=
model
.
cuda
().
eval
()
# build model and input
meta_args
,
concrete_args
,
sequence
=
data
if
concrete_args
is
None
:
concrete_args
=
{}
# trace the meta graph and setup codegen
meta_graph
=
symbolic_trace
(
model
,
meta_args
=
{
k
:
v
.
to
(
torch
.
device
(
"meta"
))
for
k
,
v
in
meta_args
.
items
()},
concrete_args
=
{
k
:
v
for
k
,
v
in
concrete_args
.
items
()},
)
interp
=
MetaInfoProp
(
meta_graph
)
meta_tensors
=
[
meta_args
[
i
]
if
i
in
meta_args
else
concrete_args
[
i
]
for
i
in
sequence
]
meta_tensors
=
[
MetaTensor
(
i
,
fake_device
=
"cuda:0"
)
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
meta_tensors
]
interp
.
propagate
(
*
meta_tensors
)
codegen
=
AutoChunkCodeGen
(
meta_graph
,
max_memory
=
max_memory
,
)
# trace and recompile
# MetaInfoProp requires symbolic_trace but CodeGen requires ColoTracer
graph
=
ColoTracer
().
trace
(
model
.
cuda
().
eval
(),
meta_args
=
{
k
:
v
.
to
(
torch
.
device
(
"meta"
))
for
k
,
v
in
meta_args
.
items
()},
concrete_args
=
{
k
:
v
for
k
,
v
in
concrete_args
.
items
()},
)
graph
.
set_codegen
(
codegen
)
gm
=
ColoGraphModule
(
model
,
graph
,
ckpt_codegen
=
False
)
gm
.
recompile
()
# init inputs
inputs
=
[
meta_args
[
i
]
if
i
in
meta_args
else
concrete_args
[
i
]
for
i
in
sequence
]
inputs
=
[
i
.
cuda
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
]
model
.
cuda
().
eval
()
# bench
para_mem
=
float
(
parameter_size
(
model
))
/
1024
**
2
*
6
act_mem
=
_benchmark_memory
(
gm
,
inputs
)
speed
=
_benchmark_speed
(
gm
,
inputs
)
print
(
"gpt autochunk, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB"
%
(
speed
,
act_mem
,
para_mem
,
act_mem
+
para_mem
))
def
_benchmark_autochunk_gpt_origin
(
model
:
Any
,
data
:
tuple
,
)
->
None
:
# build model and input
meta_args
,
concrete_args
,
sequence
=
data
if
concrete_args
is
None
:
concrete_args
=
{}
# init inputs
inputs
=
[
meta_args
[
i
]
if
i
in
meta_args
else
concrete_args
[
i
]
for
i
in
sequence
]
inputs
=
[
i
.
cuda
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
]
model
.
cuda
().
eval
()
# bench
para_mem
=
float
(
parameter_size
(
model
))
/
1024
**
2
*
6
act_mem
=
_benchmark_memory
(
model
,
inputs
)
speed
=
_benchmark_speed
(
model
,
inputs
)
print
(
"gpt origin, time: %.4fs, act mem: %.2fMB, para mem: %.2fMB, all mem: %.2fMB"
%
(
speed
,
act_mem
,
para_mem
,
act_mem
+
para_mem
))
return
act_mem
def
_benchmark_memory
(
model
,
inputs
):
with
torch
.
no_grad
():
torch
.
cuda
.
reset_peak_memory_stats
()
now_mem
=
float
(
torch
.
cuda
.
memory_allocated
())
/
1024
**
2
model
(
*
[
i
.
clone
()
if
isinstance
(
i
,
torch
.
Tensor
)
else
i
for
i
in
inputs
])
new_max_mem
=
float
(
torch
.
cuda
.
max_memory_allocated
())
/
1024
**
2
return
new_max_mem
-
now_mem
def
_benchmark_speed
(
model
,
inputs
,
loop
=
5
):
with
torch
.
no_grad
():
for
_
in
range
(
loop
//
2
+
1
):
model
(
*
inputs
)
torch
.
cuda
.
synchronize
()
time1
=
time
.
time
()
for
_
in
range
(
loop
):
model
(
*
inputs
)
torch
.
cuda
.
synchronize
()
time2
=
time
.
time
()
return
(
time2
-
time1
)
/
loop
def
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
512
,
n_embd
=
768
,
n_head
=
12
):
from
test_autochunk_gpt
import
GPT2Config
,
GPT2Model
,
get_data
model
=
GPT2Model
config
=
GPT2Config
(
n_embd
=
n_embd
,
n_position
=
seq
,
n_layer
=
2
,
n_head
=
n_head
)
config
.
max_position_embeddings
=
seq
model
=
model
(
config
=
config
)
shape
=
[
batch
,
seq
]
print
(
"
\n
batch: %d, seq: %d, n_embd: %d, n_head: %d"
%
(
batch
,
seq
,
n_embd
,
n_head
))
max_mem
=
_benchmark_autochunk_gpt_origin
(
model
,
get_data
(
shape
))
for
ratio
in
[
0.5
,
0.4
,
0.3
,
0.2
]:
try
:
_benchmark_autochunk_gpt_gm
(
model
,
get_data
(
shape
),
max_mem
*
ratio
)
except
RuntimeError
as
e
:
if
e
.
args
[
0
]
==
'Search failed. Try a larger memory threshold.'
:
break
except
Exception
as
e
:
raise
e
_benchmark_autochunk_gpt_gm
(
model
,
get_data
(
shape
),
None
)
if
__name__
==
"__main__"
:
# launch colossalai
colossalai
.
launch
(
config
=
{},
rank
=
0
,
world_size
=
1
,
host
=
"localhost"
,
port
=
free_port
(),
backend
=
"nccl"
,
)
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
1024
,
n_embd
=
768
,
n_head
=
12
)
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
2048
,
n_embd
=
768
,
n_head
=
12
)
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
4096
,
n_embd
=
768
,
n_head
=
12
)
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
6144
,
n_embd
=
768
,
n_head
=
12
)
benchmark_autochunk_gpt
(
batch
=
1
,
seq
=
8192
,
n_embd
=
768
,
n_head
=
12
)
tests/test_autochunk/test_transformer/test_autochunk_gpt.py
→
tests/test_autochunk/test_
autochunk_
transformer/test_autochunk_gpt.py
View file @
c4b15661
...
@@ -13,7 +13,7 @@ except:
...
@@ -13,7 +13,7 @@ except:
MODELS
=
[]
MODELS
=
[]
HAS_REPO
=
False
HAS_REPO
=
False
from
test_transformer_utils
import
run_test
from
test_
autochunk_
transformer_utils
import
run_test
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
from
colossalai.autochunk.autochunk_codegen
import
AUTOCHUNK_AVAILABLE
...
...
tests/test_autochunk/test_transformer/test_transformer_utils.py
→
tests/test_autochunk/test_
autochunk_
transformer/test_
autochunk_
transformer_utils.py
View file @
c4b15661
File moved
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment