Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
0c7d8beb
Unverified
Commit
0c7d8beb
authored
Nov 20, 2023
by
Bin Jia
Committed by
GitHub
Nov 20, 2023
Browse files
[hotfix/hybridengine] fix bug when tp*pp size = 1 (#5069)
parent
e5ce4c8e
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
64 additions
and
14 deletions
+64
-14
colossalai/inference/engine/engine.py
colossalai/inference/engine/engine.py
+4
-2
tests/test_infer/test_hybrid_bloom.py
tests/test_infer/test_hybrid_bloom.py
+20
-4
tests/test_infer/test_hybrid_chatglm2.py
tests/test_infer/test_hybrid_chatglm2.py
+20
-4
tests/test_infer/test_hybrid_llama.py
tests/test_infer/test_hybrid_llama.py
+20
-4
No files found.
colossalai/inference/engine/engine.py
View file @
0c7d8beb
...
...
@@ -126,7 +126,7 @@ class CaiInferEngine:
# Init pg mesh
pg_mesh
=
ProcessGroupMesh
(
pp_size
,
tp_size
)
stage_manager
=
PipelineStageManager
(
pg_mesh
,
PP_AXIS
,
True
)
stage_manager
=
PipelineStageManager
(
pg_mesh
,
PP_AXIS
,
True
if
pp_size
*
tp_size
>
1
else
False
)
self
.
cache_manager_list
=
[
self
.
_init_manager
(
model
,
max_batch_size
,
max_input_len
,
max_output_len
)
for
_
in
range
(
micro_batch_buffer_size
or
pp_size
)
...
...
@@ -142,7 +142,9 @@ class CaiInferEngine:
self
.
verbose
=
verbose
self
.
schedule
=
GenerateSchedule
(
stage_manager
,
self
.
mb_manager
,
verbose
)
self
.
model
=
self
.
_shardformer
(
model
,
model_policy
,
stage_manager
,
pg_mesh
.
get_group_along_axis
(
TP_AXIS
))
self
.
model
=
self
.
_shardformer
(
model
,
model_policy
,
stage_manager
,
pg_mesh
.
get_group_along_axis
(
TP_AXIS
)
if
pp_size
*
tp_size
>
1
else
None
)
if
quant
==
"gptq"
:
self
.
gptq_manager
.
post_init_gptq_buffer
(
self
.
model
)
...
...
tests/test_infer/test_hybrid_bloom.py
View file @
0c7d8beb
...
...
@@ -78,17 +78,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
torch
.
cuda
.
empty_cache
()
def
check_tp_pipeline_inference
(
rank
,
world_size
,
port
):
@
parameterize
(
"tp_size"
,
[
1
])
@
parameterize
(
"pp_size"
,
[
1
])
@
parameterize
(
"max_output_len"
,
[
2
])
@
parameterize
(
"micro_batch_size"
,
[
1
])
@
clear_cache_before_run
()
def
run_single_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
):
pipeline_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
)
torch
.
cuda
.
empty_cache
()
def
check_tp_pp_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_pipeline_inference_test
()
def
check_
single
_inference
(
rank
,
world_size
,
port
):
def
check_
tp_or_pp
_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_inference_test
()
run_pipeline_inference_test
()
def
check_single_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_single_inference_test
@
pytest
.
mark
.
skipif
(
not
CUDA_SUPPORT
or
not
HAS_LIGHTLLM_KERNEL
,
reason
=
"kv-cache manager engine requires cuda version to be higher than 11.5"
,
...
...
@@ -97,8 +112,9 @@ def check_single_inference(rank, world_size, port):
@
rerun_if_address_is_in_use
()
@
clear_cache_before_run
()
def
test_pipeline_inference
():
spawn
(
check_tp_pipeline_inference
,
nprocs
=
4
)
spawn
(
check_single_inference
,
nprocs
=
2
)
spawn
(
check_tp_pp_inference
,
nprocs
=
4
)
spawn
(
check_tp_or_pp_inference
,
nprocs
=
2
)
spawn
(
check_single_inference
,
nprocs
=
1
)
if
__name__
==
"__main__"
:
...
...
tests/test_infer/test_hybrid_chatglm2.py
View file @
0c7d8beb
...
...
@@ -86,17 +86,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
torch
.
cuda
.
empty_cache
()
def
check_tp_pipeline_inference
(
rank
,
world_size
,
port
):
@
parameterize
(
"tp_size"
,
[
1
])
@
parameterize
(
"pp_size"
,
[
1
])
@
parameterize
(
"max_output_len"
,
[
2
])
@
parameterize
(
"micro_batch_size"
,
[
1
])
@
clear_cache_before_run
()
def
run_single_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
):
pipeline_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
)
torch
.
cuda
.
empty_cache
()
def
check_tp_pp_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_pipeline_inference_test
()
def
check_
single
_inference
(
rank
,
world_size
,
port
):
def
check_
tp_or_pp
_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_inference_test
()
run_pipeline_inference_test
()
def
check_single_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_single_inference_test
@
pytest
.
mark
.
skipif
(
not
CUDA_SUPPORT
or
not
HAS_LIGHTLLM_KERNEL
,
reason
=
"kv-cache manager engine requires cuda version to be higher than 11.5"
,
...
...
@@ -105,8 +120,9 @@ def check_single_inference(rank, world_size, port):
@
rerun_if_address_is_in_use
()
@
clear_cache_before_run
()
def
test_pipeline_inference
():
spawn
(
check_tp_pipeline_inference
,
nprocs
=
4
)
spawn
(
check_single_inference
,
nprocs
=
2
)
spawn
(
check_tp_pp_inference
,
nprocs
=
4
)
spawn
(
check_tp_or_pp_inference
,
nprocs
=
2
)
spawn
(
check_single_inference
,
nprocs
=
1
)
if
__name__
==
"__main__"
:
...
...
tests/test_infer/test_hybrid_llama.py
View file @
0c7d8beb
...
...
@@ -83,17 +83,32 @@ def run_tp_inference_test(tp_size, pp_size, max_output_len, micro_batch_size):
torch
.
cuda
.
empty_cache
()
def
check_tp_pipeline_inference
(
rank
,
world_size
,
port
):
@
parameterize
(
"tp_size"
,
[
1
])
@
parameterize
(
"pp_size"
,
[
1
])
@
parameterize
(
"max_output_len"
,
[
2
])
@
parameterize
(
"micro_batch_size"
,
[
1
])
@
clear_cache_before_run
()
def
run_single_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
):
pipeline_inference_test
(
tp_size
,
pp_size
,
max_output_len
,
micro_batch_size
)
torch
.
cuda
.
empty_cache
()
def
check_tp_pp_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_pipeline_inference_test
()
def
check_
single
_inference
(
rank
,
world_size
,
port
):
def
check_
tp_or_pp
_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_tp_inference_test
()
run_pipeline_inference_test
()
def
check_single_inference
(
rank
,
world_size
,
port
):
colossalai
.
launch
(
config
=
{},
rank
=
rank
,
world_size
=
world_size
,
host
=
"localhost"
,
port
=
port
,
backend
=
"nccl"
)
run_single_inference_test
@
pytest
.
mark
.
skipif
(
not
CUDA_SUPPORT
or
not
HAS_LIGHTLLM_KERNEL
,
reason
=
"kv-cache manager engine requires cuda version to be higher than 11.5"
,
...
...
@@ -102,8 +117,9 @@ def check_single_inference(rank, world_size, port):
@
rerun_if_address_is_in_use
()
@
clear_cache_before_run
()
def
test_pipeline_inference
():
spawn
(
check_tp_pipeline_inference
,
nprocs
=
4
)
spawn
(
check_single_inference
,
nprocs
=
2
)
spawn
(
check_tp_pp_inference
,
nprocs
=
4
)
spawn
(
check_tp_or_pp_inference
,
nprocs
=
2
)
spawn
(
check_single_inference
,
nprocs
=
1
)
if
__name__
==
"__main__"
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment