Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
09c2eb85
Unverified
Commit
09c2eb85
authored
Jul 16, 2024
by
youkaichao
Committed by
GitHub
Jul 16, 2024
Browse files
[ci][distributed] add pipeline parallel correctness test (#6410)
parent
978aed53
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
129 additions
and
128 deletions
+129
-128
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+2
-7
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+112
-121
vllm/executor/multiproc_gpu_executor.py
vllm/executor/multiproc_gpu_executor.py
+15
-0
No files found.
.buildkite/test-pipeline.yaml
View file @
09c2eb85
...
@@ -72,7 +72,7 @@ steps:
...
@@ -72,7 +72,7 @@ steps:
commands
:
commands
:
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
# the following commands are for the first node, with ip 192.168.10.10 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0
pytest -v -s distributed/test_pipeline_parallel.py
-
pytest -v -s distributed/test_pipeline_parallel.py
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
# the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
-
VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py
...
@@ -115,12 +115,7 @@ steps:
...
@@ -115,12 +115,7 @@ steps:
working_dir
:
"
/vllm-workspace/tests"
working_dir
:
"
/vllm-workspace/tests"
num_gpus
:
4
num_gpus
:
4
commands
:
commands
:
-
TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-
pytest -v -s distributed/test_pipeline_parallel.py
-
TP_SIZE=2 PP_SIZE=2 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-
TP_SIZE=1 PP_SIZE=3 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-
PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=1 pytest -v -s distributed/test_pipeline_parallel.py
-
PP_SIZE=4 EAGER_MODE=1 CHUNKED_PREFILL=0 pytest -v -s distributed/test_pipeline_parallel.py
-
label
:
Engine Test
-
label
:
Engine Test
mirror_hardwares
:
[
amd
]
mirror_hardwares
:
[
amd
]
...
...
tests/distributed/test_pipeline_parallel.py
View file @
09c2eb85
import
os
import
openai
# use the official client for correctness check
import
pytest
import
pytest
from
..utils
import
RemoteOpenAIServer
from
..utils
import
RemoteOpenAIServer
# downloading lora to test lora requests
# any model with a chat template should work here
MODEL_NAME
=
"meta-llama/Meta-Llama-3-8B"
EAGER_MODE
=
bool
(
int
(
os
.
getenv
(
"EAGER_MODE"
,
0
)))
CHUNKED_PREFILL
=
bool
(
int
(
os
.
getenv
(
"CHUNKED_PREFILL"
,
0
)))
TP_SIZE
=
int
(
os
.
getenv
(
"TP_SIZE"
,
1
))
PP_SIZE
=
int
(
os
.
getenv
(
"PP_SIZE"
,
1
))
pytestmark
=
pytest
.
mark
.
asyncio
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
mark
.
parametrize
(
def
server
():
"TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME"
,
[
args
=
[
(
2
,
2
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
),
(
2
,
2
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
),
(
1
,
3
,
0
,
0
,
"meta-llama/Meta-Llama-3-8B"
),
(
1
,
4
,
0
,
1
,
"meta-llama/Meta-Llama-3-8B"
),
(
1
,
4
,
1
,
0
,
"meta-llama/Meta-Llama-3-8B"
),
])
def
test_compare_tp
(
TP_SIZE
,
PP_SIZE
,
EAGER_MODE
,
CHUNKED_PREFILL
,
MODEL_NAME
):
pp_args
=
[
"--model"
,
"--model"
,
MODEL_NAME
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
# use half precision for speed and memory savings in CI environment
...
@@ -32,109 +25,107 @@ def server():
...
@@ -32,109 +25,107 @@ def server():
"--distributed-executor-backend"
,
"--distributed-executor-backend"
,
"ray"
,
"ray"
,
]
]
if
CHUNKED_PREFILL
:
args
+=
[
# compare without pipeline parallelism
"--enable-chunked-prefill"
,
# NOTE: use mp backend for TP
# PP tests might involve multiple nodes, and ray might
# schedule all workers in a node other than the head node,
# which can cause the test to fail.
tp_args
=
[
"--model"
,
MODEL_NAME
,
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--tensor-parallel-size"
,
str
(
max
(
TP_SIZE
,
2
)),
# use at least TP_SIZE=2 to hold the model
"--distributed-executor-backend"
,
"mp"
,
]
]
if
CHUNKED_PREFILL
:
pp_args
.
append
(
"--enable-chunked-prefill"
)
tp_args
.
append
(
"--enable-chunked-prefill"
)
if
EAGER_MODE
:
if
EAGER_MODE
:
args
+=
[
pp_args
.
append
(
"--enforce-eager"
)
"--enforce-eager"
,
tp_args
.
append
(
"--enforce-eager"
)
]
with
RemoteOpenAIServer
(
args
)
as
remote_server
:
yield
remote_server
@
pytest
.
fixture
(
scope
=
"module"
)
def
client
(
server
):
return
server
.
get_async_client
()
results
=
[]
for
args
in
[
pp_args
,
tp_args
]:
with
RemoteOpenAIServer
(
args
)
as
server
:
client
=
server
.
get_client
()
async
def
test_check_models
(
server
,
client
:
openai
.
AsyncOpenAI
):
# test models list
models
=
await
client
.
models
.
list
()
models
=
client
.
models
.
list
()
models
=
models
.
data
models
=
models
.
data
served_model
=
models
[
0
]
served_model
=
models
[
0
]
assert
served_model
.
id
==
MODEL_NAME
results
.
append
({
assert
all
(
model
.
root
==
MODEL_NAME
for
model
in
models
)
"test"
:
"models_list"
,
"id"
:
served_model
.
id
,
"root"
:
served_model
.
root
,
@
pytest
.
mark
.
parametrize
(
})
"model_name"
,
[
MODEL_NAME
],
# test with text prompt
)
completion
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
async
def
test_single_completion
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
completion
=
await
client
.
completions
.
create
(
model
=
model_name
,
prompt
=
"Hello, my name is"
,
prompt
=
"Hello, my name is"
,
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
)
temperature
=
0.0
)
assert
completion
.
id
is
not
None
results
.
append
({
assert
completion
.
choices
is
not
None
and
len
(
completion
.
choices
)
==
1
"test"
:
"single_completion"
,
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
"text"
:
completion
.
choices
[
0
].
text
,
completion
.
choices
[
0
].
text
)
>=
5
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
assert
completion
.
choices
[
0
].
finish_reason
==
"length"
"usage"
:
completion
.
usage
,
assert
completion
.
usage
==
openai
.
types
.
CompletionUsage
(
})
completion_tokens
=
5
,
prompt_tokens
=
6
,
total_tokens
=
11
)
# test using token IDs
# test using token IDs
completion
=
await
client
.
completions
.
create
(
completion
=
client
.
completions
.
create
(
model
=
MODEL_NAME
,
model
=
MODEL_NAME
,
prompt
=
[
0
,
0
,
0
,
0
,
0
],
prompt
=
[
0
,
0
,
0
,
0
,
0
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
assert
completion
.
choices
[
0
].
text
is
not
None
and
len
(
completion
.
choices
[
0
].
text
)
>=
5
results
.
append
({
"test"
:
"token_ids"
,
"text"
:
completion
.
choices
[
0
].
text
,
"finish_reason"
:
completion
.
choices
[
0
].
finish_reason
,
"usage"
:
completion
.
usage
,
})
@
pytest
.
mark
.
parametrize
(
# just test 1 lora hereafter
"model_name"
,
[
MODEL_NAME
],
)
async
def
test_batch_completions
(
server
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
):
# test simple list
# test simple list
batch
=
await
client
.
completions
.
create
(
batch
=
client
.
completions
.
create
(
model
=
model_name
,
model
=
MODEL_NAME
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
)
)
assert
len
(
batch
.
choices
)
==
2
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
1
].
text
# test n = 2
results
.
append
({
batch
=
await
client
.
completions
.
create
(
"test"
:
"simple_list"
,
model
=
model_name
,
"text0"
:
batch
.
choices
[
0
].
text
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
"text1"
:
batch
.
choices
[
1
].
text
,
n
=
2
,
})
max_tokens
=
5
,
temperature
=
0.0
,
extra_body
=
dict
(
# NOTE: this has to be true for n > 1 in vLLM, but not necessary
# for official client.
use_beam_search
=
True
),
)
assert
len
(
batch
.
choices
)
==
4
assert
batch
.
choices
[
0
].
text
!=
batch
.
choices
[
1
].
text
,
"beam search should be different"
assert
batch
.
choices
[
0
].
text
==
batch
.
choices
[
2
].
text
,
"two copies of the same prompt should be the same"
assert
batch
.
choices
[
1
].
text
==
batch
.
choices
[
3
].
text
,
"two copies of the same prompt should be the same"
# test streaming
# test streaming
batch
=
await
client
.
completions
.
create
(
batch
=
client
.
completions
.
create
(
model
=
model_name
,
model
=
MODEL_NAME
,
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
prompt
=
[
"Hello, my name is"
,
"Hello, my name is"
],
max_tokens
=
5
,
max_tokens
=
5
,
temperature
=
0.0
,
temperature
=
0.0
,
stream
=
True
,
stream
=
True
,
)
)
texts
=
[
""
]
*
2
texts
=
[
""
]
*
2
async
for
chunk
in
batch
:
for
chunk
in
batch
:
assert
len
(
chunk
.
choices
)
==
1
assert
len
(
chunk
.
choices
)
==
1
choice
=
chunk
.
choices
[
0
]
choice
=
chunk
.
choices
[
0
]
texts
[
choice
.
index
]
+=
choice
.
text
texts
[
choice
.
index
]
+=
choice
.
text
assert
texts
[
0
]
==
texts
[
1
]
results
.
append
({
"test"
:
"streaming"
,
"texts"
:
texts
,
})
n
=
len
(
results
)
//
2
pp_results
=
results
[:
n
]
tp_results
=
results
[
n
:]
for
pp
,
tp
in
zip
(
pp_results
,
tp_results
):
assert
pp
==
tp
vllm/executor/multiproc_gpu_executor.py
View file @
09c2eb85
import
asyncio
import
asyncio
import
os
import
os
import
signal
import
weakref
from
functools
import
partial
from
functools
import
partial
from
typing
import
Any
,
List
,
Optional
from
typing
import
Any
,
List
,
Optional
...
@@ -78,6 +80,19 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
...
@@ -78,6 +80,19 @@ class MultiprocessingGPUExecutor(DistributedGPUExecutor):
result_handler
.
start
()
result_handler
.
start
()
self
.
worker_monitor
.
start
()
self
.
worker_monitor
.
start
()
# Set up signal handlers to shutdown the executor cleanly
# sometimes gc does not work well
# Use weakref to avoid holding a reference to self
ref
=
weakref
.
ref
(
self
)
def
shutdown
(
signum
,
frame
):
if
executor
:
=
ref
():
executor
.
shutdown
()
signal
.
signal
(
signal
.
SIGINT
,
shutdown
)
signal
.
signal
(
signal
.
SIGTERM
,
shutdown
)
self
.
driver_worker
=
self
.
_create_worker
(
self
.
driver_worker
=
self
.
_create_worker
(
distributed_init_method
=
distributed_init_method
)
distributed_init_method
=
distributed_init_method
)
self
.
_run_workers
(
"init_device"
)
self
.
_run_workers
(
"init_device"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment