Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
04e55834
Unverified
Commit
04e55834
authored
Aug 02, 2024
by
youkaichao
Committed by
GitHub
Aug 02, 2024
Browse files
[ci][distributed] merge distributed test commands (#7097)
Co-authored-by:
Cyrus Leung
<
cyrus.tl.leung@gmail.com
>
parent
8c025fa7
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
78 additions
and
91 deletions
+78
-91
.buildkite/test-pipeline.yaml
.buildkite/test-pipeline.yaml
+4
-23
tests/distributed/test_basic_distributed_correctness.py
tests/distributed/test_basic_distributed_correctness.py
+33
-17
tests/distributed/test_chunked_prefill_distributed.py
tests/distributed/test_chunked_prefill_distributed.py
+14
-21
tests/distributed/test_multimodal_broadcast.py
tests/distributed/test_multimodal_broadcast.py
+27
-30
No files found.
.buildkite/test-pipeline.yaml
View file @
04e55834
...
@@ -82,20 +82,9 @@ steps:
...
@@ -82,20 +82,9 @@ steps:
num_gpus
:
2
num_gpus
:
2
commands
:
commands
:
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TARGET_TEST_SUITE=L4 pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s distributed/test_multimodal_broadcast.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-
TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_multimodal_broadcast.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-
TEST_DIST_MODEL=llava-hf/llava-v1.6-mistral-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_multimodal_broadcast.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
-
CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
...
@@ -107,11 +96,6 @@ steps:
...
@@ -107,11 +96,6 @@ steps:
fast_check
:
true
fast_check
:
true
commands
:
commands
:
-
pytest -v -s distributed/test_pynccl.py
-
pytest -v -s distributed/test_pynccl.py
# We want to test that models which use 2 GPUs work with 4 GPUs, which is why we duplicate them here.
# See https://github.com/vllm-project/vllm/pull/5473#issuecomment-2166601837 for context.
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray VLLM_USE_RAY_SPMD_WORKER=1 VLLM_USE_RAY_COMPILED_DAG=1 pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
-
label
:
Pipeline Parallelism Test
-
label
:
Pipeline Parallelism Test
...
@@ -279,9 +263,6 @@ steps:
...
@@ -279,9 +263,6 @@ steps:
# NOTE: don't test llama model here, it seems hf implementation is buggy
# NOTE: don't test llama model here, it seems hf implementation is buggy
# see https://github.com/vllm-project/vllm/pull/5689 for details
# see https://github.com/vllm-project/vllm/pull/5689 for details
-
pytest -v -s distributed/test_custom_all_reduce.py
-
pytest -v -s distributed/test_custom_all_reduce.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-
pip install https://github.com/flashinfer-ai/flashinfer/releases/download/v0.1.2/flashinfer-0.1.2+cu121torch2.4-cp310-cp310-linux_x86_64.whl
-
VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
TARGET_TEST_SUITE=A100 pytest -v -s distributed/test_basic_distributed_correctness.py
-
VLLM_ATTENTION_BACKEND=FLASHINFER TEST_DIST_MODEL=meta-llama/Meta-Llama-3-8B DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
-
pytest -v -s -x lora/test_mixtral.py
-
pytest -v -s -x lora/test_mixtral.py
tests/distributed/test_basic_distributed_correctness.py
View file @
04e55834
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
Run:
```sh
```sh
cd $VLLM_PATH/tests
cd $VLLM_PATH/tests
TEST_DIST_MODEL=facebook/opt-125m pytest
\
pytest distributed/test_basic_distributed_correctness.py
distributed/test_basic_distributed_correctness.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf
\
distributed/test_basic_distributed_correctness.py
```
```
"""
"""
import
os
import
os
...
@@ -19,27 +14,48 @@ import pytest
...
@@ -19,27 +14,48 @@ import pytest
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
MODELS
=
[
TARGET_TEST_SUITE
=
os
.
environ
.
get
(
"TARGET_TEST_SUITE"
,
"L4"
)
os
.
environ
[
"TEST_DIST_MODEL"
],
]
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
"model, distributed_executor_backend, attention_backend, test_suite"
,
[
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
@
fork_new_process_for_each_test
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
example_prompts
,
example_prompts
,
model
:
str
,
model
:
str
,
dtype
:
str
,
distributed_executor_backend
:
str
,
max_tokens
:
int
,
attention_backend
:
str
,
test_suite
:
str
,
)
->
None
:
)
->
None
:
distributed_executor_backend
=
os
.
getenv
(
DISTRIBUTED_EXECUTOR_BACKEND
)
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
if
attention_backend
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attention_backend
dtype
=
"half"
max_tokens
=
5
# NOTE: take care of the order. run vLLM first, and then run HF.
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# vLLM needs a fresh new process without cuda initialization.
...
...
tests/distributed/test_chunked_prefill_distributed.py
View file @
04e55834
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
vLLM will allocate all the available memory, so we need to run the tests one
by one. The solution is to pass arguments (model name) by environment
variables.
Run:
Run:
```sh
```sh
TEST_DIST_MODEL=facebook/opt-125m pytest
\
pytest test_chunked_prefill_distributed.py
test_chunked_prefill_distributed.py
TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf
\
test_chunked_prefill_distributed.py
```
```
"""
"""
import
os
import
pytest
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.utils
import
cuda_device_count_stateless
from
..models.utils
import
check_outputs_equal
from
..models.utils
import
check_outputs_equal
from
..utils
import
fork_new_process_for_each_test
MODELS
=
[
os
.
environ
[
"TEST_DIST_MODEL"
],
]
DISTRIBUTED_EXECUTOR_BACKEND
=
"DISTRIBUTED_EXECUTOR_BACKEND"
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
(
"facebook/opt-125m"
,
"ray"
),
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
5
])
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
),
@
pytest
.
mark
.
parametrize
(
"chunked_prefill_token_size"
,
[
16
])
(
"facebook/opt-125m"
,
"mp"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
def
test_models
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
example_prompts
,
example_prompts
,
model
:
str
,
model
:
str
,
dtype
:
str
,
distributed_executor_backend
:
str
,
max_tokens
:
int
,
chunked_prefill_token_size
:
int
,
)
->
None
:
)
->
None
:
distributed_executor_backend
=
os
.
getenv
(
DISTRIBUTED_EXECUTOR_BACKEND
)
dtype
=
"half"
max_tokens
=
5
chunked_prefill_token_size
=
16
# Add a chunked prefill config.
# Add a chunked prefill config.
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
max_num_seqs
=
min
(
chunked_prefill_token_size
,
256
)
...
...
tests/distributed/test_multimodal_broadcast.py
View file @
04e55834
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
"""Compare the outputs of HF and distributed vLLM when using greedy sampling.
The second test will hang if more than one test is run per command, so we need
to run the tests one by one. The solution is to pass arguments (model name) by
environment variables.
Run:
Run:
```sh
```sh
TEST_DIST_MODEL=llava-hf/llava-1.5-7b-hf
\
pytest -s -v test_multimodal_broadcast.py
test_multimodal_broadcast.py
TEST_DIST_MODEL=microsoft/Phi-3-vision-128k-instruct
\
test_multimodal_broadcast.py
```
```
"""
"""
import
os
import
pytest
import
pytest
from
vllm.utils
import
cuda_device_count_stateless
from
vllm.utils
import
cuda_device_count_stateless
model
=
os
.
environ
[
"TEST_DIST_MODEL"
]
from
..utils
import
fork_new_process_for_each_test
if
model
.
startswith
(
"llava-hf/llava-1.5"
):
from
..models.test_llava
import
models
,
run_test
elif
model
.
startswith
(
"llava-hf/llava-v1.6"
):
from
..models.test_llava_next
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
@
pytest
.
mark
.
skipif
(
cuda_device_count_stateless
()
<
2
,
reason
=
"Need at least 2 GPUs to run the test."
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend"
,
[
(
"llava-hf/llava-1.5-7b-hf"
,
"ray"
),
(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"ray"
),
(
"llava-hf/llava-1.5-7b-hf"
,
"mp"
),
(
"llava-hf/llava-v1.6-mistral-7b-hf"
,
"mp"
),
])
@
fork_new_process_for_each_test
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
model
:
str
,
distributed_executor_backend
:
str
)
->
None
:
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
2
])
dtype
=
"half"
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
max_tokens
=
5
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
128
])
num_logprobs
=
5
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
5
])
tensor_parallel_size
=
2
def
test_models
(
hf_runner
,
vllm_runner
,
image_assets
,
tensor_parallel_size
:
int
,
dtype
:
str
,
max_tokens
:
int
,
num_logprobs
:
int
)
->
None
:
if
cuda_device_count_stateless
()
<
tensor_parallel_size
:
pytest
.
skip
(
f
"Need at least
{
tensor_parallel_size
}
GPUs to run the test."
)
distributed_executor_backend
=
os
.
getenv
(
"DISTRIBUTED_EXECUTOR_BACKEND"
)
if
model
.
startswith
(
"llava-hf/llava-1.5"
):
from
..models.test_llava
import
models
,
run_test
elif
model
.
startswith
(
"llava-hf/llava-v1.6"
):
from
..models.test_llava_next
import
models
,
run_test
else
:
raise
NotImplementedError
(
f
"Unsupported model:
{
model
}
"
)
run_test
(
run_test
(
hf_runner
,
hf_runner
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment