Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a715dfbe
Commit
a715dfbe
authored
Nov 18, 2024
by
zhuwenwen
Browse files
update test_basic_correctness.py
parent
4e06836d
Changes
1
Show whitespace changes
Inline
Side-by-side
Showing
1 changed file
with
57 additions
and
57 deletions
+57
-57
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+57
-57
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
a715dfbe
...
...
@@ -74,63 +74,63 @@ def test_models(
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"model, distributed_executor_backend, attention_backend, "
"test_suite"
,
[
(
"facebook/opt-125m"
,
"ray"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"ray"
,
""
,
"L4"
),
(
"meta-llama/Llama-2-7b-hf"
,
"mp"
,
""
,
"L4"
),
(
"facebook/opt-125m"
,
"ray"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
""
,
"A100"
),
(
"facebook/opt-125m"
,
"mp"
,
"FLASHINFER"
,
"A100"
),
(
"meta-llama/Meta-Llama-3-8B"
,
"ray"
,
"FLASHINFER"
,
"A100"
),
])
def
test_models_distributed
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
distributed_executor_backend
:
str
,
attention_backend
:
str
,
test_suite
:
str
,
)
->
None
:
if
test_suite
!=
TARGET_TEST_SUITE
:
pytest
.
skip
(
f
"Skip test for
{
test_suite
}
"
)
if
model
==
"meta-llama/Llama-2-7b-hf"
and
distributed_executor_backend
==
"ray"
and
attention_backend
==
""
and
test_suite
==
"L4"
:
# noqa
# test ray adag
os
.
environ
[
'VLLM_USE_RAY_SPMD_WORKER'
]
=
"1"
os
.
environ
[
'VLLM_USE_RAY_COMPILED_DAG'
]
=
"1"
if
attention_backend
:
os
.
environ
[
"VLLM_ATTENTION_BACKEND"
]
=
attention_backend
dtype
=
"half"
max_tokens
=
5
# NOTE: take care of the order. run vLLM first, and then run HF.
# vLLM needs a fresh new process without cuda initialization.
# if we run HF first, the cuda initialization will be done and it
# will hurt multiprocessing backend with fork method (the default method).
with
vllm_runner
(
model
,
dtype
=
dtype
,
tensor_parallel_size
=
2
,
distributed_executor_backend
=
distributed_executor_backend
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
)
#
@multi_gpu_test(num_gpus=2)
#
@pytest.mark.parametrize(
#
"model, distributed_executor_backend, attention_backend, "
#
"test_suite", [
#
("facebook/opt-125m", "ray", "", "L4"),
#
("facebook/opt-125m", "mp", "", "L4"),
#
("meta-llama/Llama-2-7b-hf", "ray", "", "L4"),
#
("meta-llama/Llama-2-7b-hf", "mp", "", "L4"),
#
("facebook/opt-125m", "ray", "", "A100"),
#
("facebook/opt-125m", "mp", "", "A100"),
#
("facebook/opt-125m", "mp", "FLASHINFER", "A100"),
#
("meta-llama/Meta-Llama-3-8B", "ray", "FLASHINFER", "A100"),
#
])
#
def test_models_distributed(
#
hf_runner,
#
vllm_runner,
#
example_prompts,
#
model: str,
#
distributed_executor_backend: str,
#
attention_backend: str,
#
test_suite: str,
#
) -> None:
#
if test_suite != TARGET_TEST_SUITE:
#
pytest.skip(f"Skip test for {test_suite}")
#
if model == "meta-llama/Llama-2-7b-hf" and distributed_executor_backend == "ray" and attention_backend == "" and test_suite == "L4": # noqa
#
# test ray adag
#
os.environ['VLLM_USE_RAY_SPMD_WORKER'] = "1"
#
os.environ['VLLM_USE_RAY_COMPILED_DAG'] = "1"
#
if attention_backend:
#
os.environ["VLLM_ATTENTION_BACKEND"] = attention_backend
#
dtype = "half"
#
max_tokens = 5
#
# NOTE: take care of the order. run vLLM first, and then run HF.
#
# vLLM needs a fresh new process without cuda initialization.
#
# if we run HF first, the cuda initialization will be done and it
#
# will hurt multiprocessing backend with fork method (the default method).
#
with vllm_runner(model,
#
dtype=dtype,
#
tensor_parallel_size=2,
#
distributed_executor_backend=distributed_executor_backend
#
) as vllm_model:
#
vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
#
with hf_runner(model, dtype=dtype) as hf_model:
#
hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
#
check_outputs_equal(
#
outputs_0_lst=hf_outputs,
#
outputs_1_lst=vllm_outputs,
#
name_0="hf",
#
name_1="vllm",
#
)
def
test_model_with_failure
(
vllm_runner
)
->
None
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment