Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
dbd62f84
Commit
dbd62f84
authored
May 29, 2025
by
zhuwenwen
Browse files
[test]fix basic_correctness and benchmarks
parent
0e8619b8
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
57 additions
and
31 deletions
+57
-31
tests/basic_correctness/test_basic_correctness.py
tests/basic_correctness/test_basic_correctness.py
+22
-6
tests/basic_correctness/test_chunked_prefill.py
tests/basic_correctness/test_chunked_prefill.py
+8
-3
tests/basic_correctness/test_cumem.py
tests/basic_correctness/test_cumem.py
+5
-5
tests/benchmarks/test_latency_cli.py
tests/benchmarks/test_latency_cli.py
+4
-2
tests/benchmarks/test_serve_cli.py
tests/benchmarks/test_serve_cli.py
+4
-3
tests/benchmarks/test_throughput_cli.py
tests/benchmarks/test_throughput_cli.py
+5
-2
tests/test_regression.py
tests/test_regression.py
+9
-8
vllm/attention/backends/rocm_flash_attn.py
vllm/attention/backends/rocm_flash_attn.py
+0
-2
No files found.
tests/basic_correctness/test_basic_correctness.py
View file @
dbd62f84
...
@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal
...
@@ -16,6 +16,8 @@ from ..models.utils import check_outputs_equal
from
..utils
import
multi_gpu_test
from
..utils
import
multi_gpu_test
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
gpuname
import
vllm.envs
as
envs
MODELS
=
[
MODELS
=
[
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
os
.
path
.
join
(
models_path_prefix
,
"google/gemma-2-2b-it"
),
...
@@ -35,7 +37,11 @@ def v1(run_with_both_engines):
...
@@ -35,7 +37,11 @@ def v1(run_with_both_engines):
def
test_vllm_gc_ed
():
def
test_vllm_gc_ed
():
"""Verify vllm instance is GC'ed when it is deleted"""
"""Verify vllm instance is GC'ed when it is deleted"""
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
block_size
=
64
)
else
:
llm
=
LLM
(
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
))
weak_llm
=
weakref
.
ref
(
llm
)
weak_llm
=
weakref
.
ref
(
llm
)
del
llm
del
llm
# If there's any circular reference to vllm, this fails
# If there's any circular reference to vllm, this fails
...
@@ -79,13 +85,23 @@ def test_models(
...
@@ -79,13 +85,23 @@ def test_models(
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
with
hf_runner
(
model
,
dtype
=
dtype
)
as
hf_model
:
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
hf_outputs
=
hf_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
with
VllmRunner
(
model
,
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
with
VllmRunner
(
model
,
max_model_len
=
8192
,
max_model_len
=
8192
,
dtype
=
dtype
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
gpu_memory_utilization
=
0.7
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
block_size
=
64
)
as
vllm_model
:
max_tokens
)
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
else
:
with
VllmRunner
(
model
,
max_model_len
=
8192
,
dtype
=
dtype
,
enforce_eager
=
enforce_eager
,
gpu_memory_utilization
=
0.7
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
check_outputs_equal
(
check_outputs_equal
(
outputs_0_lst
=
hf_outputs
,
outputs_0_lst
=
hf_outputs
,
...
@@ -159,4 +175,4 @@ def test_models(
...
@@ -159,4 +175,4 @@ def test_models(
# outputs_1_lst=vllm_outputs,
# outputs_1_lst=vllm_outputs,
# name_0="hf",
# name_0="hf",
# name_1="vllm",
# name_1="vllm",
# )
# )
\ No newline at end of file
tests/basic_correctness/test_chunked_prefill.py
View file @
dbd62f84
...
@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal
...
@@ -21,6 +21,8 @@ from ..models.utils import check_logprobs_close, check_outputs_equal
from
..utils
import
multi_gpu_test
from
..utils
import
multi_gpu_test
import
os
import
os
from
..utils
import
models_path_prefix
from
..utils
import
models_path_prefix
from
vllm.utils
import
gpuname
import
vllm.envs
as
envs
if
TYPE_CHECKING
:
if
TYPE_CHECKING
:
from
.conftest
import
HfRunner
,
VllmRunner
from
.conftest
import
HfRunner
,
VllmRunner
...
@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
...
@@ -50,7 +52,7 @@ def use_v0_only(monkeypatch: pytest.MonkeyPatch):
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# NOTE: Increasing this in this suite will fail CI because we currently cannot
# reset distributed env properly. Use a value > 1 just when you test.
# reset distributed env properly. Use a value > 1 just when you test.
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
]
)
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
]
if
not
current_platform
.
is_rocm
()
else
[
"FLASH_ATTN"
])
def
test_models
(
def
test_models
(
hf_runner
:
HfRunner
,
hf_runner
:
HfRunner
,
vllm_runner
:
VllmRunner
,
vllm_runner
:
VllmRunner
,
...
@@ -85,6 +87,7 @@ def test_models(
...
@@ -85,6 +87,7 @@ def test_models(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
max_tokens
)
...
@@ -100,7 +103,7 @@ def test_models(
...
@@ -100,7 +103,7 @@ def test_models(
@
multi_gpu_test
(
num_gpus
=
2
)
@
multi_gpu_test
(
num_gpus
=
2
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
"ray"
,
"mp"
])
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
])
@
pytest
.
mark
.
parametrize
(
"attention_backend"
,
[
"FLASHINFER"
,
"FLASH_ATTN"
]
if
not
current_platform
.
is_rocm
()
else
[
"FLASH_ATTN"
]
)
def
test_models_distributed
(
def
test_models_distributed
(
hf_runner
:
HfRunner
,
hf_runner
:
HfRunner
,
vllm_runner
:
VllmRunner
,
vllm_runner
:
VllmRunner
,
...
@@ -142,6 +145,7 @@ def test_models_distributed(
...
@@ -142,6 +145,7 @@ def test_models_distributed(
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
distributed_executor_backend
=
distributed_executor_backend
,
distributed_executor_backend
=
distributed_executor_backend
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
generate_greedy
(
vllm_outputs
=
vllm_model
.
generate_greedy
(
example_prompts
,
example_prompts
,
...
@@ -267,6 +271,7 @@ def test_with_prefix_caching(
...
@@ -267,6 +271,7 @@ def test_with_prefix_caching(
tensor_parallel_size
=
tensor_parallel_size
,
tensor_parallel_size
=
tensor_parallel_size
,
enforce_eager
=
enforce_eager
,
enforce_eager
=
enforce_eager
,
max_num_seqs
=
max_num_seqs
,
max_num_seqs
=
max_num_seqs
,
block_size
=
64
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
else
16
,
)
as
vllm_model
:
)
as
vllm_model
:
outputs
[
enable
]
=
[]
outputs
[
enable
]
=
[]
for
prompt
in
full_prompts
:
for
prompt
in
full_prompts
:
...
@@ -338,4 +343,4 @@ def test_with_prefix_caching_cpu(
...
@@ -338,4 +343,4 @@ def test_with_prefix_caching_cpu(
chunk_size
,
chunk_size
,
1
,
1
,
dtype
,
dtype
,
)
)
\ No newline at end of file
tests/basic_correctness/test_cumem.py
View file @
dbd62f84
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
os
import
pytest
import
pytest
import
torch
import
torch
...
@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams
...
@@ -7,8 +8,7 @@ from vllm import LLM, SamplingParams
from
vllm.device_allocator.cumem
import
CuMemAllocator
from
vllm.device_allocator.cumem
import
CuMemAllocator
from
vllm.utils
import
GiB_bytes
from
vllm.utils
import
GiB_bytes
from
..utils
import
create_new_process_for_each_test
from
..utils
import
create_new_process_for_each_test
,
models_path_prefix
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
def
test_python_error
():
def
test_python_error
():
...
@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph():
...
@@ -119,9 +119,9 @@ def test_cumem_with_cudagraph():
"model, use_v1"
,
"model, use_v1"
,
[
[
# sleep mode with safetensors
# sleep mode with safetensors
(
"meta-llama/Llama-3.2-1B"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
,
True
),
# sleep mode with pytorch checkpoint
# sleep mode with pytorch checkpoint
(
"facebook/opt-125m"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B"
)
,
False
),
])
])
def
test_end_to_end
(
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
use_v1
:
bool
):
def
test_end_to_end
(
monkeypatch
:
pytest
.
MonkeyPatch
,
model
:
str
,
use_v1
:
bool
):
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
...
@@ -175,4 +175,4 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
...
@@ -175,4 +175,4 @@ def test_end_to_end(monkeypatch: pytest.MonkeyPatch, model: str, use_v1: bool):
output3
=
llm
.
generate
(
prompt
,
sampling_params
)
output3
=
llm
.
generate
(
prompt
,
sampling_params
)
# cmp output
# cmp output
assert
output
[
0
].
outputs
[
0
].
text
==
output3
[
0
].
outputs
[
0
].
text
assert
output
[
0
].
outputs
[
0
].
text
==
output3
[
0
].
outputs
[
0
].
text
\ No newline at end of file
tests/benchmarks/test_latency_cli.py
View file @
dbd62f84
...
@@ -2,8 +2,10 @@
...
@@ -2,8 +2,10 @@
import
subprocess
import
subprocess
import
pytest
import
pytest
import
os
from
..utils
import
models_path_prefix
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
@
pytest
.
mark
.
benchmark
@
pytest
.
mark
.
benchmark
...
@@ -16,4 +18,4 @@ def test_bench_latency():
...
@@ -16,4 +18,4 @@ def test_bench_latency():
print
(
result
.
stdout
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
\ No newline at end of file
tests/benchmarks/test_serve_cli.py
View file @
dbd62f84
...
@@ -2,10 +2,11 @@
...
@@ -2,10 +2,11 @@
import
subprocess
import
subprocess
import
pytest
import
pytest
import
os
from
..utils
import
RemoteOpenAIServer
from
..utils
import
RemoteOpenAIServer
,
models_path_prefix
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
@@ -41,4 +42,4 @@ def test_bench_serve(server):
...
@@ -41,4 +42,4 @@ def test_bench_serve(server):
print
(
result
.
stdout
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
\ No newline at end of file
tests/benchmarks/test_throughput_cli.py
View file @
dbd62f84
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
import
subprocess
import
subprocess
import
os
import
pytest
import
pytest
MODEL_NAME
=
"meta-llama/Llama-3.2-1B-Instruct"
from
..utils
import
models_path_prefix
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
@
pytest
.
mark
.
benchmark
@
pytest
.
mark
.
benchmark
...
@@ -16,4 +19,4 @@ def test_bench_throughput():
...
@@ -16,4 +19,4 @@ def test_bench_throughput():
print
(
result
.
stdout
)
print
(
result
.
stdout
)
print
(
result
.
stderr
)
print
(
result
.
stderr
)
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
assert
result
.
returncode
==
0
,
f
"Benchmark failed:
{
result
.
stderr
}
"
\ No newline at end of file
tests/test_regression.py
View file @
dbd62f84
...
@@ -13,8 +13,9 @@ import torch
...
@@ -13,8 +13,9 @@ import torch
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
utils
import
models_path_prefix
from
.
utils
import
models_path_prefix
from
vllm.utils
import
SUPPORT_TC
,
gpuname
from
vllm.utils
import
SUPPORT_TC
,
gpuname
import
vllm.envs
as
envs
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
...
@@ -37,15 +38,15 @@ def test_max_tokens_none():
...
@@ -37,15 +38,15 @@ def test_max_tokens_none():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
max_tokens
=
None
)
max_tokens
=
None
)
if
not
gpuname
.
startswith
(
'BW'
):
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
,
block_size
=
64
)
else
:
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
tensor_parallel_size
=
1
)
block_size
=
64
)
prompts
=
[
"Just say hello!"
]
prompts
=
[
"Just say hello!"
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
...
@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
...
@@ -70,10 +71,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
if
not
gpuname
.
startswith
(
'BW'
):
if
gpuname
.
startswith
(
'BW'
)
and
envs
.
VLLM_FLASH_ATTN_BACKEND
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
...
vllm/attention/backends/rocm_flash_attn.py
View file @
dbd62f84
...
@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
...
@@ -852,8 +852,6 @@ class ROCmFlashAttentionImpl(AttentionImpl):
else
:
else
:
# prefix-enabled attention -
# prefix-enabled attention -
# not applicable for encoder-only models
# not applicable for encoder-only models
# if not envs.VLLM_USE_TRITON_PREFIX_FLASH_ATTN:
# self.fa_prefix_attn_func = vllm_flash_attn_varlen_func
if
envs
.
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
or
gpuname
.
startswith
(
'BW'
):
if
envs
.
VLLM_USE_TRITON_PREFIX_FLASH_ATTN
or
gpuname
.
startswith
(
'BW'
):
version_key
=
triton_key
()
version_key
=
triton_key
()
if
self
.
attn_type
!=
AttentionType
.
ENCODER_ONLY
:
if
self
.
attn_type
!=
AttentionType
.
ENCODER_ONLY
:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment