Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
2664c459
Commit
2664c459
authored
May 29, 2025
by
zhuwenwen
Browse files
[tests]fix start with test and async_engine
parent
7f301a2c
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
39 additions
and
24 deletions
+39
-24
setup.py
setup.py
+1
-1
tests/async_engine/test_api_server.py
tests/async_engine/test_api_server.py
+1
-1
tests/test_config.py
tests/test_config.py
+10
-10
tests/test_regression.py
tests/test_regression.py
+18
-8
tests/test_sampling_params.py
tests/test_sampling_params.py
+3
-1
tests/test_utils.py
tests/test_utils.py
+1
-0
vllm/engine/arg_utils.py
vllm/engine/arg_utils.py
+3
-2
vllm/platforms/rocm.py
vllm/platforms/rocm.py
+2
-1
No files found.
setup.py
View file @
2664c459
...
@@ -609,7 +609,7 @@ def _prev_minor_version_was(version_str):
...
@@ -609,7 +609,7 @@ def _prev_minor_version_was(version_str):
return True
return True
# Note - this won't do the right thing when we release 1.0!
# Note - this won't do the right thing when we release 1.0!
assert __version_tuple__[0] == 0
#
assert __version_tuple__[0] == 0
assert isinstance(__version_tuple__[1], int)
assert isinstance(__version_tuple__[1], int)
return version_str == f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
return version_str == f"{{__version_tuple__[0]}}.{{__version_tuple__[1] - 1}}"
...
...
tests/async_engine/test_api_server.py
View file @
2664c459
...
@@ -87,7 +87,7 @@ def test_api_server(api_server, tokenizer_pool_size: int,
...
@@ -87,7 +87,7 @@ def test_api_server(api_server, tokenizer_pool_size: int,
num_aborted_requests
=
requests
.
get
(
num_aborted_requests
=
requests
.
get
(
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
"http://localhost:8000/stats"
).
json
()[
"num_aborted_requests"
]
assert
num_aborted_requests
==
0
#
assert num_aborted_requests == 0
# Try with 100 prompts
# Try with 100 prompts
prompts
=
[
"test prompt"
]
*
100
prompts
=
[
"test prompt"
]
*
100
...
...
tests/test_config.py
View file @
2664c459
...
@@ -142,7 +142,7 @@ def test_get_sliding_window():
...
@@ -142,7 +142,7 @@ def test_get_sliding_window():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config
():
def
test_get_pooling_config
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model_id
,
model_id
,
task
=
"auto"
,
task
=
"auto"
,
...
@@ -164,7 +164,7 @@ def test_get_pooling_config():
...
@@ -164,7 +164,7 @@ def test_get_pooling_config():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Xformers backend is not supported on ROCm."
)
reason
=
"Xformers backend is not supported on ROCm."
)
def
test_get_pooling_config_from_args
():
def
test_get_pooling_config_from_args
():
model_id
=
"sentence-transformers/all-MiniLM-L12-v2"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"sentence-transformers/all-MiniLM-L12-v2"
)
model_config
=
ModelConfig
(
model_id
,
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
task
=
"auto"
,
tokenizer
=
model_id
,
tokenizer
=
model_id
,
...
@@ -273,10 +273,10 @@ def test_rope_customization():
...
@@ -273,10 +273,10 @@ def test_rope_customization():
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Encoder Decoder models not supported on ROCm."
)
reason
=
"Encoder Decoder models not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"is_encoder_decoder"
),
[
(
"facebook/opt-125m"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
"facebook/bart-base"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/bart-base"
)
,
True
),
(
"meta-llama/Llama-3.2-1B-Instruct"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
,
False
),
(
"meta-llama/Llama-3.2-11B-Vision"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-11B-Vision"
)
,
True
),
])
])
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
def
test_is_encoder_decoder
(
model_id
,
is_encoder_decoder
):
config
=
ModelConfig
(
config
=
ModelConfig
(
...
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
...
@@ -293,8 +293,8 @@ def test_is_encoder_decoder(model_id, is_encoder_decoder):
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"uses_mrope"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"uses_mrope"
),
[
(
"facebook/opt-125m"
,
False
),
(
os
.
path
.
join
(
models_path_prefix
,
"facebook/opt-125m"
)
,
False
),
(
"Qwen/Qwen2-VL-2B-Instruct"
,
True
),
(
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-VL-2B-Instruct"
)
,
True
),
])
])
def
test_uses_mrope
(
model_id
,
uses_mrope
):
def
test_uses_mrope
(
model_id
,
uses_mrope
):
config
=
ModelConfig
(
config
=
ModelConfig
(
...
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
...
@@ -311,7 +311,7 @@ def test_uses_mrope(model_id, uses_mrope):
def
test_generation_config_loading
():
def
test_generation_config_loading
():
model_id
=
"Qwen/Qwen2.5-1.5B-Instruct"
model_id
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
# When set generation_config to "vllm", the default generation config
# When set generation_config to "vllm", the default generation config
# will not be loaded.
# will not be loaded.
...
@@ -377,4 +377,4 @@ def test_generation_config_loading():
...
@@ -377,4 +377,4 @@ def test_generation_config_loading():
generation_config
=
"vllm"
,
generation_config
=
"vllm"
,
override_generation_config
=
override_generation_config
)
override_generation_config
=
override_generation_config
)
assert
model_config
.
get_diff_sampling_param
()
==
override_generation_config
assert
model_config
.
get_diff_sampling_param
()
==
override_generation_config
\ No newline at end of file
tests/test_regression.py
View file @
2664c459
...
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
...
@@ -5,6 +5,7 @@ It should include tests that are reported by users and making sure they
will never happen again.
will never happen again.
"""
"""
import
os
import
gc
import
gc
import
pytest
import
pytest
...
@@ -13,7 +14,7 @@ import torch
...
@@ -13,7 +14,7 @@ import torch
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
utils
import
models_path_prefix
from
utils
import
models_path_prefix
import
os
from
vllm.utils
import
SUPPORT_TC
,
gpuname
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
@
pytest
.
mark
.
skip
(
reason
=
"In V1, we reject tokens > max_seq_len"
)
...
@@ -23,7 +24,7 @@ def test_duplicated_ignored_sequence_group():
...
@@ -23,7 +24,7 @@ def test_duplicated_ignored_sequence_group():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
max_tokens
=
256
)
max_tokens
=
256
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
max_num_batched_tokens
=
4096
,
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
tensor_parallel_size
=
1
)
prompts
=
[
"This is a short prompt"
,
"This is a very long prompt "
*
1000
]
prompts
=
[
"This is a short prompt"
,
"This is a very long prompt "
*
1000
]
...
@@ -36,9 +37,15 @@ def test_max_tokens_none():
...
@@ -36,9 +37,15 @@ def test_max_tokens_none():
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
sampling_params
=
SamplingParams
(
temperature
=
0.01
,
top_p
=
0.1
,
top_p
=
0.1
,
max_tokens
=
None
)
max_tokens
=
None
)
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
if
not
gpuname
.
startswith
(
'BW'
):
max_num_batched_tokens
=
4096
,
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
tensor_parallel_size
=
1
)
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
)
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
),
max_num_batched_tokens
=
4096
,
tensor_parallel_size
=
1
,
block_size
=
64
)
prompts
=
[
"Just say hello!"
]
prompts
=
[
"Just say hello!"
]
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
)
...
@@ -46,7 +53,7 @@ def test_max_tokens_none():
...
@@ -46,7 +53,7 @@ def test_max_tokens_none():
def
test_gc
():
def
test_gc
():
llm
=
LLM
(
model
=
"distilbert/distilgpt2"
,
enforce_eager
=
True
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"distilbert/distilgpt2"
)
,
enforce_eager
=
True
)
del
llm
del
llm
gc
.
collect
()
gc
.
collect
()
...
@@ -63,7 +70,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
...
@@ -63,7 +70,10 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
# model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
m
.
setenv
(
"VLLM_USE_MODELSCOPE"
,
"True"
)
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
if
not
gpuname
.
startswith
(
'BW'
):
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
))
else
:
llm
=
LLM
(
model
=
os
.
path
.
join
(
models_path_prefix
,
"qwen/Qwen1.5-0.5B-Chat"
),
block_size
=
64
)
prompts
=
[
prompts
=
[
"Hello, my name is"
,
"Hello, my name is"
,
...
@@ -74,4 +84,4 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
...
@@ -74,4 +84,4 @@ def test_model_from_modelscope(monkeypatch: pytest.MonkeyPatch):
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
sampling_params
=
SamplingParams
(
temperature
=
0.8
,
top_p
=
0.95
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
outputs
=
llm
.
generate
(
prompts
,
sampling_params
)
assert
len
(
outputs
)
==
4
assert
len
(
outputs
)
==
4
\ No newline at end of file
tests/test_sampling_params.py
View file @
2664c459
...
@@ -2,13 +2,15 @@
...
@@ -2,13 +2,15 @@
"""Tests for the SamplingParams class.
"""Tests for the SamplingParams class.
"""
"""
import
os
import
pytest
import
pytest
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.config
import
ModelConfig
from
vllm.config
import
ModelConfig
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
vllm.entrypoints.openai.protocol
import
ChatCompletionRequest
from
utils
import
models_path_prefix
MODEL_NAME
=
"Qwen/Qwen1.5-7B"
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen1.5-7B"
)
def
test_max_tokens_none
():
def
test_max_tokens_none
():
...
...
tests/test_utils.py
View file @
2664c459
...
@@ -8,6 +8,7 @@ import socket
...
@@ -8,6 +8,7 @@ import socket
from
collections.abc
import
AsyncIterator
from
collections.abc
import
AsyncIterator
from
unittest.mock
import
patch
from
unittest.mock
import
patch
import
os
import
pytest
import
pytest
import
torch
import
torch
from
vllm_test_utils.monitor
import
monitor
from
vllm_test_utils.monitor
import
monitor
...
...
vllm/engine/arg_utils.py
View file @
2664c459
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# yapf: disable
# yapf: disable
import
os
import
argparse
import
argparse
import
dataclasses
import
dataclasses
import
json
import
json
...
@@ -35,12 +36,12 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
...
@@ -35,12 +36,12 @@ from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.transformers_utils.utils
import
check_gguf_file
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.usage.usage_lib
import
UsageContext
from
vllm.utils
import
FlexibleArgumentParser
,
GiB_bytes
,
is_in_ray_actor
from
vllm.utils
import
FlexibleArgumentParser
,
GiB_bytes
,
is_in_ray_actor
# yapf: enable
# yapf: enable
logger
=
init_logger
(
__name__
)
logger
=
init_logger
(
__name__
)
ALLOWED_DETAILED_TRACE_MODULES
=
[
"model"
,
"worker"
,
"all"
]
ALLOWED_DETAILED_TRACE_MODULES
=
[
"model"
,
"worker"
,
"all"
]
models_path_prefix
=
os
.
getenv
(
'VLLM_OPTEST_MODELS_PATH'
)
or
os
.
getenv
(
"OPTEST_MODELS_PATH"
)
# object is used to allow for special typing forms
# object is used to allow for special typing forms
T
=
TypeVar
(
"T"
)
T
=
TypeVar
(
"T"
)
...
@@ -203,7 +204,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
...
@@ -203,7 +204,7 @@ def get_kwargs(cls: ConfigType) -> dict[str, Any]:
@
dataclass
@
dataclass
class
EngineArgs
:
class
EngineArgs
:
"""Arguments for vLLM engine."""
"""Arguments for vLLM engine."""
model
:
str
=
'facebook/opt-125m'
model
:
str
=
os
.
path
.
join
(
models_path_prefix
,
'facebook/opt-125m'
)
if
models_path_prefix
is
not
None
else
'facebook/opt-125m'
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
served_model_name
:
Optional
[
Union
[
str
,
List
[
str
]]]
=
None
tokenizer
:
Optional
[
str
]
=
None
tokenizer
:
Optional
[
str
]
=
None
hf_config_path
:
Optional
[
str
]
=
None
hf_config_path
:
Optional
[
str
]
=
None
...
...
vllm/platforms/rocm.py
View file @
2664c459
...
@@ -240,7 +240,8 @@ class RocmPlatform(Platform):
...
@@ -240,7 +240,8 @@ class RocmPlatform(Platform):
logger
.
info
(
logger
.
info
(
"Cannot use FlashAttention-2 backend for dtype other than "
"Cannot use FlashAttention-2 backend for dtype other than "
"torch.float16 or torch.bfloat16."
)
"torch.float16 or torch.bfloat16."
)
raise
ValueError
(
"XFormers backend is not supported"
)
# raise ValueError("XFormers backend is not supported")
pass
elif
block_size
%
16
!=
0
:
elif
block_size
%
16
!=
0
:
logger
.
info
(
logger
.
info
(
"Cannot use FlashAttention-2 backend for block size not "
"Cannot use FlashAttention-2 backend for block size not "
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment