Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
99324e25
Commit
99324e25
authored
Jul 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.2' into v0.9.2-ori
parents
cc7f22a8
a5dd03c1
Changes
475
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
304 additions
and
181 deletions
+304
-181
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
...ummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
+20
-0
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
..._dummy_platform/vllm_add_dummy_platform/dummy_platform.py
+20
-3
tests/plugins_tests/test_platform_plugins.py
tests/plugins_tests/test_platform_plugins.py
+14
-0
tests/pplx_utils.py
tests/pplx_utils.py
+0
-123
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+2
-1
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+8
-2
tests/quantization/test_register_quantization_config.py
tests/quantization/test_register_quantization_config.py
+1
-0
tests/quantization/test_rtn.py
tests/quantization/test_rtn.py
+29
-0
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+15
-0
tests/samplers/test_typical_acceptance_sampler.py
tests/samplers/test_typical_acceptance_sampler.py
+1
-1
tests/spec_decode/e2e/test_eagle_correctness.py
tests/spec_decode/e2e/test_eagle_correctness.py
+9
-1
tests/spec_decode/e2e/test_integration.py
tests/spec_decode/e2e/test_integration.py
+10
-1
tests/spec_decode/e2e/test_logprobs.py
tests/spec_decode/e2e/test_logprobs.py
+17
-1
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_medusa_correctness.py
+1
-1
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+3
-0
tests/spec_decode/e2e/test_mtp_correctness.py
tests/spec_decode/e2e/test_mtp_correctness.py
+1
-1
tests/spec_decode/e2e/test_multistep_correctness.py
tests/spec_decode/e2e/test_multistep_correctness.py
+24
-0
tests/spec_decode/e2e/test_ngram_correctness.py
tests/spec_decode/e2e/test_ngram_correctness.py
+20
-2
tests/standalone_tests/pytorch_nightly_dependency.sh
tests/standalone_tests/pytorch_nightly_dependency.sh
+42
-0
tests/test_config.py
tests/test_config.py
+67
-44
No files found.
Too many changes to show.
To preserve performance only
475 of 475+
files are displayed.
Plain diff
Email patch
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_custom_ops.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
# Register CustomRotaryEmbedding to CustomOP.
@
RotaryEmbedding
.
register_oot
class
DummyRotaryEmbedding
(
RotaryEmbedding
):
"""Original rotary positional embedding."""
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
addition_config
=
True
def
forward_oot
(
self
,
*
args
,
**
kwargs
)
->
tuple
[
torch
.
Tensor
,
torch
.
Tensor
]:
return
super
().
forward_oot
(
*
args
,
**
kwargs
)
tests/plugins/vllm_add_dummy_platform/vllm_add_dummy_platform/dummy_platform.py
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
TYPE_CHECKING
from
vllm.platforms.
cuda
import
Cuda
Platform
from
vllm.platforms.
interface
import
Platform
,
Platform
Enum
if
TYPE_CHECKING
:
from
vllm.config
import
VllmConfig
else
:
VllmConfig
=
None
from
vllm
import
envs
class
DummyPlatform
(
CudaPlatform
):
class
DummyPlatform
(
Platform
):
_enum
=
PlatformEnum
.
OOT
device_name
=
"DummyDevice"
device_name
=
"DummyDevice"
device_type
:
str
=
"privateuseone"
dispatch_key
:
str
=
"PrivateUse1"
@
classmethod
def
check_and_update_config
(
cls
,
vllm_config
:
VllmConfig
)
->
None
:
if
envs
.
VLLM_USE_V1
:
compilation_config
=
vllm_config
.
compilation_config
# Activate custom ops for v1.
compilation_config
.
custom_ops
=
[
"all"
]
def
get_attn_backend_cls
(
self
,
backend_name
,
head_size
,
dtype
,
def
get_attn_backend_cls
(
self
,
backend_name
,
head_size
,
dtype
,
kv_cache_dtype
,
block_size
,
use_v1
,
use_mla
):
kv_cache_dtype
,
block_size
,
use_v1
,
use_mla
):
return
"vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"
# noqa E501
return
"vllm_add_dummy_platform.dummy_attention_backend.DummyAttentionBackend"
# noqa E501
\ No newline at end of file
tests/plugins_tests/test_platform_plugins.py
View file @
99324e25
...
@@ -5,6 +5,7 @@ import pytest
...
@@ -5,6 +5,7 @@ import pytest
import
torch
import
torch
from
vllm.attention.selector
import
get_attn_backend
from
vllm.attention.selector
import
get_attn_backend
from
vllm.plugins
import
load_general_plugins
from
vllm.utils
import
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
from
vllm.utils
import
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
...
@@ -32,3 +33,16 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
...
@@ -32,3 +33,16 @@ def test_oot_attention_backend(monkeypatch: pytest.MonkeyPatch):
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
m
.
setenv
(
STR_BACKEND_ENV_VAR
,
STR_INVALID_VAL
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
"auto"
,
16
,
False
)
backend
=
get_attn_backend
(
16
,
torch
.
float16
,
"auto"
,
16
,
False
)
assert
backend
.
get_name
()
==
"Dummy_Backend"
assert
backend
.
get_name
()
==
"Dummy_Backend"
def
test_oot_custom_op
(
monkeypatch
:
pytest
.
MonkeyPatch
):
# simulate workload by running an example
load_general_plugins
()
from
vllm.model_executor.layers.rotary_embedding
import
RotaryEmbedding
layer
=
RotaryEmbedding
(
16
,
16
,
16
,
16
,
True
,
torch
.
float16
)
assert
layer
.
__class__
.
__name__
==
"DummyRotaryEmbedding"
,
(
f
"Expected DummyRotaryEmbedding, got
{
layer
.
__class__
.
__name__
}
, "
"possibly because the custom op is not registered correctly."
)
assert
hasattr
(
layer
,
"addition_config"
),
(
"Expected DummyRotaryEmbedding to have an 'addition_config' attribute, "
"which is set by the custom op."
)
tests/pplx_utils.py
deleted
100644 → 0
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
dataclasses
import
os
import
traceback
from
typing
import
Callable
import
torch
from
torch.multiprocessing
import
(
spawn
)
# pyright: ignore[reportPrivateImportUsage]
from
typing_extensions
import
Concatenate
,
ParamSpec
P
=
ParamSpec
(
"P"
)
@
dataclasses
.
dataclass
class
ProcessGroupInfo
:
world_size
:
int
world_local_size
:
int
rank
:
int
node_rank
:
int
local_rank
:
int
device
:
torch
.
device
def
_worker_parallel_launch
(
local_rank
:
int
,
world_size
:
int
,
world_local_size
:
int
,
node_rank
:
int
,
init_method
:
str
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
rank
=
node_rank
*
world_local_size
+
local_rank
torch
.
cuda
.
set_device
(
local_rank
)
device
=
torch
.
device
(
"cuda"
,
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
"cpu:gloo,cuda:nccl"
,
init_method
=
init_method
,
rank
=
rank
,
world_size
=
world_size
,
device_id
=
device
,
)
barrier
=
torch
.
tensor
([
rank
],
device
=
device
)
torch
.
distributed
.
all_reduce
(
barrier
)
try
:
worker
(
ProcessGroupInfo
(
world_size
=
world_size
,
world_local_size
=
world_local_size
,
rank
=
rank
,
node_rank
=
node_rank
,
local_rank
=
local_rank
,
device
=
device
,
),
*
args
,
**
kwargs
,
)
except
Exception
as
ex
:
print
(
ex
)
traceback
.
print_exc
()
raise
finally
:
torch
.
distributed
.
destroy_process_group
()
def
parallel_launch
(
world_size
:
int
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
assert
not
kwargs
spawn
(
_worker_parallel_launch
,
args
=
(
world_size
,
world_size
,
0
,
"tcp://localhost:29500"
,
worker
,
)
+
args
,
nprocs
=
world_size
,
join
=
True
,
)
def
parallel_launch_from_env
(
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
"""
Launches a worker function in parallel across all processes in the current
environment. The environment must have the following variables set:
- WORLD_SIZE: The total number of processes.
- WORLD_LOCAL_SIZE: The number of processes on the current node.
- NODE_RANK: The rank of the current
- MASTER_ADDR: The address of the master process.
- MASTER_PORT: The port of the master process.
"""
assert
not
kwargs
world_size
=
int
(
os
.
environ
[
"WORLD_SIZE"
])
world_local_size
=
int
(
os
.
environ
[
"WORLD_LOCAL_SIZE"
])
node_rank
=
int
(
os
.
environ
[
"NODE_RANK"
])
assert
"MASTER_ADDR"
in
os
.
environ
assert
"MASTER_PORT"
in
os
.
environ
spawn
(
_worker_parallel_launch
,
args
=
(
world_size
,
world_local_size
,
node_rank
,
"env://"
,
worker
,
)
+
args
,
nprocs
=
world_local_size
,
join
=
True
,
)
tests/quantization/test_bitsandbytes.py
View file @
99324e25
...
@@ -159,8 +159,9 @@ def test_4bit_bnb_embedding_model(
...
@@ -159,8 +159,9 @@ def test_4bit_bnb_embedding_model(
with
vllm_runner
(
model_name
,
with
vllm_runner
(
model_name
,
task
=
"embed"
,
task
=
"embed"
,
dtype
=
dtype
,
dtype
=
dtype
,
gpu_memory_utilization
=
0.5
,
quantization
=
"bitsandbytes"
)
as
vllm_model
:
quantization
=
"bitsandbytes"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
e
ncode
(
example_prompts
)
vllm_outputs
=
vllm_model
.
e
mbed
(
example_prompts
)
check_embeddings_close
(
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
embeddings_1_lst
=
vllm_outputs
,
...
...
tests/quantization/test_compressed_tensors.py
View file @
99324e25
...
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
...
@@ -17,7 +17,7 @@ from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tenso
CompressedTensorsW4A4Fp4
,
CompressedTensorsW4A16Fp4
,
CompressedTensorsW4A4Fp4
,
CompressedTensorsW4A16Fp4
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW4A16Sparse24
,
CompressedTensorsW8A8Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsW8A8Int8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
CompressedTensorsWNA16
,
cutlass_fp4_supported
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
sparse_cutlass_supported
)
sparse_cutlass_supported
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -667,7 +667,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
...
@@ -667,7 +667,13 @@ def test_compressed_tensors_nvfp4(vllm_runner, args):
qkv_proj
=
layer
.
self_attn
.
qkv_proj
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
scheme
)
if
isinstance
(
qkv_proj
.
scheme
,
scheme
)
or
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW4A16Fp4
)
and
not
cutlass_fp4_supported
():
assert
True
else
:
raise
AssertionError
(
"FP4 Scheme Mismatch"
)
assert
qkv_proj
.
scheme
.
group_size
==
16
assert
qkv_proj
.
scheme
.
group_size
==
16
llm
.
apply_model
(
check_model
)
llm
.
apply_model
(
check_model
)
...
...
tests/quantization/test_register_quantization_config.py
View file @
99324e25
...
@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
...
@@ -53,6 +53,7 @@ class CustomQuantConfig(QuantizationConfig):
def
__init__
(
self
,
num_bits
:
int
=
8
)
->
None
:
def
__init__
(
self
,
num_bits
:
int
=
8
)
->
None
:
"""Initialize the quantization config."""
"""Initialize the quantization config."""
super
().
__init__
()
self
.
num_bits
=
num_bits
self
.
num_bits
=
num_bits
def
get_name
(
self
)
->
QuantizationMethods
:
def
get_name
(
self
)
->
QuantizationMethods
:
...
...
tests/quantization/test_rtn.py
0 → 100644
View file @
99324e25
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Copyright © 2025, Oracle and/or its affiliates.
"""Tests RTN quantization startup and generation,
doesn't test correctness
"""
import
pytest
from
tests.quantization.utils
import
is_quant_method_supported
MODELS
=
[
"microsoft/Phi-3-mini-4k-instruct"
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"rtn"
),
reason
=
"RTN is not supported on this GPU type."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"bfloat16"
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
10
])
def
test_model_rtn_startup
(
hf_runner
,
vllm_runner
,
example_prompts
,
model
:
str
,
dtype
:
str
,
max_tokens
:
int
,
)
->
None
:
with
vllm_runner
(
model
,
dtype
=
dtype
,
quantization
=
"rtn"
)
as
vllm_model
:
vllm_model
.
generate_greedy
(
example_prompts
,
max_tokens
)
tests/quantization/test_torchao.py
View file @
99324e25
...
@@ -60,5 +60,20 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
...
@@ -60,5 +60,20 @@ def test_opt_125m_int4wo_model_per_module_quant(vllm_runner):
print
(
output
)
print
(
output
)
@
pytest
.
mark
.
skipif
(
not
TORCHAO_AVAILABLE
,
reason
=
"torchao is not available"
)
def
test_qwenvl_int8wo_model_loading_with_params
(
vllm_runner
):
torch
.
_dynamo
.
reset
()
model_name
=
"mobicham/Qwen2.5-VL-3B-Instruct_int8wo_ao"
with
vllm_runner
(
model_name
=
model_name
,
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
assert
output
print
(
output
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
pytest
.
main
([
__file__
])
tests/samplers/test_typical_acceptance_sampler.py
View file @
99324e25
...
@@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
...
@@ -248,7 +248,7 @@ def test_temperature_zero_target_distribution(seed: int, device: str):
size
=
(
batch_size
,
1
),
size
=
(
batch_size
,
1
),
dtype
=
torch
.
int64
)
dtype
=
torch
.
int64
)
# The target probaility distribution is a temperature zero distribution
# The target probaility distribution is a temperature zero distribution
# with zero entroy. Since our draft token ids don't match the probability
# with zero entro
p
y. Since our draft token ids don't match the probability
# 1.0 tokens in the target distribution we will reject all of them and
# 1.0 tokens in the target distribution we will reject all of them and
# fallback to the greedy sampling for selecting 1 token for each sequence.
# fallback to the greedy sampling for selecting 1 token for each sequence.
# Verify the same.
# Verify the same.
...
...
tests/spec_decode/e2e/test_eagle_correctness.py
View file @
99324e25
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, EAGLE would not break the
With those tests, we can say at least, EAGLE would not break the
correctess for the target model outputs.
correct
n
ess for the target model outputs.
"""
"""
import
pytest
import
pytest
...
@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -370,6 +370,10 @@ def test_llama2_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
16
,
"max_model_len"
:
(
2
+
256
//
16
)
*
16
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
...
@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -420,6 +424,10 @@ def test_llama3_eagle_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
# 2 for small prompt, 256//16 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
16
,
"max_model_len"
:
(
2
+
256
//
16
)
*
16
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
...
...
tests/spec_decode/e2e/test_integration.py
View file @
99324e25
...
@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
...
@@ -14,10 +14,13 @@ MAIN_MODEL = "JackFram/llama-68m"
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-68m"
,
# Verify equality when cuda graphs allowed.
# Verify equality when cuda graphs allowed.
"enforce_eager"
:
False
,
"enforce_eager"
:
False
,
"model_name"
:
"JackFram/llama-68m"
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
...
@@ -59,6 +62,9 @@ def test_spec_decode_cuda_graph(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
...
@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
...
@@ -117,6 +123,9 @@ def test_speculative_model_quantization_config(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_logprobs.py
View file @
99324e25
...
@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
...
@@ -17,7 +17,10 @@ from .conftest import run_equality_correctness_test
"model_name"
:
"JackFram/llama-160m"
,
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
...
@@ -75,6 +78,9 @@ def test_logprobs_equality(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
...
@@ -128,6 +134,9 @@ def test_logprobs_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
...
@@ -182,6 +191,9 @@ def test_logprobs_when_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
...
@@ -256,8 +268,12 @@ def test_logprobs_temp_1(vllm_runner, common_llm_kwargs,
"common_llm_kwargs"
,
"common_llm_kwargs"
,
[{
[{
"model_name"
:
"JackFram/llama-160m"
,
"model_name"
:
"JackFram/llama-160m"
,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_medusa_correctness.py
View file @
99324e25
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, Medusa would not break the
With those tests, we can say at least, Medusa would not break the
correctess for the target model outputs.
correct
n
ess for the target model outputs.
"""
"""
import
pytest
import
pytest
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
99324e25
...
@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -494,6 +494,9 @@ def test_mlp_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# Precision
"dtype"
:
PRECISION
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_mtp_correctness.py
View file @
99324e25
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
...
@@ -18,7 +18,7 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under various number of speculative tokens.
* Test greedy equality under various number of speculative tokens.
With those tests, we can say at least, mtp would not break the
With those tests, we can say at least, mtp would not break the
correctess for the target model outputs.
correct
n
ess for the target model outputs.
"""
"""
import
pytest
import
pytest
...
...
tests/spec_decode/e2e/test_multistep_correctness.py
View file @
99324e25
...
@@ -57,6 +57,9 @@ from .conftest import (get_output_from_llm_generator,
...
@@ -57,6 +57,9 @@ from .conftest import (get_output_from_llm_generator,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
...
@@ -139,6 +142,9 @@ def test_spec_decode_e2e_with_detokenization(test_llm_generator,
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
...
@@ -216,6 +222,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_bs1(
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
...
@@ -279,6 +288,9 @@ def test_spec_decode_e2e_greedy_correctness_tiny_model_large_bs(
[{
[{
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
...
@@ -464,6 +476,8 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
...
@@ -523,6 +537,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
"per_test_common_llm_kwargs"
,
...
@@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
...
@@ -589,6 +605,8 @@ def test_spec_decode_different_block_size(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
...
@@ -655,6 +673,8 @@ def test_skip_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
...
@@ -706,6 +726,8 @@ def test_disable_speculation(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
...
@@ -763,6 +785,8 @@ def test_many_k(vllm_runner, common_llm_kwargs, per_test_common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/spec_decode/e2e/test_ngram_correctness.py
View file @
99324e25
...
@@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
...
@@ -22,8 +22,8 @@ However, we still need to verify below scenario could be passed:
* Test greedy equality under preemption
* Test greedy equality under preemption
* Test greedy equality under various ngram sizes / speculative sizes
* Test greedy equality under various ngram sizes / speculative sizes
With those tests, we can say at least, ngram spec would not break the
correctess
With those tests, we can say at least, ngram spec would not break the
for the target model outputs.
correctness
for the target model outputs.
"""
"""
import
pytest
import
pytest
...
@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
...
@@ -40,6 +40,9 @@ from .conftest import run_equality_correctness_test
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
...
@@ -97,6 +100,9 @@ def test_ngram_e2e_greedy_correctness(vllm_runner, common_llm_kwargs,
# Print spec metrics.
# Print spec metrics.
"disable_log_stats"
:
False
,
"disable_log_stats"
:
False
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
...
@@ -160,6 +166,9 @@ def test_ngram_e2e_greedy_logprobs(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
{
{
...
@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
...
@@ -221,6 +230,9 @@ def test_ngram_e2e_greedy_correctness_with_preemption(
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
...
@@ -281,6 +293,9 @@ def test_ngram_different_k(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
...
@@ -337,6 +352,9 @@ def test_ngram_disable_queue(vllm_runner, common_llm_kwargs,
# Skip cuda graph recording for fast test.
# Skip cuda graph recording for fast test.
"enforce_eager"
:
True
,
"enforce_eager"
:
True
,
# The original model is float32, keep it for numerical stability.
"dtype"
:
"float32"
,
}])
}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
@
pytest
.
mark
.
parametrize
(
"baseline_llm_kwargs"
,
[{}])
...
...
tests/standalone_tests/pytorch_nightly_dependency.sh
0 → 100644
View file @
99324e25
#!/bin/sh
# This script tests if the nightly torch packages are not overridden by the dependencies
set
-e
set
-x
cd
/vllm-workspace/
rm
-rf
.venv
uv venv .venv
source
.venv/bin/activate
# check the environment
uv pip freeze
echo
">>> Installing nightly torch packages"
uv pip
install
--quiet
torch torchvision torchaudio
--pre
--extra-index-url
https://download.pytorch.org/whl/nightly/cu128
echo
">>> Capturing torch-related versions before requirements install"
uv pip freeze |
grep
-E
'^torch|^torchvision|^torchaudio'
|
sort
>
before.txt
echo
"Before:"
cat
before.txt
echo
">>> Installing requirements/nightly_torch_test.txt"
uv pip
install
--quiet
-r
requirements/nightly_torch_test.txt
echo
">>> Capturing torch-related versions after requirements install"
uv pip freeze |
grep
-E
'^torch|^torchvision|^torchaudio'
|
sort
>
after.txt
echo
"After:"
cat
after.txt
echo
">>> Comparing versions"
if
diff before.txt after.txt
;
then
echo
"torch version not overridden."
else
echo
"torch version overridden by nightly_torch_test.txt,
\
if the dependency is not triggered by the pytroch nightly test,
\
please add the dependency to the list 'white_list' in tools/generate_nightly_torch_test.py"
exit
1
fi
tests/test_config.py
View file @
99324e25
...
@@ -2,49 +2,16 @@
...
@@ -2,49 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
MISSING
,
Field
,
asdict
,
dataclass
,
field
from
dataclasses
import
MISSING
,
Field
,
asdict
,
dataclass
,
field
from
typing
import
Literal
,
Union
import
pytest
import
pytest
from
vllm.compilation.backends
import
VllmBackend
from
vllm.compilation.backends
import
VllmBackend
from
vllm.config
import
(
LoadConfig
,
ModelConfig
,
PoolerConfig
,
VllmConfig
,
from
vllm.config
import
(
LoadConfig
,
ModelConfig
,
PoolerConfig
,
VllmConfig
,
config
,
get_field
)
get_field
)
from
vllm.model_executor.layers.pooler
import
PoolingType
from
vllm.model_executor.layers.pooler
import
PoolingType
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
class
TestConfig1
:
pass
@
dataclass
class
TestConfig2
:
a
:
int
"""docstring"""
@
dataclass
class
TestConfig3
:
a
:
int
=
1
@
dataclass
class
TestConfig4
:
a
:
Union
[
Literal
[
1
],
Literal
[
2
]]
=
1
"""docstring"""
@
pytest
.
mark
.
parametrize
((
"test_config"
,
"expected_error"
),
[
(
TestConfig1
,
"must be a dataclass"
),
(
TestConfig2
,
"must have a default"
),
(
TestConfig3
,
"must have a docstring"
),
(
TestConfig4
,
"must use a single Literal"
),
])
def
test_config
(
test_config
,
expected_error
):
with
pytest
.
raises
(
Exception
,
match
=
expected_error
):
config
(
test_config
)
def
test_compile_config_repr_succeeds
():
def
test_compile_config_repr_succeeds
():
# setup: VllmBackend mutates the config object
# setup: VllmBackend mutates the config object
config
=
VllmConfig
()
config
=
VllmConfig
()
...
@@ -57,23 +24,23 @@ def test_compile_config_repr_succeeds():
...
@@ -57,23 +24,23 @@ def test_compile_config_repr_succeeds():
assert
'inductor_passes'
in
val
assert
'inductor_passes'
in
val
def
test_get_field
():
@
dataclass
class
_TestConfigFields
:
a
:
int
b
:
dict
=
field
(
default_factory
=
dict
)
c
:
str
=
"default"
@
dataclass
class
TestConfig
:
a
:
int
b
:
dict
=
field
(
default_factory
=
dict
)
c
:
str
=
"default"
def
test_get_field
():
with
pytest
.
raises
(
ValueError
):
with
pytest
.
raises
(
ValueError
):
get_field
(
TestConfig
,
"a"
)
get_field
(
_
TestConfig
Fields
,
"a"
)
b
=
get_field
(
TestConfig
,
"b"
)
b
=
get_field
(
_
TestConfig
Fields
,
"b"
)
assert
isinstance
(
b
,
Field
)
assert
isinstance
(
b
,
Field
)
assert
b
.
default
is
MISSING
assert
b
.
default
is
MISSING
assert
b
.
default_factory
is
dict
assert
b
.
default_factory
is
dict
c
=
get_field
(
TestConfig
,
"c"
)
c
=
get_field
(
_
TestConfig
Fields
,
"c"
)
assert
isinstance
(
c
,
Field
)
assert
isinstance
(
c
,
Field
)
assert
c
.
default
==
"default"
assert
c
.
default
==
"default"
assert
c
.
default_factory
is
MISSING
assert
c
.
default_factory
is
MISSING
...
@@ -85,7 +52,7 @@ def test_get_field():
...
@@ -85,7 +52,7 @@ def test_get_field():
(
"distilbert/distilgpt2"
,
"generate"
,
"generate"
),
(
"distilbert/distilgpt2"
,
"generate"
,
"generate"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"
score
"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"
classify
"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"reward"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"reward"
),
(
"openai/whisper-small"
,
"transcription"
,
"transcription"
),
(
"openai/whisper-small"
,
"transcription"
,
"transcription"
),
],
],
...
@@ -105,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
...
@@ -105,6 +72,32 @@ def test_auto_task(model_id, expected_runner_type, expected_task):
assert
config
.
task
==
expected_task
assert
config
.
task
==
expected_task
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"expected_runner_type"
,
"expected_task"
),
[
(
"distilbert/distilgpt2"
,
"pooling"
,
"embed"
),
(
"intfloat/multilingual-e5-small"
,
"pooling"
,
"embed"
),
(
"jason9693/Qwen2.5-1.5B-apeach"
,
"pooling"
,
"classify"
),
(
"cross-encoder/ms-marco-MiniLM-L-6-v2"
,
"pooling"
,
"classify"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"pooling"
,
"embed"
),
(
"openai/whisper-small"
,
"pooling"
,
"embed"
),
],
)
def
test_score_task
(
model_id
,
expected_runner_type
,
expected_task
):
config
=
ModelConfig
(
model_id
,
task
=
"score"
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
)
assert
config
.
runner_type
==
expected_runner_type
assert
config
.
task
==
expected_task
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"bad_task"
),
[
@
pytest
.
mark
.
parametrize
((
"model_id"
,
"bad_task"
),
[
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"generate"
),
(
"Qwen/Qwen2.5-Math-RM-72B"
,
"generate"
),
])
])
...
@@ -438,3 +431,33 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
...
@@ -438,3 +431,33 @@ def test_load_config_pt_load_map_location(pt_load_map_location):
config
=
VllmConfig
(
load_config
=
load_config
)
config
=
VllmConfig
(
load_config
=
load_config
)
assert
config
.
load_config
.
pt_load_map_location
==
pt_load_map_location
assert
config
.
load_config
.
pt_load_map_location
==
pt_load_map_location
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"max_model_len"
,
"expected_max_len"
,
"should_raise"
),
[
(
"BAAI/bge-reranker-base"
,
None
,
512
,
False
),
(
"BAAI/bge-reranker-base"
,
256
,
256
,
False
),
(
"BAAI/bge-reranker-base"
,
513
,
512
,
True
),
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
,
None
,
131072
,
False
),
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"
,
131073
,
131072
,
True
),
])
def
test_get_and_verify_max_len
(
model_id
,
max_model_len
,
expected_max_len
,
should_raise
):
"""Test get_and_verify_max_len with different configurations."""
model_config
=
ModelConfig
(
model_id
,
task
=
"auto"
,
tokenizer
=
model_id
,
tokenizer_mode
=
"auto"
,
trust_remote_code
=
False
,
seed
=
0
,
dtype
=
"float16"
,
revision
=
None
,
)
if
should_raise
:
with
pytest
.
raises
(
ValueError
):
model_config
.
get_and_verify_max_len
(
max_model_len
)
else
:
actual_max_len
=
model_config
.
get_and_verify_max_len
(
max_model_len
)
assert
actual_max_len
==
expected_max_len
Prev
1
…
16
17
18
19
20
21
22
23
24
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment