Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7a985548
Commit
7a985548
authored
May 22, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.0' into v0.9.0-ori
parents
45d3785c
dc1440cf
Changes
486
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
649 additions
and
34 deletions
+649
-34
tests/multimodal/test_utils.py
tests/multimodal/test_utils.py
+17
-0
tests/multimodal/test_video.py
tests/multimodal/test_video.py
+41
-0
tests/neuron/1_core/test_neuron_model_runner.py
tests/neuron/1_core/test_neuron_model_runner.py
+126
-0
tests/neuron/1_core/test_rotary_embedding.py
tests/neuron/1_core/test_rotary_embedding.py
+21
-12
tests/neuron/2_core/test_mistral.py
tests/neuron/2_core/test_mistral.py
+32
-0
tests/plugins/lora_resolvers/__init__.py
tests/plugins/lora_resolvers/__init__.py
+0
-0
tests/plugins/lora_resolvers/test_filesystem_resolver.py
tests/plugins/lora_resolvers/test_filesystem_resolver.py
+65
-0
tests/quantization/test_auto_round.py
tests/quantization/test_auto_round.py
+30
-0
tests/quantization/test_bitsandbytes.py
tests/quantization/test_bitsandbytes.py
+64
-2
tests/quantization/test_compressed_tensors.py
tests/quantization/test_compressed_tensors.py
+23
-3
tests/quantization/test_quark.py
tests/quantization/test_quark.py
+26
-0
tests/quantization/test_register_quantization_config.py
tests/quantization/test_register_quantization_config.py
+2
-2
tests/quantization/test_torchao.py
tests/quantization/test_torchao.py
+38
-0
tests/reasoning/test_qwen3_reasoning_parser.py
tests/reasoning/test_qwen3_reasoning_parser.py
+141
-0
tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
...i_model_streamer_test/test_runai_model_streamer_loader.py
+2
-3
tests/samplers/test_rejection_sampler.py
tests/samplers/test_rejection_sampler.py
+12
-2
tests/samplers/test_sampler.py
tests/samplers/test_sampler.py
+3
-1
tests/spec_decode/e2e/test_medusa_correctness.py
tests/spec_decode/e2e/test_medusa_correctness.py
+1
-1
tests/spec_decode/e2e/test_mlp_correctness.py
tests/spec_decode/e2e/test_mlp_correctness.py
+2
-2
tests/spec_decode/e2e/test_multistep_correctness.py
tests/spec_decode/e2e/test_multistep_correctness.py
+3
-6
No files found.
Too many changes to show.
To preserve performance only
486 of 486+
files are displayed.
Plain diff
Email patch
tests/multimodal/test_utils.py
View file @
7a985548
...
...
@@ -26,6 +26,11 @@ TEST_IMAGE_URLS = [
"https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png"
,
]
TEST_VIDEO_URLS
=
[
"https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
,
"https://filesamples.com/samples/video/avi/sample_640x360.avi"
,
]
@
pytest
.
fixture
(
scope
=
"module"
)
def
url_images
()
->
dict
[
str
,
Image
.
Image
]:
...
...
@@ -134,6 +139,18 @@ async def test_fetch_image_local_files(image_url: str):
f
"file://
{
temp_dir
}
/../
{
os
.
path
.
basename
(
image_url
)
}
"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
parametrize
(
"video_url"
,
TEST_VIDEO_URLS
)
@
pytest
.
mark
.
parametrize
(
"num_frames"
,
[
-
1
,
32
,
1800
])
async
def
test_fetch_video_http
(
video_url
:
str
,
num_frames
:
int
):
connector
=
MediaConnector
()
video_sync
=
connector
.
fetch_video
(
video_url
,
num_frames
=
num_frames
)
video_async
=
await
connector
.
fetch_video_async
(
video_url
,
num_frames
=
num_frames
)
assert
np
.
array_equal
(
video_sync
,
video_async
)
# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
class
TestCase
(
NamedTuple
):
mm_positions
:
"MultiModalPlaceholderDict"
...
...
tests/multimodal/test_video.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
numpy
as
np
import
numpy.typing
as
npt
import
pytest
from
vllm.multimodal.video
import
VIDEO_LOADER_REGISTRY
,
VideoLoader
NUM_FRAMES
=
10
FAKE_OUTPUT_1
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
FAKE_OUTPUT_2
=
np
.
random
.
rand
(
NUM_FRAMES
,
1280
,
720
,
3
)
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_loader_1"
)
class
TestVideoLoader1
(
VideoLoader
):
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
)
->
npt
.
NDArray
:
return
FAKE_OUTPUT_1
@
VIDEO_LOADER_REGISTRY
.
register
(
"test_video_loader_2"
)
class
TestVideoLoader2
(
VideoLoader
):
@
classmethod
def
load_bytes
(
cls
,
data
:
bytes
,
num_frames
:
int
=
-
1
)
->
npt
.
NDArray
:
return
FAKE_OUTPUT_2
def
test_video_loader_registry
():
custom_loader_1
=
VIDEO_LOADER_REGISTRY
.
load
(
"test_video_loader_1"
)
output_1
=
custom_loader_1
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
output_1
,
FAKE_OUTPUT_1
)
custom_loader_2
=
VIDEO_LOADER_REGISTRY
.
load
(
"test_video_loader_2"
)
output_2
=
custom_loader_2
.
load_bytes
(
b
"test"
)
np
.
testing
.
assert_array_equal
(
output_2
,
FAKE_OUTPUT_2
)
def
test_video_loader_type_doesnt_exist
():
with
pytest
.
raises
(
AssertionError
):
VIDEO_LOADER_REGISTRY
.
load
(
"non_existing_video_loader"
)
tests/neuron/1_core/test_neuron_model_runner.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
os
from
unittest.mock
import
MagicMock
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
EngineArgs
from
vllm.platforms
import
current_platform
from
vllm.platforms.neuron
import
NeuronFramework
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
SequenceData
,
SequenceGroupMetadata
from
vllm.worker.neuron_model_runner
import
NeuronModelRunner
os
.
environ
[
'VLLM_NEURON_FRAMEWORK'
]
=
NeuronFramework
.
TRANSFORMERS_NEURONX
.
value
def
_create_neuron_model_runner
(
model
:
str
,
*
args
,
**
kwargs
)
->
NeuronModelRunner
:
engine_args
=
EngineArgs
(
model
,
*
args
,
**
kwargs
)
engine_config
=
engine_args
.
create_engine_config
()
vllm_config
=
VllmConfig
(
model_config
=
engine_config
.
model_config
,
parallel_config
=
engine_config
.
parallel_config
,
scheduler_config
=
engine_config
.
scheduler_config
,
device_config
=
engine_config
.
device_config
,
)
neuron_model_runner
=
NeuronModelRunner
(
vllm_config
=
vllm_config
)
return
neuron_model_runner
def
test_update_neuron_sampling_params_not_full_batch
():
os
.
environ
[
"NEURON_ON_DEVICE_SAMPLING_DISABLED"
]
=
"0"
model_runner
=
_create_neuron_model_runner
(
"facebook/opt-125m"
,
seed
=
0
,
dtype
=
"float16"
,
max_num_seqs
=
2
,
)
assert
not
model_runner
.
_on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if
current_platform
.
use_transformers_neuronx
():
model_mock
=
MagicMock
()
model_runner
.
model
=
model_mock
seq_group_metadata_list
=
[
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0.5
,
top_k
=
1
,
top_p
=
0.5
),
block_tables
=
{
0
:
[
1
]},
)
]
model_runner
.
prepare_model_input
(
seq_group_metadata_list
)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: default sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params
=
(
model_runner
.
model_config
.
neuron_sampling_params
)
assert
neuron_sampling_params
.
temperature
==
[
1.0
,
0.5
]
assert
neuron_sampling_params
.
top_k
==
[
model_runner
.
_MAX_NEURON_SAMPLING_TOP_K
,
1
]
assert
neuron_sampling_params
.
top_p
==
[
1.0
,
0.5
]
model_mock
.
model
.
update_generation_config
.
assert_called_once_with
(
neuron_sampling_params
)
def
test_update_neuron_sampling_params_full_batch
():
os
.
environ
[
"NEURON_ON_DEVICE_SAMPLING_DISABLED"
]
=
"0"
model_runner
=
_create_neuron_model_runner
(
"facebook/opt-125m"
,
seed
=
0
,
dtype
=
"float16"
,
max_num_seqs
=
2
,
)
assert
not
model_runner
.
_on_device_sampling_disabled
# Test sampling param updating only when TNx is framework
# NxDI handles sampling parameter updating inside model
if
current_platform
.
use_transformers_neuronx
():
model_mock
=
MagicMock
()
model_runner
.
model
=
model_mock
seq_group_metadata_list
=
[
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
0
:
SequenceData
.
from_seqs
([
1
,
2
,
3
])},
sampling_params
=
SamplingParams
(
temperature
=
0.5
,
top_k
=
1
,
top_p
=
0.5
),
block_tables
=
{
0
:
[
1
]},
),
SequenceGroupMetadata
(
request_id
=
"test_0"
,
is_prompt
=
True
,
seq_data
=
{
1
:
SequenceData
.
from_seqs
([
4
,
5
,
6
])},
sampling_params
=
SamplingParams
(
temperature
=
0.2
,
top_k
=
2
,
top_p
=
0.2
),
block_tables
=
{
1
:
[
0
]},
)
]
model_runner
.
prepare_model_input
(
seq_group_metadata_list
)
# Index neuron sampling parameters based on block_tables indices.
# The first block_id of the sequence 0 is 1, so its parameters are
# placed at index 1. So the sampling parameters will be:
# Index 0: sequence 1's sampling parameters
# Index 1: sequecne 0's sampling parameters.
neuron_sampling_params
=
(
model_runner
.
model_config
.
neuron_sampling_params
)
assert
neuron_sampling_params
.
temperature
==
[
0.2
,
0.5
]
assert
neuron_sampling_params
.
top_k
==
[
2
,
1
]
assert
neuron_sampling_params
.
top_p
==
[
0.2
,
0.5
]
model_mock
.
model
.
update_generation_config
.
assert_called_once_with
(
neuron_sampling_params
)
tests/neuron/1_core/test_rotary_embedding.py
View file @
7a985548
...
...
@@ -11,14 +11,16 @@ from vllm.platforms import current_platform
@
pytest
.
mark
.
parametrize
(
"max_position,is_neox_style,rotary_dim,head_size,seq_len"
,
[
(
16
,
False
,
32
,
32
,
1024
),
(
16
,
False
,
32
,
128
,
1024
),
(
16
,
True
,
32
,
32
,
1024
),
(
16
,
True
,
32
,
128
,
1024
),
"max_position,is_neox_style,rotary_dim,head_size,seq_len,use_key"
,
[
(
16
,
False
,
32
,
32
,
1024
,
True
),
(
16
,
False
,
32
,
128
,
1024
,
True
),
(
16
,
True
,
32
,
32
,
1024
,
True
),
(
16
,
True
,
32
,
128
,
1024
,
True
),
(
16
,
False
,
32
,
128
,
1024
,
False
),
(
16
,
True
,
32
,
128
,
1024
,
False
),
])
def
test_rotary_embedding_opcheck
(
max_position
,
is_neox_style
,
rotary_dim
,
head_size
,
seq_len
):
head_size
,
seq_len
,
use_key
):
import
torch_xla.core.xla_model
as
xm
device
=
xm
.
xla_device
()
...
...
@@ -40,19 +42,26 @@ def test_rotary_embedding_opcheck(max_position, is_neox_style, rotary_dim,
num_heads
*
head_size
,
dtype
=
torch
.
float32
,
device
=
"cpu"
)
key
=
torch
.
randn_like
(
query
)
key
=
torch
.
randn_like
(
query
)
if
use_key
else
None
assert
positions
.
is_cpu
,
\
"reference input tensor is expected to be CPU tensor."
ref_query
,
ref_key
=
rot
.
to
(
device
=
"cpu"
).
forward_native
(
positions
,
query
,
key
)
out_query
,
out_key
=
rot
.
to
(
device
=
device
).
forward_neuron
(
positions
.
to
(
device
=
device
),
query
.
to
(
device
=
device
),
key
.
to
(
device
=
device
))
assert
out_query
.
is_xla
and
out_key
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
key
.
to
(
device
=
device
)
if
key
is
not
None
else
None
)
if
use_key
:
assert
out_query
.
is_xla
and
out_key
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out_key
.
cpu
(),
ref_key
,
atol
=
1e-2
,
rtol
=
1e-2
)
else
:
assert
out_key
is
None
,
"expected returned key to be None"
assert
out_query
.
is_xla
,
\
"output tensor is expected to be XLA tensor"
torch
.
testing
.
assert_close
(
out_query
.
cpu
(),
ref_query
,
atol
=
1e-2
,
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
out_key
.
cpu
(),
ref_key
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/neuron/2_core/test_mistral.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
from
vllm
import
LLM
,
SamplingParams
def
test_mistral
():
llm
=
LLM
(
model
=
"mistralai/Mistral-7B-v0.1"
,
tensor_parallel_size
=
2
,
max_num_seqs
=
4
,
max_model_len
=
512
,
use_v2_block_manager
=
True
,
override_neuron_config
=
{
"sequence_parallel_enabled"
:
False
,
"skip_warmup"
:
True
},
device
=
"neuron"
)
prompts
=
[
"The president of the United States is"
,
"The capital of France is"
,
]
outputs
=
llm
.
generate
(
prompts
,
SamplingParams
(
top_k
=
1
))
expected_outputs
=
[
" the most powerful person in the world. He is the head of state "
"and head"
,
" a city of many faces. It is a city of history, culture, art"
]
for
expected_output
,
output
in
zip
(
expected_outputs
,
outputs
):
generated_text
=
output
.
outputs
[
0
].
text
assert
(
expected_output
==
generated_text
)
tests/
models/embedding/vision_language
/__init__.py
→
tests/
plugins/lora_resolvers
/__init__.py
View file @
7a985548
File moved
tests/plugins/lora_resolvers/test_filesystem_resolver.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
os
import
shutil
import
pytest
from
huggingface_hub
import
snapshot_download
from
vllm.plugins.lora_resolvers.filesystem_resolver
import
FilesystemResolver
MODEL_NAME
=
"mistralai/Mistral-7B-v0.1"
LORA_NAME
=
"typeof/zephyr-7b-beta-lora"
PA_NAME
=
"swapnilbp/llama_tweet_ptune"
@
pytest
.
fixture
(
scope
=
'module'
)
def
adapter_cache
(
request
,
tmpdir_factory
):
# Create dir that mimics the structure of the adapter cache
adapter_cache
=
tmpdir_factory
.
mktemp
(
request
.
module
.
__name__
)
/
"adapter_cache"
return
adapter_cache
@
pytest
.
fixture
(
scope
=
"module"
)
def
zephyr_lora_files
():
return
snapshot_download
(
repo_id
=
LORA_NAME
)
@
pytest
.
fixture
(
scope
=
"module"
)
def
pa_files
():
return
snapshot_download
(
repo_id
=
PA_NAME
)
@
pytest
.
mark
.
asyncio
async
def
test_filesystem_resolver
(
adapter_cache
,
zephyr_lora_files
):
model_files
=
adapter_cache
/
LORA_NAME
shutil
.
copytree
(
zephyr_lora_files
,
model_files
)
fs_resolver
=
FilesystemResolver
(
adapter_cache
)
assert
fs_resolver
is
not
None
lora_request
=
await
fs_resolver
.
resolve_lora
(
MODEL_NAME
,
LORA_NAME
)
assert
lora_request
is
not
None
assert
lora_request
.
lora_name
==
LORA_NAME
assert
lora_request
.
lora_path
==
os
.
path
.
join
(
adapter_cache
,
LORA_NAME
)
@
pytest
.
mark
.
asyncio
async
def
test_missing_adapter
(
adapter_cache
):
fs_resolver
=
FilesystemResolver
(
adapter_cache
)
assert
fs_resolver
is
not
None
missing_lora_request
=
await
fs_resolver
.
resolve_lora
(
MODEL_NAME
,
"foobar"
)
assert
missing_lora_request
is
None
@
pytest
.
mark
.
asyncio
async
def
test_nonlora_adapter
(
adapter_cache
,
pa_files
):
model_files
=
adapter_cache
/
PA_NAME
shutil
.
copytree
(
pa_files
,
model_files
)
fs_resolver
=
FilesystemResolver
(
adapter_cache
)
assert
fs_resolver
is
not
None
pa_request
=
await
fs_resolver
.
resolve_lora
(
MODEL_NAME
,
PA_NAME
)
assert
pa_request
is
None
tests/quantization/test_auto_round.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
"""Test model set-up and inference for quantized HF models supported
on the AutoRound.
Validating the configuration and printing results for manual checking.
Run `pytest tests/quantization/test_auto_round.py`.
"""
import
pytest
from
vllm.platforms
import
current_platform
MODELS
=
[
"OPEA/Qwen2.5-0.5B-Instruct-int4-sym-inc"
,
##auto_round:auto_gptq
"Intel/Qwen2-0.5B-Instruct-int4-sym-AutoRound"
##auto_round:auto_awq
]
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cpu
()
and
not
current_platform
.
is_xpu
()
and
not
current_platform
.
is_cuda
(),
reason
=
"only supports CPU/XPU/CUDA backend."
)
@
pytest
.
mark
.
parametrize
(
"model"
,
MODELS
)
def
test_auto_round
(
vllm_runner
,
model
):
with
vllm_runner
(
model
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
8
)
assert
output
print
(
f
"
{
output
[
0
][
1
]
}
"
)
tests/quantization/test_bitsandbytes.py
View file @
7a985548
...
...
@@ -8,9 +8,11 @@ import gc
import
pytest
import
torch
from
transformers
import
BitsAndBytesConfig
from
tests.quantization.utils
import
is_quant_method_supported
from
..models.utils
import
check_embeddings_close
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
models_4bit_to_test
=
[
...
...
@@ -19,6 +21,10 @@ models_4bit_to_test = [
"quantize inflight model with both HF and Mistral format weights"
)
]
models_4bit_to_embedding_test
=
[
(
"intfloat/e5-mistral-7b-instruct"
,
"quantize embedding model inflight"
),
]
models_pre_qaunt_4bit_to_test
=
[
(
'PrunaAI/Einstein-v6.1-Llama3-8B-bnb-4bit-smashed'
,
'read pre-quantized 4-bit FP4 model'
),
...
...
@@ -31,6 +37,12 @@ models_pre_quant_8bit_to_test = [
(
"yec019/fbopt-350m-8bit"
,
"read pre-quantized 8-bit opt model"
),
]
models_pre_quant_8bit_to_test
=
[
(
'meta-llama/Llama-Guard-3-8B-INT8'
,
'read pre-quantized llama 8-bit model'
),
(
"yec019/fbopt-350m-8bit"
,
"read pre-quantized 8-bit opt model"
),
]
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
...
...
@@ -39,7 +51,8 @@ models_pre_quant_8bit_to_test = [
def
test_load_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
hf_model_kwargs
=
{
"load_in_4bit"
:
True
}
hf_model_kwargs
=
dict
(
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
True
))
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
model_name
,
False
,
hf_model_kwargs
)
...
...
@@ -77,7 +90,8 @@ def test_load_8bit_bnb_model(hf_runner, vllm_runner, example_prompts,
def
test_load_tp_4bit_bnb_model
(
hf_runner
,
vllm_runner
,
example_prompts
,
model_name
,
description
)
->
None
:
hf_model_kwargs
=
{
"load_in_4bit"
:
True
}
hf_model_kwargs
=
dict
(
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
True
))
validate_generated_texts
(
hf_runner
,
vllm_runner
,
example_prompts
[:
1
],
...
...
@@ -113,6 +127,54 @@ def test_load_pp_4bit_bnb_model(model_name, description) -> None:
compare_two_settings
(
model_name
,
common_args
,
pp_args
)
@
pytest
.
mark
.
skipif
(
not
is_quant_method_supported
(
"bitsandbytes"
),
reason
=
'bitsandbytes is not supported on this GPU type.'
)
@
pytest
.
mark
.
parametrize
(
"model_name, description"
,
models_4bit_to_embedding_test
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
"half"
])
@
create_new_process_for_each_test
()
def
test_4bit_bnb_embedding_model
(
model_name
,
description
,
hf_runner
,
vllm_runner
,
example_prompts
,
dtype
:
str
,
)
->
None
:
# The example_prompts has ending "\n", for example:
# "Write a short story about a robot that dreams for the first time.\n"
# sentence_transformers will strip the input texts, see:
# https://github.com/UKPLab/sentence-transformers/blob/v3.1.1/sentence_transformers/models/Transformer.py#L159
# This makes the input_ids different between hf_model and vllm_model.
# So we need to strip the input texts to avoid test failing.
example_prompts
=
[
str
(
s
).
strip
()
for
s
in
example_prompts
]
# Inflight 4bit quantization
hf_model_kwargs
=
dict
(
quantization_config
=
BitsAndBytesConfig
(
load_in_4bit
=
True
))
with
hf_runner
(
model_name
,
dtype
=
dtype
,
model_kwargs
=
hf_model_kwargs
,
is_sentence_transformer
=
True
,
)
as
hf_model
:
hf_outputs
=
hf_model
.
encode
(
example_prompts
)
with
vllm_runner
(
model_name
,
task
=
"embed"
,
dtype
=
dtype
,
quantization
=
"bitsandbytes"
)
as
vllm_model
:
vllm_outputs
=
vllm_model
.
encode
(
example_prompts
)
check_embeddings_close
(
embeddings_0_lst
=
hf_outputs
,
embeddings_1_lst
=
vllm_outputs
,
name_0
=
"hf"
,
name_1
=
"vllm"
,
tol
=
5e-2
,
)
def
log_generated_texts
(
prompts
,
outputs
,
runner_name
):
logged_texts
=
[]
for
i
,
(
_
,
generated_text
)
in
enumerate
(
outputs
):
...
...
tests/quantization/test_compressed_tensors.py
View file @
7a985548
...
...
@@ -13,9 +13,9 @@ from compressed_tensors.quantization import QuantizationType
from
tests.models.utils
import
check_logprobs_close
from
vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors
import
(
# noqa: E501
CompressedTensors24
,
CompressedTensorsLinearMethod
,
CompressedTensorsW4A16
Sparse2
4
,
CompressedTensorsW
8A8Fp8
,
CompressedTensorsW8A8
Int
8
,
CompressedTensorsW8A
16Fp
8
,
CompressedTensorsWNA16
)
CompressedTensorsW4A16
Fp
4
,
CompressedTensorsW
4A16Sparse24
,
CompressedTensorsW8A8
Fp
8
,
CompressedTensorsW8A
8Int
8
,
CompressedTensorsW8A16Fp8
,
CompressedTensorsWNA16
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
sparse_cutlass_supported
)
from
vllm.platforms
import
current_platform
...
...
@@ -648,3 +648,23 @@ def test_compressed_tensors_2of4_sparse_compressed(vllm_runner, args_2of4):
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
def
test_compressed_tensors_nvfp4a16
(
vllm_runner
):
# run weight only example
model
=
"nm-testing/TinyLlama-1.1B-Chat-v1.0-FP4"
with
vllm_runner
(
model
,
enforce_eager
=
True
)
as
llm
:
def
check_model
(
model
):
layer
=
model
.
model
.
layers
[
0
]
qkv_proj
=
layer
.
self_attn
.
qkv_proj
assert
isinstance
(
qkv_proj
.
quant_method
,
CompressedTensorsLinearMethod
)
assert
isinstance
(
qkv_proj
.
scheme
,
CompressedTensorsW4A16Fp4
)
assert
qkv_proj
.
scheme
.
group_size
==
16
llm
.
apply_model
(
check_model
)
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
print
(
output
)
assert
output
tests/quantization/test_quark.py
View file @
7a985548
...
...
@@ -5,6 +5,7 @@ Run `pytest tests/quantization/test_quark.py`.
"""
import
pytest
import
torch
from
vllm.model_executor.layers.quantization.quark.quark
import
(
# noqa: E501
QuarkLinearMethod
,
QuarkW8A8Fp8
,
QuarkW8A8Int8
)
...
...
@@ -63,3 +64,28 @@ def test_quark_int8_w_per_tensor_a_per_tensor(vllm_runner, tp):
output
=
llm
.
generate_greedy
(
"Hello my name is"
,
max_tokens
=
20
)
assert
output
def
test_quark_fp8_parity
(
vllm_runner
):
quark_model_id
=
"amd-quark/llama-tiny-fp8-quark-quant-method"
fp8_model_id
=
"amd-quark/llama-tiny-fp8-quant-method"
llm_kwargs
=
{
"tensor_parallel_size"
:
1
,
"enforce_eager"
:
True
,
"gpu_memory_utilization"
:
0.1
}
with
(
vllm_runner
(
quark_model_id
,
**
llm_kwargs
)
as
quark_handle
,
vllm_runner
(
fp8_model_id
,
**
llm_kwargs
)
as
fp8_handle
):
quark_model
=
(
quark_handle
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
quark_state_dict
=
quark_model
.
state_dict
()
fp8_model
=
(
fp8_handle
.
model
.
llm_engine
.
model_executor
.
driver_worker
.
model_runner
.
model
)
fp8_state_dict
=
fp8_model
.
state_dict
()
assert
fp8_state_dict
.
keys
()
==
quark_state_dict
.
keys
()
for
key
in
fp8_state_dict
:
assert
torch
.
equal
(
fp8_state_dict
[
key
],
quark_state_dict
[
key
])
tests/quantization/test_register_quantization_config.py
View file @
7a985548
...
...
@@ -14,7 +14,7 @@ import torch.nn.functional as F
from
vllm.model_executor.layers.linear
import
LinearBase
# noqa: E501
from
vllm.model_executor.layers.linear
import
UnquantizedLinearMethod
from
vllm.model_executor.layers.quantization
import
(
get_quantization_config
,
register_quantization_config
)
QuantizationMethods
,
get_quantization_config
,
register_quantization_config
)
from
vllm.model_executor.layers.quantization.base_config
import
(
# noqa: E501
QuantizationConfig
)
...
...
@@ -54,7 +54,7 @@ class CustomQuantConfig(QuantizationConfig):
"""Initialize the quantization config."""
self
.
num_bits
=
num_bits
def
get_name
(
self
)
->
str
:
def
get_name
(
self
)
->
QuantizationMethods
:
"""Name of the quantization method."""
return
"custom_quant"
...
...
tests/quantization/test_torchao.py
View file @
7a985548
...
...
@@ -3,6 +3,7 @@ import importlib.metadata
import
importlib.util
import
pytest
import
torch
DTYPE
=
[
"bfloat16"
]
...
...
@@ -21,5 +22,42 @@ def test_pre_quantized_model(vllm_runner):
print
(
output
)
@
pytest
.
mark
.
skipif
(
not
TORCHAO_AVAILABLE
,
reason
=
"torchao is not available"
)
@
pytest
.
mark
.
parametrize
(
"pt_load_map_location"
,
[
"cuda:0"
,
# {"": "cuda"},
])
def
test_opt_125m_int4wo_model_loading_with_params
(
vllm_runner
,
pt_load_map_location
):
torch
.
_dynamo
.
reset
()
model_name
=
"jerryzh168/opt-125m-int4wo"
with
vllm_runner
(
model_name
=
model_name
,
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
pt_load_map_location
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
assert
output
print
(
output
)
@
pytest
.
mark
.
skipif
(
not
TORCHAO_AVAILABLE
,
reason
=
"torchao is not available"
)
def
test_opt_125m_int4wo_model_per_module_quant
(
vllm_runner
):
torch
.
_dynamo
.
reset
()
model_name
=
"jerryzh168/opt-125m-int4wo-per-module"
with
vllm_runner
(
model_name
=
model_name
,
quantization
=
"torchao"
,
dtype
=
"bfloat16"
,
pt_load_map_location
=
"cuda:0"
)
as
llm
:
output
=
llm
.
generate_greedy
([
"The capital of France is"
],
max_tokens
=
32
)
assert
output
print
(
output
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/reasoning/test_qwen3_reasoning_parser.py
0 → 100644
View file @
7a985548
# SPDX-License-Identifier: Apache-2.0
import
pytest
from
transformers
import
AutoTokenizer
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.reasoning
import
ReasoningParser
,
ReasoningParserManager
parser_name
=
"qwen3"
start_token
=
"<think>"
end_token
=
"</think>"
REASONING_MODEL_NAME
=
"Qwen/Qwen3-0.6B"
@
pytest
.
fixture
(
scope
=
"module"
)
def
qwen3_tokenizer
():
return
AutoTokenizer
.
from_pretrained
(
REASONING_MODEL_NAME
)
# 带 <think></think>,非stream
WITH_THINK
=
{
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
}
# 带 <think></think>,stream
WITH_THINK_STREAM
=
{
"output"
:
"<think>This is a reasoning section</think>This is the rest"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
"This is the rest"
,
}
# 不带 <think></think>,非stream
WITHOUT_THINK
=
{
"output"
:
"This is the rest"
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
}
# 不带 <think></think>,stream
WITHOUT_THINK_STREAM
=
{
"output"
:
"This is the rest"
,
"reasoning_content"
:
None
,
"content"
:
"This is the rest"
,
}
COMPLETE_REASONING
=
{
"output"
:
"<think>This is a reasoning section</think>"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
MULTILINE_REASONING
=
{
"output"
:
"<think>This is a reasoning
\n
section</think>This is the rest
\n
That"
,
"reasoning_content"
:
"This is a reasoning
\n
section"
,
"content"
:
"This is the rest
\n
That"
,
}
ONLY_OPEN_TAG
=
{
"output"
:
"<think>This is a reasoning section"
,
"reasoning_content"
:
None
,
"content"
:
"<think>This is a reasoning section"
,
}
ONLY_OPEN_TAG_STREAM
=
{
"output"
:
"<think>This is a reasoning section"
,
"reasoning_content"
:
"This is a reasoning section"
,
"content"
:
None
,
}
TEST_CASES
=
[
pytest
.
param
(
False
,
WITH_THINK
,
id
=
"with_think"
,
),
pytest
.
param
(
True
,
WITH_THINK_STREAM
,
id
=
"with_think_stream"
,
),
pytest
.
param
(
False
,
WITHOUT_THINK
,
id
=
"without_think"
,
),
pytest
.
param
(
True
,
WITHOUT_THINK_STREAM
,
id
=
"without_think_stream"
,
),
pytest
.
param
(
False
,
COMPLETE_REASONING
,
id
=
"complete_reasoning"
,
),
pytest
.
param
(
True
,
COMPLETE_REASONING
,
id
=
"complete_reasoning_stream"
,
),
pytest
.
param
(
False
,
MULTILINE_REASONING
,
id
=
"multiline_reasoning"
,
),
pytest
.
param
(
True
,
MULTILINE_REASONING
,
id
=
"multiline_reasoning_stream"
,
),
pytest
.
param
(
False
,
ONLY_OPEN_TAG
,
id
=
"only_open_tag"
,
),
pytest
.
param
(
True
,
ONLY_OPEN_TAG_STREAM
,
id
=
"only_open_tag_stream"
,
),
]
@
pytest
.
mark
.
parametrize
(
"streaming, param_dict"
,
TEST_CASES
)
def
test_reasoning
(
streaming
:
bool
,
param_dict
:
dict
,
qwen3_tokenizer
,
):
output
=
qwen3_tokenizer
.
tokenize
(
param_dict
[
"output"
])
output_tokens
:
list
[
str
]
=
[
qwen3_tokenizer
.
convert_tokens_to_string
([
token
])
for
token
in
output
]
parser
:
ReasoningParser
=
ReasoningParserManager
.
get_reasoning_parser
(
parser_name
)(
qwen3_tokenizer
)
reasoning
,
content
=
run_reasoning_extraction
(
parser
,
output_tokens
,
streaming
=
streaming
)
assert
reasoning
==
param_dict
[
"reasoning_content"
]
assert
content
==
param_dict
[
"content"
]
tests/runai_model_streamer_test/test_runai_model_streamer_loader.py
View file @
7a985548
...
...
@@ -2,8 +2,7 @@
from
vllm
import
SamplingParams
from
vllm.config
import
LoadConfig
,
LoadFormat
from
vllm.model_executor.model_loader.loader
import
(
RunaiModelStreamerLoader
,
get_model_loader
)
from
vllm.model_executor.model_loader
import
get_model_loader
test_model
=
"openai-community/gpt2"
...
...
@@ -24,7 +23,7 @@ def get_runai_model_loader():
def
test_get_model_loader_with_runai_flag
():
model_loader
=
get_runai_model_loader
()
assert
isinstance
(
model_loader
,
RunaiModelStreamerLoader
)
assert
model_loader
.
__class__
.
__name__
==
"
RunaiModelStreamerLoader
"
def
test_runai_model_loader_download_files
(
vllm_runner
):
...
...
tests/samplers/test_rejection_sampler.py
View file @
7a985548
...
...
@@ -169,7 +169,10 @@ def test_no_crash_with_varying_dims(k: int, vocab_size: int, batch_size: int,
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
1
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"n_rep"
,
[
100
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
# @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
False
])
@
torch
.
inference_mode
()
def
test_deterministic_when_seeded
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
frac_seeded
:
float
,
n_rep
:
int
,
device
:
str
,
...
...
@@ -214,7 +217,10 @@ def test_deterministic_when_seeded(k: int, vocab_size: int, batch_size: int,
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
[
30_000
,
50_000
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
3
,
8
,
32
,
128
])
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
True
,
False
])
# @pytest.mark.parametrize("use_flashinfer", [True, False])
# Not testing FlashInfer now, since 0.2.3 API removed the ability
# to pass in uniform samples.
@
pytest
.
mark
.
parametrize
(
"use_flashinfer"
,
[
False
])
@
torch
.
inference_mode
()
def
test_mixed_seeded_batch
(
k
:
int
,
vocab_size
:
int
,
batch_size
:
int
,
device
:
str
,
use_flashinfer
:
bool
):
...
...
@@ -284,6 +290,10 @@ def test_compare_nonflashinfer_backend(k: int, vocab_size: int,
Test the flashinfer and nonflashinfer backend generate
the same output metrics.
"""
pytest
.
skip
(
"Not testing FlashInfer now, since 0.2.3 API removed "
"the ability to pass in uniform samples."
)
torch
.
set_default_device
(
device
)
torch
.
manual_seed
(
0
)
draft_probs
=
torch
.
rand
(
batch_size
,
k
,
vocab_size
,
dtype
=
torch
.
float32
)
...
...
tests/samplers/test_sampler.py
View file @
7a985548
...
...
@@ -478,7 +478,7 @@ def test_sampler_mixed(seed: int, device: str):
sampling_params
=
SamplingParams
(
temperature
=
random
.
random
()
+
0.1
,
top_p
=
min
(
random
.
random
()
+
0.1
,
1
),
top_k
=
random
.
randint
(
0
,
10
)
or
-
1
,
top_k
=
random
.
randint
(
0
,
10
),
n
=
n
,
presence_penalty
=
random
.
randint
(
0
,
1
),
)
...
...
@@ -647,6 +647,8 @@ def test_flashinfer_fallback(seed: int, device: str):
if
not
envs
.
VLLM_USE_FLASHINFER_SAMPLER
:
pytest
.
skip
(
"Flashinfer sampler is disabled"
)
pytest
.
skip
(
"After FlashInfer 0.2.3, sampling will never fail"
)
set_random_seed
(
seed
)
torch
.
set_default_device
(
device
)
batch_size
=
random
.
randint
(
1
,
256
)
...
...
tests/spec_decode/e2e/test_medusa_correctness.py
View file @
7a985548
...
...
@@ -205,7 +205,7 @@ def test_medusa_e2e_greedy_correctness_cuda_graph(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
tests/spec_decode/e2e/test_mlp_correctness.py
View file @
7a985548
...
...
@@ -267,7 +267,7 @@ def test_mlp_e2e_seeded_correctness(vllm_runner, common_llm_kwargs,
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
@@ -321,7 +321,7 @@ def test_mlp_e2e_greedy_correctness_with_preemption(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
tests/spec_decode/e2e/test_multistep_correctness.py
View file @
7a985548
...
...
@@ -456,7 +456,7 @@ def test_spec_decode_e2e_greedy_correctness_real_model_large_bs(
@
pytest
.
mark
.
parametrize
(
"common_llm_kwargs"
,
[{
"block_size"
:
8
,
"block_size"
:
16
,
# 2 for small prompt, 256//8 for generated.
"num_gpu_blocks_override"
:
2
+
256
//
8
,
"max_model_len"
:
(
2
+
256
//
8
)
*
8
,
...
...
@@ -526,11 +526,8 @@ def test_spec_decode_e2e_greedy_correctness_with_preemption(
@
pytest
.
mark
.
parametrize
(
"per_test_common_llm_kwargs"
,
[
# As of this writing, vLLM only compiles with these 3 block sizes by
# default.
{
"block_size"
:
8
,
},
# https://github.com/triton-lang/triton/issues/2266 tl.dot
# doesn't support embedding < 16
{
"block_size"
:
16
,
},
...
...
Prev
1
…
19
20
21
22
23
24
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment