Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
d2b52805
Commit
d2b52805
authored
Sep 07, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc1' into v0.10.2rc1-ori
parents
9a521c23
5438967f
Changes
501
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
665 additions
and
198 deletions
+665
-198
tests/v1/engine/test_output_processor.py
tests/v1/engine/test_output_processor.py
+10
-20
tests/v1/engine/test_processor_multi_modal_uuids.py
tests/v1/engine/test_processor_multi_modal_uuids.py
+229
-0
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+144
-52
tests/v1/entrypoints/openai/responses/test_basic.py
tests/v1/entrypoints/openai/responses/test_basic.py
+13
-0
tests/v1/executor/__init__.py
tests/v1/executor/__init__.py
+0
-0
tests/v1/executor/test_executor.py
tests/v1/executor/test_executor.py
+116
-0
tests/v1/kv_connector/unit/test_nixl_connector.py
tests/v1/kv_connector/unit/test_nixl_connector.py
+85
-1
tests/v1/kv_connector/unit/utils.py
tests/v1/kv_connector/unit/utils.py
+1
-4
tests/v1/logits_processors/utils.py
tests/v1/logits_processors/utils.py
+9
-28
tests/v1/sample/test_logprobs.py
tests/v1/sample/test_logprobs.py
+5
-5
tests/v1/spec_decode/test_tree_attention.py
tests/v1/spec_decode/test_tree_attention.py
+2
-0
tests/v1/test_async_llm_dp.py
tests/v1/test_async_llm_dp.py
+4
-2
tests/v1/test_serial_utils.py
tests/v1/test_serial_utils.py
+13
-9
tests/v1/tpu/worker/test_tpu_model_runner.py
tests/v1/tpu/worker/test_tpu_model_runner.py
+6
-6
tests/v1/worker/test_gpu_input_batch.py
tests/v1/worker/test_gpu_input_batch.py
+1
-0
tests/v1/worker/test_gpu_model_runner.py
tests/v1/worker/test_gpu_model_runner.py
+13
-10
tests/weight_loading/models.txt
tests/weight_loading/models.txt
+0
-4
tests/worker/test_model_input.py
tests/worker/test_model_input.py
+0
-54
tools/check_pickle_imports.py
tools/check_pickle_imports.py
+1
-1
tools/ep_kernels/install_python_libraries.sh
tools/ep_kernels/install_python_libraries.sh
+13
-2
No files found.
Too many changes to show.
To preserve performance only
501 of 501+
files are displayed.
Plain diff
Email patch
tests/v1/engine/test_output_processor.py
View file @
d2b52805
...
...
@@ -52,11 +52,9 @@ def test_incremental_detokenization(request_output_kind: RequestOutputKind,
requests
=
[
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
prompt_token_ids
=
prompt_tokens
,
arrival_time
=
0
,
mm_kwargs
=
None
,
mm_hashes
=
None
,
mm_placeholders
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
arrival_time
=
0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
...
...
@@ -401,11 +399,9 @@ def test_logprobs_processor(request_output_kind: RequestOutputKind,
requests
=
[
EngineCoreRequest
(
request_id
=
request_id_list
[
idx
],
prompt_token_ids
=
prompt_tokens
,
arrival_time
=
0
,
mm_kwargs
=
None
,
mm_hashes
=
None
,
mm_placeholders
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
arrival_time
=
0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
...
...
@@ -566,11 +562,9 @@ def test_stop_token(include_stop_str_in_output: bool,
request
=
EngineCoreRequest
(
request_id
=
request_id
,
prompt_token_ids
=
prompt_tokens
,
arrival_time
=
0
,
mm_kwargs
=
None
,
mm_hashes
=
None
,
mm_placeholders
=
None
,
mm_features
=
None
,
eos_token_id
=
eos_token_id
,
arrival_time
=
0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
...
...
@@ -665,11 +659,9 @@ def test_stop_string(include_stop_str_in_output: bool,
EngineCoreRequest
(
request_id
=
request_id_list
[
idx
],
prompt_token_ids
=
prompt_tokens
,
arrival_time
=
0
,
mm_kwargs
=
None
,
mm_hashes
=
None
,
mm_placeholders
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
arrival_time
=
0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
...
...
@@ -781,11 +773,9 @@ def test_iteration_stats(dummy_test_vectors):
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
prompt_token_ids
=
prompt_tokens
,
arrival_time
=
0
,
mm_kwargs
=
None
,
mm_hashes
=
None
,
mm_placeholders
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
arrival_time
=
0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
...
...
tests/v1/engine/test_processor_multi_modal_uuids.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
from
vllm.platforms.interface
import
UnspecifiedPlatform
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
processor
as
processor_mod
from
vllm.v1.engine.processor
import
Processor
cherry_pil_image
=
ImageAsset
(
"cherry_blossom"
).
pil_image
stop_pil_image
=
ImageAsset
(
"stop_sign"
).
pil_image
baby_reading_np_ndarrays
=
VideoAsset
(
"baby_reading"
).
np_ndarrays
# Mock processor for testing
def
_mk_processor
(
monkeypatch
,
*
,
mm_cache_gb
:
float
=
4.0
,
enable_prefix_caching
:
bool
=
True
)
->
Processor
:
"""
Create a Processor instance with minimal configuration suitable for unit
tests without accessing external resources.
"""
monkeypatch
.
setattr
(
ModelConfig
,
"try_get_generation_config"
,
lambda
self
:
{},
raising
=
True
)
monkeypatch
.
setattr
(
ModelConfig
,
"__post_init__"
,
lambda
self
:
None
,
raising
=
True
)
monkeypatch
.
setattr
(
UnspecifiedPlatform
,
"is_async_output_supported"
,
classmethod
(
lambda
cls
,
enforce_eager
:
True
),
raising
=
True
)
monkeypatch
.
setattr
(
ModelConfig
,
"verify_async_output_proc"
,
lambda
self
,
parallel_config
,
speculative_config
,
device_config
:
None
,
raising
=
True
)
monkeypatch
.
setattr
(
ModelConfig
,
"verify_with_parallel_config"
,
lambda
self
,
parallel_config
:
None
,
raising
=
True
)
monkeypatch
.
setattr
(
processor_mod
,
"processor_cache_from_config"
,
lambda
vllm_config
,
mm_registry
:
None
,
raising
=
True
)
monkeypatch
.
setattr
(
VllmConfig
,
"__post_init__"
,
lambda
self
:
None
,
raising
=
True
)
model_config
=
ModelConfig
(
skip_tokenizer_init
=
True
,
max_model_len
=
128
,
mm_processor_cache_gb
=
mm_cache_gb
,
generation_config
=
"vllm"
,
tokenizer
=
"dummy"
,
)
# Minimal multimodal_config to satisfy references in
# Processor.process_inputs.
class
_MockMMConfig
:
def
__init__
(
self
,
gb
:
float
):
self
.
mm_processor_cache_gb
=
gb
model_config
.
multimodal_config
=
_MockMMConfig
(
mm_cache_gb
)
# type: ignore[attr-defined]
vllm_config
=
VllmConfig
(
model_config
=
model_config
,
cache_config
=
CacheConfig
(
enable_prefix_caching
=
enable_prefix_caching
),
device_config
=
DeviceConfig
(
device
=
"cpu"
),
)
# Pass tokenizer=None; InputPreprocessor handles None when
# skip_tokenizer_init is True.
return
Processor
(
vllm_config
,
tokenizer
=
None
)
# type: ignore[arg-type]
def
test_multi_modal_uuids_length_mismatch_raises
(
monkeypatch
):
processor
=
_mk_processor
(
monkeypatch
)
prompt
=
{
"prompt"
:
"USER: <image>
\n
Describe
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
]
},
# Mismatch: 2 items but only 1 uuid provided
"multi_modal_uuids"
:
{
"image"
:
[
"hash_cherry"
]
},
}
with
pytest
.
raises
(
ValueError
,
match
=
"must have same length as data"
):
processor
.
process_inputs
(
request_id
=
"req-1"
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
def
test_multi_modal_uuids_missing_modality_raises
(
monkeypatch
):
processor
=
_mk_processor
(
monkeypatch
)
prompt
=
{
"prompt"
:
"USER: <image><video>
\n
Describe
\n
ASSISTANT:"
,
# Two modalities provided in data
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
],
"video"
:
[
baby_reading_np_ndarrays
]
},
# Only image uuids provided; video missing should raise
"multi_modal_uuids"
:
{
"image"
:
[
"hash_cherry"
]
},
}
with
pytest
.
raises
(
ValueError
,
match
=
"must be provided if multi_modal_data"
):
processor
.
process_inputs
(
request_id
=
"req-2"
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
@
pytest
.
mark
.
parametrize
(
"mm_cache_gb, enable_prefix_caching"
,
[
(
4.0
,
True
),
# default behavior
(
4.0
,
False
),
# prefix caching disabled
(
0.0
,
True
),
# processor cache disabled
],
)
def
test_multi_modal_uuids_accepts_none_and_passes_through
(
monkeypatch
,
mm_cache_gb
:
float
,
enable_prefix_caching
:
bool
):
processor
=
_mk_processor
(
monkeypatch
,
mm_cache_gb
=
mm_cache_gb
,
enable_prefix_caching
=
enable_prefix_caching
)
# Capture the overrides passed to InputPreprocessor.preprocess
captured
:
dict
[
str
,
object
]
=
{}
def
fake_preprocess
(
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_hash_overrides
=
None
):
captured
[
"mm_hash_overrides"
]
=
mm_hash_overrides
# Minimal processed inputs for decoder-only flow
return
{
"type"
:
"token"
,
"prompt_token_ids"
:
[
1
]}
# Monkeypatch only the bound preprocess method on this instance
monkeypatch
.
setattr
(
processor
.
input_preprocessor
,
"preprocess"
,
fake_preprocess
,
raising
=
True
)
# Use a consistent two-image scenario across all configurations
mm_uuids
=
{
"image"
:
[
None
,
"hash_stop"
],
"video"
:
None
}
prompt
=
{
"prompt"
:
"USER: <image><image>
\n
Two images
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
baby_reading_np_ndarrays
,
},
"multi_modal_uuids"
:
mm_uuids
,
}
processor
.
process_inputs
(
request_id
=
"req-3"
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
assert
captured
[
"mm_hash_overrides"
]
==
mm_uuids
def
test_multi_modal_uuids_ignored_when_caching_disabled
(
monkeypatch
):
# When both processor cache is 0 and prefix caching disabled, the
# processor builds overrides from request id instead of using user UUIDs.
processor
=
_mk_processor
(
monkeypatch
,
mm_cache_gb
=
0.0
,
enable_prefix_caching
=
False
)
captured
:
dict
[
str
,
object
]
=
{}
def
fake_preprocess
(
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_hash_overrides
=
None
):
captured
[
"mm_hash_overrides"
]
=
mm_hash_overrides
return
{
"type"
:
"token"
,
"prompt_token_ids"
:
[
1
]}
monkeypatch
.
setattr
(
processor
.
input_preprocessor
,
"preprocess"
,
fake_preprocess
,
raising
=
True
)
request_id
=
"req-42"
mm_uuids
=
{
"image"
:
[
"hash_cherry"
,
"hash_stop"
],
"video"
:
"hash_video"
}
prompt
=
{
"prompt"
:
"USER: <image><image><video>
\n
Describe
\n
ASSISTANT:"
,
"multi_modal_data"
:
{
"image"
:
[
cherry_pil_image
,
stop_pil_image
],
"video"
:
baby_reading_np_ndarrays
,
},
"multi_modal_uuids"
:
mm_uuids
,
}
processor
.
process_inputs
(
request_id
=
request_id
,
prompt
=
prompt
,
# type: ignore[arg-type]
params
=
SamplingParams
(),
)
# Expect request-id-based overrides are passed through
assert
captured
[
"mm_hash_overrides"
]
==
{
"image"
:
[
f
"
{
request_id
}
-image-0"
,
f
"
{
request_id
}
-image-1"
],
"video"
:
[
f
"
{
request_id
}
-video-0"
],
}
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
d2b52805
...
...
@@ -11,9 +11,11 @@ from typing import TYPE_CHECKING, Any
import
jsonschema
import
pytest
import
regex
as
re
import
torch
from
pydantic
import
BaseModel
from
tests.reasoning.utils
import
run_reasoning_extraction
from
vllm.distributed
import
cleanup_dist_env_and_memory
from
vllm.entrypoints.llm
import
LLM
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
...
...
@@ -39,8 +41,11 @@ EAGLE_SPEC_CONFIG = {
PARAMS_MODELS_BACKENDS_TOKENIZER_MODE
=
[
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar"
,
"auto"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"guidance"
,
"auto"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"lm-format-enforcer"
,
"auto"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"xgrammar"
,
"mistral"
,
None
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"xgrammar"
,
"auto"
,
None
),
(
"Qwen/Qwen2.5-1.5B-Instruct"
,
"lm-format-enforcer"
,
"auto"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"outlines"
,
"auto"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"outlines"
,
"mistral"
,
None
),
(
"mistralai/Ministral-8B-Instruct-2410"
,
"outlines"
,
"auto"
,
...
...
@@ -127,13 +132,15 @@ def test_structured_output(
temperature
=
1.0
,
max_tokens
=
4096
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
))
outputs
=
llm
.
generate
(
prompts
=
[
(
f
"Give an example JSON for an employee profile that fits this "
f
"schema. Make the response as short as possible. Schema: "
f
"
{
sample_json_schema
}
"
)
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
prompt
=
(
"Give an example JSON for an employee profile that fits this "
"schema. Make the response as short as possible. Schema: "
f
"
{
sample_json_schema
}
"
)
outputs
=
llm
.
generate
(
[
prompt
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
assert
outputs
is
not
None
...
...
@@ -144,7 +151,8 @@ def test_structured_output(
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
assert
"
\n
"
not
in
generated_text
if
guided_decoding_backend
!=
'lm-format-enforcer'
:
assert
"
\n
"
not
in
generated_text
print
(
f
"Prompt:
{
prompt
!
r
}
, Generated text:
{
generated_text
!
r
}
"
)
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
...
...
@@ -191,20 +199,24 @@ def test_structured_output(
with
pytest
.
raises
(
ValueError
,
match
=
"The provided JSON schema contains features "
"not supported by xgrammar."
):
prompt
=
(
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
unsupported_json_schema
}
. "
f
"Make the response as short as possible."
)
llm
.
generate
(
prompts
=
[(
f
"Give an example JSON for an employee profile that "
f
"fits this schema:
{
unsupported_json_schema
}
. "
f
"Make the response as short as possible."
)]
*
2
,
[
prompt
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
)
else
:
outputs
=
llm
.
generate
(
prompts
=
(
"Give an example JSON object for a grade "
"that fits this schema: "
f
"
{
unsupported_json_schema
}
. Make the response as short as "
"possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
prompt
=
(
f
"Give an example JSON object for a grade that "
f
"fits this schema:
{
unsupported_json_schema
}
. "
f
"Make the response as short as possible."
)
outputs
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
...
...
@@ -217,7 +229,7 @@ def test_structured_output(
parsed_json
=
json
.
loads
(
generated_text
)
assert
isinstance
(
parsed_json
,
dict
)
if
guided_decoding_backend
!=
"outlines"
:
if
guided_decoding_backend
not
in
[
"outlines"
,
"lm-format-enforcer"
]
:
#
# Test 4: Generate SQL statement using EBNF grammar
#
...
...
@@ -227,10 +239,9 @@ def test_structured_output(
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_ebnf
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
...
...
@@ -261,10 +272,9 @@ def test_structured_output(
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
grammar
=
sample_sql_lark
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short as "
"possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
...
...
@@ -301,7 +311,6 @@ def test_structured_output(
guided_decoding
=
GuidedDecodingParams
(
grammar
=
"not a grammar"
))
with
pytest
.
raises
(
ValueError
,
match
=
"Failed to convert the grammar "
):
llm
.
generate
(
prompts
=
(
"Generate a sql statement that selects col_1 from "
"table_1 where it is equal to 1. Make the response as short "
"as possible."
),
...
...
@@ -316,11 +325,11 @@ def test_structured_output(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
regex
=
sample_regex
))
prompt
=
(
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
. "
f
"Make the response as short as possible."
)
outputs
=
llm
.
generate
(
prompts
=
[
(
f
"Give an example IPv4 address with this regex:
{
sample_regex
}
. "
f
"Make the response as short as possible."
)
]
*
2
,
[
prompt
]
*
2
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
...
...
@@ -343,11 +352,13 @@ def test_structured_output(
temperature
=
0.8
,
top_p
=
0.95
,
guided_decoding
=
GuidedDecodingParams
(
choice
=
sample_guided_choice
))
outputs
=
llm
.
generate
(
prompts
=
(
"The best language for type-safe systems programming is "
"(Make the response as short as possible.) "
),
(
"The best language for type-safe systems programming is "
"(Make the response as short as possible.) "
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
)
assert
outputs
is
not
None
for
output
in
outputs
:
assert
output
is
not
None
...
...
@@ -367,12 +378,14 @@ def test_structured_output(
temperature
=
1.0
,
max_tokens
=
1000
,
guided_decoding
=
GuidedDecodingParams
(
json
=
json_schema
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a JSON with the brand, model and car_type of the most "
"iconic car from the 90's. Make the response as short as "
"possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
outputs
=
llm
.
generate
(
(
"Generate a JSON with the brand, model and car_type of the most "
"iconic car from the 90's. Make the response as short as "
"possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
,
)
assert
outputs
is
not
None
...
...
@@ -411,10 +424,11 @@ def test_structured_output(
guided_decoding
=
GuidedDecodingParams
(
json
=
json_schema
))
outputs
=
llm
.
generate
(
prompts
=
(
"Generate a description of a frog using 50 characters. "
"Make the response as short as possible."
),
(
"Generate a description of a frog using 50 characters. "
"Make the response as short as possible."
),
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
use_tqdm
=
True
,
)
assert
outputs
is
not
None
...
...
@@ -429,7 +443,7 @@ def test_structured_output(
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
json_schema
)
if
guided_decoding_backend
!=
"outlines"
:
if
guided_decoding_backend
not
in
[
"outlines"
,
"lm-format-enforcer"
]
:
#
# Test 11: Generate structured output using structural_tag format
#
...
...
@@ -498,7 +512,7 @@ Make the response as short as possible.
"""
# Change this once other backends support structural_tag
outputs
=
llm
.
generate
(
prompts
=
prompt
,
outputs
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
...
...
@@ -639,15 +653,13 @@ def test_structured_output_auto_mode(
f
"
{
unsupported_json_schema
}
. Make the response as short as possible."
)
# This would fail with the default of "xgrammar", but in "auto"
# we will handle fallback automatically.
outputs
=
llm
.
generate
(
prompts
=
prompts
,
outputs
=
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
# Make sure `auto` backend handling doesn't mess up sampling_params
# and that we can reuse it without error.
outputs
.
extend
(
llm
.
generate
(
prompts
=
prompts
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
))
llm
.
generate
(
prompts
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
))
assert
outputs
is
not
None
for
output
in
outputs
:
...
...
@@ -705,7 +717,7 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
max_tokens
=
256
,
guided_decoding
=
guided_params
)
outputs
=
llm
.
generate
(
prompts
=
prompt
,
sampling_params
=
sampling_params
)
outputs
=
llm
.
generate
(
prompt
,
sampling_params
=
sampling_params
)
assert
outputs
is
not
None
generated_text
=
outputs
[
0
].
outputs
[
0
].
text
assert
generated_text
is
not
None
...
...
@@ -721,3 +733,83 @@ def test_guidance_no_additional_properties(monkeypatch: pytest.MonkeyPatch):
assert
"a4"
not
in
generated
assert
"a5"
not
in
generated
assert
"a6"
not
in
generated
@
pytest
.
mark
.
parametrize
(
"guided_decoding_backend"
,
[
"guidance"
,
"xgrammar"
,
"outlines"
])
def
test_structured_output_batched_with_non_guided_requests
(
monkeypatch
:
pytest
.
MonkeyPatch
,
sample_json_schema
:
dict
[
str
,
Any
],
guided_decoding_backend
:
str
,
):
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
# Don't use eager execution on TPUs because we want to test for no
# recompilation at runtime
enforce_eager
=
bool
(
not
current_platform
.
is_tpu
())
llm
=
LLM
(
model
=
"meta-llama/Meta-Llama-3.1-8B-Instruct"
,
enforce_eager
=
enforce_eager
,
max_model_len
=
1024
,
guided_decoding_backend
=
guided_decoding_backend
,
guided_decoding_disable_any_whitespace
=
(
guided_decoding_backend
in
{
"xgrammar"
,
"guidance"
}),
)
guided_prompt
=
(
"Give an example JSON for an employee profile that fits this "
"schema. Make the response as short as possible. Schema: "
f
"
{
sample_json_schema
}
"
)
non_guided_prompt
=
"The diameter of the Earth in kilometers is "
prompts
=
[
guided_prompt
,
non_guided_prompt
]
sampling_params
=
[
SamplingParams
(
temperature
=
1.0
,
max_tokens
=
400
,
guided_decoding
=
GuidedDecodingParams
(
json
=
sample_json_schema
)),
# No max tokens, temp=0 to assert on contents
SamplingParams
(
seed
=
42
,
temperature
=
0
,
top_p
=
1.0
,
),
]
outputs
=
llm
.
generate
(
prompts
=
prompts
,
sampling_params
=
sampling_params
,
use_tqdm
=
True
)
assert
outputs
is
not
None
# Free memory as soon as possible as failed assertions
# will short circuit and not free up memory
del
llm
torch
.
cuda
.
empty_cache
()
cleanup_dist_env_and_memory
()
for
index
,
output
in
enumerate
(
outputs
):
assert
output
is
not
None
assert
isinstance
(
output
,
RequestOutput
)
prompt
=
output
.
prompt
generated_text
=
output
.
outputs
[
0
].
text
assert
generated_text
is
not
None
print
(
f
"Prompt:
\n
{
prompt
!
r
}
\n
Generated text:
\n
{
generated_text
!
r
}
"
)
if
index
==
0
:
# First prompt is guided, expect valid JSON
assert
"
\n
"
not
in
generated_text
output_json
=
json
.
loads
(
generated_text
)
jsonschema
.
validate
(
instance
=
output_json
,
schema
=
sample_json_schema
)
else
:
# Second prompt is not guided, expect valid output
# Cannot assert on exact output, but we can expect it to be factual
assert
"12,742"
in
generated_text
# non-guided requests should not return a valid JSON here
with
pytest
.
raises
(
ValueError
):
output_json
=
json
.
loads
(
generated_text
)
tests/v1/entrypoints/openai/responses/test_basic.py
View file @
d2b52805
...
...
@@ -73,3 +73,16 @@ async def test_chat_with_input_type(client: openai.AsyncOpenAI):
],
)
print
(
response
)
assert
response
.
status
==
"completed"
@
pytest
.
mark
.
asyncio
async
def
test_logprobs
(
client
:
openai
.
AsyncOpenAI
):
response
=
await
client
.
responses
.
create
(
include
=
[
"message.output_text.logprobs"
],
input
=
"What is 13 * 24?"
,
top_logprobs
=
5
,
)
print
(
response
)
outputs
=
response
.
output
assert
outputs
[
-
1
].
content
[
-
1
].
logprobs
assert
len
(
outputs
[
-
1
].
content
[
-
1
].
logprobs
[
0
].
top_logprobs
)
==
5
tests/
prefix_caching
/__init__.py
→
tests/
v1/executor
/__init__.py
View file @
d2b52805
File moved
tests/v1/executor/test_executor.py
0 → 100644
View file @
d2b52805
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
os
from
typing
import
Any
,
Callable
,
Optional
,
Union
import
pytest
from
vllm.engine.arg_utils
import
AsyncEngineArgs
,
EngineArgs
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine.async_llm
import
AsyncLLM
from
vllm.v1.engine.llm_engine
import
LLMEngine
from
vllm.v1.executor.multiproc_executor
import
MultiprocExecutor
class
Mock
:
...
class
CustomMultiprocExecutor
(
MultiprocExecutor
):
def
collective_rpc
(
self
,
method
:
Union
[
str
,
Callable
],
timeout
:
Optional
[
float
]
=
None
,
args
:
tuple
=
(),
kwargs
:
Optional
[
dict
]
=
None
,
non_block
:
bool
=
False
,
unique_reply_rank
:
Optional
[
int
]
=
None
)
->
list
[
Any
]:
# Drop marker to show that this was ran
with
open
(
".marker"
,
"w"
):
...
return
super
().
collective_rpc
(
method
,
timeout
,
args
,
kwargs
)
CustomMultiprocExecutorAsync
=
CustomMultiprocExecutor
MODEL
=
"Qwen/Qwen3-0.6B"
def
test_custom_executor_type_checking
():
with
pytest
.
raises
(
ValueError
):
engine_args
=
EngineArgs
(
model
=
MODEL
,
gpu_memory_utilization
=
0.2
,
max_model_len
=
8192
,
distributed_executor_backend
=
Mock
,
)
LLMEngine
.
from_engine_args
(
engine_args
)
with
pytest
.
raises
(
ValueError
):
engine_args
=
AsyncEngineArgs
(
model
=
MODEL
,
gpu_memory_utilization
=
0.2
,
max_model_len
=
8192
,
distributed_executor_backend
=
Mock
)
AsyncLLM
.
from_engine_args
(
engine_args
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
CustomMultiprocExecutor
,
"tests.v1.executor.test_executor.CustomMultiprocExecutor"
])
def
test_custom_executor
(
distributed_executor_backend
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
EngineArgs
(
model
=
MODEL
,
gpu_memory_utilization
=
0.2
,
max_model_len
=
8192
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
LLMEngine
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
engine
.
add_request
(
"0"
,
"foo"
,
sampling_params
)
engine
.
step
()
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
@
pytest
.
mark
.
parametrize
(
"distributed_executor_backend"
,
[
CustomMultiprocExecutorAsync
,
"tests.v1.executor.test_executor.CustomMultiprocExecutorAsync"
])
def
test_custom_executor_async
(
distributed_executor_backend
,
tmp_path
):
cwd
=
os
.
path
.
abspath
(
"."
)
os
.
chdir
(
tmp_path
)
try
:
assert
not
os
.
path
.
exists
(
".marker"
)
engine_args
=
AsyncEngineArgs
(
model
=
MODEL
,
gpu_memory_utilization
=
0.2
,
max_model_len
=
8192
,
distributed_executor_backend
=
distributed_executor_backend
,
enforce_eager
=
True
,
# reduce test time
)
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
)
sampling_params
=
SamplingParams
(
max_tokens
=
1
)
async
def
t
():
stream
=
engine
.
generate
(
request_id
=
"0"
,
prompt
=
"foo"
,
sampling_params
=
sampling_params
)
async
for
x
in
stream
:
...
asyncio
.
run
(
t
())
assert
os
.
path
.
exists
(
".marker"
)
finally
:
os
.
chdir
(
cwd
)
tests/v1/kv_connector/unit/test_nixl_connector.py
View file @
d2b52805
...
...
@@ -14,6 +14,7 @@ from unittest.mock import patch
import
pytest
import
ray
import
torch
from
vllm
import
LLM
from
vllm.config
import
KVTransferConfig
...
...
@@ -22,6 +23,7 @@ from vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector import (
NixlConnectorWorker
)
from
vllm.forward_context
import
ForwardContext
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.attention.backends.flash_attn
import
FlashAttentionBackend
from
.utils
import
create_request
,
create_scheduler
,
create_vllm_config
...
...
@@ -98,7 +100,6 @@ class FakeNixlWrapper:
def
set_cycles_before_xfer_done
(
self
,
cycles
:
int
):
"""Set the number of cycles before a transfer is considered done."""
self
.
_cycles_before_xfer_done
=
cycles
@
contextlib
.
contextmanager
...
...
@@ -562,3 +563,86 @@ def _run_abort_timeout_test(llm_kwargs: dict, timeout: int):
sampling_params
)
# Request-0 times out and is cleared!
assert
'0'
not
in
req_to_blocks
def
test_register_kv_caches
(
dist_init
):
"""
Test that register_kv_caches() properly calls nixl_wrapper methods with
correct data.
This test verifies:
1. nixl_wrapper.get_reg_descs() is called with caches_data containing
tensor metadata
2. nixl_wrapper.get_xfer_descs() is called with blocks_data containing
block layout info
"""
vllm_config
=
create_vllm_config
()
# Create test kv cache tensors using proper backend shape
kv_cache_shape
=
FlashAttentionBackend
.
get_kv_cache_shape
(
num_blocks
=
2
,
block_size
=
16
,
num_kv_heads
=
4
,
head_size
=
64
)
shared_tensor
=
torch
.
zeros
(
*
kv_cache_shape
,
dtype
=
torch
.
float16
)
unique_tensor
=
torch
.
zeros
(
*
kv_cache_shape
,
dtype
=
torch
.
float16
)
kv_caches
=
{
"layer0"
:
shared_tensor
,
"layer1"
:
unique_tensor
,
"layer2"
:
shared_tensor
,
}
# Store tensor info for validation
expected_tensor_size
=
shared_tensor
[
0
].
element_size
(
)
*
shared_tensor
[
0
].
numel
()
expected_base_addrs
=
[
shared_tensor
[
0
].
data_ptr
(),
shared_tensor
[
1
].
data_ptr
(),
unique_tensor
[
0
].
data_ptr
(),
unique_tensor
[
1
].
data_ptr
()
]
with
patch
(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.NixlWrapper"
)
as
mock_nixl_wrapper
,
\
patch
(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Event"
),
\
patch
(
"vllm.distributed.kv_transfer.kv_connector.v1.nixl_connector.threading.Thread"
):
# noqa: E501
# Create connector
connector
=
NixlConnector
(
vllm_config
,
KVConnectorRole
.
WORKER
)
connector
.
connector_worker
=
FakeNixlConnectorWorker
(
vllm_config
,
connector
.
engine_id
,
hand_shake_latency
=
0
)
# Get the mock instance
mock_wrapper_instance
=
mock_nixl_wrapper
.
return_value
connector
.
connector_worker
.
nixl_wrapper
=
mock_wrapper_instance
# Execute register_kv_caches
connector
.
register_kv_caches
(
kv_caches
)
# Verify get_reg_descs was called with caches_data
assert
mock_wrapper_instance
.
get_reg_descs
.
called
caches_data
,
_
=
mock_wrapper_instance
.
get_reg_descs
.
call_args
[
0
]
assert
len
(
caches_data
)
==
4
for
i
,
cache_entry
in
enumerate
(
caches_data
):
base_addr
,
size
,
_tp_rank
,
_
=
cache_entry
assert
size
==
expected_tensor_size
,
\
f
"Entry
{
i
}
: Expected tensor size
{
expected_tensor_size
}
, "
\
f
"got
{
size
}
"
assert
base_addr
==
expected_base_addrs
[
i
],
\
f
"Entry
{
i
}
: Expected base address
{
expected_base_addrs
[
i
]
}
, "
\
f
"got
{
base_addr
}
"
# Verify get_xfer_descs was called with blocks_data
assert
mock_wrapper_instance
.
get_xfer_descs
.
called
blocks_data
,
_
=
mock_wrapper_instance
.
get_xfer_descs
.
call_args
[
0
]
# Validate blocks_data structure and size
expected_blocks_count
=
8
assert
len
(
blocks_data
)
==
expected_blocks_count
,
\
f
"Expected
{
expected_blocks_count
}
blocks, "
\
f
"got
{
len
(
blocks_data
)
}
"
expected_block_len
=
expected_tensor_size
//
2
for
i
,
block_entry
in
enumerate
(
blocks_data
):
block_start_addr
,
block_len
,
tp_rank
=
block_entry
assert
block_len
==
expected_block_len
,
\
f
"Block entry
{
i
}
: Expected block len
{
expected_block_len
}
, "
\
f
"got
{
block_len
}
"
tests/v1/kv_connector/unit/utils.py
View file @
d2b52805
...
...
@@ -162,9 +162,7 @@ def create_request(request_id: int,
prompt_token_ids
=
prompt_token_ids
,
sampling_params
=
sampling_params
,
pooling_params
=
None
,
multi_modal_kwargs
=
None
,
multi_modal_placeholders
=
None
,
multi_modal_hashes
=
None
,
mm_features
=
None
,
eos_token_id
=
EOS_TOKEN_ID
,
block_hasher
=
get_request_block_hasher
(
block_size
,
hash_fn
),
)
...
...
@@ -200,7 +198,6 @@ def create_model_runner_output(
req_ids
=
req_ids
,
req_id_to_index
=
req_id_to_index
,
sampled_token_ids
=
sampled_token_ids
,
spec_token_ids
=
None
,
logprobs
=
None
,
prompt_logprobs_dict
=
{},
pooler_output
=
None
,
...
...
tests/v1/logits_processors/utils.py
View file @
d2b52805
...
...
@@ -8,10 +8,9 @@ from typing import Optional
import
torch
from
vllm.config
import
VllmConfig
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.sample.logits_processor
import
(
LOGITSPROCS_GROUP
,
BatchUpdate
,
LogitsProcessor
,
MoveDirectionality
)
LogitsProcessor
)
from
vllm.v1.sample.logits_processor.builtin
import
process_dict_updates
MODEL_NAME
=
"facebook/opt-125m"
POOLING_MODEL_NAME
=
"BAAI/bge-base-en-v1.5"
...
...
@@ -45,37 +44,19 @@ class DummyLogitsProcessor(LogitsProcessor):
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
device
:
torch
.
device
,
is_pin_memory
:
bool
):
self
.
req_info
:
dict
[
int
,
SamplingParams
]
=
{}
self
.
req_info
:
dict
[
int
,
int
]
=
{}
def
is_argmax_invariant
(
self
)
->
bool
:
"""Never impacts greedy sampling"""
return
False
def
update_state
(
self
,
batch_update
:
Optional
[
BatchUpdate
]):
if
not
batch_update
:
return
# Process added requests.
for
index
,
params
,
_
,
_
in
batch_update
.
added
:
assert
params
is
not
None
if
params
.
extra_args
and
(
target_token
:
=
params
.
extra_args
.
get
(
"target_token"
)):
self
.
req_info
[
index
]
=
target_token
if
self
.
req_info
:
# Process removed requests.
for
index
in
batch_update
.
removed
:
self
.
req_info
.
pop
(
index
,
None
)
# Process moved requests, unidirectional move (a->b) and swap
# (a<->b)
for
adx
,
bdx
,
direct
in
batch_update
.
moved
:
a_val
=
self
.
req_info
.
pop
(
adx
,
None
)
b_val
=
self
.
req_info
.
pop
(
bdx
,
None
)
if
a_val
is
not
None
:
self
.
req_info
[
bdx
]
=
a_val
if
direct
==
MoveDirectionality
.
SWAP
and
b_val
is
not
None
:
self
.
req_info
[
adx
]
=
b_val
process_dict_updates
(
self
.
req_info
,
batch_update
,
lambda
params
,
_
,
__
:
params
.
extra_args
and
(
params
.
extra_args
.
get
(
"target_token"
)),
)
def
apply
(
self
,
logits
:
torch
.
Tensor
)
->
torch
.
Tensor
:
if
not
self
.
req_info
:
...
...
tests/v1/sample/test_logprobs.py
View file @
d2b52805
...
...
@@ -456,9 +456,7 @@ def test_all_logprobs(example_prompts, monkeypatch: pytest.MonkeyPatch):
assert
len
(
logprob
)
==
vocab_size
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
[
"raw_logprobs"
,
"raw_logits"
,
"processed_logprobs"
,
"processed_logits"
])
@
pytest
.
mark
.
parametrize
(
"logprobs_mode"
,
list
(
LogprobsMode
))
def
test_logprobs_mode
(
logprobs_mode
:
LogprobsMode
,
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test with LLM engine with different logprobs_mode.
...
...
@@ -487,12 +485,14 @@ def test_logprobs_mode(logprobs_mode: LogprobsMode,
for
logprobs
in
output
.
logprobs
:
for
token_id
in
logprobs
:
logprob
=
logprobs
[
token_id
]
if
"logprobs"
in
logprobs_mode
:
if
logprobs_mode
in
(
LogprobsMode
.
RAW_LOGPROBS
,
LogprobsMode
.
PROCESSED_LOGPROBS
):
assert
logprob
.
logprob
<=
0
if
logprob
.
logprob
>
0
:
positive_values
=
positive_values
+
1
total_token_with_logprobs
=
total_token_with_logprobs
+
1
assert
total_token_with_logprobs
>=
len
(
results
[
0
].
outputs
)
if
"logits"
in
logprobs_mode
:
if
logprobs_mode
in
(
LogprobsMode
.
RAW_LOGITS
,
LogprobsMode
.
PROCESSED_LOGITS
):
assert
positive_values
>
0
del
llm
tests/v1/spec_decode/test_tree_attention.py
View file @
d2b52805
...
...
@@ -50,6 +50,7 @@ def forward_attention(
dtype
=
torch
.
int32
,
)
context_lens
=
seq_lens
-
query_lens
max_seq_len
=
int
(
seq_lens
.
max
())
max_query_len
=
q_len
num_actual_tokens
=
query_start_loc
[
-
1
]
...
...
@@ -81,6 +82,7 @@ def forward_attention(
num_reqs
=
batch_size
,
num_actual_tokens
=
num_actual_tokens
,
max_query_len
=
max_query_len
,
max_seq_len
=
max_seq_len
,
block_table_tensor
=
block_table
,
slot_mapping
=
slot_mapping
,
)
...
...
tests/v1/test_async_llm_dp.py
View file @
d2b52805
...
...
@@ -75,9 +75,10 @@ async def generate(
],
)
@
pytest
.
mark
.
parametrize
(
"data_parallel_backend"
,
[
"mp"
,
"ray"
])
@
pytest
.
mark
.
parametrize
(
"async_scheduling"
,
[
True
,
False
])
@
pytest
.
mark
.
asyncio
async
def
test_load
(
output_kind
:
RequestOutputKind
,
data_parallel_backend
:
str
):
async
def
test_load
(
output_kind
:
RequestOutputKind
,
data_parallel_backend
:
str
,
async_scheduling
:
bool
):
stats_loggers
=
{}
...
...
@@ -105,6 +106,7 @@ async def test_load(output_kind: RequestOutputKind,
prompt
=
"This is a test of data parallel"
engine_args
.
data_parallel_backend
=
data_parallel_backend
engine_args
.
async_scheduling
=
async_scheduling
engine
=
AsyncLLM
.
from_engine_args
(
engine_args
,
stat_loggers
=
[
SimpleStatsLogger
])
after
.
callback
(
engine
.
shutdown
)
...
...
tests/v1/test_serial_utils.py
View file @
d2b52805
...
...
@@ -11,7 +11,8 @@ import torch
from
vllm.multimodal.inputs
import
(
MultiModalBatchedField
,
MultiModalFieldElem
,
MultiModalFlatField
,
MultiModalKwargs
,
MultiModalKwargsItem
,
MultiModalKwargsItem
,
MultiModalKwargsItems
,
MultiModalSharedField
,
NestedTensors
)
from
vllm.v1.serial_utils
import
MsgpackDecoder
,
MsgpackEncoder
...
...
@@ -96,7 +97,7 @@ def test_encode_decode(monkeypatch: pytest.MonkeyPatch):
class
MyRequest
(
msgspec
.
Struct
):
mm
:
Optional
[
list
[
MultiModalKwargs
]]
mm
:
Optional
[
list
[
MultiModalKwargs
Items
]]
def
test_multimodal_kwargs
():
...
...
@@ -119,7 +120,7 @@ def test_multimodal_kwargs():
audio
=
MultiModalKwargsItem
.
from_elems
([
e1
])
video
=
MultiModalKwargsItem
.
from_elems
([
e2
])
image
=
MultiModalKwargsItem
.
from_elems
([
e3
,
e4
])
mm
=
MultiModalKwargs
([
audio
,
video
,
image
])
mm
=
MultiModalKwargs
Items
.
from_seq
([
audio
,
video
,
image
])
# pack mm kwargs into a mock request so that it can be decoded properly
req
=
MyRequest
([
mm
])
...
...
@@ -133,19 +134,22 @@ def test_multimodal_kwargs():
total_len
=
sum
(
memoryview
(
x
).
cast
(
"B"
).
nbytes
for
x
in
encoded
)
# expected total encoding length, should be 14255, +-20 for minor changes
assert
14250
<=
total_len
<=
14300
decoded
:
MultiModalKwargs
=
decoder
.
decode
(
encoded
).
mm
[
0
]
# expected total encoding length, should be 14306, +-20 for minor changes
assert
14275
<=
total_len
<=
14325
decoded
=
decoder
.
decode
(
encoded
).
mm
[
0
]
assert
isinstance
(
decoded
,
MultiModalKwargsItems
)
# check all modalities were recovered and do some basic sanity checks
assert
len
(
decoded
.
modalities
)
==
3
images
=
decoded
.
get_items
(
"image"
)
assert
len
(
decoded
)
==
3
images
=
decoded
[
"image"
]
assert
len
(
images
)
==
1
assert
len
(
images
[
0
].
items
())
==
2
assert
list
(
images
[
0
].
keys
())
==
[
"i0"
,
"i1"
]
# check the tensor contents and layout in the main dict
assert
all
(
nested_equal
(
mm
[
k
],
decoded
[
k
])
for
k
in
mm
)
mm_data
=
mm
.
get_data
()
decoded_data
=
decoded
.
get_data
()
assert
all
(
nested_equal
(
mm_data
[
k
],
decoded_data
[
k
])
for
k
in
mm_data
)
def
nested_equal
(
a
:
NestedTensors
,
b
:
NestedTensors
):
...
...
tests/v1/tpu/worker/test_tpu_model_runner.py
View file @
d2b52805
...
...
@@ -85,7 +85,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -164,7 +164,7 @@ def test_update_states_request_finished(model_runner):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
{
req_id
},
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -194,7 +194,7 @@ def test_update_states_request_resumed(model_runner):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -221,7 +221,7 @@ def test_update_states_request_resumed(model_runner):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -252,7 +252,7 @@ def test_update_states_no_changes(model_runner):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -287,7 +287,7 @@ def test_update_states_request_unscheduled(model_runner):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
tests/v1/worker/test_gpu_input_batch.py
View file @
d2b52805
...
...
@@ -205,6 +205,7 @@ def _construct_cached_request_state(req_id_suffix: int):
pooling_params
=
None
,
mm_kwargs
=
[],
mm_positions
=
[],
mm_hashes
=
[],
block_ids
=
([],
),
generator
=
None
,
num_computed_tokens
=
len
(
output_token_ids
),
...
...
tests/v1/worker/test_gpu_model_runner.py
View file @
d2b52805
...
...
@@ -141,7 +141,7 @@ def _schedule_new_request(*req_ids: str) -> SchedulerOutput:
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -207,7 +207,7 @@ def test_update_states_request_finished(model_runner, dist_init):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
{
req_id
},
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -239,7 +239,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -266,7 +266,7 @@ def test_update_states_request_resumed(model_runner, dist_init):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -347,7 +347,7 @@ def test_update_states_no_changes(model_runner, dist_init):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -384,7 +384,7 @@ def test_update_states_request_unscheduled(model_runner, dist_init):
scheduled_encoder_inputs
=
{},
num_common_prefix_blocks
=
0
,
finished_req_ids
=
set
(),
free_encoder_
input_id
s
=
[],
free_encoder_
mm_hashe
s
=
[],
structured_output_request_ids
=
{},
grammar_bitmask
=
None
,
)
...
...
@@ -680,6 +680,7 @@ def test_init_kv_cache_with_kv_sharing_valid():
kv_cache_spec
[
layer_0
].
page_size_bytes
runner
.
initialize_kv_cache
(
kv_cache_config
)
kv_cache_config_after_init
=
runner
.
kv_cache_config
layer_0_kv
=
vllm_ctx
[
layer_0
].
kv_cache
[
0
]
layer_1_kv
=
vllm_ctx
[
layer_1
].
kv_cache
[
0
]
...
...
@@ -687,10 +688,12 @@ def test_init_kv_cache_with_kv_sharing_valid():
assert
id
(
layer_1_kv
)
==
id
(
layer_0_kv
)
# check layer 1 added to kv cache group's layer names
assert
len
(
kv_cache_config
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
assert
len
(
kv_cache_config_after_init
.
kv_cache_groups
)
==
1
assert
len
(
kv_cache_config_after_init
.
kv_cache_groups
[
0
].
layer_names
)
==
2
assert
kv_cache_config_after_init
.
kv_cache_groups
[
0
].
layer_names
[
0
]
==
layer_0
assert
kv_cache_config_after_init
.
kv_cache_groups
[
0
].
layer_names
[
1
]
==
layer_1
def
test_hybrid_attention_mamba_tensor_shapes
(
monkeypatch
):
...
...
tests/weight_loading/models.txt
View file @
d2b52805
...
...
@@ -26,9 +26,5 @@ compressed-tensors, nm-testing/SparseLlama-3.1-8B-gsm8k-pruned.2of4-W8A8-testing
awq, casperhansen/mixtral-instruct-awq, main
awq_marlin, casperhansen/mixtral-instruct-awq, main
fp8, neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV, main
marlin, nm-testing/zephyr-beta-7b-marlin-g128, main
marlin, robertgshaw2/zephyr-7b-beta-channelwise-marlin, main
qqq, HandH1998/QQQ-Llama-3-8b-g128, main
qqq, HandH1998/QQQ-Llama-3-8b, main
hqq, nm-testing/Llama-3.2-1B-Instruct-HQQ, main
None, mgleize/fairseq2-dummy-Llama-3.2-1B, main
\ No newline at end of file
tests/worker/test_model_input.py
View file @
d2b52805
...
...
@@ -9,10 +9,7 @@ from vllm.attention import AttentionMetadata, AttentionMetadataBuilder
from
vllm.attention.backends.abstract
import
AttentionBackend
from
vllm.attention.backends.utils
import
CommonAttentionState
from
vllm.model_executor
import
SamplingMetadata
from
vllm.model_executor.pooling_metadata
import
PoolingMetadata
from
vllm.worker.model_runner
import
ModelInputForGPUWithSamplingMetadata
from
vllm.worker.pooling_model_runner
import
(
ModelInputForGPUWithPoolingMetadata
)
class
MockAttentionBackend
(
AttentionBackend
):
...
...
@@ -114,54 +111,3 @@ def test_model_runner_input():
assert
(
received_model_input
.
sampling_metadata
.
selected_token_indices
==
sampling_metadata
.
selected_token_indices
)
assert
received_model_input
.
sampling_metadata
.
seq_groups
is
None
def
test_embedding_model_runner_input
():
pooling_metadata
=
PoolingMetadata
(
seq_groups
=
[[
0
]],
seq_data
=
{},
prompt_lens
=
[
1
],
)
attn_metadata
=
AttentionMetadata
(
num_prefills
=
1
,
num_prefill_tokens
=
2
,
num_decode_tokens
=
3
,
slot_mapping
=
torch
.
zeros
(
1
),
multi_modal_placeholder_index_maps
=
None
,
enable_kv_scales_calculation
=
True
,
)
model_input
=
ModelInputForGPUWithPoolingMetadata
(
input_tokens
=
torch
.
ones
(
10
),
input_positions
=
torch
.
ones
(
10
),
pooling_metadata
=
pooling_metadata
,
attn_metadata
=
attn_metadata
)
assert
isinstance
(
model_input
,
ModelInputForGPUWithPoolingMetadata
)
# Test round trip serialization.
tensor_dict
=
model_input
.
as_broadcastable_tensor_dict
()
attn_backend
=
MockAttentionBackend
()
received_model_input
=
(
ModelInputForGPUWithPoolingMetadata
.
from_broadcasted_tensor_dict
(
tensor_dict
,
attn_backend
=
attn_backend
))
# Check that received copy has correct values.
assert
isinstance
(
received_model_input
,
ModelInputForGPUWithPoolingMetadata
)
assert
received_model_input
.
input_tokens
is
not
None
assert
(
received_model_input
.
input_tokens
==
model_input
.
input_tokens
).
all
()
assert
received_model_input
.
input_positions
is
not
None
assert
(
received_model_input
.
input_positions
==
model_input
.
input_positions
).
all
()
assert
received_model_input
.
multi_modal_kwargs
is
None
assert
(
received_model_input
.
multi_modal_kwargs
==
model_input
.
multi_modal_kwargs
)
assert
received_model_input
.
lora_requests
is
None
assert
received_model_input
.
lora_requests
==
model_input
.
lora_requests
assert
received_model_input
.
lora_mapping
is
None
assert
received_model_input
.
lora_mapping
==
model_input
.
lora_mapping
for
field
in
dataclasses
.
fields
(
AttentionMetadata
):
assert
getattr
(
received_model_input
.
attn_metadata
,
field
.
name
,
None
)
==
getattr
(
attn_metadata
,
field
.
name
,
None
)
# Pooling metadata is not broadcast.
assert
received_model_input
.
pooling_metadata
is
None
tools/check_pickle_imports.py
View file @
d2b52805
...
...
@@ -37,7 +37,7 @@ ALLOWED_FILES = set([
'vllm/distributed/utils.py'
,
'vllm/distributed/parallel_state.py'
,
'vllm/engine/multiprocessing/client.py'
,
'vllm/distributed/device_communicators/
custom_
all_reduce_utils.py'
,
'vllm/distributed/device_communicators/all_reduce_utils.py'
,
'vllm/distributed/device_communicators/shm_broadcast.py'
,
'vllm/engine/multiprocessing/engine.py'
,
'benchmarks/kernels/graph_machete_bench.py'
,
...
...
tools/ep_kernels/install_python_libraries.sh
View file @
d2b52805
...
...
@@ -77,6 +77,7 @@ clone_repo() {
local
repo_url
=
$1
local
dir_name
=
$2
local
key_file
=
$3
local
commit_hash
=
$4
if
[
-d
"
$dir_name
"
]
;
then
# Check if directory has uncommitted changes (dirty)
...
...
@@ -87,17 +88,27 @@ clone_repo() {
echo
"
$dir_name
directory exists but clone appears incomplete, cleaning up and re-cloning"
rm
-rf
"
$dir_name
"
git clone
"
$repo_url
"
if
[
-n
"
$commit_hash
"
]
;
then
cd
"
$dir_name
"
git checkout
"
$commit_hash
"
cd
..
fi
else
echo
"
$dir_name
directory exists and appears complete; manually update if needed"
fi
else
git clone
"
$repo_url
"
if
[
-n
"
$commit_hash
"
]
;
then
cd
"
$dir_name
"
git checkout
"
$commit_hash
"
cd
..
fi
fi
}
# build and install pplx, require pytorch installed
pushd
$WORKSPACE
clone_repo
"https://github.com/ppl-ai/pplx-kernels"
"pplx-kernels"
"setup.py"
clone_repo
"https://github.com/ppl-ai/pplx-kernels"
"pplx-kernels"
"setup.py"
"c336faf"
cd
pplx-kernels
# see https://github.com/pypa/pip/issues/9955#issuecomment-838065925
# PIP_NO_BUILD_ISOLATION=0 disables build isolation
...
...
@@ -106,7 +117,7 @@ popd
# build and install deepep, require pytorch installed
pushd
$WORKSPACE
clone_repo
"https://github.com/deepseek-ai/DeepEP"
"DeepEP"
"setup.py"
clone_repo
"https://github.com/deepseek-ai/DeepEP"
"DeepEP"
"setup.py"
"e3908bf"
cd
DeepEP
export
NVSHMEM_DIR
=
$WORKSPACE
/nvshmem_install
PIP_NO_BUILD_ISOLATION
=
0 pip
install
-vvv
-e
.
...
...
Prev
1
…
14
15
16
17
18
19
20
21
22
…
26
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment