Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
7e63ef82
Commit
7e63ef82
authored
Jan 21, 2026
by
zhuwenwen
Browse files
Merge tag 'v0.14.0' into v0.14.0-dev
parents
8cbcac5d
b17039bc
Changes
681
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
610 additions
and
220 deletions
+610
-220
tests/v1/e2e/test_async_scheduling.py
tests/v1/e2e/test_async_scheduling.py
+33
-37
tests/v1/e2e/test_async_spec_decode.py
tests/v1/e2e/test_async_spec_decode.py
+1
-1
tests/v1/e2e/test_spec_decode.py
tests/v1/e2e/test_spec_decode.py
+24
-19
tests/v1/e2e/untest_cascade_attention.py
tests/v1/e2e/untest_cascade_attention.py
+18
-17
tests/v1/ec_connector/integration/test_epd_correctness.py
tests/v1/ec_connector/integration/test_epd_correctness.py
+3
-7
tests/v1/engine/test_async_llm.py
tests/v1/engine/test_async_llm.py
+63
-2
tests/v1/engine/test_engine_core.py
tests/v1/engine/test_engine_core.py
+7
-1
tests/v1/engine/test_engine_core_client.py
tests/v1/engine/test_engine_core_client.py
+106
-4
tests/v1/engine/test_fast_incdec_prefix_err.py
tests/v1/engine/test_fast_incdec_prefix_err.py
+1
-0
tests/v1/engine/test_output_processor.py
tests/v1/engine/test_output_processor.py
+126
-56
tests/v1/engine/test_parallel_sampling.py
tests/v1/engine/test_parallel_sampling.py
+37
-56
tests/v1/engine/test_preprocess_error_handling.py
tests/v1/engine/test_preprocess_error_handling.py
+63
-0
tests/v1/engine/test_process_multi_modal_uuids.py
tests/v1/engine/test_process_multi_modal_uuids.py
+15
-5
tests/v1/engine/utils.py
tests/v1/engine/utils.py
+7
-1
tests/v1/entrypoints/llm/test_struct_output_generate.py
tests/v1/entrypoints/llm/test_struct_output_generate.py
+6
-2
tests/v1/entrypoints/openai/serving_responses/test_image.py
tests/v1/entrypoints/openai/serving_responses/test_image.py
+5
-5
tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
..._connector/nixl_integration/config_sweep_accuracy_test.sh
+54
-0
tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
+20
-2
tests/v1/kv_connector/unit/test_backwards_compatibility.py
tests/v1/kv_connector/unit/test_backwards_compatibility.py
+1
-1
tests/v1/kv_connector/unit/test_config.py
tests/v1/kv_connector/unit/test_config.py
+20
-4
No files found.
Too many changes to show.
To preserve performance only
681 of 681+
files are displayed.
Plain diff
Email patch
tests/v1/e2e/test_async_scheduling.py
View file @
7e63ef82
...
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
...
@@ -30,8 +30,9 @@ example_prompts = [first_prompt, "In one word, the capital of France is "] + [
default_params
=
dict
(
default_params
=
dict
(
temperature
=
0.0
,
# greedy
temperature
=
0.0
,
# greedy
max_tokens
=
23
,
max_tokens
=
30
,
min_tokens
=
18
,
# spec decoding currently doesn't support min_tokens
# min_tokens=28,
)
)
...
@@ -50,6 +51,14 @@ def test_without_spec_decoding(
...
@@ -50,6 +51,14 @@ def test_without_spec_decoding(
dict
(
logprobs
=
2
),
dict
(
logprobs
=
2
),
dict
(
logprobs
=
2
,
presence_penalty
=-
1.0
),
dict
(
logprobs
=
2
,
presence_penalty
=-
1.0
),
dict
(
structured_outputs
=
struct_outputs
),
dict
(
structured_outputs
=
struct_outputs
),
dict
(
structured_outputs
=
struct_outputs
,
logprobs
=
2
,
),
dict
(
structured_outputs
=
struct_outputs
,
presence_penalty
=-
1.0
,
),
dict
(
dict
(
structured_outputs
=
struct_outputs
,
structured_outputs
=
struct_outputs
,
logprobs
=
2
,
logprobs
=
2
,
...
@@ -86,7 +95,7 @@ def test_without_spec_decoding(
...
@@ -86,7 +95,7 @@ def test_without_spec_decoding(
run_tests
(
monkeypatch
,
MODEL
,
test_configs
,
test_sampling_params
)
run_tests
(
monkeypatch
,
MODEL
,
test_configs
,
test_sampling_params
)
def
test_with_spec_decoding
(
monkeypatch
:
pytest
.
MonkeyPatch
):
def
test_with_spec_decoding
(
sample_json_schema
,
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test consistency and acceptance rates with some different combos of
"""Test consistency and acceptance rates with some different combos of
preemption, executor, async scheduling, prefill chunking,
preemption, executor, async scheduling, prefill chunking,
spec decoding model length.
spec decoding model length.
...
@@ -100,9 +109,20 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
...
@@ -100,9 +109,20 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
# Set small draft model len to force doesn't-fit-in-drafter case.
# Set small draft model len to force doesn't-fit-in-drafter case.
spec_config_short
=
spec_config
|
{
"max_model_len"
:
50
}
spec_config_short
=
spec_config
|
{
"max_model_len"
:
50
}
struct_outputs
=
StructuredOutputsParams
(
json
=
sample_json_schema
)
test_sampling_params
=
[
test_sampling_params
=
[
dict
(),
dict
(),
dict
(
presence_penalty
=-
1.0
),
dict
(
bad_words
=
[
"the"
,
" the"
]),
dict
(
logprobs
=
2
),
dict
(
logprobs
=
2
),
dict
(
logprobs
=
2
,
presence_penalty
=-
1.0
),
dict
(
structured_outputs
=
struct_outputs
),
dict
(
structured_outputs
=
struct_outputs
,
logprobs
=
2
,
presence_penalty
=-
1.0
,
),
]
]
# test_preemption, executor, async_scheduling,
# test_preemption, executor, async_scheduling,
...
@@ -142,18 +162,12 @@ def run_tests(
...
@@ -142,18 +162,12 @@ def run_tests(
"""Test consistency of combos of async scheduling, preemption,
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor with spec decoding."""
uni/multiproc executor with spec decoding."""
# Determine attention config based on platform
attention_config
=
{
"backend"
:
"FLEX_ATTENTION"
}
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
# avoid precision errors
if
current_platform
.
is_rocm
():
if
is_testing_with_spec_decoding
:
# Use TRITON_ATTN for spec decoding test for consistency
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
else
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_FA"
)
else
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
# lock matmul precision to full FP32 (IEEE)
# lock matmul precision to full FP32 (IEEE)
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"
ieee
"
)
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"
highest
"
)
# m.setenv("VLLM_BATCH_INVARIANT", "1")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs
:
list
[
tuple
[
str
,
list
,
list
]]
=
[]
outputs
:
list
[
tuple
[
str
,
list
,
list
]]
=
[]
for
n
,
(
for
n
,
(
...
@@ -174,6 +188,7 @@ def run_tests(
...
@@ -174,6 +188,7 @@ def run_tests(
spec_config
,
spec_config
,
test_prefill_chunking
=
test_prefill_chunking
,
test_prefill_chunking
=
test_prefill_chunking
,
is_testing_with_spec_decoding
=
is_testing_with_spec_decoding
,
is_testing_with_spec_decoding
=
is_testing_with_spec_decoding
,
attention_config
=
attention_config
,
)
)
outputs
.
append
(
test_results
)
outputs
.
append
(
test_results
)
...
@@ -204,15 +219,7 @@ def run_tests(
...
@@ -204,15 +219,7 @@ def run_tests(
name_1
=
f
"config=[
{
test_config
}
], params=
{
params
}
"
,
name_1
=
f
"config=[
{
test_config
}
], params=
{
params
}
"
,
)
)
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
assert
_all_logprobs_match
(
base_logprobs
,
test_logprobs
)
# logprobs comparison when logprobs are requested
skip_logprobs_check
=
(
current_platform
.
is_rocm
()
and
params
.
get
(
"logprobs"
)
and
is_testing_with_spec_decoding
)
if
not
skip_logprobs_check
:
assert
_all_logprobs_match
(
base_logprobs
,
test_logprobs
)
if
(
if
(
base_acceptance_rate
is
not
None
base_acceptance_rate
is
not
None
...
@@ -262,6 +269,7 @@ def run_test(
...
@@ -262,6 +269,7 @@ def run_test(
spec_config
:
dict
[
str
,
Any
]
|
None
,
spec_config
:
dict
[
str
,
Any
]
|
None
,
test_prefill_chunking
:
bool
,
test_prefill_chunking
:
bool
,
is_testing_with_spec_decoding
:
bool
=
False
,
is_testing_with_spec_decoding
:
bool
=
False
,
attention_config
:
dict
[
str
,
Any
]
|
None
=
None
,
):
):
spec_decoding
=
spec_config
is
not
None
spec_decoding
=
spec_config
is
not
None
cache_arg
:
dict
[
str
,
Any
]
=
(
cache_arg
:
dict
[
str
,
Any
]
=
(
...
@@ -281,14 +289,6 @@ def run_test(
...
@@ -281,14 +289,6 @@ def run_test(
print
(
f
"---- TESTING
{
test_str
}
:
{
test_config
}
"
)
print
(
f
"---- TESTING
{
test_str
}
:
{
test_config
}
"
)
print
(
"-"
*
80
)
print
(
"-"
*
80
)
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
# spec decoding test (TRITON_ATTN) for better precision.
# On others: always use float32.
if
current_platform
.
is_rocm
()
and
not
is_testing_with_spec_decoding
:
dtype
=
"float16"
else
:
dtype
=
"float32"
with
VllmRunner
(
with
VllmRunner
(
model
,
model
,
max_model_len
=
512
,
max_model_len
=
512
,
...
@@ -298,9 +298,10 @@ def run_test(
...
@@ -298,9 +298,10 @@ def run_test(
# enforce_eager=True,
# enforce_eager=True,
async_scheduling
=
async_scheduling
,
async_scheduling
=
async_scheduling
,
distributed_executor_backend
=
executor
,
distributed_executor_backend
=
executor
,
dtype
=
dtype
,
dtype
=
"float32"
,
speculative_config
=
spec_config
,
speculative_config
=
spec_config
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
attention_config
=
attention_config
,
**
cache_arg
,
**
cache_arg
,
)
as
vllm_model
:
)
as
vllm_model
:
results
=
[]
results
=
[]
...
@@ -358,12 +359,7 @@ def _all_logprobs_match(req_a, req_b) -> bool:
...
@@ -358,12 +359,7 @@ def _all_logprobs_match(req_a, req_b) -> bool:
def
_logprobs_match
(
lps_a
:
dict
[
int
,
Logprob
],
lps_b
:
dict
[
int
,
Logprob
])
->
bool
:
def
_logprobs_match
(
lps_a
:
dict
[
int
,
Logprob
],
lps_b
:
dict
[
int
,
Logprob
])
->
bool
:
if
current_platform
.
is_rocm
():
rel_tol
,
abs_tol
=
1e-3
,
1e-6
# ROCm has higher numerical variance
# due to use of float16.
rel_tol
,
abs_tol
=
5e-2
,
1e-5
else
:
rel_tol
,
abs_tol
=
1e-3
,
1e-6
return
(
return
(
len
(
lps_a
)
==
len
(
lps_b
)
len
(
lps_a
)
==
len
(
lps_b
)
and
lps_a
.
keys
()
==
lps_b
.
keys
()
and
lps_a
.
keys
()
==
lps_b
.
keys
()
...
...
tests/v1/e2e/test_async_spec_decode.py
View file @
7e63ef82
...
@@ -19,7 +19,7 @@ def sync_tracker():
...
@@ -19,7 +19,7 @@ def sync_tracker():
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
Fixture that patches CommonAttentionMetadata.seq_lens_cpu to detect
lazy init syncs. Prints stack traces immediately when syncs occur.
lazy init syncs. Prints stack traces immediately when syncs occur.
"""
"""
from
vllm.v1.attention.backend
s.utils
import
CommonAttentionMetadata
from
vllm.v1.attention.backend
import
CommonAttentionMetadata
# Shared counter for cross-process communication (inherited by fork)
# Shared counter for cross-process communication (inherited by fork)
sync_count
=
multiprocessing
.
Value
(
"i"
,
0
)
sync_count
=
multiprocessing
.
Value
(
"i"
,
0
)
...
...
tests/v1/e2e/test_spec_decode.py
View file @
7e63ef82
...
@@ -445,25 +445,26 @@ def test_eagle_correctness(
...
@@ -445,25 +445,26 @@ def test_eagle_correctness(
should be the same when using eagle speculative decoding.
should be the same when using eagle speculative decoding.
model_setup: (method, model_name, eagle_model_name, tp_size)
model_setup: (method, model_name, eagle_model_name, tp_size)
"""
"""
with
monkeypatch
.
context
()
as
m
:
# Determine attention config
if
"Llama-4-Scout"
in
model_setup
[
1
]
and
attn_backend
==
"FLASH_ATTN"
:
# Scout requires default backend selection because vision encoder has
# Scout requires default backend selection
# head_dim 88 being incompatible with FLASH_ATTN and needs to fall back
# because vision encoder has head_dim 88 being incompatible
# to Flex Attn
# with FLASH_ATTN and needs to fall back to Flex Attn
if
"Llama-4-Scout"
in
model_setup
[
1
]
and
attn_backend
==
"FLASH_ATTN"
:
if
current_platform
.
is_rocm
():
# pass if not ROCm
# TODO: Enable Flex Attn for spec_decode on ROCm
if
current_platform
.
is_rocm
():
pytest
.
skip
(
"Flex Attn for spec_decode not supported on ROCm currently"
)
# TODO: Enable Flex Attn for spec_decode on ROCm
attention_config
=
None
# Let it fall back to default
pytest
.
skip
(
"Flex Attn for spec_decode not supported on ROCm currently"
)
else
:
else
:
attention_config
=
{
"backend"
:
attn_backend
}
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
pytest
.
skip
(
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
if
attn_backend
==
"TRITON_ATTN"
and
not
current_platform
.
is_rocm
():
with
monkeypatch
.
context
()
as
m
:
pytest
.
skip
(
m
.
setenv
(
"VLLM_MLA_DISABLE"
,
"1"
)
"TRITON_ATTN does not support "
"multi-token eagle spec decode on current platform"
)
if
attn_backend
==
"ROCM_AITER_FA"
and
current_platform
.
is_rocm
():
if
attn_backend
==
"ROCM_AITER_FA"
and
current_platform
.
is_rocm
():
if
"deepseek"
in
model_setup
[
1
].
lower
():
if
"deepseek"
in
model_setup
[
1
].
lower
():
...
@@ -478,7 +479,10 @@ def test_eagle_correctness(
...
@@ -478,7 +479,10 @@ def test_eagle_correctness(
max_num_batched_tokens
=
128
if
enable_chunked_prefill
else
max_model_len
max_num_batched_tokens
=
128
if
enable_chunked_prefill
else
max_model_len
ref_llm
=
LLM
(
ref_llm
=
LLM
(
model
=
model_name
,
max_model_len
=
max_model_len
,
tensor_parallel_size
=
tp_size
model
=
model_name
,
max_model_len
=
max_model_len
,
tensor_parallel_size
=
tp_size
,
attention_config
=
attention_config
,
)
)
ref_outputs
=
ref_llm
.
chat
(
test_prompts
,
sampling_config
)
ref_outputs
=
ref_llm
.
chat
(
test_prompts
,
sampling_config
)
del
ref_llm
del
ref_llm
...
@@ -499,6 +503,7 @@ def test_eagle_correctness(
...
@@ -499,6 +503,7 @@ def test_eagle_correctness(
max_num_batched_tokens
=
max_num_batched_tokens
,
max_num_batched_tokens
=
max_num_batched_tokens
,
enable_chunked_prefill
=
enable_chunked_prefill
,
enable_chunked_prefill
=
enable_chunked_prefill
,
model_impl
=
model_impl
,
model_impl
=
model_impl
,
attention_config
=
attention_config
,
)
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
spec_outputs
=
spec_llm
.
chat
(
test_prompts
,
sampling_config
)
matches
=
0
matches
=
0
...
...
tests/v1/e2e/untest_cascade_attention.py
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
pytest
import
pytest
from
vllm
import
LLM
,
SamplingParams
from
vllm
import
LLM
,
SamplingParams
from
...utils
import
create_new_process_for_each_test
from
...utils
import
create_new_process_for_each_test
from
...utils
import
models_path_prefix
@
create_new_process_for_each_test
()
@
create_new_process_for_each_test
()
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
])
@
pytest
.
mark
.
parametrize
(
"attn_backend"
,
[
"FLASH_ATTN"
,
"FLASHINFER"
])
def
test_cascade_attention
(
example_system_message
,
monkeypatch
,
attn_backend
):
def
test_cascade_attention
(
example_system_message
,
attn_backend
):
prompt
=
"
\n
<User>: Implement fibonacci sequence in Python.
\n
<Claude>:"
prompt
=
"
\n
<User>: Implement fibonacci sequence in Python.
\n
<Claude>:"
if
attn_backend
==
"FLASHINFER"
:
if
attn_backend
==
"FLASHINFER"
:
...
@@ -19,19 +21,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
...
@@ -19,19 +21,18 @@ def test_cascade_attention(example_system_message, monkeypatch, attn_backend):
"needs investigation. See issue #25679."
"needs investigation. See issue #25679."
)
)
with
monkeypatch
.
context
()
as
m
:
llm
=
LLM
(
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
attn_backend
)
model
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2-1.5B-Instruct"
),
attention_config
=
{
"backend"
:
attn_backend
}
)
llm
=
LLM
(
model
=
"Qwen/Qwen2-1.5B-Instruct"
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
max_tokens
=
100
)
# No cascade attention.
# No cascade attention.
single_prompt
=
[
example_system_message
+
prompt
]
single_prompt
=
[
example_system_message
+
prompt
]
responses
=
llm
.
generate
(
single_prompt
,
sampling_params
)
responses
=
llm
.
generate
(
single_prompt
,
sampling_params
)
ref_output
=
responses
[
0
].
outputs
[
0
].
text
ref_output
=
responses
[
0
].
outputs
[
0
].
text
# (Probably) Use cascade attention.
# (Probably) Use cascade attention.
prompts
=
[
example_system_message
+
prompt
]
*
64
prompts
=
[
example_system_message
+
prompt
]
*
64
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
responses
=
llm
.
generate
(
prompts
,
sampling_params
)
for
response
in
responses
:
for
response
in
responses
:
assert
response
.
outputs
[
0
].
text
==
ref_output
assert
response
.
outputs
[
0
].
text
==
ref_output
\ No newline at end of file
tests/v1/ec_connector/integration/test_epd_correctness.py
View file @
7e63ef82
...
@@ -31,7 +31,7 @@ import openai
...
@@ -31,7 +31,7 @@ import openai
import
requests
import
requests
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.multimodal.utils
import
encode_image_
base64
from
vllm.multimodal.utils
import
encode_image_
url
MAX_OUTPUT_LEN
=
256
MAX_OUTPUT_LEN
=
256
...
@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
...
@@ -49,9 +49,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content"
:
[
"content"
:
[
{
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"image_url"
:
{
"url"
:
encode_image_url
(
image_1
)},
"url"
:
f
"data:image;base64,
{
encode_image_base64
(
image_1
)
}
"
},
},
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
],
...
@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
...
@@ -66,9 +64,7 @@ SAMPLE_PROMPTS_MM: list[dict] = [
"content"
:
[
"content"
:
[
{
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
"image_url"
:
{
"image_url"
:
{
"url"
:
encode_image_url
(
image_2
)},
"url"
:
f
"data:image;base64,
{
encode_image_base64
(
image_2
)
}
"
},
},
},
{
{
"type"
:
"image_url"
,
"type"
:
"image_url"
,
...
...
tests/v1/engine/test_async_llm.py
View file @
7e63ef82
...
@@ -12,6 +12,13 @@ from vllm import SamplingParams
...
@@ -12,6 +12,13 @@ from vllm import SamplingParams
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.config
import
VllmConfig
from
vllm.config
import
VllmConfig
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.engine.arg_utils
import
AsyncEngineArgs
from
vllm.entrypoints.openai.protocol
import
(
ChatCompletionRequest
,
ChatCompletionResponse
,
ErrorResponse
,
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_models
import
BaseModelPath
,
OpenAIServingModels
from
vllm.inputs
import
PromptType
from
vllm.inputs
import
PromptType
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
...
@@ -255,7 +262,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
...
@@ -255,7 +262,7 @@ async def test_multi_abort(output_kind: RequestOutputKind):
# Use multi-abort to abort multiple requests at once
# Use multi-abort to abort multiple requests at once
abort_request_ids
=
[
request_ids
[
i
]
for
i
in
REQUEST_IDS_TO_ABORT
]
abort_request_ids
=
[
request_ids
[
i
]
for
i
in
REQUEST_IDS_TO_ABORT
]
await
engine
.
abort
(
abort_request_ids
)
await
engine
.
abort
(
abort_request_ids
,
internal
=
False
)
# Wait for all tasks to complete
# Wait for all tasks to complete
results
=
await
asyncio
.
gather
(
*
tasks
,
return_exceptions
=
True
)
results
=
await
asyncio
.
gather
(
*
tasks
,
return_exceptions
=
True
)
...
@@ -486,6 +493,60 @@ async def test_dp_rank_argument():
...
@@ -486,6 +493,60 @@ async def test_dp_rank_argument():
pass
pass
@
pytest
.
mark
.
asyncio
(
scope
=
"module"
)
async
def
test_header_dp_rank_argument
():
with
ExitStack
()
as
after
:
with
set_default_torch_num_threads
(
1
):
engine
=
AsyncLLM
.
from_engine_args
(
TEXT_ENGINE_ARGS
)
after
.
callback
(
engine
.
shutdown
)
MODEL_NAME
=
"test-model"
BASE_MODEL_PATHS
=
[
BaseModelPath
(
name
=
MODEL_NAME
,
model_path
=
MODEL_NAME
)]
# Create models first
models
=
OpenAIServingModels
(
engine_client
=
engine
,
base_model_paths
=
BASE_MODEL_PATHS
,
)
# Create serving chat instance
serving_chat
=
OpenAIServingChat
(
engine_client
=
engine
,
models
=
models
,
response_role
=
"assistant"
,
chat_template
=
None
,
chat_template_content_format
=
"auto"
,
request_logger
=
None
,
)
# Create a chat completion request
req
=
ChatCompletionRequest
(
model
=
MODEL_NAME
,
messages
=
[{
"role"
:
"user"
,
"content"
:
TEXT_PROMPT
}],
max_tokens
=
100
,
temperature
=
1.0
,
seed
=
33
,
)
# Test 1: Valid DP rank (0)
mock_raw_request
=
MagicMock
()
mock_raw_request
.
headers
=
{
"X-data-parallel-rank"
:
"0"
}
mock_raw_request
.
state
=
MagicMock
()
# Should succeed with valid rank
response
=
await
serving_chat
.
create_chat_completion
(
req
,
mock_raw_request
)
assert
isinstance
(
response
,
ChatCompletionResponse
),
(
"Expected a ChatCompletionResponse for valid DP rank"
)
# Test 2: Out-of-range DP rank (1)
mock_raw_request
.
headers
=
{
"X-data-parallel-rank"
:
"1"
}
# should return ErrorResponse for out-of-range rank
response2
=
await
serving_chat
.
create_chat_completion
(
req
,
mock_raw_request
)
assert
isinstance
(
response2
,
ErrorResponse
),
(
"Expected an ErrorResponse for out-of-range DP rank"
)
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_check_health
():
async
def
test_check_health
():
"""Test that check_health returns normally for healthy engine
"""Test that check_health returns normally for healthy engine
...
@@ -550,7 +611,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
...
@@ -550,7 +611,7 @@ async def test_abort_final_output(output_kind: RequestOutputKind):
await
asyncio
.
sleep
(
0.5
)
await
asyncio
.
sleep
(
0.5
)
# Abort the request
# Abort the request
await
engine
.
abort
(
request_id
)
await
engine
.
abort
(
request_id
,
internal
=
False
)
# Wait for generation to complete and return final output
# Wait for generation to complete and return final output
final_output
=
await
generated
final_output
=
await
generated
...
...
tests/v1/engine/test_engine_core.py
View file @
7e63ef82
...
@@ -42,10 +42,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
...
@@ -42,10 +42,16 @@ TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME)
PROMPT
=
"I am Gyoubu Masataka Oniwa"
PROMPT
=
"I am Gyoubu Masataka Oniwa"
PROMPT_TOKENS
=
TOKENIZER
(
PROMPT
).
input_ids
PROMPT_TOKENS
=
TOKENIZER
(
PROMPT
).
input_ids
_REQUEST_COUNTER
=
0
def
make_request
()
->
EngineCoreRequest
:
def
make_request
()
->
EngineCoreRequest
:
global
_REQUEST_COUNTER
_REQUEST_COUNTER
+=
1
request_id
=
f
"request-
{
_REQUEST_COUNTER
}
"
return
EngineCoreRequest
(
return
EngineCoreRequest
(
request_id
=
str
(
uuid
.
uuid4
()),
request_id
=
request_id
,
external_req_id
=
f
"
{
request_id
}
-
{
uuid
.
uuid4
()
}
"
,
prompt_token_ids
=
PROMPT_TOKENS
,
prompt_token_ids
=
PROMPT_TOKENS
,
mm_features
=
None
,
mm_features
=
None
,
sampling_params
=
SamplingParams
(),
sampling_params
=
SamplingParams
(),
...
...
tests/v1/engine/test_engine_core_client.py
View file @
7e63ef82
...
@@ -2,12 +2,14 @@
...
@@ -2,12 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
asyncio
import
asyncio
import
importlib
import
os
import
os
import
signal
import
signal
import
time
import
time
import
uuid
import
uuid
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
threading
import
Thread
from
threading
import
Thread
from
types
import
SimpleNamespace
from
typing
import
Any
from
typing
import
Any
from
unittest.mock
import
MagicMock
from
unittest.mock
import
MagicMock
...
@@ -25,7 +27,11 @@ from vllm.usage.usage_lib import UsageContext
...
@@ -25,7 +27,11 @@ from vllm.usage.usage_lib import UsageContext
from
vllm.utils.torch_utils
import
set_default_torch_num_threads
from
vllm.utils.torch_utils
import
set_default_torch_num_threads
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.core
import
EngineCore
from
vllm.v1.engine.core
import
EngineCore
from
vllm.v1.engine.core_client
import
AsyncMPClient
,
EngineCoreClient
,
SyncMPClient
from
vllm.v1.engine.core_client
import
(
AsyncMPClient
,
EngineCoreClient
,
SyncMPClient
,
)
from
vllm.v1.engine.utils
import
CoreEngineProcManager
from
vllm.v1.engine.utils
import
CoreEngineProcManager
from
vllm.v1.executor.abstract
import
Executor
from
vllm.v1.executor.abstract
import
Executor
...
@@ -33,14 +39,19 @@ from ...distributed.conftest import MockSubscriber
...
@@ -33,14 +39,19 @@ from ...distributed.conftest import MockSubscriber
from
...utils
import
create_new_process_for_each_test
from
...utils
import
create_new_process_for_each_test
from
...utils
import
models_path_prefix
from
...utils
import
models_path_prefix
if
not
current_platform
.
is_cuda
():
if
not
current_platform
.
is_cuda_alike
():
pytest
.
skip
(
reason
=
"V1 currently only supported on CUDA."
,
allow_module_level
=
True
)
pytest
.
skip
(
reason
=
"V1 currently only supported on CUDA-alike platforms."
,
allow_module_level
=
True
,
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
MODEL_NAME
=
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
)
TOKENIZER
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
TOKENIZER
=
AutoTokenizer
.
from_pretrained
(
MODEL_NAME
)
PROMPT
=
"Hello my name is Robert and I love quantization kernels"
PROMPT
=
"Hello my name is Robert and I love quantization kernels"
PROMPT_TOKENS
=
TOKENIZER
(
PROMPT
).
input_ids
PROMPT_TOKENS
=
TOKENIZER
(
PROMPT
).
input_ids
_REQUEST_COUNTER
=
0
def
make_request
(
def
make_request
(
params
:
SamplingParams
,
prompt_tokens_ids
:
list
[
int
]
|
None
=
None
params
:
SamplingParams
,
prompt_tokens_ids
:
list
[
int
]
|
None
=
None
...
@@ -48,8 +59,12 @@ def make_request(
...
@@ -48,8 +59,12 @@ def make_request(
if
not
prompt_tokens_ids
:
if
not
prompt_tokens_ids
:
prompt_tokens_ids
=
PROMPT_TOKENS
prompt_tokens_ids
=
PROMPT_TOKENS
global
_REQUEST_COUNTER
_REQUEST_COUNTER
+=
1
request_id
=
f
"request-
{
_REQUEST_COUNTER
}
"
return
EngineCoreRequest
(
return
EngineCoreRequest
(
request_id
=
str
(
uuid
.
uuid4
()),
request_id
=
request_id
,
external_req_id
=
f
"
{
request_id
}
-
{
uuid
.
uuid4
()
}
"
,
prompt_token_ids
=
prompt_tokens_ids
,
prompt_token_ids
=
prompt_tokens_ids
,
mm_features
=
None
,
mm_features
=
None
,
sampling_params
=
params
,
sampling_params
=
params
,
...
@@ -62,6 +77,92 @@ def make_request(
...
@@ -62,6 +77,92 @@ def make_request(
)
)
def
_reload_envs_module
():
import
vllm.envs
as
envs_mod
cache_clear
=
getattr
(
getattr
(
envs_mod
,
"__getattr__"
,
None
),
"cache_clear"
,
None
)
if
cache_clear
is
not
None
:
cache_clear
()
return
importlib
.
reload
(
envs_mod
)
def
_reload_core_client_module
():
module
=
importlib
.
import_module
(
"vllm.v1.engine.core_client"
)
return
importlib
.
reload
(
module
)
def
test_mp_client_uses_env_timeout
(
monkeypatch
:
pytest
.
MonkeyPatch
):
timeout_value
=
654
monkeypatch
.
setenv
(
"VLLM_ENGINE_READY_TIMEOUT_S"
,
str
(
timeout_value
))
# Ensure that the environment variable is loaded if caching is enabled
_reload_envs_module
()
core_client_mod
=
_reload_core_client_module
()
poll_timeouts
:
list
[
int
]
=
[]
class
ShadowSocket
:
def
poll
(
self
,
timeout
:
int
)
->
int
:
# Capture the timeout value for each poll call
poll_timeouts
.
append
(
timeout
)
return
1
def
recv_multipart
(
self
):
return
(
b
"
\x00\x00
"
,
b
"ready"
)
class
DummySocket
:
def
send_multipart
(
self
,
_msg
,
*
,
copy
:
bool
=
False
,
track
:
bool
=
False
):
if
track
:
return
SimpleNamespace
(
done
=
True
)
def
recv_multipart
(
self
,
*
,
copy
:
bool
=
False
):
return
(
b
""
,
b
""
)
def
close
(
self
,
*
,
linger
:
int
=
0
):
pass
def
bind
(
self
,
_address
):
pass
def
connect
(
self
,
_address
):
pass
def
setsockopt
(
self
,
*
_args
,
**
_kwargs
):
pass
monkeypatch
.
setattr
(
core_client_mod
.
zmq
.
Socket
,
"shadow"
,
lambda
*
_
:
ShadowSocket
())
monkeypatch
.
setattr
(
core_client_mod
,
"make_zmq_socket"
,
lambda
*
_
,
**
__
:
DummySocket
()
)
parallel_config
=
SimpleNamespace
(
data_parallel_size
=
1
,
data_parallel_rank
=
0
,
data_parallel_index
=
0
,
data_parallel_size_local
=
1
,
data_parallel_rank_local
=
None
,
data_parallel_hybrid_lb
=
False
,
data_parallel_external_lb
=
False
,
)
vllm_config
=
SimpleNamespace
(
parallel_config
=
parallel_config
)
client
=
core_client_mod
.
MPClient
(
asyncio_mode
=
False
,
vllm_config
=
vllm_config
,
executor_class
=
object
,
log_stats
=
False
,
client_addresses
=
{
"input_address"
:
"inproc://input"
,
"output_address"
:
"inproc://output"
,
},
)
try
:
# timeout_value is in seconds, but poll receives milliseconds
assert
poll_timeouts
==
[
timeout_value
*
1000
]
finally
:
client
.
shutdown
()
def
loop_until_done
(
client
:
EngineCoreClient
,
outputs
:
dict
):
def
loop_until_done
(
client
:
EngineCoreClient
,
outputs
:
dict
):
while
True
:
while
True
:
engine_core_outputs
=
client
.
get_output
().
outputs
engine_core_outputs
=
client
.
get_output
().
outputs
...
@@ -638,6 +739,7 @@ def test_kv_cache_events(
...
@@ -638,6 +739,7 @@ def test_kv_cache_events(
)
)
assert
event
.
parent_block_hash
is
None
,
"Parent block hash should be None"
assert
event
.
parent_block_hash
is
None
,
"Parent block hash should be None"
assert
event
.
lora_id
is
None
,
"Lora id should be None"
assert
event
.
lora_id
is
None
,
"Lora id should be None"
assert
event
.
lora_name
is
None
,
"Lora name should be None"
assert
len
(
event
.
token_ids
)
==
num_blocks
*
block_size
,
(
assert
len
(
event
.
token_ids
)
==
num_blocks
*
block_size
,
(
"Token ids should be the same as the custom tokens"
"Token ids should be the same as the custom tokens"
)
)
...
...
tests/v1/engine/test_fast_incdec_prefix_err.py
View file @
7e63ef82
...
@@ -29,6 +29,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
...
@@ -29,6 +29,7 @@ def test_fast_inc_detok_invalid_utf8_err_case():
params
=
SamplingParams
(
skip_special_tokens
=
True
)
params
=
SamplingParams
(
skip_special_tokens
=
True
)
request
=
EngineCoreRequest
(
request
=
EngineCoreRequest
(
request_id
=
"test"
,
request_id
=
"test"
,
external_req_id
=
"test-ext"
,
prompt_token_ids
=
prompt_token_ids
,
prompt_token_ids
=
prompt_token_ids
,
mm_features
=
None
,
mm_features
=
None
,
sampling_params
=
params
,
sampling_params
=
params
,
...
...
tests/v1/engine/test_output_processor.py
View file @
7e63ef82
...
@@ -59,12 +59,12 @@ def test_incremental_detokenization(
...
@@ -59,12 +59,12 @@ def test_incremental_detokenization(
output_processor
=
OutputProcessor
(
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
,
stream_interval
=
stream_interval
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
,
stream_interval
=
stream_interval
)
)
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
)
# Make N requests.
# Make N requests.
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
request_id
=
f
"request-
{
idx
}
-int"
,
external_req_id
=
f
"request-
{
idx
}
"
,
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -84,6 +84,11 @@ def test_incremental_detokenization(
...
@@ -84,6 +84,11 @@ def test_incremental_detokenization(
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
]
]
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
,
request_ids
=
[
req
.
request_id
for
req
in
requests
],
)
# Add requests to the detokenizer.
# Add requests to the detokenizer.
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
output_processor
.
add_request
(
request
,
prompt
)
output_processor
.
add_request
(
request
,
prompt
)
...
@@ -270,12 +275,28 @@ def _validate_logprobs(
...
@@ -270,12 +275,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
# the logprob token id at this sequence position
decoded_token
=
pos_logprob_dict
[
lp_tok
].
decoded_token
decoded_token
=
pos_logprob_dict
[
lp_tok
].
decoded_token
ref_decoded_token
=
_ref_convert_id_to_token
(
dtv
.
tokenizer
,
lp_tok
)
ref_decoded_token
=
_ref_convert_id_to_token
(
dtv
.
tokenizer
,
lp_tok
)
assert
decoded_token
==
ref_decoded_token
,
(
f
"Sampled logprob token id
{
lp_tok
}
decodes to"
# With UTF-8 correction logic, tokens ending with "�"
f
"
{
ref_decoded_token
}
but Logprob decoded"
# (incomplete byte sequences) are corrected to either
f
" token is
{
decoded_token
}
instead"
# empty string or proper UTF-8 characters
f
" (at position
{
idx
}
)"
if
ref_decoded_token
.
endswith
(
"�"
):
)
# Token needs UTF-8 correction
assert
not
decoded_token
.
endswith
(
"�"
),
(
f
"Sampled logprob token id
{
lp_tok
}
decodes to"
f
" '
{
ref_decoded_token
}
' (ends with replacement char)"
f
" but corrected decoded token '
{
decoded_token
}
'"
f
" still ends with replacement char"
f
" (at position
{
idx
}
). UTF-8 correction should"
f
" have removed it."
)
else
:
# No correction needed, should match exactly
assert
decoded_token
==
ref_decoded_token
,
(
f
"Sampled logprob token id
{
lp_tok
}
decodes to"
f
"
{
ref_decoded_token
}
but Logprob decoded"
f
" token is
{
decoded_token
}
instead"
f
" (at position
{
idx
}
)"
)
ref_cumulative_logprob
+=
pos_logprob_dict
[
sampled_token
].
logprob
ref_cumulative_logprob
+=
pos_logprob_dict
[
sampled_token
].
logprob
# Assert that cumulative logprobs are correct
# Assert that cumulative logprobs are correct
...
@@ -416,12 +437,28 @@ def _validate_logprobs(
...
@@ -416,12 +437,28 @@ def _validate_logprobs(
# the logprob token id at this sequence position
# the logprob token id at this sequence position
decoded_token
=
pos_logprob_dict
[
plp_tok
].
decoded_token
decoded_token
=
pos_logprob_dict
[
plp_tok
].
decoded_token
ref_decoded_token
=
_ref_convert_id_to_token
(
dtv
.
tokenizer
,
plp_tok
)
ref_decoded_token
=
_ref_convert_id_to_token
(
dtv
.
tokenizer
,
plp_tok
)
assert
decoded_token
==
ref_decoded_token
,
(
f
"Prompt logprob token id
{
plp_tok
}
decodes to"
# With UTF-8 correction logic, tokens ending with "�"
f
"
{
ref_decoded_token
}
but Logprob decoded"
# (incomplete byte sequences) are corrected to either
f
" token is
{
decoded_token
}
instead"
# empty string or proper UTF-8 characters
f
" (at position
{
idx
}
)"
if
ref_decoded_token
.
endswith
(
"�"
):
)
# Token needs UTF-8 correction
assert
not
decoded_token
.
endswith
(
"�"
),
(
f
"Prompt logprob token id
{
plp_tok
}
decodes to"
f
" '
{
ref_decoded_token
}
' (ends with replacement char)"
f
" but corrected decoded token '
{
decoded_token
}
'"
f
" still ends with replacement char"
f
" (at position
{
idx
}
). UTF-8 correction should"
f
" have removed it."
)
else
:
# No correction needed, should match exactly
assert
decoded_token
==
ref_decoded_token
,
(
f
"Prompt logprob token id
{
plp_tok
}
decodes to"
f
"
{
ref_decoded_token
}
but Logprob decoded"
f
" token is
{
decoded_token
}
instead"
f
" (at position
{
idx
}
)"
)
else
:
else
:
# Prompt logprobs disabled for this request
# Prompt logprobs disabled for this request
assert
prompt_logprobs
is
None
assert
prompt_logprobs
is
None
...
@@ -439,15 +476,6 @@ def test_logprobs_processor(
...
@@ -439,15 +476,6 @@ def test_logprobs_processor(
dummy_test_vectors
,
dummy_test_vectors
,
):
):
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
)
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
)
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
,
generated_logprobs_raw
=
None
if
num_sample_logprobs
is
None
else
dummy_test_vectors
.
generation_logprobs
,
prompt_logprobs_raw
=
None
if
num_prompt_logprobs
is
None
else
dummy_test_vectors
.
prompt_logprobs
,
)
# Make N requests.
# Make N requests.
request_id_list
=
[
request_id_list
=
[
...
@@ -455,7 +483,8 @@ def test_logprobs_processor(
...
@@ -455,7 +483,8 @@ def test_logprobs_processor(
]
]
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
request_id_list
[
idx
],
request_id
=
request_id_list
[
idx
]
+
"-int"
,
external_req_id
=
request_id_list
[
idx
],
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -477,6 +506,17 @@ def test_logprobs_processor(
...
@@ -477,6 +506,17 @@ def test_logprobs_processor(
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
]
]
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
,
generated_logprobs_raw
=
None
if
num_sample_logprobs
is
None
else
dummy_test_vectors
.
generation_logprobs
,
prompt_logprobs_raw
=
None
if
num_prompt_logprobs
is
None
else
dummy_test_vectors
.
prompt_logprobs
,
request_ids
=
[
req
.
request_id
for
req
in
requests
],
)
# Add requests to the detokenizer.
# Add requests to the detokenizer.
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
output_processor
.
add_request
(
request
,
prompt
)
output_processor
.
add_request
(
request
,
prompt
)
...
@@ -622,19 +662,12 @@ def test_stop_token(
...
@@ -622,19 +662,12 @@ def test_stop_token(
]
]
prompt_string
=
dummy_test_vectors
.
prompt_strings
[
0
]
prompt_string
=
dummy_test_vectors
.
prompt_strings
[
0
]
prompt_tokens
=
dummy_test_vectors
.
prompt_tokens
[
0
]
prompt_tokens
=
dummy_test_vectors
.
prompt_tokens
[
0
]
engine_core
=
MockEngineCore
(
tokens_list
=
[
generation_tokens
],
generated_logprobs_raw
=
[
generation_logprobs
]
if
do_logprobs
else
None
,
prompt_logprobs_raw
=
None
,
eos_token_id
=
eos_token_id
,
stop_token_ids
=
stop_token_ids
,
ignore_eos
=
ignore_eos
,
)
# Make request.
# Make request.
request_id
=
"request-0"
request_id
=
"request-0"
request
=
EngineCoreRequest
(
request
=
EngineCoreRequest
(
request_id
=
request_id
,
request_id
=
request_id
,
external_req_id
=
request_id
+
"-ext"
,
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
eos_token_id
,
eos_token_id
=
eos_token_id
,
...
@@ -656,6 +689,16 @@ def test_stop_token(
...
@@ -656,6 +689,16 @@ def test_stop_token(
pooling_params
=
None
,
pooling_params
=
None
,
)
)
engine_core
=
MockEngineCore
(
tokens_list
=
[
generation_tokens
],
generated_logprobs_raw
=
[
generation_logprobs
]
if
do_logprobs
else
None
,
prompt_logprobs_raw
=
None
,
eos_token_id
=
eos_token_id
,
stop_token_ids
=
stop_token_ids
,
ignore_eos
=
ignore_eos
,
request_ids
=
[
request
.
request_id
],
)
# Add request to the detokenizer.
# Add request to the detokenizer.
output_processor
.
add_request
(
request
,
prompt_string
)
output_processor
.
add_request
(
request
,
prompt_string
)
...
@@ -721,13 +764,6 @@ def test_stop_string(
...
@@ -721,13 +764,6 @@ def test_stop_string(
dummy_test_vectors
,
dummy_test_vectors
,
):
):
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
)
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
False
)
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
,
generated_logprobs_raw
=
dummy_test_vectors
.
generation_logprobs
if
num_sample_logprobs
else
None
,
prompt_logprobs_raw
=
None
,
)
# Make N requests.
# Make N requests.
request_id_list
=
[
request_id_list
=
[
...
@@ -735,7 +771,8 @@ def test_stop_string(
...
@@ -735,7 +771,8 @@ def test_stop_string(
]
]
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
request_id_list
[
idx
],
request_id
=
request_id_list
[
idx
]
+
"-int"
,
external_req_id
=
request_id_list
[
idx
],
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -757,6 +794,15 @@ def test_stop_string(
...
@@ -757,6 +794,15 @@ def test_stop_string(
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
]
]
engine_core
=
MockEngineCore
(
tokens_list
=
dummy_test_vectors
.
generation_tokens
,
generated_logprobs_raw
=
dummy_test_vectors
.
generation_logprobs
if
num_sample_logprobs
else
None
,
prompt_logprobs_raw
=
None
,
request_ids
=
[
req
.
request_id
for
req
in
requests
],
)
# Add requests to the detokenizer.
# Add requests to the detokenizer.
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
for
request
,
prompt
in
zip
(
requests
,
dummy_test_vectors
.
prompt_strings
):
output_processor
.
add_request
(
request
,
prompt
)
output_processor
.
add_request
(
request
,
prompt
)
...
@@ -814,9 +860,12 @@ def test_stop_string(
...
@@ -814,9 +860,12 @@ def test_stop_string(
for
idx
,
(
ref_gen_str
,
stop_str
)
in
enumerate
(
for
idx
,
(
ref_gen_str
,
stop_str
)
in
enumerate
(
zip
(
dummy_test_vectors
.
generation_strings
,
STOP_STRINGS
)
zip
(
dummy_test_vectors
.
generation_strings
,
STOP_STRINGS
)
):
):
# Request should be aborted.
# Request should be aborted (check internal ID in abort list).
internal_request_id
=
f
"request-
{
idx
}
-int"
assert
internal_request_id
in
aborted
# Use external ID for collecting outputs
request_id
=
f
"request-
{
idx
}
"
request_id
=
f
"request-
{
idx
}
"
assert
request_id
in
aborted
# Collected values that were generated.
# Collected values that were generated.
gen_str
=
gen_strings
[
request_id
]
gen_str
=
gen_strings
[
request_id
]
...
@@ -849,13 +898,13 @@ def test_stop_string(
...
@@ -849,13 +898,13 @@ def test_stop_string(
def
test_iteration_stats
(
dummy_test_vectors
):
def
test_iteration_stats
(
dummy_test_vectors
):
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
True
)
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
True
)
engine_core
=
MockEngineCore
(
dummy_test_vectors
.
generation_tokens
)
engine_core_timestamp
=
time
.
monotonic
()
engine_core_timestamp
=
time
.
monotonic
()
# Make N requests.
# Make N requests.
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
request_id
=
f
"request-
{
idx
}
"
,
external_req_id
=
f
"request-
{
idx
}
-ext"
,
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -869,6 +918,11 @@ def test_iteration_stats(dummy_test_vectors):
...
@@ -869,6 +918,11 @@ def test_iteration_stats(dummy_test_vectors):
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
]
]
engine_core
=
MockEngineCore
(
dummy_test_vectors
.
generation_tokens
,
request_ids
=
[
req
.
request_id
for
req
in
requests
],
)
# Add all requests except one to the OutputProcessor.
# Add all requests except one to the OutputProcessor.
num_active
=
len
(
dummy_test_vectors
.
generation_tokens
)
-
1
num_active
=
len
(
dummy_test_vectors
.
generation_tokens
)
-
1
for
request
in
requests
[:
num_active
]:
for
request
in
requests
[:
num_active
]:
...
@@ -923,7 +977,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -923,7 +977,6 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
output_processor
=
OutputProcessor
(
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
log_stats
dummy_test_vectors
.
tokenizer
,
log_stats
=
log_stats
)
)
engine_core
=
MockEngineCore
(
dummy_test_vectors
.
generation_tokens
)
engine_core_timestamp
=
time
.
monotonic
()
engine_core_timestamp
=
time
.
monotonic
()
# Create LoRA requests
# Create LoRA requests
...
@@ -937,7 +990,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -937,7 +990,8 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
lora_assignments
=
[
lora1
,
lora2
,
None
]
lora_assignments
=
[
lora1
,
lora2
,
None
]
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
request_id
=
f
"request-
{
idx
}
-int"
,
external_req_id
=
f
"request-
{
idx
}
"
,
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -951,6 +1005,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -951,6 +1005,11 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
for
idx
,
prompt_tokens
in
enumerate
(
dummy_test_vectors
.
prompt_tokens
)
]
]
engine_core
=
MockEngineCore
(
dummy_test_vectors
.
generation_tokens
,
request_ids
=
[
req
.
request_id
for
req
in
requests
],
)
# Add all requests to the OutputProcessor
# Add all requests to the OutputProcessor
for
request
in
requests
:
for
request
in
requests
:
output_processor
.
add_request
(
request
,
None
)
output_processor
.
add_request
(
request
,
None
)
...
@@ -1016,9 +1075,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -1016,9 +1075,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs
=
EngineCoreOutputs
(
outputs
=
EngineCoreOutputs
(
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
)
)
# Find and mark request-0 as finished (it uses lora-1)
# Find and mark request-0
-int
as finished (it uses lora-1)
for
output
in
outputs
.
outputs
:
for
output
in
outputs
.
outputs
:
if
output
.
request_id
==
"request-0"
:
if
output
.
request_id
==
"request-0
-int
"
:
output
.
finish_reason
=
FinishReason
.
LENGTH
output
.
finish_reason
=
FinishReason
.
LENGTH
break
break
...
@@ -1041,9 +1100,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -1041,9 +1100,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs
=
EngineCoreOutputs
(
outputs
=
EngineCoreOutputs
(
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
)
)
# Find and mark request-1 as finished (it uses lora-2)
# Find and mark request-1
-int
as finished (it uses lora-2)
for
output
in
outputs
.
outputs
:
for
output
in
outputs
.
outputs
:
if
output
.
request_id
==
"request-1"
:
if
output
.
request_id
==
"request-1
-int
"
:
output
.
finish_reason
=
FinishReason
.
LENGTH
output
.
finish_reason
=
FinishReason
.
LENGTH
break
break
...
@@ -1065,9 +1124,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
...
@@ -1065,9 +1124,9 @@ def test_lora_request_tracking(log_stats: bool, dummy_test_vectors):
outputs
=
EngineCoreOutputs
(
outputs
=
EngineCoreOutputs
(
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
outputs
=
engine_core
.
get_outputs
(),
scheduler_stats
=
SchedulerStats
()
)
)
# Find and mark request-2 as finished (it has no LoRA)
# Find and mark request-2
-int
as finished (it has no LoRA)
for
output
in
outputs
.
outputs
:
for
output
in
outputs
.
outputs
:
if
output
.
request_id
==
"request-2"
:
if
output
.
request_id
==
"request-2
-int
"
:
output
.
finish_reason
=
FinishReason
.
LENGTH
output
.
finish_reason
=
FinishReason
.
LENGTH
break
break
...
@@ -1108,7 +1167,9 @@ async def test_request_output_collector():
...
@@ -1108,7 +1167,9 @@ async def test_request_output_collector():
for
idx
in
range
(
NUM_REQS
)
for
idx
in
range
(
NUM_REQS
)
]
]
collector
=
RequestOutputCollector
(
RequestOutputKind
.
DELTA
)
collector
=
RequestOutputCollector
(
RequestOutputKind
.
DELTA
,
request_id
=
"my-request-id-int"
)
# CASE 1: Put then get.
# CASE 1: Put then get.
outputs
=
make_outputs
()
outputs
=
make_outputs
()
...
@@ -1164,7 +1225,9 @@ async def test_request_output_collector():
...
@@ -1164,7 +1225,9 @@ async def test_request_output_collector():
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_cumulative_output_collector_n
():
async
def
test_cumulative_output_collector_n
():
"""Test collector correctly handles multiple outputs by index."""
"""Test collector correctly handles multiple outputs by index."""
collector
=
RequestOutputCollector
(
RequestOutputKind
.
CUMULATIVE
)
collector
=
RequestOutputCollector
(
RequestOutputKind
.
CUMULATIVE
,
request_id
=
"my-request-id-int"
)
outputs
=
[
outputs
=
[
RequestOutput
(
RequestOutput
(
request_id
=
"my-request-id"
,
request_id
=
"my-request-id"
,
...
@@ -1243,11 +1306,13 @@ async def test_cumulative_output_collector_n():
...
@@ -1243,11 +1306,13 @@ async def test_cumulative_output_collector_n():
@
pytest
.
mark
.
parametrize
(
"runner"
,
[
"generate"
,
"pooling"
])
@
pytest
.
mark
.
parametrize
(
"runner"
,
[
"generate"
,
"pooling"
])
def
test_abort_requests
(
runner
:
str
,
dummy_test_vectors
):
@
pytest
.
mark
.
parametrize
(
"abort_by"
,
[
"internal"
,
"external"
])
def
test_abort_requests
(
runner
:
str
,
abort_by
:
str
,
dummy_test_vectors
):
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
True
)
output_processor
=
OutputProcessor
(
dummy_test_vectors
.
tokenizer
,
log_stats
=
True
)
requests
=
[
requests
=
[
EngineCoreRequest
(
EngineCoreRequest
(
request_id
=
f
"request-
{
idx
}
"
,
request_id
=
f
"request-
{
idx
}
"
,
external_req_id
=
f
"external-
{
idx
}
"
,
prompt_token_ids
=
prompt_tokens
,
prompt_token_ids
=
prompt_tokens
,
mm_features
=
None
,
mm_features
=
None
,
eos_token_id
=
None
,
eos_token_id
=
None
,
...
@@ -1266,8 +1331,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
...
@@ -1266,8 +1331,13 @@ def test_abort_requests(runner: str, dummy_test_vectors):
output_kind
=
request
.
sampling_params
.
output_kind
output_kind
=
request
.
sampling_params
.
output_kind
else
:
else
:
output_kind
=
request
.
pooling_params
.
output_kind
output_kind
=
request
.
pooling_params
.
output_kind
queue
=
RequestOutputCollector
(
output_kind
=
output_kind
)
queue
=
RequestOutputCollector
(
output_kind
=
output_kind
,
request_id
=
request
.
request_id
)
output_processor
.
add_request
(
request
,
None
,
queue
=
queue
)
output_processor
.
add_request
(
request
,
None
,
queue
=
queue
)
for
request
in
requests
:
for
request
in
requests
:
output_processor
.
abort_requests
([
request
.
request_id
])
if
abort_by
==
"internal"
:
output_processor
.
abort_requests
([
request
.
request_id
],
internal
=
True
)
else
:
output_processor
.
abort_requests
([
request
.
external_req_id
],
internal
=
False
)
tests/v1/engine/test_parallel_sampling.py
View file @
7e63ef82
...
@@ -4,11 +4,12 @@
...
@@ -4,11 +4,12 @@
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.outputs
import
CompletionOutput
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.sampling_params
import
RequestOutputKind
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.parallel_sampling
import
ParentRequest
from
vllm.v1.engine.parallel_sampling
import
ParentRequest
def
test_parent_request_to_output_stream
()
->
None
:
def
test_parent_request_to_output_stream
()
->
None
:
parent_request
=
ParentRequest
(
"parent_id"
,
SamplingParams
(
n
=
2
))
parent_request
=
ParentRequest
(
make_request
(
SamplingParams
(
n
=
2
))
)
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
output_0
=
CompletionOutput
(
output_0
=
CompletionOutput
(
index
=
0
,
text
=
"child 0"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
index
=
0
,
text
=
"child 0"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
...
@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
...
@@ -17,51 +18,31 @@ def test_parent_request_to_output_stream() -> None:
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
)
# Request not finished
# Request not finished
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
assert
([
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
"child_id_0"
,
output_0
assert
([
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
)
assert
([
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
assert
([
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
"child_id_1"
,
output_1
)
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
# output_1 finished
# output_1 finished
output_1
.
finish_reason
=
"ended"
output_1
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
assert
([
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
"child_id_0"
,
output_0
assert
([
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
)
assert
(
"parent_id"
,
[
output_1
],
False
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
# Finished output_1 had already returned, DO NOT returned again
# Finished output_1 had already returned, DO NOT returned again
assert
(
"parent_id"
,
[
output_0
],
False
)
==
parent_request
.
get_outputs
(
assert
([
output_0
],
False
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
"child_id_0"
,
output_0
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
([],
False
)
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_0 finished
# output_0 finished
output_0
.
finish_reason
=
"ended"
output_0
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
],
True
)
==
parent_request
.
get_outputs
(
assert
([
output_0
],
True
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
"child_id_0"
,
output_0
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
([],
True
)
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
True
)
# Finished output_0 had already returned, DO NOT returned again
# Finished output_0 had already returned, DO NOT returned again
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
"parent_id"
,
[],
True
)
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
([],
True
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
True
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
([],
True
)
def
test_parent_request_to_output_final_only
()
->
None
:
def
test_parent_request_to_output_final_only
()
->
None
:
parent_request
=
ParentRequest
(
parent_request
=
ParentRequest
(
"parent_id"
,
SamplingParams
(
n
=
2
,
output_kind
=
RequestOutputKind
.
FINAL_ONLY
)
make_request
(
SamplingParams
(
n
=
2
,
output_kind
=
RequestOutputKind
.
FINAL_ONLY
)
)
)
)
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
parent_request
.
child_requests
=
{
"child_id_0"
,
"child_id_1"
}
output_0
=
CompletionOutput
(
output_0
=
CompletionOutput
(
...
@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
...
@@ -71,33 +52,33 @@ def test_parent_request_to_output_final_only() -> None:
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
index
=
1
,
text
=
"child 1"
,
token_ids
=
[],
cumulative_logprob
=
None
,
logprobs
=
None
)
)
# Request not finished, return nothing
# Request not finished, return nothing
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
([],
False
)
"parent_id"
,
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
([],
False
)
[],
False
,
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_1 finished, but outputs won't be returned until all child requests finished
# output_1 finished, but outputs won't be returned until all child requests finished
output_1
.
finish_reason
=
"ended"
output_1
.
finish_reason
=
"ended"
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
(
assert
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
)
==
([],
False
)
"parent_id"
,
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
([],
False
)
[],
False
,
)
assert
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
)
==
(
"parent_id"
,
[],
False
,
)
# output_0 finished, as all child requests finished, the output would be returned
# output_0 finished, as all child requests finished, the output would be returned
output_0
.
finish_reason
=
"ended"
output_0
.
finish_reason
=
"ended"
assert
(
"parent_id"
,
[
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
assert
([
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
"child_id_0"
,
output_0
"child_id_0"
,
output_0
)
)
assert
(
"parent_id"
,
[
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
assert
([
output_0
,
output_1
],
True
)
==
parent_request
.
get_outputs
(
"child_id_1"
,
output_1
"child_id_1"
,
output_1
)
)
def
make_request
(
sampling_params
:
SamplingParams
)
->
EngineCoreRequest
:
return
EngineCoreRequest
(
request_id
=
"parent_id"
,
external_req_id
=
"ext_parent_id"
,
prompt_token_ids
=
None
,
mm_features
=
None
,
sampling_params
=
sampling_params
,
pooling_params
=
None
,
eos_token_id
=
None
,
arrival_time
=
0.0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
)
tests/v1/engine/test_preprocess_error_handling.py
0 → 100644
View file @
7e63ef82
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch.cuda
from
vllm
import
LLM
,
SamplingParams
from
vllm.platforms
import
current_platform
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.core
import
EngineCore
MODEL_NAME
=
"hmellor/tiny-random-LlamaForCausalLM"
def
test_preprocess_error_handling
(
monkeypatch
:
pytest
.
MonkeyPatch
):
"""Test that preprocessing errors are handled gracefully."""
if
current_platform
.
is_rocm
()
or
current_platform
.
is_xpu
():
pytest
.
skip
(
"Skipped on ROCm/XPU: this test only works with 'fork', "
"but ROCm/XPU uses 'spawn'."
)
assert
not
torch
.
cuda
.
is_initialized
(),
(
"fork needs to be used for the engine "
"core process and this isn't possible if cuda is already initialized"
)
# Store original method to call for non-failing requests
original_preprocess
=
EngineCore
.
preprocess_add_request
# Monkeypatch to make preprocess_add_request raise an exception
# only for requests with "FAIL" in the first token
def
conditional_failing_preprocess
(
self
,
request
:
EngineCoreRequest
):
# Fail if the first token id is 333
if
request
.
prompt_token_ids
and
request
.
prompt_token_ids
[
0
]
==
333
:
raise
ValueError
(
"Simulated preprocessing error!"
)
return
original_preprocess
(
self
,
request
)
monkeypatch
.
setattr
(
EngineCore
,
"preprocess_add_request"
,
conditional_failing_preprocess
)
llm
=
LLM
(
model
=
MODEL_NAME
)
# Create a failing request by crafting a request with an invalid token
# We need to use a direct approach since LLM.generate tokenizes for us
from
vllm.inputs
import
TokensPrompt
# This should raise an exception due to the preprocessing failure
# Special token id to trigger the failure
failing_prompt
=
TokensPrompt
(
prompt_token_ids
=
[
333
])
outputs
=
llm
.
generate
(
failing_prompt
,
SamplingParams
(
max_tokens
=
10
))
# type: ignore
assert
len
(
outputs
)
==
1
assert
len
(
outputs
[
0
].
outputs
[
0
].
token_ids
)
==
0
assert
outputs
[
0
].
finished
assert
outputs
[
0
].
outputs
[
0
].
finish_reason
==
"error"
# Verify the engine is still functional with a normal request
outputs
=
llm
.
generate
(
"Hello, my name is"
,
SamplingParams
(
max_tokens
=
10
))
assert
len
(
outputs
)
==
1
assert
len
(
outputs
[
0
].
outputs
[
0
].
token_ids
)
>
0
assert
outputs
[
0
].
outputs
[
0
].
finish_reason
in
(
"stop"
,
"length"
)
tests/v1/engine/test_process_multi_modal_uuids.py
View file @
7e63ef82
...
@@ -6,6 +6,7 @@ import pytest
...
@@ -6,6 +6,7 @@ import pytest
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.image
import
ImageAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.assets.video
import
VideoAsset
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
from
vllm.config
import
CacheConfig
,
DeviceConfig
,
ModelConfig
,
VllmConfig
from
vllm.multimodal
import
MultiModalUUIDDict
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
input_processor
as
input_processor_mod
from
vllm.v1.engine
import
input_processor
as
input_processor_mod
from
vllm.v1.engine.input_processor
import
InputProcessor
from
vllm.v1.engine.input_processor
import
InputProcessor
...
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
...
@@ -166,7 +167,7 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
monkeypatch
,
mm_cache_gb
=
0.0
,
enable_prefix_caching
=
False
monkeypatch
,
mm_cache_gb
=
0.0
,
enable_prefix_caching
=
False
)
)
captured
:
dict
[
str
,
obje
ct
]
=
{}
captured
:
dict
[
str
,
MultiModalUUIDDi
ct
]
=
{}
def
fake_preprocess
(
def
fake_preprocess
(
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_uuids
=
None
prompt
,
*
,
tokenization_kwargs
=
None
,
lora_request
=
None
,
mm_uuids
=
None
...
@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
...
@@ -196,7 +197,16 @@ def test_multi_modal_uuids_ignored_when_caching_disabled(monkeypatch):
)
)
# Expect request-id-based overrides are passed through
# Expect request-id-based overrides are passed through
assert
captured
[
"mm_uuids"
]
==
{
mm_uuids
=
captured
[
"mm_uuids"
]
"image"
:
[
f
"
{
request_id
}
-image-0"
,
f
"
{
request_id
}
-image-1"
],
assert
set
(
mm_uuids
.
keys
())
==
{
"image"
,
"video"
}
"video"
:
[
f
"
{
request_id
}
-video-0"
],
assert
len
(
mm_uuids
[
"image"
])
==
2
}
assert
len
(
mm_uuids
[
"video"
])
==
1
assert
mm_uuids
[
"image"
][
0
].
startswith
(
f
"
{
request_id
}
-image-"
)
and
mm_uuids
[
"image"
][
0
].
endswith
(
"-0"
)
assert
mm_uuids
[
"image"
][
1
].
startswith
(
f
"
{
request_id
}
-image-"
)
and
mm_uuids
[
"image"
][
1
].
endswith
(
"-1"
)
assert
mm_uuids
[
"video"
][
0
].
startswith
(
f
"
{
request_id
}
-video-"
)
and
mm_uuids
[
"video"
][
0
].
endswith
(
"-0"
)
tests/v1/engine/utils.py
View file @
7e63ef82
...
@@ -343,6 +343,7 @@ class MockEngineCore:
...
@@ -343,6 +343,7 @@ class MockEngineCore:
eos_token_id
:
int
|
None
=
None
,
eos_token_id
:
int
|
None
=
None
,
stop_token_ids
:
list
[
int
]
|
None
=
None
,
stop_token_ids
:
list
[
int
]
|
None
=
None
,
ignore_eos
:
bool
=
False
,
ignore_eos
:
bool
=
False
,
request_ids
:
list
[
str
]
|
None
=
None
,
)
->
None
:
)
->
None
:
self
.
num_requests
=
len
(
tokens_list
)
self
.
num_requests
=
len
(
tokens_list
)
self
.
tokens_list
=
tokens_list
self
.
tokens_list
=
tokens_list
...
@@ -355,6 +356,11 @@ class MockEngineCore:
...
@@ -355,6 +356,11 @@ class MockEngineCore:
self
.
eos_token_id
=
eos_token_id
self
.
eos_token_id
=
eos_token_id
self
.
stop_token_ids
=
stop_token_ids
self
.
stop_token_ids
=
stop_token_ids
self
.
ignore_eos
=
ignore_eos
self
.
ignore_eos
=
ignore_eos
self
.
request_ids
=
(
request_ids
if
request_ids
is
not
None
else
[
f
"request-
{
i
}
"
for
i
in
range
(
self
.
num_requests
)]
)
def
get_outputs
(
self
)
->
list
[
EngineCoreOutput
]:
def
get_outputs
(
self
)
->
list
[
EngineCoreOutput
]:
do_logprobs
=
self
.
do_logprobs
do_logprobs
=
self
.
do_logprobs
...
@@ -386,7 +392,7 @@ class MockEngineCore:
...
@@ -386,7 +392,7 @@ class MockEngineCore:
prompt_logprobs
=
None
prompt_logprobs
=
None
new_token_id
=
token_ids
[
token_idx
]
new_token_id
=
token_ids
[
token_idx
]
output
=
EngineCoreOutput
(
output
=
EngineCoreOutput
(
request_id
=
f
"
request
-
{
req_idx
}
"
,
request_id
=
self
.
request
_ids
[
req_idx
]
,
new_token_ids
=
[
new_token_id
],
new_token_ids
=
[
new_token_id
],
new_logprobs
=
logprobs
,
new_logprobs
=
logprobs
,
new_prompt_logprobs_tensors
=
prompt_logprobs
,
new_prompt_logprobs_tensors
=
prompt_logprobs
,
...
...
tests/v1/entrypoints/llm/test_struct_output_generate.py
View file @
7e63ef82
...
@@ -610,7 +610,7 @@ Make the response as short as possible.
...
@@ -610,7 +610,7 @@ Make the response as short as possible.
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config"
,
# noqa: E501
"model_name, backend, tokenizer_mode, reasoning_parser, speculative_config
, async_scheduling
"
,
# noqa: E501
[
[
(
(
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
,
"deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
,
...
@@ -618,8 +618,10 @@ Make the response as short as possible.
...
@@ -618,8 +618,10 @@ Make the response as short as possible.
"auto"
,
"auto"
,
"deepseek_r1"
,
"deepseek_r1"
,
NGRAM_SPEC_CONFIG
,
NGRAM_SPEC_CONFIG
,
False
,
),
),
(
"Qwen/Qwen3-1.7B"
,
"xgrammar"
,
"auto"
,
"deepseek_r1"
,
None
),
(
"Qwen/Qwen3-1.7B"
,
"xgrammar"
,
"auto"
,
"deepseek_r1"
,
None
,
False
),
(
"Qwen/Qwen3-1.7B"
,
"xgrammar"
,
"auto"
,
"deepseek_r1"
,
None
,
True
),
],
],
)
)
def
test_structured_output_with_reasoning_matrices
(
def
test_structured_output_with_reasoning_matrices
(
...
@@ -628,6 +630,7 @@ def test_structured_output_with_reasoning_matrices(
...
@@ -628,6 +630,7 @@ def test_structured_output_with_reasoning_matrices(
reasoning_parser
:
str
,
reasoning_parser
:
str
,
model_name
:
str
,
model_name
:
str
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
speculative_config
:
dict
[
str
,
Any
]
|
None
,
async_scheduling
:
bool
,
):
):
if
current_platform
.
is_tpu
()
and
speculative_config
:
if
current_platform
.
is_tpu
()
and
speculative_config
:
pytest
.
skip
(
"TPU does not support speculative decoding"
)
pytest
.
skip
(
"TPU does not support speculative decoding"
)
...
@@ -648,6 +651,7 @@ def test_structured_output_with_reasoning_matrices(
...
@@ -648,6 +651,7 @@ def test_structured_output_with_reasoning_matrices(
),
),
tokenizer_mode
=
tokenizer_mode
,
tokenizer_mode
=
tokenizer_mode
,
speculative_config
=
speculative_config
,
speculative_config
=
speculative_config
,
async_scheduling
=
async_scheduling
,
)
)
tokenizer
=
llm
.
get_tokenizer
()
tokenizer
=
llm
.
get_tokenizer
()
reasoner
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoning_parser
)(
reasoner
=
ReasoningParserManager
.
get_reasoning_parser
(
reasoning_parser
)(
...
...
tests/v1/entrypoints/openai/serving_responses/test_image.py
View file @
7e63ef82
...
@@ -8,7 +8,7 @@ import pytest
...
@@ -8,7 +8,7 @@ import pytest
import
pytest_asyncio
import
pytest_asyncio
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.multimodal.utils
import
encode_image_
base64
from
vllm.multimodal.utils
import
encode_image_
url
# Use a small vision model for testing
# Use a small vision model for testing
MODEL_NAME
=
"Qwen/Qwen2.5-VL-3B-Instruct"
MODEL_NAME
=
"Qwen/Qwen2.5-VL-3B-Instruct"
...
@@ -52,9 +52,9 @@ async def client(image_server):
...
@@ -52,9 +52,9 @@ async def client(image_server):
@
pytest
.
fixture
(
scope
=
"session"
)
@
pytest
.
fixture
(
scope
=
"session"
)
def
base64
_encoded_image
(
local_asset_server
)
->
dict
[
str
,
str
]:
def
url
_encoded_image
(
local_asset_server
)
->
dict
[
str
,
str
]:
return
{
return
{
image_url
:
encode_image_
base64
(
local_asset_server
.
get_image_asset
(
image_url
))
image_url
:
encode_image_
url
(
local_asset_server
.
get_image_asset
(
image_url
))
for
image_url
in
TEST_IMAGE_ASSETS
for
image_url
in
TEST_IMAGE_ASSETS
}
}
...
@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -95,7 +95,7 @@ async def test_single_chat_session_image_base64encoded(
client
:
openai
.
AsyncOpenAI
,
client
:
openai
.
AsyncOpenAI
,
model_name
:
str
,
model_name
:
str
,
raw_image_url
:
str
,
raw_image_url
:
str
,
base64
_encoded_image
:
dict
[
str
,
str
],
url
_encoded_image
:
dict
[
str
,
str
],
):
):
content_text
=
"What's in this image?"
content_text
=
"What's in this image?"
messages
=
[
messages
=
[
...
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
...
@@ -104,7 +104,7 @@ async def test_single_chat_session_image_base64encoded(
"content"
:
[
"content"
:
[
{
{
"type"
:
"input_image"
,
"type"
:
"input_image"
,
"image_url"
:
f
"data:image/jpeg;base64,
{
base64
_encoded_image
[
raw_image_url
]
}
"
,
# noqa: E501
"image_url"
:
url
_encoded_image
[
raw_image_url
]
,
"detail"
:
"auto"
,
"detail"
:
"auto"
,
},
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
{
"type"
:
"input_text"
,
"text"
:
content_text
},
...
...
tests/v1/kv_connector/nixl_integration/
tp_
config_sweep_accuracy_test.sh
→
tests/v1/kv_connector/nixl_integration/config_sweep_accuracy_test.sh
View file @
7e63ef82
...
@@ -5,24 +5,37 @@ set -euo pipefail
...
@@ -5,24 +5,37 @@ set -euo pipefail
SCRIPT
=
"v1/kv_connector/nixl_integration/run_accuracy_test.sh"
SCRIPT
=
"v1/kv_connector/nixl_integration/run_accuracy_test.sh"
# Define test configurations
# Define test configurations
configs
=(
tp_
configs
=(
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2"
"GPU_MEMORY_UTILIZATION=0.6 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=1"
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
# MLA case
"GPU_MEMORY_UTILIZATION=0.8 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
# MLA case
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
"
DP_EP=1
GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=
1
DECODER_TP_SIZE=
2
MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
# MLA+P-TP1, D-DPEP=2 (TP=1)
"GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=
2
DECODER_TP_SIZE=
1
MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
)
)
dp_ep_configs
=(
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=1 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
# MLA+P-TP1, D-DPEP=2 (TP=1)
"DP_EP=1 GPU_MEMORY_UTILIZATION=0.8 PREFILLER_TP_SIZE=2 DECODER_TP_SIZE=2 MODEL_NAMES=deepseek-ai/deepseek-vl2-tiny"
# MLA+P-TP2, D-DPEP=2 (TP=1)
)
# Select config array based on DP_EP env var
if
[[
-n
"
${
DP_EP
:-}
"
]]
;
then
configs
=(
"
${
dp_ep_configs
[@]
}
"
)
echo
"DP_EP is set, using dp_ep_configs"
else
configs
=(
"
${
tp_configs
[@]
}
"
)
fi
run_tests
()
{
run_tests
()
{
local
label
=
$1
local
label
=
$1
local
extra_
env
=
$2
local
extra_
args
=
$2
echo
"=== Running tests (
${
label
}
) ==="
echo
"=== Running tests (
${
label
}
) ==="
for
cfg
in
"
${
configs
[@]
}
"
;
do
for
cfg
in
"
${
configs
[@]
}
"
;
do
echo
"-> Running with
${
cfg
}
${
extra_
env
:+and
${
extra_
env
}}
"
echo
"-> Running with
${
cfg
}
${
extra_
args
:+and
${
extra_
args
}}
"
# Use 'env' to safely set variables without eval
# Use 'env' to safely set variables without eval
if
!
env
${
extra_env
}
${
cfg
}
bash
"
${
SCRIPT
}
"
;
then
if
!
env
${
cfg
}
bash
"
${
SCRIPT
}
"
${
extra_args
}
;
then
echo
"❌ Test failed for config:
${
cfg
}
${
extra_
env
:+
(
${
extra_
env
}
)
}
"
echo
"❌ Test failed for config:
${
cfg
}
${
extra_
args
:+
(
${
extra_
args
}
)
}
"
exit
1
exit
1
fi
fi
done
done
...
@@ -34,8 +47,8 @@ run_tests "default backend" ""
...
@@ -34,8 +47,8 @@ run_tests "default backend" ""
# Check if FLASHINFER is set (non-empty)
# Check if FLASHINFER is set (non-empty)
if
[[
-n
"
${
FLASHINFER
:-}
"
]]
;
then
if
[[
-n
"
${
FLASHINFER
:-}
"
]]
;
then
echo
"FLASHINFER is set, rerunning with
VLLM_ATTENTION_BACKEND=
FLASHINFER"
echo
"FLASHINFER is set, rerunning with
--attention-backend
FLASHINFER"
run_tests
"FLASHINFER backend"
"
VLLM_ATTENTION_BACKEND=
FLASHINFER"
run_tests
"FLASHINFER backend"
"
--attention-backend
FLASHINFER"
else
else
echo
"FLASHINFER not set, skipping FLASHINFER runs."
echo
"FLASHINFER not set, skipping FLASHINFER runs."
fi
fi
tests/v1/kv_connector/nixl_integration/run_accuracy_test.sh
View file @
7e63ef82
...
@@ -3,21 +3,29 @@ set -xe
...
@@ -3,21 +3,29 @@ set -xe
# Parse command line arguments
# Parse command line arguments
KV_BUFFER_DEVICE
=
"cuda"
# Default to cuda
KV_BUFFER_DEVICE
=
"cuda"
# Default to cuda
ATTENTION_BACKEND
=
""
# Default to empty (use vllm default)
while
[[
$#
-gt
0
]]
;
do
while
[[
$#
-gt
0
]]
;
do
case
$1
in
case
$1
in
--kv_buffer_device
)
--kv_buffer_device
)
KV_BUFFER_DEVICE
=
"
$2
"
KV_BUFFER_DEVICE
=
"
$2
"
shift
2
shift
2
;;
;;
--attention-backend
)
ATTENTION_BACKEND
=
"
$2
"
shift
2
;;
*
)
*
)
echo
"Unknown option
$1
"
echo
"Unknown option
$1
"
echo
"Usage:
$0
[--kv_buffer_device <cuda|cpu>]"
echo
"Usage:
$0
[--kv_buffer_device <cuda|cpu>]
[--attention-backend <backend>]
"
exit
1
exit
1
;;
;;
esac
esac
done
done
echo
"Running accuracy tests with kv_buffer_device=
$KV_BUFFER_DEVICE
"
echo
"Running accuracy tests with kv_buffer_device=
$KV_BUFFER_DEVICE
"
if
[[
-n
"
$ATTENTION_BACKEND
"
]]
;
then
echo
"Using attention backend:
$ATTENTION_BACKEND
"
fi
DECODER_KV_LAYOUT
=
${
DECODER_KV_LAYOUT
:-
"HND"
}
# Default to HND, optional NHD
DECODER_KV_LAYOUT
=
${
DECODER_KV_LAYOUT
:-
"HND"
}
# Default to HND, optional NHD
if
[[
"
$DECODER_KV_LAYOUT
"
==
"NHD"
]]
;
then
if
[[
"
$DECODER_KV_LAYOUT
"
==
"NHD"
]]
;
then
...
@@ -148,6 +156,11 @@ run_tests_for_model() {
...
@@ -148,6 +156,11 @@ run_tests_for_model() {
--tensor-parallel-size
$PREFILLER_TP_SIZE
\
--tensor-parallel-size
$PREFILLER_TP_SIZE
\
--kv-transfer-config '
$KV_CONFIG
'"
--kv-transfer-config '
$KV_CONFIG
'"
# Add attention backend config if specified
if
[[
-n
"
$ATTENTION_BACKEND
"
]]
;
then
BASE_CMD
=
"
${
BASE_CMD
}
--attention-backend=
$ATTENTION_BACKEND
"
fi
if
[
-n
"
$model_args
"
]
;
then
if
[
-n
"
$model_args
"
]
;
then
FULL_CMD
=
"
$BASE_CMD
$model_args
"
FULL_CMD
=
"
$BASE_CMD
$model_args
"
else
else
...
@@ -188,7 +201,12 @@ run_tests_for_model() {
...
@@ -188,7 +201,12 @@ run_tests_for_model() {
--block-size
${
DECODE_BLOCK_SIZE
}
\
--block-size
${
DECODE_BLOCK_SIZE
}
\
--gpu-memory-utilization
$GPU_MEMORY_UTILIZATION
\
--gpu-memory-utilization
$GPU_MEMORY_UTILIZATION
\
--kv-transfer-config '
$KV_CONFIG
'"
--kv-transfer-config '
$KV_CONFIG
'"
# Add attention backend config if specified
if
[[
-n
"
$ATTENTION_BACKEND
"
]]
;
then
BASE_CMD
=
"
${
BASE_CMD
}
--attention-backend=
$ATTENTION_BACKEND
"
fi
# DP-EP attention mode
# DP-EP attention mode
if
[[
-z
"
$DP_EP
"
]]
;
then
if
[[
-z
"
$DP_EP
"
]]
;
then
BASE_CMD
=
"
${
BASE_CMD
}
--tensor-parallel-size
$DECODER_TP_SIZE
"
BASE_CMD
=
"
${
BASE_CMD
}
--tensor-parallel-size
$DECODER_TP_SIZE
"
...
...
tests/v1/kv_connector/unit/test_backwards_compatibility.py
View file @
7e63ef82
...
@@ -14,12 +14,12 @@ from unittest.mock import patch
...
@@ -14,12 +14,12 @@ from unittest.mock import patch
import
pytest
import
pytest
from
vllm.attention.backends.abstract
import
AttentionMetadata
from
vllm.distributed.kv_transfer.kv_connector.factory
import
KVConnectorFactory
from
vllm.distributed.kv_transfer.kv_connector.factory
import
KVConnectorFactory
from
vllm.distributed.kv_transfer.kv_connector.v1
import
(
from
vllm.distributed.kv_transfer.kv_connector.v1
import
(
KVConnectorBase_V1
,
KVConnectorBase_V1
,
KVConnectorRole
,
KVConnectorRole
,
)
)
from
vllm.v1.attention.backend
import
AttentionMetadata
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
vllm.v1.core.sched.output
import
SchedulerOutput
from
.utils
import
create_scheduler
,
create_vllm_config
from
.utils
import
create_scheduler
,
create_vllm_config
...
...
tests/v1/kv_connector/unit/test_config.py
View file @
7e63ef82
...
@@ -15,11 +15,12 @@ pytestmark = pytest.mark.cpu_test
...
@@ -15,11 +15,12 @@ pytestmark = pytest.mark.cpu_test
[
[
(
"native"
,
4.0
,
1
,
1
,
"OffloadingConnector"
,
4.0
*
(
1
<<
30
)),
(
"native"
,
4.0
,
1
,
1
,
"OffloadingConnector"
,
4.0
*
(
1
<<
30
)),
# bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
# bytes per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
(
"native"
,
8.0
,
2
,
2
,
"OffloadingConnector"
,
8.0
*
(
1
<<
30
)
/
4
),
(
"native"
,
8.0
,
2
,
2
,
"OffloadingConnector"
,
8.0
*
(
1
<<
30
)),
(
"lmcache"
,
4.0
,
1
,
1
,
"LMCacheConnectorV1"
,
4.0
),
(
"lmcache"
,
4.0
,
1
,
1
,
"LMCacheConnectorV1"
,
4.0
),
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
# size per rank: 8.0 GiB / (2 * 2) = 2.0 GiB
(
"lmcache"
,
8.0
,
2
,
2
,
"LMCacheConnectorV1"
,
2.0
),
(
"lmcache"
,
8.0
,
2
,
2
,
"LMCacheConnectorV1"
,
2.0
),
(
None
,
None
,
1
,
1
,
None
,
None
),
# When kv_offloading_size is None, offloading is disabled (backend is ignored)
(
"native"
,
None
,
1
,
1
,
None
,
None
),
],
],
)
)
def
test_kv_connector
(
def
test_kv_connector
(
...
@@ -54,8 +55,7 @@ def test_kv_connector(
...
@@ -54,8 +55,7 @@ def test_kv_connector(
assert
kv_transfer_config
.
kv_role
==
"kv_both"
assert
kv_transfer_config
.
kv_role
==
"kv_both"
if
kv_offloading_backend
==
"native"
:
if
kv_offloading_backend
==
"native"
:
assert
kv_connector_extra_config
[
"kv_bytes_per_rank"
]
==
expected_bytes
assert
kv_connector_extra_config
[
"cpu_bytes_to_use"
]
==
expected_bytes
assert
kv_connector_extra_config
[
"num_cpu_blocks"
]
==
0
# Existing config should be preserved
# Existing config should be preserved
assert
kv_connector_extra_config
[
"existing_key"
]
==
"existing_value"
assert
kv_connector_extra_config
[
"existing_key"
]
==
"existing_value"
elif
kv_offloading_backend
==
"lmcache"
:
elif
kv_offloading_backend
==
"lmcache"
:
...
@@ -63,3 +63,19 @@ def test_kv_connector(
...
@@ -63,3 +63,19 @@ def test_kv_connector(
assert
kv_connector_extra_config
[
"lmcache.max_local_cpu_size"
]
==
expected_bytes
assert
kv_connector_extra_config
[
"lmcache.max_local_cpu_size"
]
==
expected_bytes
# Existing config should be replaced
# Existing config should be replaced
assert
"existing_key"
not
in
kv_connector_extra_config
assert
"existing_key"
not
in
kv_connector_extra_config
def
test_kv_offloading_size_only_uses_native_default
():
"""Test that setting only kv_offloading_size enables native offloading."""
vllm_config
=
VllmConfig
(
cache_config
=
CacheConfig
(
kv_offloading_size
=
4.0
,
# kv_offloading_backend not set, should default to "native"
),
)
kv_transfer_config
=
vllm_config
.
kv_transfer_config
kv_connector_extra_config
=
kv_transfer_config
.
kv_connector_extra_config
assert
kv_transfer_config
.
kv_connector
==
"OffloadingConnector"
assert
kv_transfer_config
.
kv_role
==
"kv_both"
assert
kv_connector_extra_config
[
"cpu_bytes_to_use"
]
==
4.0
*
(
1
<<
30
)
Prev
1
…
28
29
30
31
32
33
34
35
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment