Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
412 additions
and
94 deletions
+412
-94
tests/tokenizers_/test_detokenize.py
tests/tokenizers_/test_detokenize.py
+1
-1
tests/tokenizers_/test_registry.py
tests/tokenizers_/test_registry.py
+21
-2
tests/tool_parsers/__init__.py
tests/tool_parsers/__init__.py
+0
-0
tests/tool_parsers/test_deepseekv31_tool_parser.py
tests/tool_parsers/test_deepseekv31_tool_parser.py
+2
-2
tests/tool_parsers/test_ernie45_moe_tool_parser.py
tests/tool_parsers/test_ernie45_moe_tool_parser.py
+1
-1
tests/tool_parsers/test_glm4_moe_tool_parser.py
tests/tool_parsers/test_glm4_moe_tool_parser.py
+2
-4
tests/tool_parsers/test_jamba_tool_parser.py
tests/tool_parsers/test_jamba_tool_parser.py
+1
-3
tests/tool_parsers/test_kimi_k2_tool_parser.py
tests/tool_parsers/test_kimi_k2_tool_parser.py
+1
-3
tests/tool_parsers/test_minimax_tool_parser.py
tests/tool_parsers/test_minimax_tool_parser.py
+1
-3
tests/tool_parsers/test_mistral_tool_parser.py
tests/tool_parsers/test_mistral_tool_parser.py
+3
-6
tests/tool_parsers/test_openai_tool_parser.py
tests/tool_parsers/test_openai_tool_parser.py
+1
-1
tests/tool_parsers/test_qwen3coder_tool_parser.py
tests/tool_parsers/test_qwen3coder_tool_parser.py
+4
-6
tests/tool_parsers/test_seed_oss_tool_parser.py
tests/tool_parsers/test_seed_oss_tool_parser.py
+1
-3
tests/tool_parsers/test_xlam_tool_parser.py
tests/tool_parsers/test_xlam_tool_parser.py
+1
-3
tests/tool_use/test_tool_choice_required.py
tests/tool_use/test_tool_choice_required.py
+1
-1
tests/utils.py
tests/utils.py
+2
-2
tests/v1/attention/test_sparse_mla_backends.py
tests/v1/attention/test_sparse_mla_backends.py
+215
-36
tests/v1/core/test_encoder_cache_manager.py
tests/v1/core/test_encoder_cache_manager.py
+74
-5
tests/v1/distributed/test_dbo.py
tests/v1/distributed/test_dbo.py
+2
-0
tests/v1/e2e/test_async_scheduling.py
tests/v1/e2e/test_async_scheduling.py
+78
-12
No files found.
tests/tokenizers_/test_detokenize.py
View file @
a3f8d5dd
...
@@ -8,7 +8,7 @@ import pytest
...
@@ -8,7 +8,7 @@ import pytest
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
transformers
import
AutoTokenizer
,
PreTrainedTokenizer
,
PreTrainedTokenizerFast
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.tokenizers
import
MistralTokenizer
from
vllm.tokenizers
.mistral
import
MistralTokenizer
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.detokenizer
import
(
from
vllm.v1.engine.detokenizer
import
(
FastIncrementalDetokenizer
,
FastIncrementalDetokenizer
,
...
...
tests/tokenizers_/test_registry.py
View file @
a3f8d5dd
...
@@ -2,7 +2,14 @@
...
@@ -2,7 +2,14 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
pathlib
import
Path
from
pathlib
import
Path
from
vllm.tokenizers
import
TokenizerLike
,
TokenizerRegistry
,
get_tokenizer
import
pytest
from
vllm.tokenizers
import
TokenizerLike
from
vllm.tokenizers.registry
import
(
TokenizerRegistry
,
get_tokenizer
,
resolve_tokenizer_args
,
)
class
TestTokenizer
(
TokenizerLike
):
class
TestTokenizer
(
TokenizerLike
):
...
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
...
@@ -40,10 +47,22 @@ class TestTokenizer(TokenizerLike):
return
True
return
True
@
pytest
.
mark
.
parametrize
(
"runner_type"
,
[
"generate"
,
"pooling"
])
def
test_resolve_tokenizer_args_idempotent
(
runner_type
):
tokenizer_mode
,
tokenizer_name
,
args
,
kwargs
=
resolve_tokenizer_args
(
"facebook/opt-125m"
,
runner_type
=
runner_type
,
)
assert
(
tokenizer_mode
,
tokenizer_name
,
args
,
kwargs
)
==
resolve_tokenizer_args
(
tokenizer_name
,
*
args
,
**
kwargs
)
def
test_customized_tokenizer
():
def
test_customized_tokenizer
():
TokenizerRegistry
.
register
(
"test_tokenizer"
,
__name__
,
TestTokenizer
.
__name__
)
TokenizerRegistry
.
register
(
"test_tokenizer"
,
__name__
,
TestTokenizer
.
__name__
)
tokenizer
=
TokenizerRegistry
.
get
_tokenizer
(
"test_tokenizer"
,
"abc"
)
tokenizer
=
TokenizerRegistry
.
load
_tokenizer
(
"test_tokenizer"
,
"abc"
)
assert
isinstance
(
tokenizer
,
TestTokenizer
)
assert
isinstance
(
tokenizer
,
TestTokenizer
)
assert
tokenizer
.
path_or_repo_id
==
"abc"
assert
tokenizer
.
path_or_repo_id
==
"abc"
assert
tokenizer
.
bos_token_id
==
0
assert
tokenizer
.
bos_token_id
==
0
...
...
tests/tool_parsers/__init__.py
0 → 100644
View file @
a3f8d5dd
tests/tool_
use
/test_deepseekv31_tool_parser.py
→
tests/tool_
parsers
/test_deepseekv31_tool_parser.py
View file @
a3f8d5dd
...
@@ -3,10 +3,10 @@
...
@@ -3,10 +3,10 @@
import
pytest
import
pytest
from
vllm.entrypoints.openai.tool_parsers.deepseekv31_tool_parser
import
(
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tool_parsers.deepseekv31_tool_parser
import
(
DeepSeekV31ToolParser
,
DeepSeekV31ToolParser
,
)
)
from
vllm.tokenizers
import
get_tokenizer
MODEL
=
"deepseek-ai/DeepSeek-V3.1"
MODEL
=
"deepseek-ai/DeepSeek-V3.1"
...
...
tests/tool_
use
/test_ernie45_moe_tool_parser.py
→
tests/tool_
parsers
/test_ernie45_moe_tool_parser.py
View file @
a3f8d5dd
...
@@ -13,9 +13,9 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -13,9 +13,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
FunctionCall
,
ToolCall
,
ToolCall
,
)
)
from
vllm.entrypoints.openai.tool_parsers.ernie45_tool_parser
import
Ernie45ToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tool_parsers.ernie45_tool_parser
import
Ernie45ToolParser
# Use a common model that is likely to be available
# Use a common model that is likely to be available
MODEL
=
"baidu/ERNIE-4.5-21B-A3B-Thinking"
MODEL
=
"baidu/ERNIE-4.5-21B-A3B-Thinking"
...
...
tests/tool_
use
/test_glm4_moe_tool_parser.py
→
tests/tool_
parsers
/test_glm4_moe_tool_parser.py
View file @
a3f8d5dd
...
@@ -7,12 +7,10 @@ import json
...
@@ -7,12 +7,10 @@ import json
import
pytest
import
pytest
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.tool_parsers.glm4_moe_tool_parser
import
(
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tool_parsers.glm4_moe_tool_parser
import
(
Glm4MoeModelToolParser
,
Glm4MoeModelToolParser
,
)
)
from
vllm.tokenizers
import
get_tokenizer
pytestmark
=
pytest
.
mark
.
cpu_test
pytest
.
skip
(
"skip glm4_moe parser test"
,
allow_module_level
=
True
)
pytest
.
skip
(
"skip glm4_moe parser test"
,
allow_module_level
=
True
)
# Use a common model that is likely to be available
# Use a common model that is likely to be available
...
...
tests/tool_
use
/test_jamba_tool_parser.py
→
tests/tool_
parsers
/test_jamba_tool_parser.py
View file @
a3f8d5dd
...
@@ -9,11 +9,9 @@ import pytest
...
@@ -9,11 +9,9 @@ import pytest
from
partial_json_parser.core.options
import
Allow
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.openai.protocol
import
DeltaMessage
,
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.protocol
import
DeltaMessage
,
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.tool_parsers.jamba_tool_parser
import
JambaToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tool_parsers.jamba_tool_parser
import
JambaToolParser
pytestmark
=
pytest
.
mark
.
cpu_test
MODEL
=
"ai21labs/Jamba-tiny-dev"
MODEL
=
"ai21labs/Jamba-tiny-dev"
...
...
tests/tool_
use
/test_kimi_k2_tool_parser.py
→
tests/tool_
parsers
/test_kimi_k2_tool_parser.py
View file @
a3f8d5dd
...
@@ -7,10 +7,8 @@ import json
...
@@ -7,10 +7,8 @@ import json
import
pytest
import
pytest
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.tool_parsers.kimi_k2_tool_parser
import
KimiK2ToolParser
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tool_parsers.kimi_k2_tool_parser
import
KimiK2ToolParser
pytestmark
=
pytest
.
mark
.
cpu_test
# Use a common model that is likely to be available
# Use a common model that is likely to be available
MODEL
=
"moonshotai/Kimi-K2-Instruct"
MODEL
=
"moonshotai/Kimi-K2-Instruct"
...
...
tests/tool_
use
/test_minimax_tool_parser.py
→
tests/tool_
parsers
/test_minimax_tool_parser.py
View file @
a3f8d5dd
...
@@ -12,10 +12,8 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -12,10 +12,8 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
FunctionCall
,
ToolCall
,
ToolCall
,
)
)
from
vllm.entrypoints.openai.tool_parsers.minimax_tool_parser
import
MinimaxToolParser
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tool_parsers.minimax_tool_parser
import
MinimaxToolParser
pytestmark
=
pytest
.
mark
.
cpu_test
# Use a common model that is likely to be available
# Use a common model that is likely to be available
MODEL
=
"MiniMaxAi/MiniMax-M1-40k"
MODEL
=
"MiniMaxAi/MiniMax-M1-40k"
...
...
tests/tool_
use
/test_mistral_tool_parser.py
→
tests/tool_
parsers
/test_mistral_tool_parser.py
View file @
a3f8d5dd
...
@@ -12,13 +12,10 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
...
@@ -12,13 +12,10 @@ from mistral_common.protocol.instruct.tool_calls import FunctionCall, ToolCall
from
partial_json_parser.core.options
import
Allow
from
partial_json_parser.core.options
import
Allow
from
vllm.entrypoints.openai.protocol
import
DeltaMessage
,
DeltaToolCall
from
vllm.entrypoints.openai.protocol
import
DeltaMessage
,
DeltaToolCall
from
vllm.entrypoints.openai.tool_parsers.mistral_tool_parser
import
MistralToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
(
MistralTokenizer
,
TokenizerLike
,
get_tokenizer
,
)
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.tool_parsers.mistral_tool_parser
import
MistralToolParser
@
pytest
.
fixture
(
scope
=
"module"
)
@
pytest
.
fixture
(
scope
=
"module"
)
...
...
tests/tool_
use
/test_openai_tool_parser.py
→
tests/tool_
parsers
/test_openai_tool_parser.py
View file @
a3f8d5dd
...
@@ -15,8 +15,8 @@ from openai_harmony import (
...
@@ -15,8 +15,8 @@ from openai_harmony import (
)
)
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.protocol
import
FunctionCall
,
ToolCall
from
vllm.entrypoints.openai.tool_parsers.openai_tool_parser
import
OpenAIToolParser
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tool_parsers.openai_tool_parser
import
OpenAIToolParser
MODEL
=
"gpt2"
MODEL
=
"gpt2"
...
...
tests/tool_
use
/test_qwen3coder_tool_parser.py
→
tests/tool_
parsers
/test_qwen3coder_tool_parser.py
View file @
a3f8d5dd
...
@@ -13,14 +13,12 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -13,14 +13,12 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
FunctionCall
,
ToolCall
,
ToolCall
,
)
)
from
vllm.entrypoints.openai.tool_parsers.qwen3coder_tool_parser
import
(
Qwen3CoderToolParser
,
)
from
vllm.entrypoints.openai.tool_parsers.qwen3xml_tool_parser
import
Qwen3XMLToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tool_parsers.qwen3coder_tool_parser
import
(
pytestmark
=
pytest
.
mark
.
cpu_test
Qwen3CoderToolParser
,
)
from
vllm.tool_parsers.qwen3xml_tool_parser
import
Qwen3XMLToolParser
MODEL
=
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
MODEL
=
"Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8"
...
...
tests/tool_
use
/test_seed_oss_tool_parser.py
→
tests/tool_
parsers
/test_seed_oss_tool_parser.py
View file @
a3f8d5dd
...
@@ -14,11 +14,9 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -14,11 +14,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
FunctionCall
,
ToolCall
,
ToolCall
,
)
)
from
vllm.entrypoints.openai.tool_parsers.seed_oss_tool_parser
import
SeedOssToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tool_parsers.seed_oss_tool_parser
import
SeedOssToolParser
pytestmark
=
pytest
.
mark
.
cpu_test
# Use a common model that is likely to be available
# Use a common model that is likely to be available
MODEL
=
"ByteDance-Seed/Seed-OSS-36B-Instruct"
MODEL
=
"ByteDance-Seed/Seed-OSS-36B-Instruct"
...
...
tests/tool_
use
/test_xlam_tool_parser.py
→
tests/tool_
parsers
/test_xlam_tool_parser.py
View file @
a3f8d5dd
...
@@ -12,11 +12,9 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -12,11 +12,9 @@ from vllm.entrypoints.openai.protocol import (
FunctionCall
,
FunctionCall
,
ToolCall
,
ToolCall
,
)
)
from
vllm.entrypoints.openai.tool_parsers.xlam_tool_parser
import
xLAMToolParser
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers
import
TokenizerLike
,
get_tokenizer
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tokenizers.detokenizer_utils
import
detokenize_incrementally
from
vllm.tool_parsers.xlam_tool_parser
import
xLAMToolParser
pytestmark
=
pytest
.
mark
.
cpu_test
# Use a common model that is likely to be available
# Use a common model that is likely to be available
MODEL
=
"Salesforce/Llama-xLAM-2-8B-fc-r"
MODEL
=
"Salesforce/Llama-xLAM-2-8B-fc-r"
...
...
tests/tool_use/test_tool_choice_required.py
View file @
a3f8d5dd
...
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
...
@@ -12,7 +12,7 @@ from vllm.entrypoints.openai.protocol import (
ChatCompletionToolsParam
,
ChatCompletionToolsParam
,
)
)
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.entrypoints.openai.serving_chat
import
OpenAIServingChat
from
vllm.
entrypoints.openai.
tool_parsers.utils
import
get_json_schema_from_tools
from
vllm.tool_parsers.utils
import
get_json_schema_from_tools
pytestmark
=
pytest
.
mark
.
cpu_test
pytestmark
=
pytest
.
mark
.
cpu_test
...
...
tests/utils.py
View file @
a3f8d5dd
...
@@ -119,7 +119,7 @@ class RemoteOpenAIServer:
...
@@ -119,7 +119,7 @@ class RemoteOpenAIServer:
vllm_serve_args
:
list
[
str
],
vllm_serve_args
:
list
[
str
],
*
,
*
,
env_dict
:
dict
[
str
,
str
]
|
None
=
None
,
env_dict
:
dict
[
str
,
str
]
|
None
=
None
,
seed
:
int
|
None
=
0
,
seed
:
int
=
0
,
auto_port
:
bool
=
True
,
auto_port
:
bool
=
True
,
max_wait_seconds
:
float
|
None
=
None
,
max_wait_seconds
:
float
|
None
=
None
,
override_hf_configs
:
dict
[
str
,
Any
]
|
None
=
None
,
override_hf_configs
:
dict
[
str
,
Any
]
|
None
=
None
,
...
@@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
...
@@ -283,7 +283,7 @@ class RemoteOpenAIServerCustom(RemoteOpenAIServer):
child_process_fxn
:
Callable
[[
dict
[
str
,
str
]
|
None
,
str
,
list
[
str
]],
None
],
child_process_fxn
:
Callable
[[
dict
[
str
,
str
]
|
None
,
str
,
list
[
str
]],
None
],
*
,
*
,
env_dict
:
dict
[
str
,
str
]
|
None
=
None
,
env_dict
:
dict
[
str
,
str
]
|
None
=
None
,
seed
:
int
|
None
=
0
,
seed
:
int
=
0
,
auto_port
:
bool
=
True
,
auto_port
:
bool
=
True
,
max_wait_seconds
:
float
|
None
=
None
,
max_wait_seconds
:
float
|
None
=
None
,
)
->
None
:
)
->
None
:
...
...
tests/v1/attention/test_sparse_mla_backends.py
View file @
a3f8d5dd
...
@@ -22,10 +22,14 @@ from tests.v1.attention.utils import (
...
@@ -22,10 +22,14 @@ from tests.v1.attention.utils import (
)
)
from
vllm
import
_custom_ops
as
ops
from
vllm
import
_custom_ops
as
ops
from
vllm.attention.ops
import
flashmla
from
vllm.attention.ops
import
flashmla
from
vllm.config
import
set_current_vllm_config
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.model_executor.layers.linear
import
ColumnParallelLinear
from
vllm.utils.math_utils
import
cdiv
from
vllm.utils.math_utils
import
cdiv
from
vllm.v1.attention.backends.mla.flashmla_sparse
import
FlashMLASparseBackend
from
vllm.v1.attention.backends.mla.flashmla_sparse
import
(
from
vllm.v1.attention.backends.mla.indexer
import
split_prefill_chunks
FlashMLASparseBackend
,
triton_convert_req_index_to_global_index
,
)
from
vllm.v1.attention.backends.utils
import
split_prefill_chunks
SPARSE_BACKEND_BATCH_SPECS
=
{
SPARSE_BACKEND_BATCH_SPECS
=
{
name
:
BATCH_SPECS
[
name
]
name
:
BATCH_SPECS
[
name
]
...
@@ -114,8 +118,12 @@ def _quantize_dequantize_fp8_ds_mla(
...
@@ -114,8 +118,12 @@ def _quantize_dequantize_fp8_ds_mla(
@
pytest
.
mark
.
parametrize
(
"batch_name"
,
list
(
SPARSE_BACKEND_BATCH_SPECS
.
keys
()))
@
pytest
.
mark
.
parametrize
(
"batch_name"
,
list
(
SPARSE_BACKEND_BATCH_SPECS
.
keys
()))
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"fp8_ds_mla"
,
"auto"
])
@
pytest
.
mark
.
parametrize
(
"kv_cache_dtype"
,
[
"fp8_ds_mla"
,
"auto"
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"tensor_parallel_size"
,
[
1
,
2
,
4
])
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
get_device_capability
()
<
(
9
,
0
),
reason
=
"FlashMLASparseBackend requires CUDA 9.0 or higher"
,
)
def
test_sparse_backend_decode_correctness
(
def
test_sparse_backend_decode_correctness
(
dist_init
,
batch_name
,
kv_cache_dtype
,
tensor_parallel_size
dist_init
,
batch_name
,
kv_cache_dtype
,
tensor_parallel_size
,
workspace_init
):
):
if
not
torch
.
cuda
.
is_available
():
if
not
torch
.
cuda
.
is_available
():
pytest
.
skip
(
"CUDA is required for sparse MLA decode test"
)
pytest
.
skip
(
"CUDA is required for sparse MLA decode test"
)
...
@@ -320,28 +328,29 @@ def test_sparse_backend_decode_correctness(
...
@@ -320,28 +328,29 @@ def test_sparse_backend_decode_correctness(
mock_kv_b_proj
.
weight
=
torch
.
nn
.
Parameter
(
kv_b_proj_weight
.
T
.
contiguous
())
mock_kv_b_proj
.
weight
=
torch
.
nn
.
Parameter
(
kv_b_proj_weight
.
T
.
contiguous
())
impl_cls
=
FlashMLASparseBackend
.
get_impl_cls
()
impl_cls
=
FlashMLASparseBackend
.
get_impl_cls
()
impl
=
impl_cls
(
with
set_current_vllm_config
(
vllm_config
):
num_heads
=
num_heads
,
impl
=
impl_cls
(
head_size
=
head_size
,
num_heads
=
num_heads
,
scale
=
scale
,
head_size
=
head_size
,
num_kv_heads
=
1
,
scale
=
scale
,
alibi_slopes
=
None
,
num_kv_heads
=
1
,
sliding_window
=
None
,
alibi_slopes
=
None
,
kv_cache_dtype
=
vllm_config
.
cache_config
.
cache_dtype
,
sliding_window
=
None
,
logits_soft_cap
=
None
,
kv_cache_dtype
=
vllm_config
.
cache_config
.
cache_dtype
,
attn_type
=
"decoder"
,
logits_soft_cap
=
None
,
kv_sharing_target_layer_name
=
None
,
attn_type
=
"decoder"
,
q_lora_rank
=
None
,
kv_sharing_target_layer_name
=
None
,
kv_lora_rank
=
kv_lora_rank
,
q_lora_rank
=
None
,
qk_nope_head_dim
=
qk_nope_head_dim
,
kv_lora_rank
=
kv_lora_rank
,
qk_rope_head_dim
=
qk_rope_head_dim
,
qk_nope_head_dim
=
qk_nope_head_dim
,
qk_head_dim
=
qk_nope_head_dim
+
qk_rope_head_dim
,
qk_rope_head_dim
=
qk_rope_head_dim
,
v_head_dim
=
v_head_dim
,
qk_head_dim
=
qk_nope_head_dim
+
qk_rope_head_dim
,
kv_b_proj
=
mock_kv_b_proj
,
v_head_dim
=
v_head_dim
,
indexer
=
mock_indexer
,
kv_b_proj
=
mock_kv_b_proj
,
)
indexer
=
mock_indexer
,
)
impl
.
process_weights_after_loading
(
dtype
)
impl
.
process_weights_after_loading
(
dtype
)
layer
=
MockAttentionLayer
(
device
)
layer
=
MockAttentionLayer
(
device
)
out_buffer
=
torch
.
empty
(
out_buffer
=
torch
.
empty
(
...
@@ -366,22 +375,192 @@ def test_sparse_backend_decode_correctness(
...
@@ -366,22 +375,192 @@ def test_sparse_backend_decode_correctness(
torch
.
testing
.
assert_close
(
backend_output
,
sdpa_reference
,
rtol
=
0.5
,
atol
=
0.5
)
torch
.
testing
.
assert_close
(
backend_output
,
sdpa_reference
,
rtol
=
0.5
,
atol
=
0.5
)
def
_triton_convert_reference_impl
(
req_ids
:
torch
.
Tensor
,
block_table
:
torch
.
Tensor
,
token_indices
:
torch
.
Tensor
,
block_size
:
int
,
num_topk_tokens
:
int
,
HAS_PREFILL_WORKSPACE
:
bool
=
False
,
prefill_workspace_request_ids
:
torch
.
Tensor
|
None
=
None
,
prefill_workspace_starts
:
torch
.
Tensor
|
None
=
None
,
)
->
torch
.
Tensor
:
"""Reference implementation for triton_convert_req_index_to_global_index."""
num_tokens
=
req_ids
.
shape
[
0
]
max_blocks_per_req
=
block_table
.
shape
[
1
]
result
=
torch
.
empty
(
num_tokens
,
num_topk_tokens
,
dtype
=
torch
.
int32
,
device
=
req_ids
.
device
)
for
token_id
in
range
(
num_tokens
):
req_id
=
req_ids
[
token_id
].
item
()
# Determine if this token uses workspace or paged cache
use_prefill_workspace
=
False
workspace_start
=
0
if
HAS_PREFILL_WORKSPACE
and
prefill_workspace_request_ids
is
not
None
:
assert
prefill_workspace_starts
is
not
None
prefill_req_id
=
prefill_workspace_request_ids
[
token_id
].
item
()
if
prefill_req_id
>=
0
:
use_prefill_workspace
=
True
workspace_start
=
prefill_workspace_starts
[
prefill_req_id
].
item
()
for
idx_id
in
range
(
num_topk_tokens
):
token_idx
=
token_indices
[
token_id
,
idx_id
].
item
()
if
token_idx
==
-
1
:
result
[
token_id
,
idx_id
]
=
-
1
elif
use_prefill_workspace
:
# Prefill + using prefill workspace: map to workspace offset
result
[
token_id
,
idx_id
]
=
workspace_start
+
token_idx
else
:
# Decode: map to paged cache
block_id
=
token_idx
//
block_size
if
block_id
>=
max_blocks_per_req
:
result
[
token_id
,
idx_id
]
=
-
1
else
:
block_num
=
block_table
[
req_id
,
block_id
].
item
()
offset
=
token_idx
%
block_size
result
[
token_id
,
idx_id
]
=
block_num
*
block_size
+
offset
return
result
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
,
64
,
128
])
@
pytest
.
mark
.
parametrize
(
"num_topk_tokens"
,
[
128
,
256
,
512
])
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
get_device_capability
()
<
(
9
,
0
),
reason
=
"FlashMLASparseBackend requires CUDA 9.0 or higher"
,
)
def
test_triton_convert_req_index_to_global_index_decode_only
(
block_size
,
num_topk_tokens
):
device
=
torch
.
device
(
"cuda"
)
num_tokens
=
8
num_requests
=
4
max_blocks_per_req
=
10
req_id
=
torch
.
randint
(
0
,
num_requests
,
(
num_tokens
,),
dtype
=
torch
.
int32
,
device
=
device
)
block_table
=
torch
.
randint
(
0
,
100
,
(
num_requests
,
max_blocks_per_req
),
dtype
=
torch
.
int32
,
device
=
device
)
token_indices
=
torch
.
randint
(
0
,
block_size
*
max_blocks_per_req
,
(
num_tokens
,
num_topk_tokens
),
dtype
=
torch
.
int32
,
device
=
device
,
)
# Set some to -1 to test masking
token_indices
[
0
,
:
10
]
=
-
1
token_indices
[
3
,
50
:
60
]
=
-
1
# Set some to out of bounds
token_indices
[
2
,
100
:
110
]
=
max_blocks_per_req
*
block_size
token_indices
[
6
,
150
:
160
]
=
max_blocks_per_req
*
block_size
result
=
triton_convert_req_index_to_global_index
(
req_id
,
block_table
,
token_indices
,
BLOCK_SIZE
=
block_size
,
NUM_TOPK_TOKENS
=
num_topk_tokens
,
)
reference_result
=
_triton_convert_reference_impl
(
req_id
,
block_table
,
token_indices
,
block_size
,
num_topk_tokens
,
)
torch
.
testing
.
assert_close
(
result
,
reference_result
,
rtol
=
0
,
atol
=
0
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
16
])
@
pytest
.
mark
.
skipif
(
torch
.
cuda
.
get_device_capability
()
<
(
9
,
0
),
reason
=
"FlashMLASparseBackend requires CUDA 9.0 or higher"
,
)
def
test_triton_convert_req_index_to_global_index_with_prefill_workspace
(
block_size
):
device
=
torch
.
device
(
"cuda"
)
num_requests
=
4
max_blocks_per_req
=
8
num_topk_tokens
=
128
# First 6 tokens are decode (reqs 0, 1), last 6 are prefill (reqs 2, 3)
req_id
=
torch
.
tensor
(
[
0
,
0
,
0
,
1
,
1
,
1
,
2
,
2
,
2
,
3
,
3
,
3
],
dtype
=
torch
.
int32
,
device
=
device
)
prefill_workspace_request_ids
=
torch
.
tensor
(
[
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
-
1
,
0
,
0
,
0
,
1
,
1
,
1
],
dtype
=
torch
.
int32
,
device
=
device
)
# Workspace starts for the 2 prefill reqs: req 2 starts at 0, req 3 starts at 100
prefill_workspace_starts
=
torch
.
tensor
([
0
,
100
],
dtype
=
torch
.
int32
,
device
=
device
)
block_table
=
torch
.
randint
(
0
,
50
,
(
num_requests
,
max_blocks_per_req
),
dtype
=
torch
.
int32
,
device
=
device
)
token_indices
=
torch
.
randint
(
0
,
block_size
*
max_blocks_per_req
,
(
req_id
.
shape
[
0
],
num_topk_tokens
),
dtype
=
torch
.
int32
,
device
=
device
,
)
# Set some to -1 to test masking
token_indices
[
0
,
:
10
]
=
-
1
token_indices
[
3
,
50
:
60
]
=
-
1
# Set some to out of bounds
token_indices
[
2
,
100
:
110
]
=
max_blocks_per_req
*
block_size
token_indices
[
6
,
150
:
160
]
=
max_blocks_per_req
*
block_size
result
=
triton_convert_req_index_to_global_index
(
req_id
,
block_table
,
token_indices
,
BLOCK_SIZE
=
block_size
,
NUM_TOPK_TOKENS
=
num_topk_tokens
,
HAS_PREFILL_WORKSPACE
=
True
,
prefill_workspace_request_ids
=
prefill_workspace_request_ids
,
prefill_workspace_starts
=
prefill_workspace_starts
,
)
reference_result
=
_triton_convert_reference_impl
(
req_id
,
block_table
,
token_indices
,
block_size
,
num_topk_tokens
,
HAS_PREFILL_WORKSPACE
=
True
,
prefill_workspace_request_ids
=
prefill_workspace_request_ids
,
prefill_workspace_starts
=
prefill_workspace_starts
,
)
torch
.
testing
.
assert_close
(
result
,
reference_result
,
rtol
=
0
,
atol
=
0
)
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"seq_lens,max_buf,
start,
expected"
,
"seq_lens,max_buf,expected"
,
[
[
# Basic split: totals per chunk ≤ max_buf
# Basic split: totals per chunk ≤ max_buf
(
torch
.
tensor
([
2
,
3
,
4
,
2
]),
5
,
0
,
[(
0
,
2
),
(
2
,
3
),
(
3
,
4
)]),
(
torch
.
tensor
([
2
,
3
,
4
,
2
]),
5
,
[(
0
,
2
),
(
2
,
3
),
(
3
,
4
)]),
# Non-zero start index
# Exact fits should split between items when adding the next would overflow
(
torch
.
tensor
([
2
,
3
,
4
,
2
]),
5
,
1
,
[(
1
,
2
),
(
2
,
3
),
(
3
,
4
)]),
(
torch
.
tensor
([
5
,
5
,
5
]),
5
,
[(
0
,
1
),
(
1
,
2
),
(
2
,
3
)]),
# Exact fits should split between items when adding the next would
# overflow
(
torch
.
tensor
([
5
,
5
,
5
]),
5
,
0
,
[(
0
,
1
),
(
1
,
2
),
(
2
,
3
)]),
# All requests fit in a single chunk
# All requests fit in a single chunk
(
torch
.
tensor
([
1
,
1
,
1
]),
10
,
0
,
[(
0
,
3
)]),
(
torch
.
tensor
([
1
,
1
,
1
]),
10
,
[(
0
,
3
)]),
# Large buffer
with non-zero start
# Large buffer
(
torch
.
tensor
([
4
,
4
,
4
]),
100
,
1
,
[(
1
,
3
)]),
(
torch
.
tensor
([
4
,
4
,
4
]),
100
,
[(
0
,
3
)]),
],
],
)
)
def
test_split_prefill_chunks
(
seq_lens
,
max_buf
,
start
,
expected
):
def
test_split_prefill_chunks
(
seq_lens
,
max_buf
,
expected
):
out
=
split_prefill_chunks
(
seq_lens
,
max_buf
,
start
)
out
=
split_prefill_chunks
(
seq_lens
,
max_buf
)
assert
out
==
expected
assert
out
==
expected
tests/v1/core/test_encoder_cache_manager.py
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
from
vllm.multimodal.inputs
import
MultiModalFeatureSpec
,
PlaceholderRange
from
vllm.multimodal.inputs
import
MultiModalFeatureSpec
,
PlaceholderRange
from
vllm.v1.core.encoder_cache_manager
import
EncoderCacheManager
from
vllm.v1.core.encoder_cache_manager
import
EncoderCacheManager
...
@@ -23,7 +24,7 @@ class MockRequest:
...
@@ -23,7 +24,7 @@ class MockRequest:
)
)
self
.
mm_features
.
append
(
feature
)
self
.
mm_features
.
append
(
feature
)
def
get_num_encoder_
token
s
(
self
,
input_id
:
int
)
->
int
:
def
get_num_encoder_
embed
s
(
self
,
input_id
:
int
)
->
int
:
return
self
.
_token_counts
[
input_id
]
return
self
.
_token_counts
[
input_id
]
...
@@ -162,8 +163,8 @@ def test_schedule_request_multi_images_respect_space_limit():
...
@@ -162,8 +163,8 @@ def test_schedule_request_multi_images_respect_space_limit():
num_tokens_to_schedule
=
0
num_tokens_to_schedule
=
0
assert
manager
.
can_allocate
(
req
,
0
,
compute_budget
,
num_tokens_to_schedule
)
assert
manager
.
can_allocate
(
req
,
0
,
compute_budget
,
num_tokens_to_schedule
)
num_tokens_to_schedule
+=
req
.
get_num_encoder_
token
s
(
0
)
num_tokens_to_schedule
+=
req
.
get_num_encoder_
embed
s
(
0
)
compute_budget
-=
req
.
get_num_encoder_
token
s
(
0
)
compute_budget
-=
req
.
get_num_encoder_
embed
s
(
0
)
assert
not
manager
.
can_allocate
(
req
,
1
,
compute_budget
,
num_tokens_to_schedule
)
assert
not
manager
.
can_allocate
(
req
,
1
,
compute_budget
,
num_tokens_to_schedule
)
...
@@ -174,7 +175,75 @@ def test_schedule_request_multi_images_respect_compute_limit():
...
@@ -174,7 +175,75 @@ def test_schedule_request_multi_images_respect_compute_limit():
compute_budget
=
10
compute_budget
=
10
num_tokens_to_schedule
=
0
num_tokens_to_schedule
=
0
assert
manager
.
can_allocate
(
req
,
0
,
compute_budget
,
num_tokens_to_schedule
)
assert
manager
.
can_allocate
(
req
,
0
,
compute_budget
,
num_tokens_to_schedule
)
num_tokens_to_schedule
+=
req
.
get_num_encoder_
token
s
(
0
)
num_tokens_to_schedule
+=
req
.
get_num_encoder_
embed
s
(
0
)
compute_budget
-=
req
.
get_num_encoder_
token
s
(
0
)
compute_budget
-=
req
.
get_num_encoder_
embed
s
(
0
)
assert
not
manager
.
can_allocate
(
req
,
1
,
compute_budget
,
num_tokens_to_schedule
)
assert
not
manager
.
can_allocate
(
req
,
1
,
compute_budget
,
num_tokens_to_schedule
)
def
test_encoder_cache_with_is_embed_mask
():
class
MockRequestWithMask
(
MockRequest
):
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
is_embed
=
torch
.
zeros
(
100
,
dtype
=
torch
.
bool
)
is_embed
[
torch
.
tensor
([
5
,
15
,
25
,
35
,
45
,
55
,
65
,
75
])]
=
True
request
=
MockRequestWithMask
(
"r1"
,
[
"img1"
],
[
100
])
request
.
mm_features
[
0
]
=
MultiModalFeatureSpec
(
data
=
None
,
modality
=
"image"
,
identifier
=
"img1"
,
mm_position
=
PlaceholderRange
(
offset
=
0
,
length
=
100
,
is_embed
=
is_embed
),
)
manager
=
EncoderCacheManager
(
cache_size
=
100
)
manager
.
allocate
(
request
,
0
)
assert
manager
.
num_free_slots
==
92
assert
"img1"
in
manager
.
cached
old_size
=
100
new_size
=
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
assert
new_size
==
8
savings_ratio
=
old_size
/
new_size
assert
savings_ratio
==
12.5
def
test_encoder_cache_mask_based_retrieval
():
class
MockRequestWithMask
(
MockRequest
):
def
get_num_encoder_embeds
(
self
,
input_id
:
int
)
->
int
:
return
self
.
mm_features
[
input_id
].
mm_position
.
get_num_embeds
is_embed
=
torch
.
tensor
(
[
False
,
False
,
True
,
True
,
False
,
True
,
True
,
True
,
False
,
False
]
)
request
=
MockRequestWithMask
(
"r1"
,
[
"img1"
],
[
10
])
request
.
mm_features
[
0
]
=
MultiModalFeatureSpec
(
data
=
None
,
modality
=
"image"
,
identifier
=
"img1"
,
mm_position
=
PlaceholderRange
(
offset
=
0
,
length
=
10
,
is_embed
=
is_embed
),
)
manager
=
EncoderCacheManager
(
cache_size
=
50
)
manager
.
allocate
(
request
,
0
)
assert
request
.
mm_features
[
0
].
mm_position
.
get_num_embeds
==
5
start_idx
=
2
end_idx
=
8
num_embeds_before
=
is_embed
[:
start_idx
].
sum
().
item
()
num_embeds_in_range
=
is_embed
[
start_idx
:
end_idx
].
sum
().
item
()
assert
num_embeds_before
==
0
assert
num_embeds_in_range
==
5
start_idx
=
0
end_idx
=
5
num_embeds_before
=
is_embed
[:
start_idx
].
sum
().
item
()
if
start_idx
>
0
else
0
num_embeds_in_range
=
is_embed
[
start_idx
:
end_idx
].
sum
().
item
()
assert
num_embeds_before
==
0
assert
num_embeds_in_range
==
2
tests/v1/distributed/test_dbo.py
View file @
a3f8d5dd
...
@@ -13,6 +13,7 @@ import torch
...
@@ -13,6 +13,7 @@ import torch
from
tests.evals.gsm8k.gsm8k_eval
import
evaluate_gsm8k
from
tests.evals.gsm8k.gsm8k_eval
import
evaluate_gsm8k
from
tests.utils
import
RemoteOpenAIServer
from
tests.utils
import
RemoteOpenAIServer
from
vllm.utils.import_utils
import
has_deep_ep
# Detect Blackwell / B200 (compute capability 10.x)
# Detect Blackwell / B200 (compute capability 10.x)
try
:
try
:
...
@@ -44,6 +45,7 @@ DEEPEP_BACKENDS = [
...
@@ -44,6 +45,7 @@ DEEPEP_BACKENDS = [
]
]
@
pytest
.
mark
.
skipif
(
not
has_deep_ep
(),
reason
=
"These tests require deep_ep to run"
)
@
pytest
.
mark
.
parametrize
(
"all2all_backend"
,
DEEPEP_BACKENDS
)
@
pytest
.
mark
.
parametrize
(
"all2all_backend"
,
DEEPEP_BACKENDS
)
@
pytest
.
mark
.
xfail
(
@
pytest
.
mark
.
xfail
(
IS_BLACKWELL
,
IS_BLACKWELL
,
...
...
tests/v1/e2e/test_async_scheduling.py
View file @
a3f8d5dd
...
@@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
...
@@ -8,6 +8,7 @@ import torch._dynamo.config as dynamo_config
from
vllm
import
SamplingParams
from
vllm
import
SamplingParams
from
vllm.logprobs
import
Logprob
from
vllm.logprobs
import
Logprob
from
vllm.platforms
import
current_platform
from
vllm.sampling_params
import
StructuredOutputsParams
from
vllm.sampling_params
import
StructuredOutputsParams
from
vllm.v1.metrics.reader
import
Metric
from
vllm.v1.metrics.reader
import
Metric
...
@@ -70,6 +71,18 @@ def test_without_spec_decoding(
...
@@ -70,6 +71,18 @@ def test_without_spec_decoding(
(
True
,
"uni"
,
True
,
None
,
True
),
(
True
,
"uni"
,
True
,
None
,
True
),
]
]
if
current_platform
.
is_rocm
():
# On ROCm, Only test with structured_outputs (deterministic)
# and skip chunk_prefill (more variable).
test_configs
=
[
cfg
for
cfg
in
test_configs
if
not
cfg
[
4
]
# skip chunk_prefill=True
]
test_sampling_params
=
[
p
for
p
in
test_sampling_params
if
p
.
get
(
"structured_outputs"
)
is
not
None
]
run_tests
(
monkeypatch
,
MODEL
,
test_configs
,
test_sampling_params
)
run_tests
(
monkeypatch
,
MODEL
,
test_configs
,
test_sampling_params
)
...
@@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
...
@@ -108,7 +121,14 @@ def test_with_spec_decoding(monkeypatch: pytest.MonkeyPatch):
(
True
,
"uni"
,
True
,
spec_config_short
,
True
),
(
True
,
"uni"
,
True
,
spec_config_short
,
True
),
]
]
run_tests
(
monkeypatch
,
MTP_MODEL
,
test_configs
,
test_sampling_params
)
# On ROCm, use TRITON_ATTN + float32 for better numerical consistency
run_tests
(
monkeypatch
,
MTP_MODEL
,
test_configs
,
test_sampling_params
,
is_testing_with_spec_decoding
=
True
,
)
@
dynamo_config
.
patch
(
cache_size_limit
=
16
)
@
dynamo_config
.
patch
(
cache_size_limit
=
16
)
...
@@ -117,15 +137,23 @@ def run_tests(
...
@@ -117,15 +137,23 @@ def run_tests(
model
:
str
,
model
:
str
,
test_configs
:
list
[
tuple
],
test_configs
:
list
[
tuple
],
test_sampling_params
:
list
[
dict
[
str
,
Any
]],
test_sampling_params
:
list
[
dict
[
str
,
Any
]],
is_testing_with_spec_decoding
:
bool
=
False
,
):
):
"""Test consistency of combos of async scheduling, preemption,
"""Test consistency of combos of async scheduling, preemption,
uni/multiproc executor with spec decoding."""
uni/multiproc executor with spec decoding."""
with
monkeypatch
.
context
()
as
m
:
with
monkeypatch
.
context
()
as
m
:
# avoid precision errors
# avoid precision errors
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
if
current_platform
.
is_rocm
():
# lock matmul precision to full FP32
if
is_testing_with_spec_decoding
:
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"highest"
)
# Use TRITON_ATTN for spec decoding test for consistency
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"TRITON_ATTN"
)
else
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"ROCM_AITER_FA"
)
else
:
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
# lock matmul precision to full FP32 (IEEE)
m
.
setenv
(
"VLLM_FLOAT32_MATMUL_PRECISION"
,
"ieee"
)
# m.setenv("VLLM_BATCH_INVARIANT", "1")
# m.setenv("VLLM_BATCH_INVARIANT", "1")
outputs
:
list
[
tuple
[
str
,
list
,
list
]]
=
[]
outputs
:
list
[
tuple
[
str
,
list
,
list
]]
=
[]
for
n
,
(
for
n
,
(
...
@@ -145,6 +173,7 @@ def run_tests(
...
@@ -145,6 +173,7 @@ def run_tests(
async_scheduling
,
async_scheduling
,
spec_config
,
spec_config
,
test_prefill_chunking
=
test_prefill_chunking
,
test_prefill_chunking
=
test_prefill_chunking
,
is_testing_with_spec_decoding
=
is_testing_with_spec_decoding
,
)
)
outputs
.
append
(
test_results
)
outputs
.
append
(
test_results
)
...
@@ -174,17 +203,34 @@ def run_tests(
...
@@ -174,17 +203,34 @@ def run_tests(
name_0
=
f
"baseline=[
{
baseline_config
}
], params=
{
params
}
"
,
name_0
=
f
"baseline=[
{
baseline_config
}
], params=
{
params
}
"
,
name_1
=
f
"config=[
{
test_config
}
], params=
{
params
}
"
,
name_1
=
f
"config=[
{
test_config
}
], params=
{
params
}
"
,
)
)
assert
_all_logprobs_match
(
base_logprobs
,
test_logprobs
)
# On ROCm with TRITON_ATTN (spec decoding test), skip strict
# logprobs comparison when logprobs are requested
skip_logprobs_check
=
(
current_platform
.
is_rocm
()
and
params
.
get
(
"logprobs"
)
and
is_testing_with_spec_decoding
)
if
not
skip_logprobs_check
:
assert
_all_logprobs_match
(
base_logprobs
,
test_logprobs
)
if
(
if
(
base_acceptance_rate
is
not
None
base_acceptance_rate
is
not
None
and
test_acceptance_rate
is
not
None
and
test_acceptance_rate
is
not
None
):
):
if
"spec_mml=None"
in
test_config
:
if
"spec_mml=None"
in
test_config
:
# Preemption causes more variance in acceptance rates
if
(
current_platform
.
is_rocm
()
and
"preemption=True"
in
test_config
):
tolerance
=
0.10
else
:
tolerance
=
0.05
assert
(
assert
(
test_acceptance_rate
>
base_acceptance_rate
test_acceptance_rate
>
base_acceptance_rate
or
test_acceptance_rate
or
test_acceptance_rate
==
pytest
.
approx
(
base_acceptance_rate
,
rel
=
5e-2
)
==
pytest
.
approx
(
base_acceptance_rate
,
rel
=
tolerance
)
)
)
else
:
else
:
# Currently the reported acceptance rate is expected to be
# Currently the reported acceptance rate is expected to be
...
@@ -215,6 +261,7 @@ def run_test(
...
@@ -215,6 +261,7 @@ def run_test(
async_scheduling
:
bool
,
async_scheduling
:
bool
,
spec_config
:
dict
[
str
,
Any
]
|
None
,
spec_config
:
dict
[
str
,
Any
]
|
None
,
test_prefill_chunking
:
bool
,
test_prefill_chunking
:
bool
,
is_testing_with_spec_decoding
:
bool
=
False
,
):
):
spec_decoding
=
spec_config
is
not
None
spec_decoding
=
spec_config
is
not
None
cache_arg
:
dict
[
str
,
Any
]
=
(
cache_arg
:
dict
[
str
,
Any
]
=
(
...
@@ -233,6 +280,15 @@ def run_test(
...
@@ -233,6 +280,15 @@ def run_test(
print
(
"-"
*
80
)
print
(
"-"
*
80
)
print
(
f
"---- TESTING
{
test_str
}
:
{
test_config
}
"
)
print
(
f
"---- TESTING
{
test_str
}
:
{
test_config
}
"
)
print
(
"-"
*
80
)
print
(
"-"
*
80
)
# On ROCm: use float16 for first test (ROCM_AITER_FA), but float32 for
# spec decoding test (TRITON_ATTN) for better precision.
# On others: always use float32.
if
current_platform
.
is_rocm
()
and
not
is_testing_with_spec_decoding
:
dtype
=
"float16"
else
:
dtype
=
"float32"
with
VllmRunner
(
with
VllmRunner
(
model
,
model
,
max_model_len
=
512
,
max_model_len
=
512
,
...
@@ -242,7 +298,7 @@ def run_test(
...
@@ -242,7 +298,7 @@ def run_test(
# enforce_eager=True,
# enforce_eager=True,
async_scheduling
=
async_scheduling
,
async_scheduling
=
async_scheduling
,
distributed_executor_backend
=
executor
,
distributed_executor_backend
=
executor
,
dtype
=
"float32"
,
# avoid precision errors
dtype
=
dtype
,
speculative_config
=
spec_config
,
speculative_config
=
spec_config
,
disable_log_stats
=
False
,
disable_log_stats
=
False
,
**
cache_arg
,
**
cache_arg
,
...
@@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
...
@@ -302,11 +358,21 @@ def _all_logprobs_match(req_a, req_b) -> bool:
def
_logprobs_match
(
lps_a
:
dict
[
int
,
Logprob
],
lps_b
:
dict
[
int
,
Logprob
])
->
bool
:
def
_logprobs_match
(
lps_a
:
dict
[
int
,
Logprob
],
lps_b
:
dict
[
int
,
Logprob
])
->
bool
:
return
len
(
lps_a
)
==
len
(
lps_b
)
and
all
(
if
current_platform
.
is_rocm
():
a
.
decoded_token
==
b
.
decoded_token
# ROCm has higher numerical variance
and
a
.
rank
==
b
.
rank
# due to use of float16.
and
a
.
logprob
==
pytest
.
approx
(
b
.
logprob
,
rel
=
1e-3
,
abs
=
1e-6
)
rel_tol
,
abs_tol
=
5e-2
,
1e-5
for
a
,
b
in
((
lps_a
[
x
],
lps_b
[
x
])
for
x
in
lps_a
)
else
:
rel_tol
,
abs_tol
=
1e-3
,
1e-6
return
(
len
(
lps_a
)
==
len
(
lps_b
)
and
lps_a
.
keys
()
==
lps_b
.
keys
()
and
all
(
a
.
decoded_token
==
b
.
decoded_token
and
a
.
rank
==
b
.
rank
and
a
.
logprob
==
pytest
.
approx
(
b
.
logprob
,
rel
=
rel_tol
,
abs
=
abs_tol
)
for
a
,
b
in
((
lps_a
[
x
],
lps_b
[
x
])
for
x
in
lps_a
)
)
)
)
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment