Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
38d80967
Commit
38d80967
authored
Sep 12, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.10.2rc2' into v0.10.2rc2-ori
parents
33650733
880c741b
Changes
544
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
762 additions
and
223 deletions
+762
-223
tests/compile/piecewise/test_multiple_graphs.py
tests/compile/piecewise/test_multiple_graphs.py
+4
-26
tests/compile/piecewise/test_simple.py
tests/compile/piecewise/test_simple.py
+8
-35
tests/compile/piecewise/test_toy_llama.py
tests/compile/piecewise/test_toy_llama.py
+2
-25
tests/compile/silly_attention.py
tests/compile/silly_attention.py
+63
-0
tests/compile/test_basic_correctness.py
tests/compile/test_basic_correctness.py
+18
-15
tests/compile/test_decorator.py
tests/compile/test_decorator.py
+4
-27
tests/compile/test_fusion.py
tests/compile/test_fusion.py
+16
-11
tests/compile/test_fusion_attn.py
tests/compile/test_fusion_attn.py
+87
-42
tests/compile/test_silu_mul_quant_fusion.py
tests/compile/test_silu_mul_quant_fusion.py
+44
-26
tests/conftest.py
tests/conftest.py
+132
-0
tests/core/block/e2e/test_correctness.py
tests/core/block/e2e/test_correctness.py
+2
-2
tests/core/test_scheduler.py
tests/core/test_scheduler.py
+3
-2
tests/detokenizer/test_stop_string_while_stop_model_terminates.py
...tokenizer/test_stop_string_while_stop_model_terminates.py
+103
-0
tests/distributed/conftest.py
tests/distributed/conftest.py
+1
-1
tests/distributed/test_context_parallel.py
tests/distributed/test_context_parallel.py
+263
-0
tests/distributed/test_pipeline_parallel.py
tests/distributed/test_pipeline_parallel.py
+6
-0
tests/distributed/test_sequence_parallel.py
tests/distributed/test_sequence_parallel.py
+3
-0
tests/encoder_decoder/test_e2e_correctness.py
tests/encoder_decoder/test_e2e_correctness.py
+1
-0
tests/engine/test_arg_utils.py
tests/engine/test_arg_utils.py
+1
-10
tests/engine/test_executor.py
tests/engine/test_executor.py
+1
-1
No files found.
Too many changes to show.
To preserve performance only
544 of 544+
files are displayed.
Plain diff
Email patch
tests/compile/piecewise/test_multiple_graphs.py
View file @
38d80967
...
@@ -4,9 +4,9 @@
...
@@ -4,9 +4,9 @@
Test (piecewise) compilation with a simple model where multiple submodules
Test (piecewise) compilation with a simple model where multiple submodules
are compiled and graph captured separately.
are compiled and graph captured separately.
"""
"""
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.backends
import
set_model_tag
from
vllm.compilation.backends
import
set_model_tag
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
...
@@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile,
...
@@ -15,10 +15,9 @@ from vllm.compilation.decorators import (ignore_torch_compile,
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
#
create a library to hold the custom op
#
This import automatically registers `torch.ops.silly.attention`
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
from
..
import
silly_attention
# noqa
: F401
BATCH_SIZE
=
32
BATCH_SIZE
=
32
MLP_SIZE
=
128
MLP_SIZE
=
128
...
@@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024
...
@@ -26,27 +25,6 @@ HIDDEN_SIZE = 1024
RANDOM_SEED
=
0
RANDOM_SEED
=
0
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
support_torch_compile
@
support_torch_compile
class
ParentModel
(
nn
.
Module
):
class
ParentModel
(
nn
.
Module
):
...
@@ -134,7 +112,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
...
@@ -134,7 +112,7 @@ class SimpleModelWithTwoGraphs(ParentModel):
# Test will fail without set_model_tag here with error:
# Test will fail without set_model_tag here with error:
# "ValueError: too many values to unpack (expected 3)"
# "ValueError: too many values to unpack (expected 3)"
# This is because CompiledAttention and CompiledAttentionTwo
# This is because CompiledAttention and CompiledAttentionTwo
# have different implmentations but the same torch.compile
# have different impl
e
mentations but the same torch.compile
# cache dir will be used as default prefix is 'model_tag'
# cache dir will be used as default prefix is 'model_tag'
with
set_model_tag
(
"attn_one"
):
with
set_model_tag
(
"attn_one"
):
self
.
attn_one
=
CompiledAttention
(
self
.
attn_one
=
CompiledAttention
(
...
...
tests/compile/piecewise/test_simple.py
View file @
38d80967
...
@@ -4,10 +4,10 @@
...
@@ -4,10 +4,10 @@
Test the piecewise compilation with a simple model so that we
Test the piecewise compilation with a simple model so that we
can exactly calculate the expected output and side effects.
can exactly calculate the expected output and side effects.
"""
"""
import
pytest
import
pytest
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
...
@@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
...
@@ -15,35 +15,9 @@ from vllm.config import (CompilationConfig, CompilationLevel, CUDAGraphMode,
VllmConfig
,
set_current_vllm_config
)
VllmConfig
,
set_current_vllm_config
)
from
vllm.envs
import
VLLM_USE_V1
from
vllm.envs
import
VLLM_USE_V1
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
global_counter
=
0
# create a library to hold the custom op
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
global
global_counter
global_counter
+=
1
print
(
f
"
{
global_counter
=
}
"
)
out
.
copy_
(
q
)
out
[
0
]
+=
1
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
# This import automatically registers `torch.ops.silly.attention`
op_name
=
"attention"
,
from
..silly_attention
import
get_global_counter
,
reset_global_counter
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
support_torch_compile
@
support_torch_compile
...
@@ -59,8 +33,7 @@ class SillyModel(nn.Module):
...
@@ -59,8 +33,7 @@ class SillyModel(nn.Module):
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
def
forward
(
self
,
x
:
torch
.
Tensor
)
->
torch
.
Tensor
:
"""
"""
Overall effect:
Overall effect:
x += 1
x = 3 * x + 19
x[0] += 2
global_counter += 2
global_counter += 2
"""
"""
x
=
x
+
1
x
=
x
+
1
...
@@ -78,6 +51,7 @@ class SillyModel(nn.Module):
...
@@ -78,6 +51,7 @@ class SillyModel(nn.Module):
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_inductor"
,
[
True
,
False
])
@
torch
.
inference_mode
()
def
test_simple_piecewise_compile
(
use_inductor
):
def
test_simple_piecewise_compile
(
use_inductor
):
assert
VLLM_USE_V1
assert
VLLM_USE_V1
...
@@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor):
...
@@ -121,13 +95,12 @@ def test_simple_piecewise_compile(use_inductor):
model
(
torch
.
randn
(
1
).
cuda
())
model
(
torch
.
randn
(
1
).
cuda
())
input
=
torch
.
zeros
(
2
).
cuda
()
input
=
torch
.
zeros
(
2
).
cuda
()
global
global_counter
reset_global_counter
()
global_counter
=
0
with
set_forward_context
(
with
set_forward_context
(
None
,
None
,
vllm_config
=
vllm_config
,
vllm_config
=
vllm_config
,
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
,
cudagraph_runtime_mode
=
CUDAGraphMode
.
PIECEWISE
,
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
batch_descriptor
=
BatchDescriptor
(
num_tokens
=
2
,
)):
output
=
model
(
input
)
output
=
model
(
input
)
assert
global_counter
==
2
assert
get_
global_counter
()
==
2
assert
torch
.
allclose
(
output
.
cpu
(),
torch
.
tensor
([
3.
,
1.
]))
assert
torch
.
allclose
(
output
.
cpu
(),
torch
.
tensor
([
19.0
,
19.0
]))
tests/compile/piecewise/test_toy_llama.py
View file @
38d80967
...
@@ -14,38 +14,15 @@ from typing import Any, Optional
...
@@ -14,38 +14,15 @@ from typing import Any, Optional
import
pytest
import
pytest
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.compilation.decorators
import
support_torch_compile
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
from
vllm.config
import
(
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
# create a library to hold the custom op
# This import automatically registers `torch.ops.silly.attention`
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
from
..
import
silly_attention
# noqa: F401
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
dataclass
@
dataclass
...
...
tests/compile/silly_attention.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Shared PyTorch custom silly attention for compilation tests.
Centralizes custom operation definitions to avoid duplicate registrations.
"""
import
torch
from
torch.library
import
Library
from
vllm.utils
import
direct_register_custom_op
# Shared library for all compilation test operations
# Using "silly" namespace to match existing test expectations
# import this file will automatically register
# torch ops for testing (like silly.attention)
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# Global counter that counts the number of times attention is invoked
_global_counter
=
0
def
get_global_counter
():
"""Get the current global counter value"""
return
_global_counter
def
reset_global_counter
():
"""Reset the global counter to 0"""
global
_global_counter
_global_counter
=
0
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
"""
Unified attention implementation that depends on
all inputs and affects the output.
Always increments a global counter that tests can use or ignore.
"""
global
_global_counter
# Always increment the global counter
_global_counter
+=
1
# Unified implementation that depends on all inputs
out
.
copy_
(
q
+
k
+
v
)
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
"""Fake implementation for testing"""
return
# Register the unified attention operation
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
tests/compile/test_basic_correctness.py
View file @
38d80967
...
@@ -23,7 +23,7 @@ class TestSetting:
...
@@ -23,7 +23,7 @@ class TestSetting:
fullgraph
:
bool
fullgraph
:
bool
# we cannot afford testing the full Catesian product
# we cannot afford testing the full Ca
r
tesian product
# of all models and all levels
# of all models and all levels
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"test_setting"
,
"test_setting"
,
...
@@ -62,8 +62,12 @@ class TestSetting:
...
@@ -62,8 +62,12 @@ class TestSetting:
TestSetting
(
TestSetting
(
model
=
"BAAI/bge-multilingual-gemma2"
,
model
=
"BAAI/bge-multilingual-gemma2"
,
model_args
=
[
model_args
=
[
"--runner"
,
"pooling"
,
"--dtype"
,
"bfloat16"
,
"--runner"
,
"--max-model-len"
,
"2048"
"pooling"
,
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
],
],
pp_size
=
1
,
pp_size
=
1
,
tp_size
=
1
,
tp_size
=
1
,
...
@@ -71,17 +75,15 @@ class TestSetting:
...
@@ -71,17 +75,15 @@ class TestSetting:
method
=
"encode"
,
method
=
"encode"
,
fullgraph
=
True
,
fullgraph
=
True
,
),
),
# TODO: bert models are not supported in V1 yet
TestSetting
(
# # encoder-based embedding model (BERT)
model
=
"BAAI/bge-base-en-v1.5"
,
# TestSetting(
model_args
=
[
"--runner"
,
"pooling"
],
# model="BAAI/bge-base-en-v1.5",
pp_size
=
1
,
# model_args=["--runner", "pooling"],
tp_size
=
1
,
# pp_size=1,
attn_backend
=
"FLASH_ATTN"
,
# tp_size=1,
method
=
"encode"
,
# attn_backend="XFORMERS",
fullgraph
=
True
,
# method="encode",
),
# fullgraph=True,
# ),
# vision language model
# vision language model
TestSetting
(
TestSetting
(
model
=
"microsoft/Phi-3.5-vision-instruct"
,
model
=
"microsoft/Phi-3.5-vision-instruct"
,
...
@@ -92,7 +94,8 @@ class TestSetting:
...
@@ -92,7 +94,8 @@ class TestSetting:
method
=
"generate_with_image"
,
method
=
"generate_with_image"
,
fullgraph
=
False
,
fullgraph
=
False
,
),
),
])
],
)
def
test_compile_correctness
(
def
test_compile_correctness
(
monkeypatch
:
pytest
.
MonkeyPatch
,
monkeypatch
:
pytest
.
MonkeyPatch
,
test_setting
:
TestSetting
,
test_setting
:
TestSetting
,
...
...
tests/compile/test_decorator.py
View file @
38d80967
...
@@ -2,7 +2,6 @@
...
@@ -2,7 +2,6 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch
from
torch
import
nn
from
torch
import
nn
from
torch.library
import
Library
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.counter
import
compilation_counter
from
vllm.compilation.decorators
import
(
ignore_torch_compile
,
from
vllm.compilation.decorators
import
(
ignore_torch_compile
,
...
@@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile,
...
@@ -10,36 +9,14 @@ from vllm.compilation.decorators import (ignore_torch_compile,
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationLevel
,
from
vllm.config
import
(
CacheConfig
,
CompilationConfig
,
CompilationLevel
,
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
CUDAGraphMode
,
VllmConfig
,
set_current_vllm_config
)
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.forward_context
import
BatchDescriptor
,
set_forward_context
from
vllm.utils
import
direct_register_custom_op
#
create a library to hold the custom op
#
This import automatically registers `torch.ops.silly.attention`
silly_lib
=
Library
(
"silly"
,
"FRAGMENT"
)
# noqa
from
.
import
silly_attention
# noqa
: F401
BATCH_SIZE
=
32
BATCH_SIZE
=
32
MLP_SIZE
=
128
MLP_SIZE
=
128
def
silly_attention
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
out
.
copy_
(
q
)
out
+=
k
out
+=
v
def
silly_attention_fake
(
q
:
torch
.
Tensor
,
k
:
torch
.
Tensor
,
v
:
torch
.
Tensor
,
out
:
torch
.
Tensor
)
->
None
:
return
direct_register_custom_op
(
op_name
=
"attention"
,
op_func
=
silly_attention
,
mutates_args
=
[
"out"
],
fake_impl
=
silly_attention_fake
,
target_lib
=
silly_lib
,
)
@
torch
.
inference_mode
@
torch
.
inference_mode
def
run_model
(
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
def
run_model
(
vllm_config
:
VllmConfig
,
model
:
nn
.
Module
,
cudagraph_runtime_mode
:
CUDAGraphMode
):
cudagraph_runtime_mode
:
CUDAGraphMode
):
...
@@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator():
...
@@ -151,7 +128,7 @@ def test_ignore_torch_compile_decorator():
run_model
(
vllm_config
,
mod_C
,
cudagraph_runtime_mode
)
run_model
(
vllm_config
,
mod_C
,
cudagraph_runtime_mode
)
#
Only enable torch.compile if
#
Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=True
# vllm_config.cache_config.kv_sharing_fast_prefill=True
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
vllm_config
.
cache_config
.
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
vllm_config
.
cache_config
.
kv_sharing_fast_prefill
)
kv_sharing_fast_prefill
)
...
@@ -173,7 +150,7 @@ class B(nn.Module):
...
@@ -173,7 +150,7 @@ class B(nn.Module):
return
x
return
x
#
Only enable torch.compile if
#
Only enable torch.compile if
# vllm_config.cache_config.kv_sharing_fast_prefill=False
# vllm_config.cache_config.kv_sharing_fast_prefill=False
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
not
vllm_config
.
@
support_torch_compile
(
enable_if
=
lambda
vllm_config
:
not
vllm_config
.
cache_config
.
kv_sharing_fast_prefill
)
cache_config
.
kv_sharing_fast_prefill
)
...
...
tests/compile/test_fusion.py
View file @
38d80967
...
@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
...
@@ -15,9 +15,10 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
QuantKey
,
ScaleDesc
)
GroupShape
,
QuantKey
,
ScaleDesc
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
Fp8LinearOp
,
maybe_create_device_identity
)
Fp8LinearOp
,
cutlass_fp8_supported
,
maybe_create_device_identity
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
override_cutlass_fp8_supported
from
.backend
import
TestBackend
from
.backend
import
TestBackend
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
...
@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
...
@@ -26,9 +27,9 @@ FP8_DTYPE = current_platform.fp8_dtype()
class
TestModel
(
torch
.
nn
.
Module
):
class
TestModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
,
static
:
bool
,
def
__init__
(
self
,
hidden_size
:
int
,
eps
:
float
,
static
:
bool
,
force_
fp8_e4m3fnuz
:
bool
,
*
args
,
**
kwargs
):
cuda_
force_
torch
:
bool
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
self
.
force_
fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
self
.
cuda_
force_
torch
=
cuda_force_torch
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
norm
=
[
RMSNorm
(
hidden_size
,
eps
)
for
_
in
range
(
3
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
self
.
wscale
=
[
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
for
_
in
range
(
2
)]
group_shape
=
GroupShape
.
PER_TENSOR
if
static
else
GroupShape
.
PER_TOKEN
group_shape
=
GroupShape
.
PER_TENSOR
if
static
else
GroupShape
.
PER_TOKEN
...
@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
...
@@ -42,11 +43,12 @@ class TestModel(torch.nn.Module):
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
for
_
in
range
(
2
)
for
_
in
range
(
2
)
]
]
self
.
fp8_linear
=
Fp8LinearOp
(
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
,
with
override_cutlass_fp8_supported
(
not
cuda_force_torch
):
act_quant_static
=
static
,
self
.
fp8_linear
=
Fp8LinearOp
(
act_quant_group_shape
=
group_shape
,
act_quant_static
=
static
,
)
act_quant_group_shape
=
group_shape
,
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
resid
=
torch
.
sqrt
(
x
)
resid
=
torch
.
sqrt
(
x
)
...
@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
...
@@ -81,11 +83,14 @@ class TestModel(torch.nn.Module):
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
7
,
256
,
533
,
2048
,
2049
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
7
,
256
,
533
,
2048
,
2049
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"eps"
,
[
1e-5
,
1e-6
])
@
pytest
.
mark
.
parametrize
(
"static"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"static"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"force_fp8_e4m3fnuz"
,
[
True
,
False
])
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@
pytest
.
mark
.
parametrize
(
"cuda_force_torch"
,
[
True
,
False
]
if
cutlass_fp8_supported
()
else
[
True
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
reason
=
"Only test on CUDA and ROCm"
)
reason
=
"Only test on CUDA and ROCm"
)
def
test_fusion_rmsnorm_quant
(
dtype
,
hidden_size
,
num_tokens
,
eps
,
static
,
def
test_fusion_rmsnorm_quant
(
dtype
,
hidden_size
,
num_tokens
,
eps
,
static
,
force_
fp8_e4m3fnuz
):
cuda_
force_
torch
):
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
dtype
)
torch
.
set_default_dtype
(
dtype
)
torch
.
manual_seed
(
1
)
torch
.
manual_seed
(
1
)
...
@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
...
@@ -102,7 +107,7 @@ def test_fusion_rmsnorm_quant(dtype, hidden_size, num_tokens, eps, static,
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
fusion_pass
=
FusionPass
.
instance
(
vllm_config
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
backend
=
TestBackend
(
noop_pass
,
fusion_pass
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
force_
fp8_e4m3fnuz
)
model
=
TestModel
(
hidden_size
,
eps
,
static
,
cuda_
force_
torch
)
# First dimension dynamic
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
)
...
...
tests/compile/test_fusion_attn.py
View file @
38d80967
...
@@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None
...
@@ -40,13 +40,12 @@ backend_unfused: Optional[TestBackend] = None
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model, quant_key"
,
"model, quant_key"
,
[(
"amd/Llama-3.1-8B-Instruct-FP8-KV"
,
kFp8StaticTensorSym
)])
[(
"amd/Llama-3.1-8B-Instruct-FP8-KV"
,
kFp8StaticTensorSym
)])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"use_triton_fa"
,
[
True
,
False
])
"use_triton_fa"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_
cuda_alike
(),
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_
rocm
(),
reason
=
"
Only test CUDA and
ROCm"
)
reason
=
"
V0 attn quant fusion only on
ROCm"
)
def
test_attention_fusion
(
example_prompts
,
monkeypatch
,
model
:
str
,
def
test_attention_fusion
_v0
(
example_prompts
,
monkeypatch
,
model
:
str
,
quant_key
:
QuantKey
,
use_triton_fa
:
bool
):
quant_key
:
QuantKey
,
use_triton_fa
:
bool
):
# Clean Dynamo cache to avoid reusing other test cases
# Clean Dynamo cache to avoid reusing other test cases
# (for some reason the reset at the end is not enough)
# (for some reason the reset at the end is not enough)
torch
.
_dynamo
.
reset
()
torch
.
_dynamo
.
reset
()
...
@@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
...
@@ -69,13 +68,17 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
backend
=
"tests.compile.test_fusion_attn.backend_unfused"
,
backend
=
"tests.compile.test_fusion_attn.backend_unfused"
,
custom_ops
=
[
"+quant_fp8"
],
custom_ops
=
[
"+quant_fp8"
],
)
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
,
model_config
=
ModelConfig
(
model
=
model
,
dtype
=
torch
.
bfloat16
,
))
backend_unfused
=
TestBackend
(
NoOpEliminationPass
(
vllm_config
))
backend_unfused
=
TestBackend
(
NoOpEliminationPass
(
vllm_config
))
llm
=
LLM
(
model
,
llm
=
LLM
(
model
,
enforce_eager
=
True
,
enforce_eager
=
True
,
compilation_config
=
compile_config
,
compilation_config
=
compile_config
,
gpu_memory_utilization
=
0.
9
,
gpu_memory_utilization
=
0.
5
,
max_model_len
=
2048
)
max_model_len
=
2048
)
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
...
@@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
...
@@ -93,7 +96,11 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
backend
=
"tests.compile.test_fusion_attn.backend"
,
backend
=
"tests.compile.test_fusion_attn.backend"
,
custom_ops
=
[
"+quant_fp8"
],
custom_ops
=
[
"+quant_fp8"
],
)
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
)
vllm_config
=
VllmConfig
(
compilation_config
=
compile_config
,
model_config
=
ModelConfig
(
model
=
model
,
dtype
=
torch
.
bfloat16
,
))
# AttnFusionPass needs attention layers to be registered in config upon init
# AttnFusionPass needs attention layers to be registered in config upon init
# so we initialize it during compilation.
# so we initialize it during compilation.
...
@@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
...
@@ -102,7 +109,7 @@ def test_attention_fusion(example_prompts, monkeypatch, model: str,
llm2
=
LLM
(
model
,
llm2
=
LLM
(
model
,
enforce_eager
=
True
,
enforce_eager
=
True
,
compilation_config
=
compile_config
,
compilation_config
=
compile_config
,
gpu_memory_utilization
=
0.
9
,
gpu_memory_utilization
=
0.
5
,
max_model_len
=
2048
)
max_model_len
=
2048
)
# check support
# check support
...
@@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
...
@@ -171,6 +178,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
cache_config
=
vllm_config
.
cache_config
,
cache_config
=
vllm_config
.
cache_config
,
prefix
=
"model.layers.0.self_attn.attn"
,
prefix
=
"model.layers.0.self_attn.attn"
,
)
)
self
.
attn
.
_k_scale
=
self
.
attn
.
_k_scale
.
to
(
device
)
self
.
attn
.
_v_scale
=
self
.
attn
.
_v_scale
.
to
(
device
)
self
.
block_size
=
16
self
.
block_size
=
16
...
@@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
...
@@ -188,7 +197,7 @@ class AttentionQuantPatternModel(torch.nn.Module):
device
=
self
.
device
,
device
=
self
.
device
,
)
)
def
build_attn_metadata
(
self
,
batch_size
:
int
):
def
build_attn_metadata
(
self
,
batch_size
:
int
,
use_hnd
:
bool
):
"""Initialize attention metadata."""
"""Initialize attention metadata."""
# Create common attn metadata
# Create common attn metadata
...
@@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
...
@@ -205,10 +214,8 @@ class AttentionQuantPatternModel(torch.nn.Module):
num_blocks
=
batch_size
*
max_blocks
num_blocks
=
batch_size
*
max_blocks
# Create dummy KV cache for FlashInfer TRTLLM
# Create dummy KV cache for FlashInfer TRTLLM
# - NHD: [num_blocks, 2, block_size, num_kv_heads, head_size]
# - NHD: [num_blocks, block_size, num_kv_heads, head_size]
# - HND: [num_blocks, 2, num_kv_heads, block_size, head_size]
# - HND: [num_blocks, num_kv_heads, block_size, head_size]
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
kv_cache
=
torch
.
zeros
(
num_blocks
,
kv_cache
=
torch
.
zeros
(
num_blocks
,
2
,
2
,
self
.
num_kv_heads
,
self
.
num_kv_heads
,
...
@@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module):
...
@@ -216,7 +223,17 @@ class AttentionQuantPatternModel(torch.nn.Module):
self
.
head_size
,
self
.
head_size
,
dtype
=
self
.
kv_cache_dtype
,
dtype
=
self
.
kv_cache_dtype
,
device
=
self
.
device
)
device
=
self
.
device
)
kv_cache
=
kv_cache
.
permute
(
0
,
1
,
3
,
2
,
4
)
if
current_platform
.
is_rocm
():
# k/v as 1st dimention
if
use_hnd
:
kv_cache
=
kv_cache
.
permute
(
1
,
0
,
2
,
3
,
4
)
else
:
kv_cache
=
kv_cache
.
permute
(
1
,
0
,
3
,
2
,
4
)
else
:
# k/v as 2nd dimention
# Create kv_cache in HND layout and permute to NHD layout
# (later will be permuted back to HND layout in forward pass)
kv_cache
=
kv_cache
.
permute
(
0
,
1
,
3
,
2
,
4
)
self
.
attn
.
kv_cache
=
[
kv_cache
]
self
.
attn
.
kv_cache
=
[
kv_cache
]
# Build attn metadata
# Build attn metadata
...
@@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
...
@@ -296,28 +313,51 @@ class TestAttentionNvfp4QuantPatternModel(AttentionQuantPatternModel):
out_dtype
=
attn_output
.
dtype
)
out_dtype
=
attn_output
.
dtype
)
@
pytest
.
mark
.
parametrize
(
"num_qo_heads, num_kv_heads"
,
[(
64
,
8
),
(
40
,
8
)])
if
current_platform
.
is_cuda
():
MODELS
=
[(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
TestAttentionFp8StaticQuantPatternModel
),
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
TestAttentionNvfp4QuantPatternModel
)]
HEADS
=
[(
64
,
8
),
(
40
,
8
)]
elif
current_platform
.
is_rocm
():
MODELS
=
[(
"amd/Llama-3.1-8B-Instruct-FP8-KV"
,
TestAttentionFp8StaticQuantPatternModel
)]
HEADS
=
[(
32
,
8
),
(
40
,
8
)]
else
:
MODELS
=
[]
HEADS
=
[]
@
pytest
.
mark
.
parametrize
(
"num_qo_heads, num_kv_heads"
,
HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
[
7
,
256
,
533
])
@
pytest
.
mark
.
parametrize
(
"batch_size"
,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
])
[
7
,
256
,
533
]
if
current_platform
.
is_cuda
()
else
[
8
])
@
pytest
.
mark
.
parametrize
(
"model_name, model_class"
,
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
bfloat16
,
torch
.
float16
])
[(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP8"
,
@
pytest
.
mark
.
parametrize
(
"model_name, model_class"
,
MODELS
)
TestAttentionFp8StaticQuantPatternModel
),
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
_Backend
.
FLASHINFER
]
if
(
"nvidia/Llama-4-Scout-17B-16E-Instruct-FP4"
,
current_platform
.
is_cuda
()
else
[
_Backend
.
ROCM_FLASH
])
TestAttentionNvfp4QuantPatternModel
)])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"backend"
,
[
_Backend
.
FLASHINFER
])
"split_attention"
,
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"Only test CUDA"
)
[
False
,
True
]
if
current_platform
.
is_rocm
()
else
[
False
])
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Only test ROCm or CUDA"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
supports_fp8
(),
reason
=
"Need FP8"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_device_capability
((
10
,
0
)),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_cuda
()
reason
=
"Only test on SM100(Blackwell)"
)
and
not
current_platform
.
is_device_capability
((
10
,
0
)),
reason
=
"On CUDA only test on SM100(Blackwell)"
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Only test ROCm or CUDA"
)
def
test_attention_quant_pattern
(
num_qo_heads
:
int
,
num_kv_heads
:
int
,
def
test_attention_quant_pattern
(
num_qo_heads
:
int
,
num_kv_heads
:
int
,
head_size
:
int
,
batch_size
:
int
,
head_size
:
int
,
batch_size
:
int
,
dtype
:
torch
.
dtype
,
model_name
:
str
,
dtype
:
torch
.
dtype
,
model_name
:
str
,
model_class
:
type
[
AttentionQuantPatternModel
],
model_class
:
type
[
AttentionQuantPatternModel
],
backend
:
_Backend
,
monkeypatch
,
dist_init
):
backend
:
_Backend
,
split_attention
:
bool
,
monkeypatch
,
dist_init
):
"""Test AttentionStaticQuantPattern fusion pass"""
"""Test AttentionStaticQuantPattern fusion pass"""
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
monkeypatch
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
if
split_attention
:
monkeypatch
.
setenv
(
"VLLM_V1_USE_PREFILL_DECODE_ATTENTION"
,
"1"
)
device
=
torch
.
device
(
"cuda:0"
)
device
=
torch
.
device
(
"cuda:0"
)
torch
.
manual_seed
(
42
)
torch
.
manual_seed
(
42
)
...
@@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
...
@@ -326,6 +366,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
model_config
=
ModelConfig
(
model_config
=
ModelConfig
(
model
=
model_name
,
model
=
model_name
,
max_model_len
=
2048
,
max_model_len
=
2048
,
dtype
=
dtype
,
),
),
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
1024
),
scheduler_config
=
SchedulerConfig
(
max_num_seqs
=
1024
),
compilation_config
=
CompilationConfig
(
compilation_config
=
CompilationConfig
(
...
@@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
...
@@ -368,7 +409,7 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
forward_ctx
=
get_forward_context
()
forward_ctx
=
get_forward_context
()
forward_ctx
.
attn_metadata
=
model_unfused
.
build_attn_metadata
(
forward_ctx
.
attn_metadata
=
model_unfused
.
build_attn_metadata
(
batch_size
)
batch_size
,
use_hnd
=
split_attention
)
# Run model directly without compilation and fusion
# Run model directly without compilation and fusion
result_unfused
=
model_unfused
(
q
,
k
,
v
)
result_unfused
=
model_unfused
(
q
,
k
,
v
)
...
@@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
...
@@ -389,7 +430,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
model_fused
=
model_fused
.
to
(
device
)
model_fused
=
model_fused
.
to
(
device
)
forward_ctx
=
get_forward_context
()
forward_ctx
=
get_forward_context
()
forward_ctx
.
attn_metadata
=
model_fused
.
build_attn_metadata
(
batch_size
)
forward_ctx
.
attn_metadata
=
model_fused
.
build_attn_metadata
(
batch_size
,
use_hnd
=
split_attention
)
# Create test backend with fusion passes enabled
# Create test backend with fusion passes enabled
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
noop_pass
=
NoOpEliminationPass
(
vllm_config
)
...
@@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
...
@@ -404,12 +446,19 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
assert
model_compiled
.
attn
.
_o_scale_float
is
None
assert
model_compiled
.
attn
.
_o_scale_float
is
None
result_fused_1
=
model_compiled
(
q
,
k
,
v
)
result_fused_1
=
model_compiled
(
q
,
k
,
v
)
# After the 1st round of the forward pass, output quant scale should be
if
backend
==
_Backend
.
FLASHINFER
:
# loaded into the attn layer's _o_scale_float, the 2nd round should
# With the Flashinfer backend after the 1st round of the forward
# reuse the loaded _o_scale_float
# pass, output quant scale should be loaded into the attn layer's
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
# _o_scale_float, the 2nd round should reuse the loaded
result_fused_2
=
model_compiled
(
q
,
k
,
v
)
# _o_scale_float
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
result_fused_2
=
model_compiled
(
q
,
k
,
v
)
assert
model_compiled
.
attn
.
_o_scale_float
is
not
None
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_2
,
atol
=
1e-2
,
rtol
=
1e-2
)
# Check attn fusion support
# Check attn fusion support
quant_key
=
model_class
.
quant_key
quant_key
=
model_class
.
quant_key
...
@@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
...
@@ -444,12 +493,8 @@ def test_attention_quant_pattern(num_qo_heads: int, num_kv_heads: int,
assert
attn_nodes_post
[
0
].
kwargs
.
get
(
"output_block_scale"
)
is
not
None
,
\
assert
attn_nodes_post
[
0
].
kwargs
.
get
(
"output_block_scale"
)
is
not
None
,
\
"Attention should have output_block_scale after FP4 fusion"
# noqa: E501
"Attention should have output_block_scale after FP4 fusion"
# noqa: E501
# Check that results are close
d
# Check that results are close
torch
.
testing
.
assert_close
(
result_unfused
,
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_1
,
result_fused_1
,
atol
=
1e-2
,
atol
=
1e-2
,
rtol
=
1e-2
)
rtol
=
1e-2
)
torch
.
testing
.
assert_close
(
result_unfused
,
result_fused_2
,
atol
=
1e-2
,
rtol
=
1e-2
)
tests/compile/test_silu_mul_quant_fusion.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
cast
import
pytest
import
pytest
import
torch
import
torch
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
tests.kernels.quantization.nvfp4_utils
import
quant_nvfp4_tensor
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
from
vllm._custom_ops
import
cutlass_scaled_fp4_mm
,
scaled_fp4_quant
# yapf conflicts with isort for this block
# yapf conflicts with isort for this block
# yapf: disable
# yapf: disable
...
@@ -17,9 +20,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
...
@@ -17,9 +20,10 @@ from vllm.model_executor.layers.activation import SiluAndMul
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
from
vllm.model_executor.layers.quantization.utils.quant_utils
import
(
GroupShape
,
kFp8StaticTensorSym
,
kNvfp4Quant
)
GroupShape
,
kFp8StaticTensorSym
,
kNvfp4Quant
)
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
from
vllm.model_executor.layers.quantization.utils.w8a8_utils
import
(
Fp8LinearOp
)
Fp8LinearOp
,
cutlass_fp8_supported
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
..utils
import
override_cutlass_fp8_supported
from
.backend
import
TestBackend
from
.backend
import
TestBackend
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
FP8_DTYPE
=
current_platform
.
fp8_dtype
()
...
@@ -32,7 +36,7 @@ def is_nvfp4_supported():
...
@@ -32,7 +36,7 @@ def is_nvfp4_supported():
class
TestSiluMulFp8QuantModel
(
torch
.
nn
.
Module
):
class
TestSiluMulFp8QuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
force_
fp8_e4m3fnuz
:
bool
,
**
kwargs
):
def
__init__
(
self
,
hidden_size
:
int
,
cuda_
force_
torch
:
bool
,
**
kwargs
):
super
().
__init__
()
super
().
__init__
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
wscale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
wscale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
...
@@ -40,11 +44,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
...
@@ -40,11 +44,11 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
self
.
w
=
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
self
.
w
=
torch
.
rand
(
hidden_size
,
hidden_size
).
to
(
dtype
=
FP8_DTYPE
).
t
()
self
.
fp8_linear
=
Fp8LinearOp
(
with
override_cutlass_fp8_supported
(
not
cuda_force_torch
):
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
,
self
.
fp8_linear
=
Fp8LinearOp
(
act_quant_static
=
True
,
act_quant_static
=
True
,
act_quant_group_shape
=
GroupShape
.
PER_TENSOR
,
act_quant_group_shape
=
GroupShape
.
PER_TENSOR
,
)
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
y
=
self
.
silu_and_mul
(
x
)
y
=
self
.
silu_and_mul
(
x
)
...
@@ -63,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
...
@@ -63,24 +67,27 @@ class TestSiluMulFp8QuantModel(torch.nn.Module):
class
TestSiluMulNvfp4QuantModel
(
torch
.
nn
.
Module
):
class
TestSiluMulNvfp4QuantModel
(
torch
.
nn
.
Module
):
def
__init__
(
self
,
hidden_size
:
int
,
**
kwargs
):
def
__init__
(
self
,
hidden_size
:
int
,
x
:
torch
.
Tensor
,
**
kwargs
):
super
().
__init__
()
super
().
__init__
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
silu_and_mul
=
SiluAndMul
()
self
.
w
=
torch
.
randint
(
256
,
(
hidden_size
,
hidden_size
//
2
),
dtype
=
FP4_DTYPE
)
# create nvfp4 weight
self
.
wscale
=
torch
.
randn
(
hidden_size
,
w
=
torch
.
rand
((
hidden_size
,
hidden_size
))
hidden_size
//
16
).
to
(
dtype
=
FP8_DTYPE
)
self
.
w
,
self
.
w_block_scale
,
self
.
w_global_scale
=
quant_nvfp4_tensor
(
w
)
self
.
wscale2
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
self
.
scale
=
torch
.
rand
(
1
,
dtype
=
torch
.
float32
)
# get global scale offline
_
,
_
,
self
.
y_global_scale
=
quant_nvfp4_tensor
(
self
.
silu_and_mul
(
x
))
self
.
alpha
=
1.0
/
(
self
.
w_global_scale
*
self
.
y_global_scale
)
def
forward
(
self
,
x
):
def
forward
(
self
,
x
):
y
=
self
.
silu_and_mul
(
x
)
y
=
self
.
silu_and_mul
(
x
)
y_quant
,
y_block_scale
=
scaled_fp4_quant
(
y
,
1
/
self
.
scale
)
y_quant
,
y_block_scale
=
scaled_fp4_quant
(
y
,
self
.
y_global_
scale
)
out
=
cutlass_scaled_fp4_mm
(
a
=
y_quant
,
out
=
cutlass_scaled_fp4_mm
(
a
=
y_quant
,
b
=
self
.
w
,
b
=
self
.
w
,
block_scale_a
=
y_block_scale
,
block_scale_a
=
y_block_scale
,
block_scale_b
=
self
.
wscale
,
block_scale_b
=
self
.
w
_block_
scale
,
alpha
=
self
.
scale
*
self
.
wscale2
,
alpha
=
self
.
alpha
,
out_dtype
=
y
.
dtype
)
out_dtype
=
y
.
dtype
)
return
out
return
out
...
@@ -94,19 +101,25 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
...
@@ -94,19 +101,25 @@ class TestSiluMulNvfp4QuantModel(torch.nn.Module):
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
[
64
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
[
128
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"model_class"
,
[
TestSiluMulFp8QuantModel
,
TestSiluMulNvfp4QuantModel
]
"model_class"
,
if
is_nvfp4_supported
()
else
[
TestSiluMulFp8QuantModel
])
cast
(
list
[
type
],
[
TestSiluMulFp8QuantModel
,
TestSiluMulNvfp4QuantModel
]
@
pytest
.
mark
.
parametrize
(
"force_fp8_e4m3fnuz"
,
[
True
,
False
])
if
is_nvfp4_supported
()
else
[
TestSiluMulFp8QuantModel
]))
# cuda_force_torch used to test torch code path on platforms that
# cutlass_fp8_supported() == True.
@
pytest
.
mark
.
parametrize
(
"cuda_force_torch"
,
[
True
,
False
]
if
cutlass_fp8_supported
()
else
[
True
])
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
@
pytest
.
mark
.
skipif
(
envs
.
VLLM_TARGET_DEVICE
not
in
[
"cuda"
,
"rocm"
],
reason
=
"Only test on CUDA and ROCm"
)
reason
=
"Only test on CUDA and ROCm"
)
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
,
model_class
,
def
test_fusion_silu_and_mul_quant
(
num_tokens
,
hidden_size
,
model_class
,
force_
fp8_e4m3fnuz
):
cuda_
force_
torch
):
if
model_class
==
TestSiluMulNvfp4QuantModel
and
force_
fp8_e4m3fnuz
:
if
model_class
==
TestSiluMulNvfp4QuantModel
and
cuda_
force_
torch
:
pytest
.
skip
(
"Duplicate tests for NVFP4"
)
pytest
.
skip
(
"Duplicate tests for NVFP4"
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_dtype
(
torch
.
float16
)
torch
.
set_default_dtype
(
torch
.
float16
)
x
=
torch
.
rand
(
num_tokens
,
hidden_size
*
2
)
# Reshape pass is needed for the fusion pass to work
# Reshape pass is needed for the fusion pass to work
config
=
VllmConfig
()
config
=
VllmConfig
()
config
.
compilation_config
=
CompilationConfig
(
config
.
compilation_config
=
CompilationConfig
(
...
@@ -115,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
...
@@ -115,10 +128,10 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
backend
=
TestBackend
(
NoOpEliminationPass
(
config
),
fusion_pass
)
backend
=
TestBackend
(
NoOpEliminationPass
(
config
),
fusion_pass
)
model
=
model_class
(
hidden_size
=
hidden_size
,
model
=
model_class
(
hidden_size
=
hidden_size
,
force_fp8_e4m3fnuz
=
force_fp8_e4m3fnuz
)
cuda_force_torch
=
cuda_force_torch
,
x
=
x
)
# First dimension dynamic
# First dimension dynamic
x
=
torch
.
rand
(
num_tokens
,
hidden_size
*
2
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
torch
.
_dynamo
.
mark_dynamic
(
x
,
0
)
result
=
model
(
x
)
result
=
model
(
x
)
...
@@ -127,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
...
@@ -127,10 +140,15 @@ def test_fusion_silu_and_mul_quant(num_tokens, hidden_size, model_class,
result2
=
model2
(
x
)
result2
=
model2
(
x
)
# Check that it gives the same answer
# Check that it gives the same answer
if
model_class
==
TestSiluMulFp8QuantModel
:
atol
,
rtol
=
1e-3
,
1e-3
elif
model_class
==
TestSiluMulNvfp4QuantModel
:
atol
,
rtol
=
1e-1
,
1e-1
torch
.
testing
.
assert_close
(
result
[
0
].
to
(
dtype
=
torch
.
float16
),
torch
.
testing
.
assert_close
(
result
[
0
].
to
(
dtype
=
torch
.
float16
),
result2
[
0
].
to
(
dtype
=
torch
.
float16
),
result2
[
0
].
to
(
dtype
=
torch
.
float16
),
atol
=
1e-3
,
atol
=
atol
,
rtol
=
1e-3
)
rtol
=
rtol
)
# In pre-nodes, quant op should be present and fused kernels should not
# In pre-nodes, quant op should be present and fused kernels should not
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
backend
.
check_before_ops
(
model
.
ops_in_model_before
())
...
...
tests/conftest.py
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa
from
tblib
import
pickling_support
# Install support for pickling exceptions so that we can nicely propagate
# failures from tests running in a subprocess.
# This should be run before any custom exception subclasses are defined.
pickling_support
.
install
()
import
http.server
import
json
import
json
import
math
import
math
import
mimetypes
import
os
import
os
import
socket
import
tempfile
import
tempfile
import
threading
from
collections.abc
import
Generator
from
enum
import
Enum
from
enum
import
Enum
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
,
cast
from
typing
import
Any
,
Callable
,
Optional
,
TypedDict
,
TypeVar
,
Union
,
cast
...
@@ -32,6 +47,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
...
@@ -32,6 +47,7 @@ from vllm.distributed import (cleanup_dist_env_and_memory,
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
from
vllm.inputs
import
(
ExplicitEncoderDecoderPrompt
,
TextPrompt
,
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
to_enc_dec_tuple_list
,
zip_enc_dec_prompts
)
from
vllm.logger
import
init_logger
from
vllm.logger
import
init_logger
from
vllm.multimodal.utils
import
fetch_image
from
vllm.outputs
import
RequestOutput
from
vllm.outputs
import
RequestOutput
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sampling_params
import
BeamSearchParams
from
vllm.sequence
import
Logprob
from
vllm.sequence
import
Logprob
...
@@ -1253,3 +1269,119 @@ def cli_config_file():
...
@@ -1253,3 +1269,119 @@ def cli_config_file():
def
cli_config_file_with_model
():
def
cli_config_file_with_model
():
"""Return the path to the CLI config file with model."""
"""Return the path to the CLI config file with model."""
return
os
.
path
.
join
(
_TEST_DIR
,
"config"
,
"test_config_with_model.yaml"
)
return
os
.
path
.
join
(
_TEST_DIR
,
"config"
,
"test_config_with_model.yaml"
)
class
AssetHandler
(
http
.
server
.
BaseHTTPRequestHandler
):
# _IMAGE_CACHE : Dict[str, bytes] = {}
def
log_message
(
self
,
*
args
,
**
kwargs
):
pass
def
do_GET
(
self
):
# Accepts paths like: /1280px-Venn_diagram_rgb.jpg
filename
=
self
.
path
.
lstrip
(
"/"
)
if
not
filename
or
"."
not
in
filename
:
self
.
send_error
(
404
,
"Missing filename (expected /<name>.<ext>)"
)
return
base
,
ext
=
filename
.
rsplit
(
"."
,
1
)
ext
=
ext
.
lower
()
if
ext
not
in
[
"jpg"
,
"png"
]:
self
.
send_error
(
404
,
f
"Unsupported extension: .
{
ext
}
"
)
return
try
:
data
=
ImageAsset
(
base
).
read_bytes
(
ext
=
ext
)
except
Exception
as
e
:
self
.
send_error
(
500
,
f
"Failed to load asset:
{
ext
}
{
base
}
{
e
}
"
)
return
ctype
,
_
=
mimetypes
.
guess_type
(
filename
)
if
ctype
is
None
:
ctype
=
{
"jpg"
:
"image/jpg"
,
"png"
:
"image/png"
}[
ext
]
self
.
send_response
(
200
)
self
.
send_header
(
"Content-Type"
,
ctype
)
self
.
send_header
(
"Content-Length"
,
str
(
len
(
data
)))
self
.
end_headers
()
self
.
wfile
.
write
(
data
)
def
_find_free_port
()
->
int
:
with
socket
.
socket
()
as
s
:
s
.
bind
((
"127.0.0.1"
,
0
))
return
s
.
getsockname
()[
1
]
class
LocalAssetServer
:
address
:
str
port
:
int
server
:
Optional
[
http
.
server
.
ThreadingHTTPServer
]
thread
:
Optional
[
threading
.
Thread
]
def
__init__
(
self
,
address
:
str
=
"127.0.0.1"
)
->
None
:
self
.
address
=
address
self
.
port
=
-
1
self
.
server
=
None
self
.
thread
=
None
def
__enter__
(
self
):
self
.
port
=
_find_free_port
()
self
.
server
=
http
.
server
.
ThreadingHTTPServer
(
(
self
.
address
,
self
.
port
),
AssetHandler
)
self
.
thread
=
threading
.
Thread
(
target
=
self
.
server
.
serve_forever
,
daemon
=
True
)
self
.
thread
.
start
()
return
self
def
__exit__
(
self
,
exc_type
,
exc_value
,
traceback
):
if
self
.
server
:
self
.
server
.
shutdown
()
del
self
.
server
if
self
.
thread
:
self
.
thread
.
join
()
del
self
.
thread
if
exc_type
is
None
:
return
None
return
False
@
property
def
base_url
(
self
)
->
str
:
assert
self
.
port
is
not
None
return
f
"http://
{
self
.
address
}
:
{
self
.
port
}
"
def
url_for
(
self
,
name
:
str
)
->
str
:
"""e.g., name='RGBA_comp.png' -> 'http://127.0.0.1:PORT/RGBA_comp.png'"""
return
f
"
{
self
.
base_url
}
/
{
name
}
"
def
get_image_asset
(
self
,
name
:
str
)
->
Image
.
Image
:
return
fetch_image
(
self
.
url_for
(
name
))
@
pytest
.
fixture
(
scope
=
"session"
)
def
local_asset_server
()
->
Generator
[
LocalAssetServer
,
None
,
None
]:
"""
Starts a thread based HTTP server bound to 127.0.0.1 on a random free port.
The server currently servers images at:
http://127.0.0.1:<port>/<name>.<ext>
"""
with
LocalAssetServer
()
as
srv
:
yield
srv
@
pytest
.
fixture
def
image_url
(
request
,
local_asset_server
)
->
str
:
# request.param is one of the IMAGE_ASSETS filenames
name
=
request
.
param
return
local_asset_server
.
url_for
(
name
)
@
pytest
.
fixture
def
image_urls
(
request
,
local_asset_server
)
->
list
[
str
]:
"""Indirect fixture: takes a list of names, returns list of full URLs."""
names
:
list
[
str
]
=
request
.
param
return
[
local_asset_server
.
url_for
(
name
)
for
name
in
names
]
tests/core/block/e2e/test_correctness.py
View file @
38d80967
...
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
...
@@ -439,10 +439,10 @@ def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
@
pytest
.
mark
.
parametrize
(
"seed"
,
[
1
])
def
test_auto_prefix_caching_after_eviction_start
(
baseline_llm_generator
,
def
test_auto_prefix_caching_after_eviction_start
(
baseline_llm_generator
,
test_llm_generator
):
test_llm_generator
):
"""Verify block manager v2 with auto prefix caching could work
s
normal
"""Verify block manager v2 with auto prefix caching could work normal
ly
even when eviction started.
even when eviction started.
With APC enabled, all blocks are held by native block at the beginning.
With APC enabled, all blocks are held by native block at the beginning.
Then blocks are managed by evictor instead. If cache hit at the evitor's
Then blocks are managed by evictor instead. If cache hit at the evi
c
tor's
block, then it could be reused, or we need to recompute its kv cache.
block, then it could be reused, or we need to recompute its kv cache.
"""
"""
output_len
=
10
output_len
=
10
...
...
tests/core/test_scheduler.py
View file @
38d80967
...
@@ -10,7 +10,8 @@ import pytest # noqa
...
@@ -10,7 +10,8 @@ import pytest # noqa
import
torch
import
torch
from
torch
import
Use
# noqa
from
torch
import
Use
# noqa
from
vllm.config
import
CacheConfig
,
LoRAConfig
,
SchedulerConfig
from
vllm.config
import
CacheConfig
,
SchedulerConfig
from
vllm.config.lora
import
LoRAConfig
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.interfaces
import
AllocStatus
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.core.scheduler
import
Scheduler
,
SchedulingBudget
from
vllm.lora.request
import
LoRARequest
from
vllm.lora.request
import
LoRARequest
...
@@ -641,7 +642,7 @@ def test_schedule_decode_blocks_to_copy_update():
...
@@ -641,7 +642,7 @@ def test_schedule_decode_blocks_to_copy_update():
# Nothing is preempted.
# Nothing is preempted.
assert
output
.
blocks_to_swap_out
==
[]
assert
output
.
blocks_to_swap_out
==
[]
# Since append_slot returns the source -> dist mapping, it should
# Since append_slot returns the source -> dist mapping, it should
# applied.
#
be
applied.
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
assert
output
.
blocks_to_copy
==
[(
2
,
3
)]
...
...
tests/detokenizer/test_stop_string_while_stop_model_terminates.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
from
vllm.sampling_params
import
SamplingParams
from
vllm.v1.engine
import
EngineCoreRequest
from
vllm.v1.engine.detokenizer
import
BaseIncrementalDetokenizer
@
pytest
.
fixture
(
params
=
[
True
,
False
])
def
include_stop_str_in_output
(
request
):
return
request
.
param
class
_DummyDetokenizer
(
BaseIncrementalDetokenizer
):
def
__init__
(
self
,
request
:
EngineCoreRequest
):
super
().
__init__
(
request
)
def
decode_next
(
self
,
next_token_id
:
int
)
->
str
:
# Map token id to single ASCII character for deterministic testing.
return
chr
(
next_token_id
)
def
_make_request
(
stop
,
include_stop_str_in_output
:
bool
,
min_tokens
:
int
=
0
):
params
=
SamplingParams
(
stop
=
stop
,
include_stop_str_in_output
=
include_stop_str_in_output
,
min_tokens
=
min_tokens
)
# Keep other fields minimal for unit test purposes.
req
=
EngineCoreRequest
(
request_id
=
"test"
,
prompt_token_ids
=
[],
mm_features
=
None
,
sampling_params
=
params
,
pooling_params
=
None
,
eos_token_id
=
None
,
arrival_time
=
0.0
,
lora_request
=
None
,
cache_salt
=
None
,
data_parallel_rank
=
None
,
)
return
req
def
test_stop_string_while_stop_token_terminates
(
include_stop_str_in_output
:
bool
):
"""
This test verifies that the detokenizer correctly handles the case where
the generated token sequence contains both:
- a stop token
- an <eos> token
The detokenizer should respect the stop string and truncate the output
accordingly.
Imagine the following sequence:
- "abcdeZ" is generated, where "Z" is the <eos> token.
- "cd" is the stop string.
If include_stop_str_in_output=False, the detokenizer should truncate the
output to "ab" because the stop string "cd" is excluded.
If include_stop_str_in_output=True, the detokenizer should include the stop
string "cd" in the output, resulting in "abcd".
This verifies the behavioral change introduced in BaseIncrementalDetokenizer
where stop-string evaluation occurs before the early-return on
stop_terminated.
"""
# Generate text "abcdeZ" and tokenize it.
generated_text
=
"abcde"
eos_token
=
"Z"
stop_string
=
"cd"
generated_text
=
generated_text
+
eos_token
token_ids
=
[
ord
(
c
)
for
c
in
generated_text
]
# Create a request with the stop string and initialize the detokenizer.
req
=
_make_request
(
stop
=
[
stop_string
],
include_stop_str_in_output
=
include_stop_str_in_output
)
detok
=
_DummyDetokenizer
(
req
)
# Simulate that the last token ('Z') is a stop token (stop_terminated=True).
result
=
detok
.
update
(
new_token_ids
=
token_ids
,
stop_terminated
=
True
)
# The update should not report a stop string
assert
result
==
stop_string
# Output text should reflect stop-string handling:
# - include_stop_str_in_output=False => exclude "cd" => "ab"
# - include_stop_str_in_output=True => include "cd" => "abcd"
expected_text
=
"abcd"
if
include_stop_str_in_output
else
"ab"
assert
detok
.
output_text
==
expected_text
# The skipped final token should still be recorded in token_ids.
assert
detok
.
output_token_ids
==
token_ids
# get_next_output_text should return the full text when finished=True.
# (Buffering only applies during streaming when finished=False.)
assert
detok
.
get_next_output_text
(
finished
=
True
,
delta
=
False
)
==
expected_text
tests/distributed/conftest.py
View file @
38d80967
...
@@ -8,7 +8,7 @@ import msgspec.msgpack
...
@@ -8,7 +8,7 @@ import msgspec.msgpack
import
pytest
import
pytest
import
zmq
import
zmq
from
vllm.config
import
KVEventsConfig
from
vllm.config
.kv_events
import
KVEventsConfig
from
vllm.distributed.kv_events
import
EventPublisherFactory
from
vllm.distributed.kv_events
import
EventPublisherFactory
from
.test_events
import
SampleBatch
from
.test_events
import
SampleBatch
...
...
tests/distributed/test_context_parallel.py
0 → 100644
View file @
38d80967
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
WARNING: This test runs in both single-node (4 GPUs) and multi-node
(2 node with 2 GPUs each) modes. If the test only uses 2 GPUs, it is
important to set the distributed backend to "mp" to avoid Ray scheduling
all workers in a node other than the head node, which can cause the test
to fail.
"""
import
json
import
os
from
dataclasses
import
dataclass
from
typing
import
Literal
,
NamedTuple
,
Optional
import
pytest
from
vllm.config
import
RunnerOption
from
vllm.logger
import
init_logger
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..utils
import
compare_two_settings
,
create_new_process_for_each_test
logger
=
init_logger
(
"test_context_parallel"
)
VLLM_MULTI_NODE
=
os
.
getenv
(
"VLLM_MULTI_NODE"
,
"0"
)
==
"1"
class
ParallelSetup
(
NamedTuple
):
tp_size
:
int
pp_size
:
int
dcp_size
:
int
eager_mode
:
bool
chunked_prefill
:
bool
class
CPTestOptions
(
NamedTuple
):
multi_node_only
:
bool
load_format
:
Optional
[
str
]
=
None
@
dataclass
class
CPTestSettings
:
parallel_setups
:
list
[
ParallelSetup
]
# NOTE: the length of distributed_backends and
# vllm_major_versions should be the same, and they
# are first zipped together to iterate over all
# test settings.
distributed_backends
:
list
[
str
]
# vllm major version: "0" for V0, "1" for V1
vllm_major_versions
:
list
[
str
]
runner
:
RunnerOption
test_options
:
CPTestOptions
def
__post_init__
(
self
):
if
len
(
self
.
distributed_backends
)
!=
len
(
self
.
vllm_major_versions
):
raise
ValueError
(
f
"Length mismatch: distributed_backends "
f
"(
{
len
(
self
.
distributed_backends
)
}
) != "
f
"vllm_major_versions (
{
len
(
self
.
vllm_major_versions
)
}
)"
)
@
staticmethod
def
detailed
(
*
,
tp_base
:
int
=
4
,
pp_base
:
int
=
1
,
dcp_base
:
int
=
1
,
multi_node_only
:
bool
=
False
,
runner
:
RunnerOption
=
"auto"
,
load_format
:
Optional
[
str
]
=
None
,
):
parallel_setups
=
[]
for
eager_mode_val
in
[
False
]:
for
pp_multiplier
in
[
1
]:
for
dcp_multiplier
in
[
2
,
4
]:
for
chunked_prefill_val
in
[
True
]:
parallel_setups
.
append
(
ParallelSetup
(
tp_size
=
tp_base
,
pp_size
=
pp_multiplier
*
pp_base
,
dcp_size
=
dcp_multiplier
*
dcp_base
,
eager_mode
=
eager_mode_val
,
chunked_prefill
=
chunked_prefill_val
))
return
CPTestSettings
(
parallel_setups
=
parallel_setups
,
distributed_backends
=
[
"mp"
],
vllm_major_versions
=
[
"1"
],
runner
=
runner
,
test_options
=
CPTestOptions
(
multi_node_only
=
multi_node_only
,
load_format
=
load_format
),
)
def
iter_params
(
self
,
model_id
:
str
):
opts
=
self
.
test_options
for
parallel_setup
in
self
.
parallel_setups
:
for
backend
,
vllm_major_version
in
zip
(
self
.
distributed_backends
,
self
.
vllm_major_versions
):
yield
(
model_id
,
parallel_setup
,
backend
,
vllm_major_version
,
self
.
runner
,
opts
)
def
_compare_cp_with_tp
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
CPTestOptions
,
num_gpus_available
:
int
,
*
,
method
:
Literal
[
"generate"
],
is_multimodal
:
bool
,
):
(
tp_size
,
pp_size
,
dcp_size
,
eager_mode
,
chunked_prefill
,
)
=
parallel_setup
multi_node_only
,
load_format
=
test_options
model_info
=
HF_EXAMPLE_MODELS
.
find_hf_info
(
model_id
)
model_info
.
check_transformers_version
(
on_fail
=
"skip"
)
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
if
load_format
==
"dummy"
:
# Avoid OOM
text_overrides
=
{
"num_hidden_layers"
:
4
,
"hidden_size"
:
512
,
"intermediate_size"
:
800
,
"num_attention_heads"
:
4
,
"num_key_value_heads"
:
1
,
}
if
is_multimodal
:
hf_overrides
.
update
({
"text_config"
:
text_overrides
})
else
:
hf_overrides
.
update
(
text_overrides
)
else
:
model_info
.
check_available_online
(
on_fail
=
"skip"
)
if
num_gpus_available
<
tp_size
*
pp_size
:
pytest
.
skip
(
f
"Need at least
{
tp_size
}
x
{
pp_size
}
GPUs"
)
if
VLLM_MULTI_NODE
and
distributed_backend
==
"mp"
:
pytest
.
skip
(
"Skipping multi-node pipeline parallel test for "
"multiprocessing distributed backend"
)
if
multi_node_only
and
not
VLLM_MULTI_NODE
:
pytest
.
skip
(
"Not in multi-node setting"
)
common_args
=
[
# use half precision for speed and memory savings in CI environment
"--dtype"
,
"bfloat16"
,
"--max-model-len"
,
"2048"
,
"--max-num-seqs"
,
"8"
,
]
if
chunked_prefill
:
common_args
.
append
(
"--enable-chunked-prefill"
)
if
eager_mode
:
common_args
.
append
(
"--enforce-eager"
)
if
runner
!=
"auto"
:
common_args
.
extend
([
"--runner"
,
runner
])
if
trust_remote_code
:
common_args
.
append
(
"--trust-remote-code"
)
if
tokenizer_mode
:
common_args
.
extend
([
"--tokenizer-mode"
,
tokenizer_mode
])
if
load_format
:
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
cp_env
=
tp_env
=
{
"VLLM_USE_V1"
:
vllm_major_version
,
# Note(hc): DCP only support V1 engine only
}
cp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--decode-context-parallel-size"
,
str
(
dcp_size
),
"--distributed-executor-backend"
,
distributed_backend
,
]
tp_args
=
[
*
common_args
,
"--tensor-parallel-size"
,
str
(
tp_size
),
"--pipeline-parallel-size"
,
str
(
pp_size
),
"--distributed-executor-backend"
,
distributed_backend
,
]
try
:
compare_two_settings
(
model_id
,
cp_args
,
tp_args
,
cp_env
,
tp_env
,
method
=
method
,
max_wait_seconds
=
720
)
except
Exception
:
testing_ray_compiled_graph
=
cp_env
is
not
None
if
testing_ray_compiled_graph
and
vllm_major_version
==
"0"
:
# Ray Compiled Graph tests are flaky for V0,
# so we don't want to fail the test
logger
.
exception
(
"Ray Compiled Graph tests failed"
)
else
:
raise
CP_TEXT_GENERATION_MODELS
=
{
# [MLA attention only]
"deepseek-ai/DeepSeek-V2-Lite-Chat"
:
CPTestSettings
.
detailed
(),
}
CP_TEST_MODELS
=
[
# TODO support other models
# [LANGUAGE GENERATION]
"deepseek-ai/DeepSeek-V2-Lite-Chat"
,
]
@
pytest
.
mark
.
parametrize
(
(
"model_id"
,
"parallel_setup"
,
"distributed_backend"
,
"vllm_major_version"
,
"runner"
,
"test_options"
),
[
params
for
model_id
,
settings
in
CP_TEXT_GENERATION_MODELS
.
items
()
for
params
in
settings
.
iter_params
(
model_id
)
if
model_id
in
CP_TEST_MODELS
],
)
@
create_new_process_for_each_test
()
def
test_cp_generation
(
model_id
:
str
,
parallel_setup
:
ParallelSetup
,
distributed_backend
:
str
,
vllm_major_version
:
str
,
runner
:
RunnerOption
,
test_options
:
CPTestOptions
,
num_gpus_available
,
):
_compare_cp_with_tp
(
model_id
,
parallel_setup
,
distributed_backend
,
vllm_major_version
,
runner
,
test_options
,
num_gpus_available
,
method
=
"generate"
,
is_multimodal
=
False
)
tests/distributed/test_pipeline_parallel.py
View file @
38d80967
...
@@ -298,6 +298,8 @@ def _compare_tp(
...
@@ -298,6 +298,8 @@ def _compare_tp(
tokenizer_mode
=
model_info
.
tokenizer_mode
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
hf_overrides
=
model_info
.
hf_overrides
hf_config
=
get_config
(
model_id
,
trust_remote_code
)
hf_config
=
get_config
(
model_id
,
trust_remote_code
)
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
max_num_seqs
=
model_info
.
max_num_seqs
dtype
=
"float16"
dtype
=
"float16"
if
hf_config
.
model_type
in
_FLOAT16_NOT_SUPPORTED_MODELS
:
if
hf_config
.
model_type
in
_FLOAT16_NOT_SUPPORTED_MODELS
:
...
@@ -351,6 +353,10 @@ def _compare_tp(
...
@@ -351,6 +353,10 @@ def _compare_tp(
common_args
.
extend
([
"--load-format"
,
load_format
])
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
if
skip_tokenizer_init
:
common_args
.
append
(
"--skip-tokenizer-init"
)
if
max_num_seqs
:
common_args
.
extend
([
"--max-num-seqs"
,
f
"
{
max_num_seqs
}
"
])
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
specific_case
=
tp_size
==
2
and
pp_size
==
2
and
chunked_prefill
testing_ray_compiled_graph
=
False
testing_ray_compiled_graph
=
False
...
...
tests/distributed/test_sequence_parallel.py
View file @
38d80967
...
@@ -178,6 +178,7 @@ def _compare_sp(
...
@@ -178,6 +178,7 @@ def _compare_sp(
trust_remote_code
=
model_info
.
trust_remote_code
trust_remote_code
=
model_info
.
trust_remote_code
tokenizer_mode
=
model_info
.
tokenizer_mode
tokenizer_mode
=
model_info
.
tokenizer_mode
hf_overrides
=
model_info
.
hf_overrides
hf_overrides
=
model_info
.
hf_overrides
skip_tokenizer_init
=
model_info
.
skip_tokenizer_init
if
load_format
==
"dummy"
:
if
load_format
==
"dummy"
:
# Avoid OOM
# Avoid OOM
...
@@ -227,6 +228,8 @@ def _compare_sp(
...
@@ -227,6 +228,8 @@ def _compare_sp(
common_args
.
extend
([
"--load-format"
,
load_format
])
common_args
.
extend
([
"--load-format"
,
load_format
])
if
hf_overrides
:
if
hf_overrides
:
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
common_args
.
extend
([
"--hf-overrides"
,
json
.
dumps
(
hf_overrides
)])
if
skip_tokenizer_init
:
common_args
.
append
(
"--skip-tokenizer-init"
)
compilation_config
=
{
compilation_config
=
{
'level'
:
3
,
'level'
:
3
,
...
...
tests/encoder_decoder/test_e2e_correctness.py
View file @
38d80967
...
@@ -63,6 +63,7 @@ def clear_cache():
...
@@ -63,6 +63,7 @@ def clear_cache():
current_platform
.
is_cpu
(),
current_platform
.
is_cpu
(),
reason
=
"CPU backend is not currently supported with encoder/decoder models"
reason
=
"CPU backend is not currently supported with encoder/decoder models"
)
)
@
pytest
.
mark
.
skip
(
reason
=
"bart not supported in V1"
)
def
test_encoder_decoder_e2e
(
def
test_encoder_decoder_e2e
(
hf_runner
,
hf_runner
,
vllm_runner
,
vllm_runner
,
...
...
tests/engine/test_arg_utils.py
View file @
38d80967
...
@@ -167,7 +167,7 @@ def test_get_kwargs():
...
@@ -167,7 +167,7 @@ def test_get_kwargs():
# dict should have json tip in help
# dict should have json tip in help
json_tip
=
"Should either be a valid JSON string or JSON keys"
json_tip
=
"Should either be a valid JSON string or JSON keys"
assert
json_tip
in
kwargs
[
"json_tip"
][
"help"
]
assert
json_tip
in
kwargs
[
"json_tip"
][
"help"
]
# nested config should
should
construct the nested config
# nested config should construct the nested config
assert
kwargs
[
"nested_config"
][
"type"
](
'{"field": 2}'
)
==
NestedConfig
(
2
)
assert
kwargs
[
"nested_config"
][
"type"
](
'{"field": 2}'
)
==
NestedConfig
(
2
)
...
@@ -287,15 +287,6 @@ def test_prefix_cache_default():
...
@@ -287,15 +287,6 @@ def test_prefix_cache_default():
},
},
"mm-processor-kwargs"
"mm-processor-kwargs"
),
),
(
'{"cast_logits_dtype":"bfloat16","sequence_parallel_norm":true,"sequence_parallel_norm_threshold":2048}'
,
{
"cast_logits_dtype"
:
"bfloat16"
,
"sequence_parallel_norm"
:
True
,
"sequence_parallel_norm_threshold"
:
2048
,
},
"override-neuron-config"
),
])
])
# yapf: enable
# yapf: enable
def
test_composite_arg_parser
(
arg
,
expected
,
option
):
def
test_composite_arg_parser
(
arg
,
expected
,
option
):
...
...
tests/engine/test_executor.py
View file @
38d80967
...
@@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
...
@@ -25,7 +25,7 @@ class CustomUniExecutor(UniProcExecutor):
timeout
:
Optional
[
float
]
=
None
,
timeout
:
Optional
[
float
]
=
None
,
args
:
tuple
=
(),
args
:
tuple
=
(),
kwargs
:
Optional
[
dict
]
=
None
)
->
list
[
Any
]:
kwargs
:
Optional
[
dict
]
=
None
)
->
list
[
Any
]:
# Drop marker to show that this was r
a
n
# Drop marker to show that this was r
u
n
with
open
(
".marker"
,
"w"
):
with
open
(
".marker"
,
"w"
):
...
...
return
super
().
collective_rpc
(
method
,
timeout
,
args
,
kwargs
)
return
super
().
collective_rpc
(
method
,
timeout
,
args
,
kwargs
)
...
...
Prev
1
…
5
6
7
8
9
10
11
12
13
…
28
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment