Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
190 additions
and
1 deletion
+190
-1
tests/kernels/quantization/test_ggml.py
tests/kernels/quantization/test_ggml.py
+1
-0
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+1
-0
tests/kernels/quantization/test_gptq.py
tests/kernels/quantization/test_gptq.py
+1
-0
tests/kernels/quantization/test_int8_kernel.py
tests/kernels/quantization/test_int8_kernel.py
+1
-0
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+1
-0
tests/kernels/quantization/test_machete_mm.py
tests/kernels/quantization/test_machete_mm.py
+1
-0
tests/kernels/quantization/test_marlin_gemm.py
tests/kernels/quantization/test_marlin_gemm.py
+1
-0
tests/kernels/quantization/test_nvfp4_quant.py
tests/kernels/quantization/test_nvfp4_quant.py
+1
-0
tests/kernels/quantization/test_nvfp4_scaled_mm.py
tests/kernels/quantization/test_nvfp4_scaled_mm.py
+1
-0
tests/kernels/quantization/test_rocm_skinny_gemms.py
tests/kernels/quantization/test_rocm_skinny_gemms.py
+1
-0
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+1
-0
tests/kernels/test_apply_repetition_penalties.py
tests/kernels/test_apply_repetition_penalties.py
+76
-0
tests/kernels/test_cutlass_mla_decode.py
tests/kernels/test_cutlass_mla_decode.py
+4
-1
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+93
-0
tests/kernels/test_fused_quant_activation.py
tests/kernels/test_fused_quant_activation.py
+1
-0
tests/kernels/test_triton_flash_attention.py
tests/kernels/test_triton_flash_attention.py
+1
-0
tests/kernels/utils.py
tests/kernels/utils.py
+1
-0
tests/kv_transfer/test_disagg.py
tests/kv_transfer/test_disagg.py
+1
-0
tests/kv_transfer/test_lookup_buffer.py
tests/kv_transfer/test_lookup_buffer.py
+1
-0
tests/kv_transfer/test_module.py
tests/kv_transfer/test_module.py
+1
-0
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/kernels/quantization/test_ggml.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
gguf
import
gguf
import
pytest
import
pytest
...
...
tests/kernels/quantization/test_gguf.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
pathlib
import
Path
from
pathlib
import
Path
...
...
tests/kernels/quantization/test_gptq.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
torch
import
torch
...
...
tests/kernels/quantization/test_int8_kernel.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
# Adapted from https://github.com/sgl-project/sglang/blob/main/test/srt/test_int8_kernel.py
import
itertools
import
itertools
...
...
tests/kernels/quantization/test_int8_quant.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/kernels/quantization/test_machete_mm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the machete kernel.
"""Tests for the machete kernel.
Run `pytest tests/kernels/test_machete_mm.py`.
Run `pytest tests/kernels/test_machete_mm.py`.
...
...
tests/kernels/quantization/test_marlin_gemm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the marlin kernel.
"""Tests for the marlin kernel.
Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
Run `pytest tests/kernels/marlin/test_marlin_gemm.py`.
...
...
tests/kernels/quantization/test_nvfp4_quant.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/kernels/quantization/test_nvfp4_scaled_mm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
from
nvfp4_utils
import
(
FLOAT4_E2M1_MAX
,
FLOAT8_E4M3_MAX
,
from
nvfp4_utils
import
(
FLOAT4_E2M1_MAX
,
FLOAT8_E4M3_MAX
,
...
...
tests/kernels/quantization/test_rocm_skinny_gemms.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_scaled_mm kernel
"""Tests for the triton_scaled_mm kernel
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
...
...
tests/kernels/test_apply_repetition_penalties.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
import
pytest
import
torch
from
tests.kernels.utils
import
opcheck
from
vllm._custom_ops
import
(
apply_repetition_penalties_cuda
,
apply_repetition_penalties_torch
)
from
vllm.platforms
import
current_platform
NUM_SEQS
=
[
1
,
2
,
3
,
4
,
8
,
13
,
17
,
32
,
37
,
256
,
1023
,
1024
,
1025
]
# [stress, stress, stress, Qwen, llama 4]
VOCAB_SIZES
=
[
17
,
256
,
1019
,
151936
,
202048
]
REPETITION_PENALTY_VALUES
=
[
1.05
]
SEEDS
=
[
0
]
DTYPES
=
[
torch
.
float32
,
torch
.
float16
]
@
pytest
.
mark
.
parametrize
(
"num_seqs"
,
NUM_SEQS
)
@
pytest
.
mark
.
parametrize
(
"vocab_size"
,
VOCAB_SIZES
)
@
pytest
.
mark
.
parametrize
(
"repetition_penalty"
,
REPETITION_PENALTY_VALUES
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
DTYPES
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda
(),
reason
=
"This test for checking CUDA kernel"
)
@
torch
.
inference_mode
()
def
test_apply_repetition_penalties
(
num_seqs
:
int
,
vocab_size
:
int
,
repetition_penalty
:
float
,
dtype
:
torch
.
dtype
,
seed
:
int
,
)
->
None
:
"""
Test the apply_repetition_penalties custom op
against a reference implementation.
"""
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
"cuda:0"
)
# Create test data
logits
=
torch
.
randn
(
num_seqs
,
vocab_size
,
dtype
=
dtype
)
# Create masks with some random tokens marked as repeated
prompt_mask
=
torch
.
zeros
(
num_seqs
,
vocab_size
,
dtype
=
torch
.
bool
)
output_mask
=
torch
.
zeros
(
num_seqs
,
vocab_size
,
dtype
=
torch
.
bool
)
# Mark some tokens as repeated in prompt and output
prompt_indices
=
torch
.
randint
(
0
,
vocab_size
,
(
num_seqs
,
max
(
1
,
vocab_size
//
200
)))
output_indices
=
torch
.
randint
(
0
,
vocab_size
,
(
num_seqs
,
max
(
1
,
vocab_size
//
200
)))
for
i
in
range
(
num_seqs
):
prompt_mask
[
i
,
prompt_indices
[
i
]]
=
True
output_mask
[
i
,
output_indices
[
i
]]
=
True
# Create repetition penalties tensor
repetition_penalties
=
torch
.
full
((
num_seqs
,
),
repetition_penalty
,
dtype
=
dtype
)
# Run all three implementations
logits_torch
=
logits
.
clone
()
logits_cuda
=
logits
.
clone
()
apply_repetition_penalties_torch
(
logits_torch
,
prompt_mask
,
output_mask
,
repetition_penalties
)
apply_repetition_penalties_cuda
(
logits_cuda
,
prompt_mask
,
output_mask
,
repetition_penalties
)
# Compare all outputs to reference
torch
.
testing
.
assert_close
(
logits_torch
,
logits_cuda
,
rtol
=
1e-3
,
atol
=
1e-3
)
# Test the operator by applying the opcheck utility
opcheck
(
torch
.
ops
.
_C
.
apply_repetition_penalties_
,
(
logits
.
clone
(),
prompt_mask
,
output_mask
,
repetition_penalties
))
tests/kernels/test_cutlass_mla_decode.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
import
torch.nn.functional
as
F
import
torch.nn.functional
as
F
...
@@ -75,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
...
@@ -75,7 +76,9 @@ def test_cutlass_mla_decode(dtype: torch.dtype, mean_seq_len: int, bs: int,
pack_factor
=
128
//
block_size
pack_factor
=
128
//
block_size
block_num
=
((
block_num
+
pack_factor
-
1
)
//
pack_factor
)
*
pack_factor
block_num
=
((
block_num
+
pack_factor
-
1
)
//
pack_factor
)
*
pack_factor
q
=
torch
.
randn
(
bs
,
h_q
,
d
)
# Amplify input values to ensure test coverage of edge cases where CUTLASS
# kernel errors occur with split_k settings.
q
=
torch
.
randn
(
bs
,
h_q
,
d
)
*
100
block_table
=
torch
.
randint
(
0
,
block_table
=
torch
.
randint
(
0
,
bs
*
block_num
,
(
bs
,
block_num
),
bs
*
block_num
,
(
bs
,
block_num
),
dtype
=
torch
.
int32
)
dtype
=
torch
.
int32
)
...
...
tests/kernels/test_flex_attention.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
"""Integration tests for FlexAttention backend vs default backend"""
import
random
import
numpy
as
np
import
pytest
import
torch
from
packaging
import
version
from
vllm
import
LLM
,
SamplingParams
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
def
set_seed
(
seed
):
"""Set seeds for reproducibility"""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
torch
.
manual_seed
(
seed
)
if
torch
.
cuda
.
is_available
():
torch
.
cuda
.
manual_seed_all
(
seed
)
@
pytest
.
mark
.
skipif
(
not
torch
.
cuda
.
is_available
()
or
TORCH_VERSION
<
MINIMUM_TORCH_VERSION
,
reason
=
"CUDA not available or PyTorch version < 2.7"
,
)
def
test_flex_attention_vs_default_backend
(
monkeypatch
):
"""Test that FlexAttention produces the same outputs as the default backend.
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
"""
model_name
=
"Qwen/Qwen2.5-1.5B-Instruct"
seed
=
42
max_tokens
=
32
prompts
=
[
"Hello, my name is"
,
"The president of the United States is"
,
"The capital of France is"
,
]
sampling_params
=
SamplingParams
(
temperature
=
0.0
,
top_p
=
1.0
,
seed
=
seed
,
max_tokens
=
max_tokens
)
# Run with flex attention
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ATTENTION_BACKEND"
,
"FLEX_ATTENTION"
)
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
set_seed
(
seed
)
llm_flex
=
LLM
(
model_name
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
)
output_flex
=
llm_flex
.
generate
(
prompts
,
sampling_params
)
# Run with default backend
with
monkeypatch
.
context
()
as
m
:
m
.
setenv
(
"VLLM_USE_V1"
,
"1"
)
m
.
setenv
(
"VLLM_ENABLE_V1_MULTIPROCESSING"
,
"0"
)
set_seed
(
seed
)
llm_default
=
LLM
(
model_name
,
tensor_parallel_size
=
1
,
num_gpu_blocks_override
=
128
,
enforce_eager
=
True
,
)
output_default
=
llm_default
.
generate
(
prompts
,
sampling_params
)
# Compare outputs from both backends
for
i
,
(
flex_result
,
default_result
)
in
enumerate
(
zip
(
output_flex
,
output_default
)):
prompt
=
prompts
[
i
]
flex_text
=
flex_result
.
outputs
[
0
].
text
default_text
=
default_result
.
outputs
[
0
].
text
assert
flex_text
==
default_text
,
(
f
"FlexAttention output doesn't match default for:
{
prompt
!
r
}
\n
"
f
"FlexAttention:
{
flex_text
!
r
}
\n
"
f
"Default:
{
default_text
!
r
}
"
)
if
__name__
==
"__main__"
:
pytest
.
main
([
__file__
])
tests/kernels/test_fused_quant_activation.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
pytest
import
torch
import
torch
...
...
tests/kernels/test_triton_flash_attention.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for the triton_flash_attention kernel
"""Tests for the triton_flash_attention kernel
Run `pytest tests/kernels/test_triton_flash_attention.py`.
Run `pytest tests/kernels/test_triton_flash_attention.py`.
...
...
tests/kernels/utils.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Kernel test utils"""
"""Kernel test utils"""
import
itertools
import
itertools
...
...
tests/kv_transfer/test_disagg.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
os
import
subprocess
import
subprocess
...
...
tests/kv_transfer/test_lookup_buffer.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
os
import
os
import
random
import
random
...
...
tests/kv_transfer/test_module.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
subprocess
import
subprocess
import
sys
import
sys
...
...
Prev
1
…
19
20
21
22
23
24
25
26
27
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment