Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
31584b45
Commit
31584b45
authored
Aug 26, 2025
by
zhuwenwen
Browse files
[fix]fix tests of kernels
parent
15347448
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
42 additions
and
39 deletions
+42
-39
tests/kernels/attention/test_attention.py
tests/kernels/attention/test_attention.py
+1
-1
tests/kernels/attention/test_mha_attn.py
tests/kernels/attention/test_mha_attn.py
+1
-1
tests/kernels/attention/test_triton_unified_attention.py
tests/kernels/attention/test_triton_unified_attention.py
+1
-1
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_batched_moe.py
+2
-2
tests/kernels/moe/test_moe.py
tests/kernels/moe/test_moe.py
+1
-1
tests/kernels/moe/untest_block_fp8.py
tests/kernels/moe/untest_block_fp8.py
+0
-0
tests/kernels/moe/untest_moe_permute_unpermute.py
tests/kernels/moe/untest_moe_permute_unpermute.py
+0
-0
tests/kernels/moe/untest_nvfp4_moe.py
tests/kernels/moe/untest_nvfp4_moe.py
+0
-0
tests/kernels/moe/untest_pplx_cutlass_moe.py
tests/kernels/moe/untest_pplx_cutlass_moe.py
+0
-0
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
tests/kernels/moe/untest_silu_mul_fp8_quant_deep_gemm.py
+0
-0
tests/kernels/moe/untest_triton_moe_ptpc_fp8.py
tests/kernels/moe/untest_triton_moe_ptpc_fp8.py
+0
-0
tests/kernels/quantization/__init__.py
tests/kernels/quantization/__init__.py
+0
-0
tests/kernels/quantization/test_gguf.py
tests/kernels/quantization/test_gguf.py
+1
-1
tests/kernels/quantization/test_int8_quant.py
tests/kernels/quantization/test_int8_quant.py
+2
-2
tests/kernels/quantization/test_triton_scaled_mm.py
tests/kernels/quantization/test_triton_scaled_mm.py
+3
-1
tests/kernels/quantization/untest_rocmskinny_gemms.py
tests/kernels/quantization/untest_rocmskinny_gemms.py
+0
-0
tests/kernels/test_flex_attention.py
tests/kernels/test_flex_attention.py
+3
-1
tests/kernels/untest_fused_quant_activation.py
tests/kernels/untest_fused_quant_activation.py
+1
-1
tests/kernels/untest_triton_flash_attention.py
tests/kernels/untest_triton_flash_attention.py
+22
-23
tests/models/registry.py
tests/models/registry.py
+4
-4
No files found.
tests/kernels/attention/test_attention.py
View file @
31584b45
...
...
@@ -18,7 +18,7 @@ if not current_platform.is_rocm():
from
xformers
import
ops
as
xops
from
xformers.ops.fmha.attn_bias
import
BlockDiagonalCausalMask
from
vllm.attention.backends.xformers
import
_make_alibi_bias
from
vllm.attention.backends.xformers
import
_make_alibi_bias
FLOAT32_BYTES
=
torch
.
finfo
(
torch
.
float
).
bits
//
8
# This will change depending on the compute capability.
...
...
tests/kernels/attention/test_mha_attn.py
View file @
31584b45
...
...
@@ -25,7 +25,7 @@ def clear_cache():
_cached_get_attn_backend
.
cache_clear
()
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
]
)
@
pytest
.
mark
.
parametrize
(
"device"
,
[
"cpu"
,
"hip"
,
"cuda"
]
if
not
current_platform
.
is_rocm
()
else
[
"cpu"
,
"hip"
])
def
test_mha_attn_platform
(
device
:
str
):
"""
Test the attention selector between different platform and device.
...
...
tests/kernels/attention/test_triton_unified_attention.py
View file @
31584b45
...
...
@@ -15,7 +15,7 @@ BLOCK_SIZES = [16, 32]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
QDTYPES
=
[
None
,
torch
.
float8_e4m3fn
]
if
not
current_platform
.
is_rocm
()
else
[
None
,
torch
.
float8_e4m3fnuz
None
#
, torch.float8_e4m3fnuz
]
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
...
...
tests/kernels/moe/test_batched_moe.py
View file @
31584b45
...
...
@@ -96,7 +96,7 @@ class BatchedMMTensors:
@
pytest
.
mark
.
parametrize
(
"N"
,
[
128
,
256
,
1024
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
[
torch
.
float8_e4m3fn
,
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
]
if
not
current_platform
.
is_rocm
()
else
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
def
test_batched_mm
(
num_experts
:
int
,
max_tokens_per_expert
:
int
,
K
:
int
,
...
...
@@ -208,7 +208,7 @@ def test_batched_mm(num_experts: int, max_tokens_per_expert: int, K: int,
@
pytest
.
mark
.
parametrize
((
"m"
,
"n"
,
"k"
),
MNK_FACTORS
)
@
pytest
.
mark
.
parametrize
(
"e"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOP_KS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"dtype"
,
[
torch
.
float8_e4m3fn
,
torch
.
bfloat16
]
if
not
current_platform
.
is_rocm
()
else
[
torch
.
bfloat16
]
)
@
pytest
.
mark
.
parametrize
(
"per_act_token_quant"
,
[
False
,
True
])
@
pytest
.
mark
.
parametrize
(
"block_shape"
,
[
None
,
[
128
,
128
]])
@
pytest
.
mark
.
parametrize
(
"input_scales"
,
[
False
])
...
...
tests/kernels/moe/test_moe.py
View file @
31584b45
...
...
@@ -353,7 +353,7 @@ def test_fused_moe_wn16(m: int, n: int, k: int, e: int, topk: int,
[
torch
.
float32
,
torch
.
float16
,
torch
.
bfloat16
])
@
pytest
.
mark
.
parametrize
(
"padding"
,
[
True
,
False
])
@
pytest
.
mark
.
parametrize
(
"use_rocm_aiter"
,
[
True
,
False
]
if
current_platform
.
is_rocm
()
else
[
False
])
"use_rocm_aiter"
,
[
True
,
False
]
if
not
current_platform
.
is_rocm
()
else
[
False
])
@
torch
.
inference_mode
()
def
test_mixtral_moe
(
dtype
:
torch
.
dtype
,
padding
:
bool
,
use_rocm_aiter
:
bool
,
monkeypatch
):
...
...
tests/kernels/moe/test_block_fp8.py
→
tests/kernels/moe/
un
test_block_fp8.py
View file @
31584b45
File moved
tests/kernels/moe/test_moe_permute_unpermute.py
→
tests/kernels/moe/
un
test_moe_permute_unpermute.py
View file @
31584b45
File moved
tests/kernels/moe/test_nvfp4_moe.py
→
tests/kernels/moe/
un
test_nvfp4_moe.py
View file @
31584b45
File moved
tests/kernels/moe/test_pplx_cutlass_moe.py
→
tests/kernels/moe/
un
test_pplx_cutlass_moe.py
View file @
31584b45
File moved
tests/kernels/moe/test_silu_mul_fp8_quant_deep_gemm.py
→
tests/kernels/moe/
un
test_silu_mul_fp8_quant_deep_gemm.py
View file @
31584b45
File moved
tests/kernels/moe/test_triton_moe_ptpc_fp8.py
→
tests/kernels/moe/
un
test_triton_moe_ptpc_fp8.py
View file @
31584b45
File moved
tests/kernels/quantization/__init__.py
0 → 100644
View file @
31584b45
tests/kernels/quantization/test_gguf.py
View file @
31584b45
...
...
@@ -13,7 +13,7 @@ import vllm._custom_ops as ops
from
vllm.model_executor.layers.fused_moe
import
fused_experts
from
vllm.model_executor.layers.quantization.gguf
import
_fused_moe_gguf
from
vllm.platforms
import
current_platform
from
..utils
import
models_path_prefix
from
..
.
utils
import
models_path_prefix
# GGUF_SAMPLE = snapshot_download("Isotr0py/test-gguf-sample")
# GGUF_SAMPLE_MOE = snapshot_download("SzymonOzog/test-gguf-moe-sample")
...
...
tests/kernels/quantization/test_int8_quant.py
View file @
31584b45
...
...
@@ -42,7 +42,7 @@ def opcheck_int8_quant_dynamic(output, input, symmetric=True):
(
output
,
input
,
scale
,
azp
))
@
pytest
.
mark
.
skipif
(
current_platform
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Currently, there is not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
...
...
@@ -67,7 +67,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
opcheck_int8_quant_dynamic
(
ops_out
,
x
)
@
pytest
.
mark
.
skipif
(
current_platform
(),
@
pytest
.
mark
.
skipif
(
current_platform
.
is_rocm
(),
reason
=
"Currently, there is not supported on ROCm."
)
@
pytest
.
mark
.
parametrize
(
"num_tokens"
,
NUM_TOKENS
)
@
pytest
.
mark
.
parametrize
(
"hidden_size"
,
HIDDEN_SIZES
)
...
...
tests/kernels/quantization/test_triton_scaled_mm.py
View file @
31584b45
...
...
@@ -4,6 +4,7 @@
Run `pytest tests/kernels/test_triton_scaled_mm.py`.
"""
import
os
import
importlib
from
typing
import
Optional
...
...
@@ -11,6 +12,7 @@ import pytest
import
torch
from
vllm.platforms
import
current_platform
from
...utils
import
models_path_prefix
device
=
"cuda"
...
...
@@ -45,7 +47,7 @@ def get_8bit_types():
# This test is to check regressions for int8 support on ROCm.
@
pytest
.
mark
.
parametrize
(
"model_path"
,
[
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
,
os
.
path
.
join
(
models_path_prefix
,
"neuralmagic/Llama-3.2-1B-quantized.w8a8"
)
,
])
@
pytest
.
mark
.
parametrize
(
"max_tokens"
,
[
32
])
@
pytest
.
mark
.
parametrize
(
"num_logprobs"
,
[
10
])
...
...
tests/kernels/quantization/test_rocm
_
skinny_gemms.py
→
tests/kernels/quantization/
un
test_rocmskinny_gemms.py
View file @
31584b45
File moved
tests/kernels/test_flex_attention.py
View file @
31584b45
...
...
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Integration tests for FlexAttention backend vs default backend"""
import
os
import
random
import
numpy
as
np
...
...
@@ -10,6 +11,7 @@ import torch
from
packaging
import
version
from
vllm
import
LLM
,
SamplingParams
from
..utils
import
models_path_prefix
TORCH_VERSION
=
version
.
parse
(
torch
.
__version__
)
MINIMUM_TORCH_VERSION
=
version
.
parse
(
"2.7.0"
)
...
...
@@ -34,7 +36,7 @@ def test_flex_attention_vs_default_backend(monkeypatch):
This test compares the outputs from the FlexAttention backend with
the default backend, ensuring they are identical when using the same seed.
"""
model_name
=
"Qwen/Qwen2.5-1.5B-Instruct"
model_name
=
os
.
path
.
join
(
models_path_prefix
,
"Qwen/Qwen2.5-1.5B-Instruct"
)
seed
=
42
max_tokens
=
32
prompts
=
[
...
...
tests/kernels/test_fused_quant_activation.py
→
tests/kernels/
un
test_fused_quant_activation.py
View file @
31584b45
...
...
@@ -9,7 +9,7 @@ from vllm.model_executor.layers.activation import SiluAndMul
from
vllm.platforms
import
current_platform
DTYPES
=
[
torch
.
bfloat16
,
torch
.
float16
]
QUANT_DTYPES
=
[
current_platform
.
fp8_dtype
()]
QUANT_DTYPES
=
[
current_platform
.
fp8_dtype
()]
if
not
current_platform
.
is_rocm
()
else
[
None
]
NUM_TOKENS
=
[
1
,
17
,
86
,
1234
,
3045
]
# Arbitrary values for testing
HIDDEN_SIZES
=
[
16
,
48
,
128
,
1562
,
4096
]
# Arbitrary values for testing
SEEDS
=
[
0
]
...
...
tests/kernels/test_triton_flash_attention.py
→
tests/kernels/
un
test_triton_flash_attention.py
View file @
31584b45
...
...
@@ -7,8 +7,7 @@ Run `pytest tests/kernels/test_triton_flash_attention.py`.
import
pytest
import
torch
from
vllm.attention.ops.triton_flash_attention
import
(
SUPPORTED_LAYOUTS
,
MetaData
,
from
vllm.attention.ops.triton_flash_attention
import
(
MetaData
,
compute_alibi_tensor
,
scale_fp8
,
triton_attention_rocm
)
...
...
@@ -60,26 +59,26 @@ class ReferenceAttention:
ref_out
=
ref_out
.
transpose
(
1
,
2
).
clone
()
return
ref_out
def
fwd_fp8
(
self
,
q_quantized
,
k_quantized
,
v_quantized
):
q
=
(
q_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
q_descale
).
to
(
self
.
dtype
)
k
=
(
k_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
k_descale
).
to
(
self
.
dtype
)
v
=
(
v_quantized
.
to
(
torch
.
float16
)
*
self
.
input_metadata
.
v_descale
).
to
(
self
.
dtype
)
result
=
self
.
fwd
(
q
,
k
,
v
)
if
self
.
input_metadata
.
o_scale
is
not
None
:
result
,
_
=
scale_fp8
(
result
,
self
.
input_metadata
.
o_scale
)
return
result
def
fwd_fp8_kv
(
self
,
q
,
k_quantized
,
v_quantized
):
k_descale
,
v_descale
=
(
self
.
input_metadata
.
k_descale
,
self
.
input_metadata
.
v_descale
)
k_dequantized
=
(
k_quantized
.
to
(
torch
.
float32
)
*
k_descale
.
to
(
torch
.
float32
)).
to
(
self
.
dtype
)
v_dequantized
=
(
v_quantized
.
to
(
torch
.
float32
)
*
v_descale
.
to
(
torch
.
float32
)).
to
(
self
.
dtype
)
return
self
.
fwd
(
q
,
k_dequantized
,
v_dequantized
)
#
def fwd_fp8(self, q_quantized, k_quantized, v_quantized):
#
q = (q_quantized.to(torch.float16) * self.input_metadata.q_descale).to(
#
self.dtype)
#
k = (k_quantized.to(torch.float16) * self.input_metadata.k_descale).to(
#
self.dtype)
#
v = (v_quantized.to(torch.float16) * self.input_metadata.v_descale).to(
#
self.dtype)
#
result = self.fwd(q, k, v)
#
if self.input_metadata.o_scale is not None:
#
result, _ = scale_fp8(result, self.input_metadata.o_scale)
#
return result
#
def fwd_fp8_kv(self, q, k_quantized, v_quantized):
#
k_descale, v_descale = (self.input_metadata.k_descale,
#
self.input_metadata.v_descale)
#
k_dequantized = (k_quantized.to(torch.float32) *
#
k_descale.to(torch.float32)).to(self.dtype)
#
v_dequantized = (v_quantized.to(torch.float32) *
#
v_descale.to(torch.float32)).to(self.dtype)
#
return self.fwd(q, k_dequantized, v_dequantized)
def
varlen_fwd
(
self
,
q
,
k
,
v
,
is_mqa
=
False
):
ref_out
=
torch
.
empty_like
(
q
)
...
...
@@ -145,7 +144,7 @@ def input_helper(
use_o_scale
=
False
,
use_bias
=
False
,
):
assert
layout
in
SUPPORTED_LAYOUTS
,
"Got unsupported layout."
#
assert layout in SUPPORTED_LAYOUTS, "Got unsupported layout."
current_platform
.
seed_everything
(
0
)
...
...
tests/models/registry.py
View file @
31584b45
...
...
@@ -210,7 +210,7 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
"JambaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/AI21-Jamba-1.5-Mini"
),
extras
=
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"ai21labs/Jamba-tiny-dev"
)}),
# noqa: E501
"LlamaForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-3.2-1B-Instruct"
),
extras
=
{
"guard"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
,
# noqa: E501
extras
=
{
"guard"
:
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-Guard-3-1B"
)
,
# noqa: E501
"hermes"
:
os
.
path
.
join
(
models_path_prefix
,
"NousResearch/Hermes-3-Llama-3.1-8B"
),
# noqa: E501
"fp8"
:
os
.
path
.
join
(
models_path_prefix
,
"RedHatAI/Meta-Llama-3.1-8B-Instruct-FP8"
)}),
# noqa: E501
"LLaMAForCausalLM"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"decapoda-research/llama-7b-hf"
),
...
...
@@ -367,12 +367,12 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
),
"Idefics3ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceM4/Idefics3-8B-Llama3"
),
# noqa: E501
{
"tiny"
:
os
.
path
.
join
(
models_path_prefix
,
"HuggingFaceTB/SmolVLM-256M-Instruct"
)}),
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
,
# noqa: E501
"KeyeForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"Kwai-Keye/Keye-VL-8B-Preview"
)
,
# noqa: E501
trust_remote_code
=
True
),
"KimiVLForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Instruct"
),
# noqa: E501
extras
=
{
"thinking"
:
os
.
path
.
join
(
models_path_prefix
,
"moonshotai/Kimi-VL-A3B-Thinking"
)},
# noqa: E501
trust_remote_code
=
True
),
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
,
# noqa: E501
"Llama4ForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"meta-llama/Llama-4-Scout-17B-16E-Instruct"
)
,
# noqa: E501
max_model_len
=
10240
),
"LlavaForConditionalGeneration"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"llava-hf/llava-1.5-7b-hf"
),
extras
=
{
"mistral"
:
os
.
path
.
join
(
models_path_prefix
,
"mistral-community/pixtral-12b"
),
# noqa: E501
...
...
@@ -407,7 +407,7 @@ _MULTIMODAL_EXAMPLE_MODELS = {
trust_remote_code
=
True
,
max_transformers_version
=
"4.48"
,
transformers_version_reason
=
"Use of deprecated imports which have been removed."
,
# noqa: E501
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
}
)),
# noqa: E501
extras
=
{
"phi3.5"
:
os
.
path
.
join
(
models_path_prefix
,
"microsoft/Phi-3.5-vision-instruct"
)
}
),
# noqa: E501
"Ovis"
:
_HfExamplesInfo
(
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis2-1B"
),
trust_remote_code
=
True
,
extras
=
{
"1.6-llama"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Llama3.2-3B"
),
"1.6-gemma"
:
os
.
path
.
join
(
models_path_prefix
,
"AIDC-AI/Ovis1.6-Gemma2-9B"
)}),
# noqa: E501
...
...
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment