Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
a3f8d5dd
"vllm/vscode:/vscode.git/clone" did not exist on "ec68d53b2b75eb5480270c67676b126079998f5a"
Commit
a3f8d5dd
authored
Dec 17, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.13.0rc2' into v0.13.0rc2-ori
parents
8d75f22e
f34eca5f
Changes
499
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
485 additions
and
34 deletions
+485
-34
tests/entrypoints/test_chat_utils.py
tests/entrypoints/test_chat_utils.py
+107
-6
tests/kernels/attention/test_cpu_attn.py
tests/kernels/attention/test_cpu_attn.py
+63
-10
tests/kernels/attention/test_cutlass_mla_decode.py
tests/kernels/attention/test_cutlass_mla_decode.py
+2
-2
tests/kernels/attention/test_flashinfer_trtllm_attention.py
tests/kernels/attention/test_flashinfer_trtllm_attention.py
+2
-2
tests/kernels/attention/test_triton_unified_attention.py
tests/kernels/attention/test_triton_unified_attention.py
+27
-0
tests/kernels/core/test_apply_rotary_emb.py
tests/kernels/core/test_apply_rotary_emb.py
+203
-0
tests/kernels/core/test_mrope.py
tests/kernels/core/test_mrope.py
+0
-2
tests/kernels/core/test_pos_encoding.py
tests/kernels/core/test_pos_encoding.py
+8
-4
tests/kernels/moe/modular_kernel_tools/common.py
tests/kernels/moe/modular_kernel_tools/common.py
+2
-1
tests/kernels/moe/test_batched_deepgemm.py
tests/kernels/moe/test_batched_deepgemm.py
+1
-1
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_batched_moe.py
+1
-0
tests/kernels/moe/test_block_fp8.py
tests/kernels/moe/test_block_fp8.py
+1
-1
tests/kernels/moe/test_cutlass_moe.py
tests/kernels/moe/test_cutlass_moe.py
+25
-2
tests/kernels/moe/test_deepep_deepgemm_moe.py
tests/kernels/moe/test_deepep_deepgemm_moe.py
+6
-0
tests/kernels/moe/test_deepep_moe.py
tests/kernels/moe/test_deepep_moe.py
+6
-0
tests/kernels/moe/test_deepgemm.py
tests/kernels/moe/test_deepgemm.py
+1
-1
tests/kernels/moe/test_flashinfer.py
tests/kernels/moe/test_flashinfer.py
+15
-0
tests/kernels/moe/test_flashinfer_moe.py
tests/kernels/moe/test_flashinfer_moe.py
+8
-1
tests/kernels/moe/test_gpt_oss_triton_kernels.py
tests/kernels/moe/test_gpt_oss_triton_kernels.py
+1
-1
tests/kernels/moe/test_modular_kernel_combinations.py
tests/kernels/moe/test_modular_kernel_combinations.py
+6
-0
No files found.
tests/entrypoints/test_chat_utils.py
View file @
a3f8d5dd
...
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
...
@@ -29,7 +29,8 @@ from vllm.multimodal.utils import (
encode_image_base64
,
encode_image_base64
,
encode_video_base64
,
encode_video_base64
,
)
)
from
vllm.tokenizers
import
MistralTokenizer
,
get_tokenizer
from
vllm.tokenizers
import
get_tokenizer
from
vllm.tokenizers.mistral
import
MistralTokenizer
from
vllm.utils.serial_utils
import
tensor2base64
from
vllm.utils.serial_utils
import
tensor2base64
from
..models.registry
import
HF_EXAMPLE_MODELS
from
..models.registry
import
HF_EXAMPLE_MODELS
...
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
...
@@ -796,9 +797,13 @@ def test_parse_chat_messages_empty_image_embeds_with_uuid(
"content"
:
"<|image_1|>
\n
What's in this image?"
,
"content"
:
"<|image_1|>
\n
What's in this image?"
,
}
}
]
]
assert
mm_data
is
not
None
assert
mm_data
is
not
None
assert
"image"
in
mm_data
assert
"image"
in
mm_data
assert
mm_data
[
"image"
]
is
None
assert
isinstance
(
mm_data
[
"image"
],
list
)
assert
len
(
mm_data
[
"image"
])
==
1
assert
mm_data
[
"image"
][
0
]
is
None
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
uuid
])
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
uuid
])
...
@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
...
@@ -825,10 +830,11 @@ def test_parse_chat_messages_empty_audio_embeds_with_uuid(
# Should have audio in mm_data as None (UUID provided)
# Should have audio in mm_data as None (UUID provided)
assert
mm_data
is
not
None
assert
mm_data
is
not
None
assert
"audio"
in
mm_data
assert
"audio"
in
mm_data
assert
mm_data
[
"audio"
]
is
None
assert
isinstance
(
mm_data
[
"audio"
],
list
)
assert
len
(
mm_data
[
"audio"
])
==
1
assert
mm_data
[
"audio"
][
0
]
is
None
# UUID should be recorded
# UUID should be recorded
assert
mm_uuids
is
not
None
assert
"audio"
in
mm_uuids
_assert_mm_uuids
(
mm_uuids
,
1
,
modality
=
"audio"
,
expected_uuids
=
[
uuid
])
_assert_mm_uuids
(
mm_uuids
,
1
,
modality
=
"audio"
,
expected_uuids
=
[
uuid
])
...
@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
...
@@ -1121,10 +1127,105 @@ async def test_parse_chat_messages_empty_image_embeds_with_uuid_async(
mm_data
=
await
mm_future
mm_data
=
await
mm_future
assert
mm_data
is
not
None
assert
mm_data
is
not
None
assert
"image"
in
mm_data
assert
"image"
in
mm_data
assert
mm_data
[
"image"
]
is
None
assert
isinstance
(
mm_data
[
"image"
],
list
)
assert
len
(
mm_data
[
"image"
])
==
1
assert
mm_data
[
"image"
][
0
]
is
None
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
uuid
])
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
uuid
])
def
test_parse_chat_messages_empty_dict_image_embeds
(
phi3v_model_config_image_embeds
,
):
"""Test that empty dictionary for image_embeds is handled without errors."""
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_embeds"
,
"image_embeds"
:
{}},
{
"type"
:
"text"
,
"text"
:
"What's in this image?"
},
],
}
],
phi3v_model_config_image_embeds
,
content_format
=
"string"
,
)
# Verify conversation structure
assert
conversation
==
[
{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
What's in this image?"
,
}
]
# Verify mm_data contains an empty dictionary of embeddings
assert
mm_data
is
not
None
assert
"image"
in
mm_data
assert
isinstance
(
mm_data
[
"image"
],
dict
)
assert
len
(
mm_data
[
"image"
])
==
0
# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids
(
mm_uuids
,
1
,
expected_uuids
=
[
None
])
def
test_parse_chat_messages_multiple_dict_image_embeds
(
phi3v_model_config_image_embeds
,
):
"""Test that multiple dictionaries for image_embeds is handled without errors."""
# Create two sample image embedding tensors
batch_size
=
2
image_embedding_1
=
torch
.
randn
(
batch_size
,
256
,
1024
)
image_embedding_2
=
torch
.
randn
(
batch_size
,
3
)
conversation
,
mm_data
,
mm_uuids
=
parse_chat_messages
(
[
{
"role"
:
"user"
,
"content"
:
[
{
"type"
:
"image_embeds"
,
"image_embeds"
:
{
"image_embedding_1"
:
tensor2base64
(
p
),
"image_embedding_2"
:
tensor2base64
(
i
),
},
}
for
p
,
i
in
zip
(
image_embedding_1
,
image_embedding_2
)
]
+
[
{
"type"
:
"text"
,
"text"
:
"Describe these two images."
},
],
}
],
phi3v_model_config_image_embeds
,
content_format
=
"string"
,
)
# Verify conversation structure
assert
conversation
==
[
{
"role"
:
"user"
,
"content"
:
"<|image_1|>
\n
<|image_2|>
\n
Describe these two images."
,
}
]
# Verify mm_data contains a dictionary of multi-embeddings
assert
mm_data
is
not
None
assert
"image"
in
mm_data
assert
isinstance
(
mm_data
[
"image"
],
dict
)
assert
len
(
mm_data
[
"image"
])
==
batch_size
# Verify each embedding has the correct shape
assert
isinstance
(
mm_data
[
"image"
][
"image_embedding_1"
],
torch
.
Tensor
)
assert
mm_data
[
"image"
][
"image_embedding_1"
].
shape
==
image_embedding_1
.
shape
assert
isinstance
(
mm_data
[
"image"
][
"image_embedding_2"
],
torch
.
Tensor
)
assert
mm_data
[
"image"
][
"image_embedding_2"
].
shape
==
image_embedding_2
.
shape
# Verify UUIDs (None since we didn't provide any)
_assert_mm_uuids
(
mm_uuids
,
batch_size
,
expected_uuids
=
[
None
,
None
])
@
pytest
.
mark
.
asyncio
@
pytest
.
mark
.
asyncio
async
def
test_parse_chat_messages_multiple_images_async
(
async
def
test_parse_chat_messages_multiple_images_async
(
phi3v_model_config
,
phi3v_model_config
,
...
...
tests/kernels/attention/test_cpu_attn.py
View file @
a3f8d5dd
...
@@ -7,7 +7,8 @@ import math
...
@@ -7,7 +7,8 @@ import math
import
pytest
import
pytest
import
torch
import
torch
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
CpuArchEnum
,
current_platform
from
vllm.v1.attention.backends.cpu_attn
import
_get_attn_isa
if
not
current_platform
.
is_cpu
():
if
not
current_platform
.
is_cpu
():
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
pytest
.
skip
(
"skipping CPU-only tests"
,
allow_module_level
=
True
)
...
@@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
...
@@ -36,6 +37,21 @@ SEQ_LENS = [ # (q_len, kv_len)
]
]
def
get_attn_isa
(
block_size
:
int
|
None
=
None
,
dtype
:
torch
.
dtype
|
None
=
None
,
):
if
block_size
and
dtype
:
return
_get_attn_isa
(
dtype
,
block_size
)
else
:
if
current_platform
.
get_cpu_architecture
()
==
CpuArchEnum
.
ARM
:
return
"neon"
elif
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
():
return
"amx"
else
:
return
"vec"
# rand number generation takes too much time, cache rand tensors
# rand number generation takes too much time, cache rand tensors
@
functools
.
lru_cache
(
maxsize
=
128
,
typed
=
False
)
@
functools
.
lru_cache
(
maxsize
=
128
,
typed
=
False
)
def
tensor_cache
(
def
tensor_cache
(
...
@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
...
@@ -452,6 +468,49 @@ def test_varlen_with_paged_kv_normal_vec16(
)
)
@
pytest
.
mark
.
parametrize
(
"seq_lens"
,
SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
HEAD_SIZES
)
@
pytest
.
mark
.
parametrize
(
"block_size"
,
[
96
,
128
])
@
pytest
.
mark
.
parametrize
(
"sliding_window"
,
SLIDING_WINDOWS
)
@
pytest
.
mark
.
parametrize
(
"dtype"
,
QTYPES
)
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"isa"
,
[
"neon"
])
@
pytest
.
mark
.
skipif
(
current_platform
.
get_cpu_architecture
()
!=
CpuArchEnum
.
ARM
,
reason
=
"Not an Arm CPU."
,
)
def
test_varlen_with_paged_kv_normal_neon
(
seq_lens
:
list
[
tuple
[
int
,
int
]],
num_heads
:
tuple
[
int
,
int
],
head_size
:
int
,
sliding_window
:
int
|
None
,
dtype
:
torch
.
dtype
,
block_size
:
int
,
soft_cap
:
float
|
None
,
num_blocks
:
int
,
use_alibi
:
bool
,
use_sink
:
bool
,
isa
:
str
,
)
->
None
:
varlen_with_paged_kv
(
seq_lens
=
seq_lens
,
num_heads
=
num_heads
,
head_size
=
head_size
,
sliding_window
=
sliding_window
,
dtype
=
dtype
,
block_size
=
block_size
,
soft_cap
=
soft_cap
,
num_blocks
=
num_blocks
,
use_alibi
=
use_alibi
,
use_sink
=
use_sink
,
isa
=
isa
,
)
@
pytest
.
mark
.
parametrize
(
"seq_lens"
,
SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"seq_lens"
,
SEQ_LENS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"num_heads"
,
NUM_HEADS
)
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
96
])
@
pytest
.
mark
.
parametrize
(
"head_size"
,
[
96
])
...
@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
...
@@ -462,9 +521,7 @@ def test_varlen_with_paged_kv_normal_vec16(
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"isa"
,
[
get_attn_isa
()])
"isa"
,
[
"amx"
]
if
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
()
else
[
"vec"
]
)
def
test_varlen_with_paged_kv_softcap
(
def
test_varlen_with_paged_kv_softcap
(
seq_lens
:
list
[
tuple
[
int
,
int
]],
seq_lens
:
list
[
tuple
[
int
,
int
]],
num_heads
:
tuple
[
int
,
int
],
num_heads
:
tuple
[
int
,
int
],
...
@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
...
@@ -503,9 +560,7 @@ def test_varlen_with_paged_kv_softcap(
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"isa"
,
[
get_attn_isa
()])
"isa"
,
[
"amx"
]
if
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
()
else
[
"vec"
]
)
def
test_varlen_with_paged_kv_alibi
(
def
test_varlen_with_paged_kv_alibi
(
seq_lens
:
list
[
tuple
[
int
,
int
]],
seq_lens
:
list
[
tuple
[
int
,
int
]],
num_heads
:
tuple
[
int
,
int
],
num_heads
:
tuple
[
int
,
int
],
...
@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
...
@@ -544,9 +599,7 @@ def test_varlen_with_paged_kv_alibi(
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_alibi"
,
[
False
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
"use_sink"
,
[
True
])
@
pytest
.
mark
.
parametrize
(
@
pytest
.
mark
.
parametrize
(
"isa"
,
[
get_attn_isa
()])
"isa"
,
[
"amx"
]
if
torch
.
_C
.
_cpu
.
_is_amx_tile_supported
()
else
[
"vec"
]
)
def
test_varlen_with_paged_kv_sink
(
def
test_varlen_with_paged_kv_sink
(
seq_lens
:
list
[
tuple
[
int
,
int
]],
seq_lens
:
list
[
tuple
[
int
,
int
]],
num_heads
:
tuple
[
int
,
int
],
num_heads
:
tuple
[
int
,
int
],
...
...
tests/kernels/attention/test_cutlass_mla_decode.py
View file @
a3f8d5dd
...
@@ -32,8 +32,8 @@ def cal_diff(
...
@@ -32,8 +32,8 @@ def cal_diff(
CUTLASS_MLA_UNSUPPORTED_REASON
=
(
CUTLASS_MLA_UNSUPPORTED_REASON
=
(
"Cutlass MLA Requires compute capability of 10 or above."
"Cutlass MLA Requires compute capability of 10
0
or above."
if
not
current_platform
.
is_device_capability
(
100
)
if
not
current_platform
.
is_device_capability
_family
(
100
)
else
"Cutlass MLA is supported"
else
"Cutlass MLA is supported"
)
)
...
...
tests/kernels/attention/test_flashinfer_trtllm_attention.py
View file @
a3f8d5dd
...
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
...
@@ -11,7 +11,7 @@ from tests.kernels.quantization.nvfp4_utils import (
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
round_up
from
vllm.utils.math_utils
import
round_up
if
not
current_platform
.
is_device_capability
(
100
):
if
not
current_platform
.
is_device_capability
_family
(
100
):
pytest
.
skip
(
pytest
.
skip
(
"This TRTLLM kernel requires NVIDIA Blackwell."
,
allow_module_level
=
True
"This TRTLLM kernel requires NVIDIA Blackwell."
,
allow_module_level
=
True
)
)
...
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
...
@@ -443,7 +443,7 @@ def test_flashinfer_trtllm_prefill_with_baseline(
output_trtllm
=
output_trtllm
.
reshape
(
-
1
,
query
.
shape
[
1
],
query
.
shape
[
2
])
output_trtllm
=
output_trtllm
.
reshape
(
-
1
,
query
.
shape
[
1
],
query
.
shape
[
2
])
if
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
FP4_DTYPE
:
if
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
FP4_DTYPE
:
rtol
,
atol
=
1
e-1
,
2
e-1
rtol
,
atol
=
3
e-1
,
4
e-1
elif
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
FP8_DTYPE
:
elif
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
FP8_DTYPE
:
rtol
,
atol
=
4e-2
,
6e-2
rtol
,
atol
=
4e-2
,
6e-2
elif
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
dtype
:
elif
q_quant_dtype
==
FP8_DTYPE
and
o_quant_dtype
==
dtype
:
...
...
tests/kernels/attention/test_triton_unified_attention.py
View file @
a3f8d5dd
...
@@ -7,6 +7,7 @@ import torch
...
@@ -7,6 +7,7 @@ import torch
from
vllm.attention.ops.triton_unified_attention
import
unified_attention
from
vllm.attention.ops.triton_unified_attention
import
unified_attention
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.math_utils
import
next_power_of_2
NUM_HEADS
=
[(
4
,
4
),
(
8
,
2
)]
NUM_HEADS
=
[(
4
,
4
),
(
8
,
2
)]
HEAD_SIZES
=
[
128
,
256
]
HEAD_SIZES
=
[
128
,
256
]
...
@@ -22,6 +23,10 @@ QDTYPES = (
...
@@ -22,6 +23,10 @@ QDTYPES = (
# one value small enough to test the schema op check
# one value small enough to test the schema op check
NUM_BLOCKS
=
[
32768
,
2048
]
NUM_BLOCKS
=
[
32768
,
2048
]
# 0: use 2D kernel for decode
# 8: use 3D kernel for decode
SEQ_THRESHOLD_3D_VALUES
=
[
0
,
8
]
def
ref_paged_attn
(
def
ref_paged_attn
(
query
:
torch
.
Tensor
,
query
:
torch
.
Tensor
,
...
@@ -92,6 +97,7 @@ def ref_paged_attn(
...
@@ -92,6 +97,7 @@ def ref_paged_attn(
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
50.0
])
@
pytest
.
mark
.
parametrize
(
"soft_cap"
,
[
None
,
50.0
])
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"num_blocks"
,
NUM_BLOCKS
)
@
pytest
.
mark
.
parametrize
(
"q_dtype"
,
QDTYPES
)
@
pytest
.
mark
.
parametrize
(
"q_dtype"
,
QDTYPES
)
@
pytest
.
mark
.
parametrize
(
"seq_threshold_3D"
,
SEQ_THRESHOLD_3D_VALUES
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_triton_unified_attn
(
def
test_triton_unified_attn
(
seq_lens
:
list
[
tuple
[
int
,
int
]],
seq_lens
:
list
[
tuple
[
int
,
int
]],
...
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
...
@@ -103,6 +109,7 @@ def test_triton_unified_attn(
soft_cap
:
float
|
None
,
soft_cap
:
float
|
None
,
num_blocks
:
int
,
num_blocks
:
int
,
q_dtype
:
torch
.
dtype
|
None
,
q_dtype
:
torch
.
dtype
|
None
,
seq_threshold_3D
:
int
,
)
->
None
:
)
->
None
:
torch
.
set_default_device
(
"cuda"
)
torch
.
set_default_device
(
"cuda"
)
...
@@ -152,6 +159,21 @@ def test_triton_unified_attn(
...
@@ -152,6 +159,21 @@ def test_triton_unified_attn(
k_descale
=
torch
.
rand
(
scale_shape
,
dtype
=
torch
.
float32
)
k_descale
=
torch
.
rand
(
scale_shape
,
dtype
=
torch
.
float32
)
v_descale
=
torch
.
rand
(
scale_shape
,
dtype
=
torch
.
float32
)
v_descale
=
torch
.
rand
(
scale_shape
,
dtype
=
torch
.
float32
)
num_par_softmax_segments
=
16
head_size_padded
=
next_power_of_2
(
head_size
)
softmax_segm_output
=
torch
.
empty
(
(
seq_threshold_3D
,
num_query_heads
,
num_par_softmax_segments
,
head_size_padded
),
dtype
=
torch
.
float32
,
)
softmax_segm_max
=
torch
.
empty
(
(
seq_threshold_3D
,
num_query_heads
,
num_par_softmax_segments
),
dtype
=
torch
.
float32
,
)
softmax_segm_expsum
=
torch
.
empty
(
(
seq_threshold_3D
,
num_query_heads
,
num_par_softmax_segments
),
dtype
=
torch
.
float32
,
)
unified_attention
(
unified_attention
(
q
=
maybe_quantized_query
,
q
=
maybe_quantized_query
,
k
=
maybe_quantized_key_cache
,
k
=
maybe_quantized_key_cache
,
...
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
...
@@ -169,6 +191,11 @@ def test_triton_unified_attn(
q_descale
=
q_descale
,
q_descale
=
q_descale
,
k_descale
=
k_descale
,
k_descale
=
k_descale
,
v_descale
=
v_descale
,
v_descale
=
v_descale
,
seq_threshold_3D
=
seq_threshold_3D
,
num_par_softmax_segments
=
num_par_softmax_segments
,
softmax_segm_output
=
softmax_segm_output
,
softmax_segm_max
=
softmax_segm_max
,
softmax_segm_expsum
=
softmax_segm_expsum
,
)
)
ref_output
=
ref_paged_attn
(
ref_output
=
ref_paged_attn
(
...
...
tests/kernels/core/test_apply_rotary_emb.py
0 → 100644
View file @
a3f8d5dd
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for ApplyRotaryEmb CustomOp dispatch behavior.
This test ensures that RotaryEmbedding classes correctly call the appropriate
ApplyRotaryEmb methods based on the calling context:
1. RotaryEmbedding.forward_native() -> ApplyRotaryEmb.forward_native()
2. RotaryEmbedding.forward_cuda() -> ApplyRotaryEmb.forward() (auto-dispatch)
3. RotaryEmbedding.forward_hip() -> ApplyRotaryEmb.forward() (auto-dispatch)
"""
from
dataclasses
import
dataclass
import
pytest
import
torch
from
vllm.config
import
(
CompilationConfig
,
VllmConfig
,
get_cached_compilation_config
,
set_current_vllm_config
,
)
from
vllm.platforms
import
current_platform
CUDA_DEVICES
=
[
"cuda:0"
]
@
dataclass
class
RotaryEmbeddingTestCase
:
"""Test case configuration for RotaryEmbedding dispatch tests."""
name
:
str
rope_class
:
type
rope_kwargs
:
dict
method_name
:
str
# forward_native, forward_cuda, forward
positions_shape
:
tuple
# (num_tokens,) or (3, num_tokens) or (4, num_tokens)
expect_forward_native
:
bool
# Should call ApplyRotaryEmb.forward_native()
expect_forward
:
bool
# Should call ApplyRotaryEmb.forward()
def
get_test_cases
()
->
list
[
RotaryEmbeddingTestCase
]:
"""Generate test cases for all RotaryEmbedding classes."""
from
vllm.model_executor.layers.rotary_embedding.ernie45_vl_rope
import
(
Ernie4_5_VLRotaryEmbedding
,
)
from
vllm.model_executor.layers.rotary_embedding.mrope
import
MRotaryEmbedding
from
vllm.model_executor.layers.rotary_embedding.xdrope
import
XDRotaryEmbedding
common_kwargs
=
{
"head_size"
:
128
,
"rotary_dim"
:
128
,
"max_position_embeddings"
:
4096
,
"base"
:
10000
,
"is_neox_style"
:
True
,
"dtype"
:
torch
.
bfloat16
,
}
return
[
# MRotaryEmbedding tests
RotaryEmbeddingTestCase
(
name
=
"MRotaryEmbedding.forward_native"
,
rope_class
=
MRotaryEmbedding
,
rope_kwargs
=
{
**
common_kwargs
,
"mrope_section"
:
[
16
,
24
,
24
]},
method_name
=
"forward_native"
,
positions_shape
=
(
3
,
32
),
# 2D for multimodal
expect_forward_native
=
True
,
expect_forward
=
False
,
),
RotaryEmbeddingTestCase
(
name
=
"MRotaryEmbedding.forward_cuda_1d"
,
rope_class
=
MRotaryEmbedding
,
rope_kwargs
=
{
**
common_kwargs
,
"mrope_section"
:
[
16
,
24
,
24
]},
method_name
=
"forward_cuda"
,
positions_shape
=
(
32
,),
# 1D triggers apply_rotary_emb path
expect_forward_native
=
False
,
expect_forward
=
True
,
),
# XDRotaryEmbedding tests
RotaryEmbeddingTestCase
(
name
=
"XDRotaryEmbedding.forward"
,
rope_class
=
XDRotaryEmbedding
,
rope_kwargs
=
{
**
common_kwargs
,
"scaling_alpha"
:
1.0
,
"xdrope_section"
:
[
16
,
16
,
16
,
16
],
},
method_name
=
"forward"
,
positions_shape
=
(
4
,
32
),
# 4D for P/W/H/T
expect_forward_native
=
False
,
expect_forward
=
True
,
),
# Ernie4_5_VLRotaryEmbedding tests
RotaryEmbeddingTestCase
(
name
=
"Ernie4_5_VLRotaryEmbedding.forward_native"
,
rope_class
=
Ernie4_5_VLRotaryEmbedding
,
rope_kwargs
=
{
**
common_kwargs
,
"mrope_section"
:
[
22
,
22
,
20
]},
method_name
=
"forward_native"
,
positions_shape
=
(
3
,
32
),
# 2D for multimodal
expect_forward_native
=
True
,
expect_forward
=
False
,
),
]
def
run_dispatch_test
(
test_case
:
RotaryEmbeddingTestCase
,
device
:
str
,
):
"""Run a dispatch test for a RotaryEmbedding class."""
vllm_config
=
VllmConfig
(
compilation_config
=
CompilationConfig
(
custom_ops
=
[
"all"
,
"+apply_rotary_emb"
])
)
get_cached_compilation_config
.
cache_clear
()
with
set_current_vllm_config
(
vllm_config
):
rope
=
test_case
.
rope_class
(
**
test_case
.
rope_kwargs
).
to
(
device
=
device
)
apply_rotary_emb
=
rope
.
apply_rotary_emb
# Verify custom op is enabled
if
test_case
.
expect_forward_native
:
assert
(
apply_rotary_emb
.
_forward_method
!=
apply_rotary_emb
.
forward_native
),
"Test setup error: ApplyRotaryEmb custom op should be enabled"
# Setup call tracking
call_tracker
=
{
"forward_native_called"
:
False
,
"forward_called"
:
False
}
original_forward_native
=
apply_rotary_emb
.
forward_native
original_forward
=
apply_rotary_emb
.
forward
def
tracked_forward_native
(
*
args
,
**
kwargs
):
call_tracker
[
"forward_native_called"
]
=
True
return
original_forward_native
(
*
args
,
**
kwargs
)
def
tracked_forward
(
*
args
,
**
kwargs
):
call_tracker
[
"forward_called"
]
=
True
return
original_forward
(
*
args
,
**
kwargs
)
apply_rotary_emb
.
forward_native
=
tracked_forward_native
apply_rotary_emb
.
forward
=
tracked_forward
try
:
num_tokens
=
test_case
.
positions_shape
[
-
1
]
num_q_heads
=
8
num_kv_heads
=
2
head_size
=
test_case
.
rope_kwargs
[
"head_size"
]
max_position
=
test_case
.
rope_kwargs
[
"max_position_embeddings"
]
positions
=
torch
.
randint
(
0
,
max_position
//
4
,
test_case
.
positions_shape
,
device
=
device
)
query
=
torch
.
randn
(
num_tokens
,
num_q_heads
*
head_size
,
dtype
=
torch
.
bfloat16
,
device
=
device
)
key
=
torch
.
randn
(
num_tokens
,
num_kv_heads
*
head_size
,
dtype
=
torch
.
bfloat16
,
device
=
device
,
)
# Call the method under test
method
=
getattr
(
rope
,
test_case
.
method_name
)
method
(
positions
,
query
.
clone
(),
key
.
clone
())
# Verify expectations
if
test_case
.
expect_forward_native
:
assert
call_tracker
[
"forward_native_called"
],
(
f
"
{
test_case
.
name
}
should call ApplyRotaryEmb.forward_native()"
)
if
not
test_case
.
expect_forward
:
assert
not
call_tracker
[
"forward_called"
],
(
f
"
{
test_case
.
name
}
should NOT call ApplyRotaryEmb.forward(). "
"Bug: when +apply_rotary_emb is enabled, forward_native() "
"incorrectly dispatches to CUDA/HIP kernels."
)
if
test_case
.
expect_forward
:
assert
call_tracker
[
"forward_called"
],
(
f
"
{
test_case
.
name
}
should call ApplyRotaryEmb.forward()"
)
finally
:
apply_rotary_emb
.
forward_native
=
original_forward_native
apply_rotary_emb
.
forward
=
original_forward
@
pytest
.
mark
.
skipif
(
not
current_platform
.
is_cuda_alike
(),
reason
=
"Skipping CUDA/ROCm only tests."
)
@
pytest
.
mark
.
parametrize
(
"test_case"
,
get_test_cases
(),
ids
=
lambda
tc
:
tc
.
name
)
@
pytest
.
mark
.
parametrize
(
"device"
,
CUDA_DEVICES
)
def
test_rotary_embedding_dispatch
(
test_case
:
RotaryEmbeddingTestCase
,
device
:
str
,
):
"""
Test that RotaryEmbedding classes dispatch to the correct ApplyRotaryEmb method.
- forward_native methods should call ApplyRotaryEmb.forward_native()
- forward_cuda/forward methods should call ApplyRotaryEmb.forward()
"""
run_dispatch_test
(
test_case
,
device
)
tests/kernels/core/test_mrope.py
View file @
a3f8d5dd
...
@@ -116,7 +116,6 @@ def test_mrope(
...
@@ -116,7 +116,6 @@ def test_mrope(
mrope_helper_class
=
get_rope
(
mrope_helper_class
=
get_rope
(
head_size
=
head_dim
,
head_size
=
head_dim
,
rotary_dim
=
head_dim
,
max_position
=
max_position
,
max_position
=
max_position
,
is_neox_style
=
is_neox_style
,
is_neox_style
=
is_neox_style
,
rope_parameters
=
config
.
rope_parameters
,
rope_parameters
=
config
.
rope_parameters
,
...
@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
...
@@ -185,7 +184,6 @@ def test_mrope_torch_compile_tracing(
mrope_helper_class
=
get_rope
(
mrope_helper_class
=
get_rope
(
head_size
=
head_dim
,
head_size
=
head_dim
,
rotary_dim
=
head_dim
,
max_position
=
max_position
,
max_position
=
max_position
,
is_neox_style
=
is_neox_style
,
is_neox_style
=
is_neox_style
,
rope_parameters
=
config
.
rope_parameters
,
rope_parameters
=
config
.
rope_parameters
,
...
...
tests/kernels/core/test_pos_encoding.py
View file @
a3f8d5dd
...
@@ -83,8 +83,12 @@ def test_rotary_embedding(
...
@@ -83,8 +83,12 @@ def test_rotary_embedding(
torch
.
set_default_device
(
device
)
torch
.
set_default_device
(
device
)
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
rope_parameters
=
{
"rope_type"
:
"default"
,
"rope_theta"
:
rope_theta
}
rope_parameters
=
{
rope
=
get_rope
(
head_size
,
rotary_dim
,
max_position
,
is_neox_style
,
rope_parameters
)
"rope_type"
:
"default"
,
"rope_theta"
:
rope_theta
,
"partial_rotary_factor"
:
rotary_dim
/
head_size
,
}
rope
=
get_rope
(
head_size
,
max_position
,
is_neox_style
,
rope_parameters
)
rope
=
rope
.
to
(
dtype
=
dtype
,
device
=
torch
.
get_default_device
())
rope
=
rope
.
to
(
dtype
=
dtype
,
device
=
torch
.
get_default_device
())
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
positions
=
torch
.
randint
(
0
,
max_position
,
(
batch_size
,
seq_len
))
...
@@ -150,9 +154,9 @@ def test_rope_module_cache():
...
@@ -150,9 +154,9 @@ def test_rope_module_cache():
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
rope_parameters
[
"rope_theta"
]
=
rope_theta
rope_parameters
[
"rope_theta"
]
=
rope_theta
rope_parameters
[
"partial_rotary_factor"
]
=
rotary_dim
/
head_size
rope
=
get_rope
(
rope
=
get_rope
(
head_size
,
head_size
,
rotary_dim
,
max_position
,
max_position
,
is_neox_style
,
is_neox_style
,
rope_parameters
,
rope_parameters
,
...
@@ -177,9 +181,9 @@ def test_rope_module_cache():
...
@@ -177,9 +181,9 @@ def test_rope_module_cache():
if
rotary_dim
is
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
rotary_dim
=
head_size
rope_parameters
[
"rope_theta"
]
=
rope_theta
rope_parameters
[
"rope_theta"
]
=
rope_theta
rope_parameters
[
"partial_rotary_factor"
]
=
rotary_dim
/
head_size
rope
=
get_rope
(
rope
=
get_rope
(
head_size
,
head_size
,
rotary_dim
,
max_position
,
max_position
,
is_neox_style
,
is_neox_style
,
rope_parameters
,
rope_parameters
,
...
...
tests/kernels/moe/modular_kernel_tools/common.py
View file @
a3f8d5dd
...
@@ -594,7 +594,8 @@ def make_modular_kernel(
...
@@ -594,7 +594,8 @@ def make_modular_kernel(
)
)
modular_kernel
=
mk
.
FusedMoEModularKernel
(
modular_kernel
=
mk
.
FusedMoEModularKernel
(
prepare_finalize
=
prepare_finalize
,
fused_experts
=
fused_experts
prepare_finalize
=
prepare_finalize
,
fused_experts
=
fused_experts
,
)
)
return
modular_kernel
return
modular_kernel
...
...
tests/kernels/moe/test_batched_deepgemm.py
View file @
a3f8d5dd
...
@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128]
...
@@ -27,7 +27,7 @@ BLOCK_SIZE = [128, 128]
@
pytest
.
mark
.
parametrize
(
"N"
,
[
512
,
1024
])
# intermediate dim per expert
@
pytest
.
mark
.
parametrize
(
"N"
,
[
512
,
1024
])
# intermediate dim per expert
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
4
])
@
pytest
.
mark
.
parametrize
(
"topk"
,
[
2
,
4
])
def
test_batched_deepgemm_vs_triton
(
def
test_batched_deepgemm_vs_triton
(
E
:
int
,
T
:
int
,
K
:
int
,
N
:
int
,
topk
:
int
,
monkeypatch
E
:
int
,
T
:
int
,
K
:
int
,
N
:
int
,
topk
:
int
,
monkeypatch
,
workspace_init
):
):
"""Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
"""Compare BatchedDeepGemmExperts to BatchedTritonExperts."""
...
...
tests/kernels/moe/test_batched_moe.py
View file @
a3f8d5dd
...
@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts(
...
@@ -248,6 +248,7 @@ def test_fused_moe_batched_experts(
per_act_token_quant
:
bool
,
per_act_token_quant
:
bool
,
block_shape
:
list
[
int
]
|
None
,
block_shape
:
list
[
int
]
|
None
,
input_scales
:
bool
,
input_scales
:
bool
,
workspace_init
,
):
):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware."""
and those tests will be skipped on unsupported hardware."""
...
...
tests/kernels/moe/test_block_fp8.py
View file @
a3f8d5dd
...
@@ -137,7 +137,7 @@ def setup_cuda():
...
@@ -137,7 +137,7 @@ def setup_cuda():
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
pytest
.
mark
.
parametrize
(
"seed"
,
SEEDS
)
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_w8a8_block_fp8_fused_moe
(
def
test_w8a8_block_fp8_fused_moe
(
M
,
N
,
K
,
E
,
topk
,
block_size
,
dtype
,
seed
,
monkeypatch
M
,
N
,
K
,
E
,
topk
,
block_size
,
dtype
,
seed
,
monkeypatch
,
workspace_init
):
):
if
topk
>
E
:
if
topk
>
E
:
pytest
.
skip
(
f
"Skipping test; topk=
{
topk
}
> E=
{
E
}
"
)
pytest
.
skip
(
f
"Skipping test; topk=
{
topk
}
> E=
{
E
}
"
)
...
...
tests/kernels/moe/test_cutlass_moe.py
View file @
a3f8d5dd
...
@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
...
@@ -274,6 +274,7 @@ def test_cutlass_moe_8_bit_no_graph(
per_act_token
:
bool
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
per_out_ch
:
bool
,
monkeypatch
,
monkeypatch
,
workspace_init
,
ep_size
:
int
|
None
=
None
,
ep_size
:
int
|
None
=
None
,
):
):
current_platform
.
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
...
@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
...
@@ -329,6 +330,7 @@ def test_cutlass_moe_8_bit_cuda_graph(
per_act_token
:
bool
,
per_act_token
:
bool
,
per_out_ch
:
bool
,
per_out_ch
:
bool
,
monkeypatch
,
monkeypatch
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
...
@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
...
@@ -385,9 +387,19 @@ def test_cutlass_moe_8_bit_EP(
per_out_channel
:
bool
,
per_out_channel
:
bool
,
ep_size
:
int
,
ep_size
:
int
,
monkeypatch
,
monkeypatch
,
workspace_init
,
):
):
test_cutlass_moe_8_bit_no_graph
(
test_cutlass_moe_8_bit_no_graph
(
m
,
n
,
k
,
e
,
topk
,
per_act_token
,
per_out_channel
,
monkeypatch
,
ep_size
m
,
n
,
k
,
e
,
topk
,
per_act_token
,
per_out_channel
,
monkeypatch
,
workspace_init
,
ep_size
,
)
)
...
@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
...
@@ -419,9 +431,19 @@ def test_cutlass_moe_8_bit_EP_large(
per_out_channel
:
bool
,
per_out_channel
:
bool
,
ep_size
:
int
,
ep_size
:
int
,
monkeypatch
,
monkeypatch
,
workspace_init
,
):
):
test_cutlass_moe_8_bit_no_graph
(
test_cutlass_moe_8_bit_no_graph
(
m
,
n
,
k
,
e
,
topk
,
per_act_token
,
per_out_channel
,
monkeypatch
,
ep_size
m
,
n
,
k
,
e
,
topk
,
per_act_token
,
per_out_channel
,
monkeypatch
,
workspace_init
,
ep_size
,
)
)
...
@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
...
@@ -445,6 +467,7 @@ def test_run_cutlass_moe_fp8(
per_act_token
:
bool
,
per_act_token
:
bool
,
per_out_channel
:
bool
,
per_out_channel
:
bool
,
ep_size
:
int
,
ep_size
:
int
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
vllm_config
):
with
set_current_vllm_config
(
vllm_config
):
...
...
tests/kernels/moe/test_deepep_deepgemm_moe.py
View file @
a3f8d5dd
...
@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
...
@@ -29,6 +29,7 @@ from vllm.utils.deep_gemm import (
is_deep_gemm_supported
,
is_deep_gemm_supported
,
)
)
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
...
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
...
@@ -363,6 +364,9 @@ def _test_deepep_deepgemm_moe(
w1_scale
:
torch
.
Tensor
,
w1_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
w2_scale
:
torch
.
Tensor
,
):
):
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
init_workspace_manager
(
device
)
current_platform
.
seed_everything
(
pgi
.
rank
)
current_platform
.
seed_everything
(
pgi
.
rank
)
w1
=
w1
.
to
(
device
=
torch
.
cuda
.
current_device
())
w1
=
w1
.
to
(
device
=
torch
.
cuda
.
current_device
())
...
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
...
@@ -445,6 +449,7 @@ def test_ht_deepep_deepgemm_moe(
topk
:
int
,
topk
:
int
,
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
disable_deepgemm_ue8m0
,
disable_deepgemm_ue8m0
,
workspace_init
,
):
):
"""
"""
Tests for High-Throughput DeepEP + DeepGemm integration.
Tests for High-Throughput DeepEP + DeepGemm integration.
...
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
...
@@ -518,6 +523,7 @@ def test_ll_deepep_deepgemm_moe(
block_size
:
list
[
int
],
block_size
:
list
[
int
],
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
disable_deepgemm_ue8m0
,
disable_deepgemm_ue8m0
,
workspace_init
,
):
):
"""
"""
Tests for Low-Latency DeepEP + DeepGemm integration.
Tests for Low-Latency DeepEP + DeepGemm integration.
...
...
tests/kernels/moe/test_deepep_moe.py
View file @
a3f8d5dd
...
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
...
@@ -22,6 +22,7 @@ from vllm.model_executor.layers.quantization.utils.fp8_utils import (
)
)
from
vllm.platforms
import
current_platform
from
vllm.platforms
import
current_platform
from
vllm.utils.import_utils
import
has_deep_ep
from
vllm.utils.import_utils
import
has_deep_ep
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
...utils
import
multi_gpu_test
from
...utils
import
multi_gpu_test
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
from
.parallel_utils
import
ProcessGroupInfo
,
parallel_launch
...
@@ -342,6 +343,9 @@ def _deep_ep_moe(
...
@@ -342,6 +343,9 @@ def _deep_ep_moe(
use_fp8_dispatch
:
bool
,
use_fp8_dispatch
:
bool
,
per_act_token_quant
:
bool
,
per_act_token_quant
:
bool
,
):
):
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
init_workspace_manager
(
device
)
if
not
low_latency_mode
:
if
not
low_latency_mode
:
assert
not
use_fp8_dispatch
,
(
assert
not
use_fp8_dispatch
,
(
"FP8 dispatch interface is available only in low-latency mode"
"FP8 dispatch interface is available only in low-latency mode"
...
@@ -437,6 +441,7 @@ def test_deep_ep_moe(
...
@@ -437,6 +441,7 @@ def test_deep_ep_moe(
topk
:
int
,
topk
:
int
,
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
per_act_token_quant
:
bool
,
per_act_token_quant
:
bool
,
workspace_init
,
):
):
low_latency_mode
=
False
low_latency_mode
=
False
use_fp8_dispatch
=
False
use_fp8_dispatch
=
False
...
@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe(
...
@@ -492,6 +497,7 @@ def test_low_latency_deep_ep_moe(
topk
:
int
,
topk
:
int
,
world_dp_size
:
tuple
[
int
,
int
],
world_dp_size
:
tuple
[
int
,
int
],
use_fp8_dispatch
:
bool
,
use_fp8_dispatch
:
bool
,
workspace_init
,
):
):
low_latency_mode
=
True
low_latency_mode
=
True
...
...
tests/kernels/moe/test_deepgemm.py
View file @
a3f8d5dd
...
@@ -143,7 +143,7 @@ NUM_EXPERTS = [32]
...
@@ -143,7 +143,7 @@ NUM_EXPERTS = [32]
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOPKS
)
@
pytest
.
mark
.
parametrize
(
"topk"
,
TOPKS
)
@
pytest
.
mark
.
parametrize
(
"num_experts"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
parametrize
(
"num_experts"
,
NUM_EXPERTS
)
@
pytest
.
mark
.
skipif
(
not
is_deep_gemm_supported
(),
reason
=
"Requires deep_gemm kernels"
)
@
pytest
.
mark
.
skipif
(
not
is_deep_gemm_supported
(),
reason
=
"Requires deep_gemm kernels"
)
def
test_deepgemm_vs_triton
(
m
,
n
,
k
,
topk
,
num_experts
,
monkeypatch
):
def
test_deepgemm_vs_triton
(
m
,
n
,
k
,
topk
,
num_experts
,
monkeypatch
,
workspace_init
):
with
monkeypatch
.
context
()
as
mp
:
with
monkeypatch
.
context
()
as
mp
:
mp
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"1"
)
mp
.
setenv
(
"VLLM_USE_DEEP_GEMM"
,
"1"
)
...
...
tests/kernels/moe/test_flashinfer.py
View file @
a3f8d5dd
...
@@ -5,6 +5,7 @@ from dataclasses import dataclass
...
@@ -5,6 +5,7 @@ from dataclasses import dataclass
import
pytest
import
pytest
import
torch
import
torch
import
vllm.model_executor.layers.fused_moe.modular_kernel
as
mk
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.config
import
ParallelConfig
,
VllmConfig
,
set_current_vllm_config
from
vllm.model_executor.layers.fused_moe.config
import
(
from
vllm.model_executor.layers.fused_moe.config
import
(
FusedMoEQuantConfig
,
FusedMoEQuantConfig
,
...
@@ -107,6 +108,19 @@ class TestData:
...
@@ -107,6 +108,19 @@ class TestData:
layer
.
w2_input_scale
=
a2_scale
layer
.
w2_input_scale
=
a2_scale
layer
.
w13_weight_scale
=
w13_weight_scale
layer
.
w13_weight_scale
=
w13_weight_scale
layer
.
w2_weight_scale
=
w2_weight_scale
layer
.
w2_weight_scale
=
w2_weight_scale
# Setup dummy config.
layer
.
moe_parallel_config
=
mk
.
FusedMoEParallelConfig
(
tp_size
=
1
,
pcp_size
=
1
,
dp_size
=
1
,
ep_size
=
1
,
tp_rank
=
1
,
pcp_rank
=
1
,
dp_rank
=
1
,
ep_rank
=
1
,
use_ep
=
False
,
all2all_backend
=
"naive"
,
)
register_moe_scaling_factors
(
layer
)
register_moe_scaling_factors
(
layer
)
...
@@ -206,6 +220,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
...
@@ -206,6 +220,7 @@ def test_flashinfer_cutlass_moe_fp8_no_graph(
topk
:
int
,
topk
:
int
,
activation
:
str
,
activation
:
str
,
monkeypatch
,
monkeypatch
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
monkeypatch
.
setenv
(
"VLLM_FUSED_MOE_CHUNK_SIZE"
,
"8192"
)
...
...
tests/kernels/moe/test_flashinfer_moe.py
View file @
a3f8d5dd
...
@@ -51,7 +51,14 @@ MNK_FACTORS = [
...
@@ -51,7 +51,14 @@ MNK_FACTORS = [
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
"silu_and_mul"
,
"relu2"
])
@
pytest
.
mark
.
parametrize
(
"activation"
,
[
"silu_and_mul"
,
"relu2"
])
@
torch
.
inference_mode
()
@
torch
.
inference_mode
()
def
test_flashinfer_fp4_moe_no_graph
(
def
test_flashinfer_fp4_moe_no_graph
(
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
activation
:
str
m
:
int
,
n
:
int
,
k
:
int
,
e
:
int
,
topk
:
int
,
dtype
:
torch
.
dtype
,
activation
:
str
,
workspace_init
,
):
):
current_platform
.
seed_everything
(
7
)
current_platform
.
seed_everything
(
7
)
with
set_current_vllm_config
(
with
set_current_vllm_config
(
...
...
tests/kernels/moe/test_gpt_oss_triton_kernels.py
View file @
a3f8d5dd
...
@@ -269,7 +269,7 @@ class Case:
...
@@ -269,7 +269,7 @@ class Case:
)
)
@
pytest
.
mark
.
parametrize
(
"num_token"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"num_token"
,
[
2
])
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
,
2
,
4
,
8
])
@
pytest
.
mark
.
parametrize
(
"tp"
,
[
1
,
2
,
4
,
8
])
def
test_equiv
(
num_token
,
a_dtype
,
w_dtype
,
tp
):
def
test_equiv
(
num_token
,
a_dtype
,
w_dtype
,
tp
,
workspace_init
):
from
triton_kernels.tensor_details
import
layout
from
triton_kernels.tensor_details
import
layout
if
not
hasattr
(
layout
,
"make_default_matmul_mxfp4_w_layout"
):
if
not
hasattr
(
layout
,
"make_default_matmul_mxfp4_w_layout"
):
...
...
tests/kernels/moe/test_modular_kernel_combinations.py
View file @
a3f8d5dd
...
@@ -16,6 +16,7 @@ from vllm.platforms import current_platform
...
@@ -16,6 +16,7 @@ from vllm.platforms import current_platform
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.flashinfer
import
has_flashinfer_cutlass_fused_moe
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
,
has_pplx
from
vllm.utils.import_utils
import
has_deep_ep
,
has_deep_gemm
,
has_pplx
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.utils.torch_utils
import
cuda_device_count_stateless
from
vllm.v1.worker.workspace
import
init_workspace_manager
from
.modular_kernel_tools.common
import
(
from
.modular_kernel_tools.common
import
(
Config
,
Config
,
...
@@ -77,6 +78,10 @@ def rank_worker(
...
@@ -77,6 +78,10 @@ def rank_worker(
weights
:
WeightTensors
,
weights
:
WeightTensors
,
verbose
:
bool
,
verbose
:
bool
,
):
):
# Initialize workspace manager in child process
device
=
torch
.
device
(
f
"cuda:
{
pgi
.
local_rank
}
"
)
init_workspace_manager
(
device
)
current_platform
.
seed_everything
(
pgi
.
rank
)
current_platform
.
seed_everything
(
pgi
.
rank
)
# sanity check
# sanity check
...
@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu(
...
@@ -300,6 +305,7 @@ def test_modular_kernel_combinations_singlegpu(
chunk_size
:
int
|
None
,
chunk_size
:
int
|
None
,
world_size
:
int
,
world_size
:
int
,
pytestconfig
,
pytestconfig
,
workspace_init
,
):
):
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
"""Note: float8_e4m3fn is not supported on CUDA architecture < 89,
and those tests will be skipped on unsupported hardware."""
and those tests will be skipped on unsupported hardware."""
...
...
Prev
1
2
3
4
5
6
7
8
9
10
…
25
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment