Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
cc7f22a8
Commit
cc7f22a8
authored
Jun 11, 2025
by
zhuwenwen
Browse files
Merge tag 'v0.9.1' into v0.9.1-ori
parents
b9ea0c09
b6553be1
Changes
1000
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
217 additions
and
10 deletions
+217
-10
tests/kernels/attention/test_prefix_prefill.py
tests/kernels/attention/test_prefix_prefill.py
+1
-0
tests/kernels/attention/test_rocm_attention_selector.py
tests/kernels/attention/test_rocm_attention_selector.py
+1
-0
tests/kernels/attention/test_triton_decode_attention.py
tests/kernels/attention/test_triton_decode_attention.py
+1
-0
tests/kernels/attention/test_triton_unified_attention.py
tests/kernels/attention/test_triton_unified_attention.py
+4
-1
tests/kernels/core/test_activation.py
tests/kernels/core/test_activation.py
+1
-0
tests/kernels/core/test_fused_quant_layernorm.py
tests/kernels/core/test_fused_quant_layernorm.py
+1
-0
tests/kernels/core/test_layernorm.py
tests/kernels/core/test_layernorm.py
+1
-0
tests/kernels/core/test_opcheck.py
tests/kernels/core/test_opcheck.py
+1
-0
tests/kernels/core/test_permute_cols.py
tests/kernels/core/test_permute_cols.py
+1
-0
tests/kernels/core/test_pos_encoding.py
tests/kernels/core/test_pos_encoding.py
+4
-3
tests/kernels/core/test_rotary_embedding.py
tests/kernels/core/test_rotary_embedding.py
+1
-0
tests/kernels/core/test_uva.py
tests/kernels/core/test_uva.py
+1
-0
tests/kernels/mamba/test_causal_conv1d.py
tests/kernels/mamba/test_causal_conv1d.py
+1
-0
tests/kernels/mamba/test_mamba_mixer2.py
tests/kernels/mamba/test_mamba_mixer2.py
+1
-0
tests/kernels/mamba/test_mamba_ssm.py
tests/kernels/mamba/test_mamba_ssm.py
+1
-0
tests/kernels/mamba/test_mamba_ssm_ssd.py
tests/kernels/mamba/test_mamba_ssm_ssd.py
+1
-0
tests/kernels/moe/__init__.py
tests/kernels/moe/__init__.py
+0
-0
tests/kernels/moe/deepep_utils.py
tests/kernels/moe/deepep_utils.py
+191
-0
tests/kernels/moe/test_batched_moe.py
tests/kernels/moe/test_batched_moe.py
+1
-0
tests/kernels/moe/test_cutlass_moe.py
tests/kernels/moe/test_cutlass_moe.py
+3
-6
No files found.
Too many changes to show.
To preserve performance only
1000 of 1000+
files are displayed.
Plain diff
Email patch
tests/kernels/attention/test_prefix_prefill.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
math
import
random
...
...
tests/kernels/attention/test_rocm_attention_selector.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/attention/test_triton_decode_attention.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/attention/test_triton_unified_attention.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
...
...
@@ -13,7 +14,9 @@ HEAD_SIZES = [128, 256]
BLOCK_SIZES
=
[
16
,
32
]
DTYPES
=
[
torch
.
float16
,
torch
.
bfloat16
]
QDTYPES
=
[
None
,
torch
.
float8_e4m3fn
]
QDTYPES
=
[
None
,
torch
.
float8_e4m3fn
]
if
not
current_platform
.
is_rocm
()
else
[
None
,
torch
.
float8_e4m3fnuz
]
# one value large enough to test overflow in index calculation.
# one value small enough to test the schema op check
NUM_BLOCKS
=
[
32768
,
2048
]
...
...
tests/kernels/core/test_activation.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
random
...
...
tests/kernels/core/test_fused_quant_layernorm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
,
Union
...
...
tests/kernels/core/test_layernorm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/core/test_opcheck.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for miscellaneous utilities
"""
...
...
tests/kernels/core/test_permute_cols.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/core/test_pos_encoding.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
itertools
import
accumulate
,
product
from
typing
import
Callable
,
Optional
...
...
@@ -70,7 +71,7 @@ def test_rotary_embedding(
device
:
str
,
use_key
:
bool
,
max_position
:
int
=
8192
,
base
:
in
t
=
10000
,
base
:
floa
t
=
10000
,
)
->
None
:
if
rotary_dim
is
None
:
rotary_dim
=
head_size
...
...
@@ -135,7 +136,7 @@ def test_batched_rotary_embedding(
device
:
str
,
use_key
:
bool
,
max_position
:
int
=
8192
,
base
:
in
t
=
10000
,
base
:
floa
t
=
10000
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
...
...
@@ -203,7 +204,7 @@ def test_batched_rotary_embedding_multi_lora(
device
:
str
,
use_key
:
bool
,
max_position
:
int
=
8192
,
base
:
in
t
=
10000
,
base
:
floa
t
=
10000
,
)
->
None
:
current_platform
.
seed_everything
(
seed
)
torch
.
set_default_device
(
device
)
...
...
tests/kernels/core/test_rotary_embedding.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Tests for miscellaneous utilities
"""
...
...
tests/kernels/core/test_uva.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/mamba/test_causal_conv1d.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
typing
import
Optional
...
...
tests/kernels/mamba/test_mamba_mixer2.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
unittest
...
...
tests/kernels/mamba/test_mamba_ssm.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/mamba/test_mamba_ssm_ssd.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
pytest
import
torch
...
...
tests/kernels/moe/__init__.py
0 → 100644
View file @
cc7f22a8
tests/kernels/moe/deepep_utils.py
0 → 100644
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
"""
DeepEP test utilities
"""
import
dataclasses
import
importlib
import
traceback
from
typing
import
Callable
,
Optional
import
torch
from
torch.distributed
import
ProcessGroup
from
torch.multiprocessing
import
(
spawn
)
# pyright: ignore[reportPrivateImportUsage]
from
typing_extensions
import
Concatenate
,
ParamSpec
has_deep_ep
=
importlib
.
util
.
find_spec
(
"deep_ep"
)
is
not
None
if
has_deep_ep
:
from
vllm.model_executor.layers.fused_moe.deepep_ht_prepare_finalize
import
(
# noqa: E501
DeepEPHTPrepareAndFinalize
)
from
vllm.model_executor.layers.fused_moe.deepep_ll_prepare_finalize
import
(
# noqa: E501
DeepEPLLPrepareAndFinalize
)
## Parallel Processes Utils
P
=
ParamSpec
(
"P"
)
@
dataclasses
.
dataclass
class
ProcessGroupInfo
:
world_size
:
int
world_local_size
:
int
rank
:
int
node_rank
:
int
local_rank
:
int
device
:
torch
.
device
def
_worker_parallel_launch
(
local_rank
:
int
,
world_size
:
int
,
world_local_size
:
int
,
node_rank
:
int
,
init_method
:
str
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
rank
=
node_rank
*
world_local_size
+
local_rank
torch
.
cuda
.
set_device
(
local_rank
)
device
=
torch
.
device
(
"cuda"
,
local_rank
)
torch
.
distributed
.
init_process_group
(
backend
=
"cpu:gloo,cuda:nccl"
,
init_method
=
init_method
,
rank
=
rank
,
world_size
=
world_size
,
device_id
=
device
,
)
barrier
=
torch
.
tensor
([
rank
],
device
=
device
)
torch
.
distributed
.
all_reduce
(
barrier
)
try
:
worker
(
ProcessGroupInfo
(
world_size
=
world_size
,
world_local_size
=
world_local_size
,
rank
=
rank
,
node_rank
=
node_rank
,
local_rank
=
local_rank
,
device
=
device
,
),
*
args
,
**
kwargs
,
)
except
Exception
as
ex
:
print
(
ex
)
traceback
.
print_exc
()
raise
finally
:
torch
.
distributed
.
destroy_process_group
()
def
parallel_launch
(
world_size
:
int
,
worker
:
Callable
[
Concatenate
[
ProcessGroupInfo
,
P
],
None
],
*
args
:
P
.
args
,
**
kwargs
:
P
.
kwargs
,
)
->
None
:
assert
not
kwargs
spawn
(
_worker_parallel_launch
,
args
=
(
world_size
,
world_size
,
0
,
"tcp://localhost:29500"
,
worker
,
)
+
args
,
nprocs
=
world_size
,
join
=
True
,
)
## DeepEP specific utils
@
dataclasses
.
dataclass
class
DeepEPHTArgs
:
num_local_experts
:
int
@
dataclasses
.
dataclass
class
DeepEPLLArgs
:
max_tokens_per_rank
:
int
hidden_size
:
int
num_experts
:
int
use_fp8_dispatch
:
bool
def
make_deepep_ht_a2a
(
pg
:
ProcessGroup
,
pgi
:
ProcessGroupInfo
,
dp_size
:
int
,
ht_args
:
DeepEPHTArgs
,
q_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
block_shape
:
Optional
[
list
[
int
]]
=
None
):
import
deep_ep
# high throughput a2a
num_nvl_bytes
=
1024
*
1024
*
1024
# 1GB
num_rdma_bytes
,
low_latency_mode
,
num_qps_per_rank
=
0
,
False
,
1
buffer
=
deep_ep
.
Buffer
(
group
=
pg
,
num_nvl_bytes
=
num_nvl_bytes
,
num_rdma_bytes
=
num_rdma_bytes
,
low_latency_mode
=
low_latency_mode
,
num_qps_per_rank
=
num_qps_per_rank
)
return
DeepEPHTPrepareAndFinalize
(
buffer
=
buffer
,
world_size
=
pgi
.
world_size
,
rank
=
pgi
.
rank
,
dp_size
=
dp_size
,
rank_expert_offset
=
pgi
.
rank
*
ht_args
.
num_local_experts
,
quant_dtype
=
q_dtype
,
block_shape
=
block_shape
)
def
make_deepep_ll_a2a
(
pg
:
ProcessGroup
,
pgi
:
ProcessGroupInfo
,
dp_size
:
int
,
deepep_ll_args
:
DeepEPLLArgs
,
q_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
block_shape
:
Optional
[
list
[
int
]]
=
None
):
import
deep_ep
# low-latency a2a
num_rdma_bytes
=
deep_ep
.
Buffer
.
get_low_latency_rdma_size_hint
(
deepep_ll_args
.
max_tokens_per_rank
,
deepep_ll_args
.
hidden_size
,
pgi
.
world_size
,
deepep_ll_args
.
num_experts
)
buffer
=
deep_ep
.
Buffer
(
group
=
pg
,
num_rdma_bytes
=
num_rdma_bytes
,
low_latency_mode
=
True
,
num_qps_per_rank
=
deepep_ll_args
.
num_experts
//
pgi
.
world_size
)
return
DeepEPLLPrepareAndFinalize
(
buffer
=
buffer
,
world_size
=
pgi
.
world_size
,
dp_size
=
dp_size
,
max_tokens_per_rank
=
deepep_ll_args
.
max_tokens_per_rank
,
quant_dtype
=
q_dtype
,
block_shape
=
block_shape
,
use_fp8_dispatch
=
deepep_ll_args
.
use_fp8_dispatch
,
)
def
make_deepep_a2a
(
pg
:
ProcessGroup
,
pgi
:
ProcessGroupInfo
,
dp_size
:
int
,
deepep_ht_args
:
Optional
[
DeepEPHTArgs
],
deepep_ll_args
:
Optional
[
DeepEPLLArgs
],
q_dtype
:
Optional
[
torch
.
dtype
]
=
None
,
block_shape
:
Optional
[
list
[
int
]]
=
None
):
if
deepep_ht_args
is
not
None
:
assert
deepep_ll_args
is
None
return
make_deepep_ht_a2a
(
pg
,
pgi
,
dp_size
,
deepep_ht_args
,
q_dtype
,
block_shape
)
assert
deepep_ll_args
is
not
None
return
make_deepep_ll_a2a
(
pg
,
pgi
,
dp_size
,
deepep_ll_args
,
q_dtype
,
block_shape
)
tests/kernels/moe/test_batched_moe.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from
dataclasses
import
dataclass
...
...
tests/kernels/moe/test_cutlass_moe.py
View file @
cc7f22a8
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import
dataclasses
from
typing
import
Optional
...
...
@@ -192,14 +193,10 @@ def run_8_bit(moe_tensors: MOETensors8Bit,
kwargs
=
{
'a'
:
moe_tensors
.
a
,
'w1_q'
:
moe_tensors
.
w1_q
.
transpose
(
1
,
2
)
,
# type: ignore[union-attr]
'w2_q'
:
moe_tensors
.
w2_q
.
transpose
(
1
,
2
)
,
# type: ignore[union-attr]
'w1_q'
:
moe_tensors
.
w1_q
,
# type: ignore[union-attr]
'w2_q'
:
moe_tensors
.
w2_q
,
# type: ignore[union-attr]
'topk_weights'
:
topk_weights
,
'topk_ids'
:
topk_ids
,
'ab_strides1'
:
moe_tensors
.
ab_strides1
,
'c_strides1'
:
moe_tensors
.
c_strides1
,
'ab_strides2'
:
moe_tensors
.
ab_strides2
,
'c_strides2'
:
moe_tensors
.
c_strides2
,
'w1_scale'
:
moe_tensors
.
w1_scale
,
'w2_scale'
:
moe_tensors
.
w2_scale
,
'a1_scale'
:
moe_tensors
.
a_scale
...
...
Prev
1
…
17
18
19
20
21
22
23
24
25
…
50
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment