Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
ab1acdce
Commit
ab1acdce
authored
Dec 16, 2025
by
zhuwenwen
Browse files
Merge branch 'v0.9.2-dev-ds-wm-1215' into 'v0.9.2-dev-ds'
优化高吞吐模式num_sms See merge request dcutoolkit/deeplearing/vllm!297
parents
b2db7ca2
10400c58
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
22 additions
and
82 deletions
+22
-82
vllm/distributed/device_communicators/all2all.py
vllm/distributed/device_communicators/all2all.py
+2
-0
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+10
-3
vllm/model_executor/layers/fused_moe/utils.py
vllm/model_executor/layers/fused_moe/utils.py
+9
-79
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
...ation/compressed_tensors/compressed_tensors_moe_marlin.py
+1
-0
No files found.
vllm/distributed/device_communicators/all2all.py
View file @
ab1acdce
...
@@ -173,6 +173,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
...
@@ -173,6 +173,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
if
self
.
internode
:
if
self
.
internode
:
num_rdma_bytes
=
int
(
1e9
/
2
)
#1024 * 1024 * 1024
num_rdma_bytes
=
int
(
1e9
/
2
)
#1024 * 1024 * 1024
num_qps_per_rank
=
30
#self.num_sms // 2
num_qps_per_rank
=
30
#self.num_sms // 2
self
.
num_sms
=
30
# import deep_ep
# import deep_ep
# num_nvl_bytes, num_rdma_bytes = 0, 0
# num_nvl_bytes, num_rdma_bytes = 0, 0
...
@@ -184,6 +185,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
...
@@ -184,6 +185,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
else
:
else
:
num_rdma_bytes
=
0
num_rdma_bytes
=
0
num_qps_per_rank
=
1
num_qps_per_rank
=
1
self
.
num_sms
=
60
assert
num_rdma_bytes
is
not
None
assert
num_rdma_bytes
is
not
None
assert
num_qps_per_rank
is
not
None
assert
num_qps_per_rank
is
not
None
...
...
vllm/model_executor/layers/fused_moe/modular_kernel.py
View file @
ab1acdce
...
@@ -12,7 +12,7 @@ import torch
...
@@ -12,7 +12,7 @@ import torch
import
vllm.envs
as
envs
import
vllm.envs
as
envs
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.config
import
FusedMoEQuantConfig
from
vllm.model_executor.layers.fused_moe.utils
import
_resize_cache
from
vllm.model_executor.layers.fused_moe.utils
import
_resize_cache
from
vllm.utils
import
cdiv
from
vllm.utils
import
cdiv
,
async_tensor_h2d
#
#
# This file defines a set of base classes used to make MoE kernels more modular.
# This file defines a set of base classes used to make MoE kernels more modular.
...
@@ -97,6 +97,8 @@ class FusedMoEActivationFormat(Enum):
...
@@ -97,6 +97,8 @@ class FusedMoEActivationFormat(Enum):
BatchedExperts
=
"batched_experts"
,
BatchedExperts
=
"batched_experts"
,
@
dataclass
@
dataclass
class
ExpertTokensMetadata
:
class
ExpertTokensMetadata
:
"""
"""
...
@@ -110,11 +112,16 @@ class ExpertTokensMetadata:
...
@@ -110,11 +112,16 @@ class ExpertTokensMetadata:
def
make_from_list
(
def
make_from_list
(
expert_num_tokens_list
:
list
[
int
],
device
:
str
expert_num_tokens_list
:
list
[
int
],
device
:
str
)
->
"ExpertTokensMetadata"
:
)
->
"ExpertTokensMetadata"
:
# expert_num_tokens_cpu = torch.tensor(
# expert_num_tokens_list, device="cpu", dtype=torch.int32
# )
expert_num_tokens_cpu
=
torch
.
tensor
(
expert_num_tokens_cpu
=
torch
.
tensor
(
expert_num_tokens_list
,
device
=
"cpu"
,
dtype
=
torch
.
int32
expert_num_tokens_list
,
device
=
"cpu"
,
dtype
=
torch
.
int32
,
pin_memory
=
True
)
)
expert_num_tokens
=
expert_num_tokens_cpu
.
to
(
device
=
device
,
non_blocking
=
True
)
return
ExpertTokensMetadata
(
return
ExpertTokensMetadata
(
expert_num_tokens
=
expert_num_tokens
_cpu
.
to
(
device
,
non_blocking
=
True
)
,
expert_num_tokens
=
expert_num_tokens
,
expert_num_tokens_cpu
=
expert_num_tokens_cpu
,
expert_num_tokens_cpu
=
expert_num_tokens_cpu
,
)
)
...
...
vllm/model_executor/layers/fused_moe/utils.py
View file @
ab1acdce
...
@@ -547,41 +547,6 @@ def _fwd_kernel_ep_scatter_2(
...
@@ -547,41 +547,6 @@ def _fwd_kernel_ep_scatter_2(
SCALE_HIDDEN_SIZE
:
tl
.
constexpr
,
SCALE_HIDDEN_SIZE
:
tl
.
constexpr
,
SCALE_HIDDEN_SIZE_PAD
:
tl
.
constexpr
,
SCALE_HIDDEN_SIZE_PAD
:
tl
.
constexpr
,
):
):
# start_token_id = tl.program_id(0)
# grid_num = tl.num_programs(0)
# offset_in = tl.arange(0, HIDDEN_SIZE_PAD)
# mask = offset_in < HIDDEN_SIZE
# offset_in_s = tl.arange(0, SCALE_HIDDEN_SIZE_PAD)
# mask_s = offset_in_s < SCALE_HIDDEN_SIZE
# for token_id in range(start_token_id, total_token_num, grid_num):
# to_copy = tl.load(recv_x + token_id * recv_x_stride0 + offset_in, mask=mask)
# to_copy_s = tl.load(
# recv_x_scale + token_id * recv_x_scale_stride0 + offset_in_s, mask=mask_s
# )
# for topk_index in tl.range(0, topk_num, 1, num_stages=4):
# expert_id = tl.load(recv_topk + token_id * recv_topk_stride0 + topk_index)
# if HAS_EXPERT_MAP:
# expert_id = apply_expert_map(expert_id, expert_map)
# if expert_id >= 0:
# dest_token_index = tl.atomic_add(expert_start_loc + expert_id, 1)
# tl.store(
# output_index + token_id * output_index_stride0 + topk_index,
# dest_token_index,
# )
# output_tensor_ptr = (
# output_tensor + dest_token_index * output_tensor_stride0
# )
# output_tensor_scale_ptr = (
# output_tensor_scale + dest_token_index * output_tensor_scale_stride0
# )
# tl.store(output_tensor_ptr + offset_in, to_copy, mask=mask)
# tl.store(output_tensor_scale_ptr + offset_in_s, to_copy_s, mask=mask_s)
start_token_id
=
tl
.
program_id
(
0
)
start_token_id
=
tl
.
program_id
(
0
)
grid_num
=
tl
.
num_programs
(
0
)
grid_num
=
tl
.
num_programs
(
0
)
...
@@ -720,43 +685,6 @@ def _fwd_kernel_ep_gather(
...
@@ -720,43 +685,6 @@ def _fwd_kernel_ep_gather(
HAS_EXPERT_MAP
:
tl
.
constexpr
,
HAS_EXPERT_MAP
:
tl
.
constexpr
,
BLOCK_D
:
tl
.
constexpr
,
BLOCK_D
:
tl
.
constexpr
,
):
):
# cur_block = tl.program_id(0)
# start_cur_token = tl.program_id(1)
# grid_num = tl.num_programs(1)
# for cur_token in range(start_cur_token, total_token_num, grid_num):
# off_d = tl.arange(0, BLOCK_D)
# accumulator = tl.zeros([BLOCK_D], dtype=tl.float32)
# for topk_index in range(0, topk_num):
# expert_id = tl.load(
# recv_topk_ids + cur_token * recv_topk_ids_stride0 + topk_index
# )
# if HAS_EXPERT_MAP:
# expert_id = apply_expert_map(expert_id, expert_map)
# if expert_id >= 0:
# source_token_index = tl.load(
# input_index + cur_token * input_index_stride0 + topk_index
# )
# acc_weight = tl.load(
# recv_topk_weight + cur_token * recv_topk_weight_stride0 + topk_index
# )
# tmp = tl.load(
# input_tensor
# + source_token_index * input_tensor_stride0
# + cur_block * BLOCK_D
# + off_d
# )
# accumulator += tmp.to(tl.float32) * acc_weight
# tl.store(
# output_tensor
# + cur_token * output_tensor_stride0
# + cur_block * BLOCK_D
# + off_d,
# accumulator.to(output_tensor.dtype.element_ty),
# )
cur_block_int32
=
tl
.
program_id
(
0
)
cur_block_int32
=
tl
.
program_id
(
0
)
cur_block
=
cur_block_int32
.
to
(
tl
.
int64
)
cur_block
=
cur_block_int32
.
to
(
tl
.
int64
)
...
@@ -856,6 +784,7 @@ def deepgemm_moe_permute(
...
@@ -856,6 +784,7 @@ def deepgemm_moe_permute(
expert_num_tokens
:
Optional
[
torch
.
Tensor
]
=
None
,
expert_num_tokens
:
Optional
[
torch
.
Tensor
]
=
None
,
expert_num_tokens_cpu
:
Optional
[
torch
.
Tensor
]
=
None
,
expert_num_tokens_cpu
:
Optional
[
torch
.
Tensor
]
=
None
,
aq_out
:
torch
.
Tensor
|
None
=
None
,
aq_out
:
torch
.
Tensor
|
None
=
None
,
M_sum
:
int
|
None
=
None
,
):
):
assert
aq
.
ndim
==
2
assert
aq
.
ndim
==
2
assert
topk_ids
.
dtype
.
is_signed
,
"The kernel uses -1 to represent invalid topk_ids"
assert
topk_ids
.
dtype
.
is_signed
,
"The kernel uses -1 to represent invalid topk_ids"
...
@@ -864,6 +793,7 @@ def deepgemm_moe_permute(
...
@@ -864,6 +793,7 @@ def deepgemm_moe_permute(
block_m
=
block_shape
[
0
]
block_m
=
block_shape
[
0
]
if
M_sum
is
None
:
M_sum
=
compute_aligned_M
(
M_sum
=
compute_aligned_M
(
M
=
topk_ids
.
size
(
0
),
M
=
topk_ids
.
size
(
0
),
num_topk
=
topk_ids
.
size
(
1
),
num_topk
=
topk_ids
.
size
(
1
),
...
...
vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe_marlin.py
View file @
ab1acdce
...
@@ -346,6 +346,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
...
@@ -346,6 +346,7 @@ class CompressedTensorsW8A8Int8MarlinMoEMethod(CompressedTensorsMarlinMoEMethod)
expert_num_tokens
=
expert_num_tokens
,
expert_num_tokens
=
expert_num_tokens
,
expert_num_tokens_cpu
=
expert_num_tokens_cpu
,
expert_num_tokens_cpu
=
expert_num_tokens_cpu
,
aq_out
=
a1q_perm
,
aq_out
=
a1q_perm
,
M_sum
=
M_sum
)
)
# if expert_map is not None:
# if expert_map is not None:
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment