Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
eebc58df
Unverified
Commit
eebc58df
authored
Jan 18, 2026
by
Wentao Ye
Committed by
GitHub
Jan 18, 2026
Browse files
[Refactor] Remove unused cutlass moe problem size function (#32047)
Signed-off-by:
yewentao256
<
zhyanwentao@126.com
>
parent
16de822c
Changes
5
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
with
0 additions
and
101 deletions
+0
-101
csrc/ops.h
csrc/ops.h
+0
-6
csrc/quantization/w8a8/cutlass/moe/moe_data.cu
csrc/quantization/w8a8/cutlass/moe/moe_data.cu
+0
-20
csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
+0
-27
csrc/torch_bindings.cpp
csrc/torch_bindings.cpp
+0
-13
vllm/_custom_ops.py
vllm/_custom_ops.py
+0
-35
No files found.
csrc/ops.h
View file @
eebc58df
...
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
...
@@ -260,12 +260,6 @@ void get_cutlass_moe_mm_data(
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
);
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
);
void
get_cutlass_moe_mm_problem_sizes
(
const
torch
::
Tensor
&
topk_ids
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
,
std
::
optional
<
bool
>
force_swap_ab
=
std
::
nullopt
);
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
const
torch
::
Tensor
&
expert_first_token_offset
,
const
torch
::
Tensor
&
expert_first_token_offset
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
...
...
csrc/quantization/w8a8/cutlass/moe/moe_data.cu
View file @
eebc58df
...
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
...
@@ -130,26 +130,6 @@ inline void launch_compute_problem_sizes(const torch::Tensor& topk_ids,
}
}
}
// namespace
}
// namespace
void
get_cutlass_moe_mm_problem_sizes_caller
(
const
torch
::
Tensor
&
topk_ids
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
,
std
::
optional
<
bool
>
force_swap_ab
=
std
::
nullopt
)
{
auto
stream
=
at
::
cuda
::
getCurrentCUDAStream
(
topk_ids
.
device
().
index
());
auto
options_int32
=
torch
::
TensorOptions
().
dtype
(
torch
::
kInt32
).
device
(
topk_ids
.
device
());
torch
::
Tensor
atomic_buffer
=
torch
::
zeros
(
num_experts
,
options_int32
);
// Swap-AB should be disabled for FP4 path
bool
may_swap_ab
=
force_swap_ab
.
value_or
((
!
blockscale_offsets
.
has_value
())
&&
(
topk_ids
.
numel
()
<=
SWAP_AB_THRESHOLD
));
launch_compute_problem_sizes
(
topk_ids
,
problem_sizes1
,
problem_sizes2
,
atomic_buffer
,
num_experts
,
n
,
k
,
stream
,
may_swap_ab
);
}
template
<
bool
SWAP_AB
>
template
<
bool
SWAP_AB
>
__global__
void
compute_problem_sizes_from_expert_offsets
(
__global__
void
compute_problem_sizes_from_expert_offsets
(
const
int64_t
*
__restrict__
expert_first_token_offset
,
const
int64_t
*
__restrict__
expert_first_token_offset
,
...
...
csrc/quantization/w8a8/cutlass/scaled_mm_entry.cu
View file @
eebc58df
...
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
...
@@ -77,12 +77,6 @@ void get_cutlass_moe_mm_data_caller(
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
);
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
);
void
get_cutlass_moe_mm_problem_sizes_caller
(
const
torch
::
Tensor
&
topk_ids
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
,
std
::
optional
<
bool
>
force_swap_ab
=
std
::
nullopt
);
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller
(
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets_caller
(
const
torch
::
Tensor
&
expert_first_token_offset
,
const
torch
::
Tensor
&
expert_first_token_offset
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
...
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
...
@@ -306,27 +300,6 @@ void get_cutlass_moe_mm_data(
version_num
,
". Required capability: 90, 100, or 120"
);
version_num
,
". Required capability: 90, 100, or 120"
);
}
}
void
get_cutlass_moe_mm_problem_sizes
(
const
torch
::
Tensor
&
topk_ids
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
const
int64_t
num_experts
,
const
int64_t
n
,
const
int64_t
k
,
const
std
::
optional
<
torch
::
Tensor
>&
blockscale_offsets
,
std
::
optional
<
bool
>
force_swap_ab
=
std
::
nullopt
)
{
int32_t
version_num
=
get_sm_version_num
();
#if (defined ENABLE_CUTLASS_MOE_SM90 && ENABLE_CUTLASS_MOE_SM90) || \
(defined ENABLE_CUTLASS_MOE_SM100 && ENABLE_CUTLASS_MOE_SM100) || \
(defined ENABLE_CUTLASS_MOE_SM120 && ENABLE_CUTLASS_MOE_SM120)
get_cutlass_moe_mm_problem_sizes_caller
(
topk_ids
,
problem_sizes1
,
problem_sizes2
,
num_experts
,
n
,
k
,
blockscale_offsets
,
force_swap_ab
);
return
;
#endif
TORCH_CHECK_NOT_IMPLEMENTED
(
false
,
"No compiled get_cutlass_moe_mm_problem_sizes: no cutlass_scaled_mm "
"kernel for CUDA device capability: "
,
version_num
,
". Required capability: 90, 100, or 120"
);
}
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
void
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
const
torch
::
Tensor
&
expert_first_token_offset
,
const
torch
::
Tensor
&
expert_first_token_offset
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
torch
::
Tensor
&
problem_sizes1
,
torch
::
Tensor
&
problem_sizes2
,
...
...
csrc/torch_bindings.cpp
View file @
eebc58df
...
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
...
@@ -474,19 +474,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"()"
);
"()"
);
ops
.
impl
(
"get_cutlass_moe_mm_data"
,
torch
::
kCUDA
,
&
get_cutlass_moe_mm_data
);
ops
.
impl
(
"get_cutlass_moe_mm_data"
,
torch
::
kCUDA
,
&
get_cutlass_moe_mm_data
);
// A function that computes problem sizes for each expert's multiplication
// used by the two mms called from fused MoE operation. It takes topk_ids as
// an input, and computes problem_sizes1 and problem_sizes2 only.
ops
.
def
(
"get_cutlass_moe_mm_problem_sizes(Tensor topk_ids, "
" Tensor! problem_sizes1, "
" Tensor! problem_sizes2, "
" int num_experts, int n, int k, "
" Tensor? blockscale_offsets, "
" bool? force_swap_ab) -> ()"
);
ops
.
impl
(
"get_cutlass_moe_mm_problem_sizes"
,
torch
::
kCUDA
,
&
get_cutlass_moe_mm_problem_sizes
);
// compute per-expert problem sizes from expert_first_token_offset
// compute per-expert problem sizes from expert_first_token_offset
// produced by vLLM's moe_permute kernel
// produced by vLLM's moe_permute kernel
ops
.
def
(
ops
.
def
(
...
...
vllm/_custom_ops.py
View file @
eebc58df
...
@@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data(
...
@@ -1044,41 +1044,6 @@ def get_cutlass_moe_mm_data(
)
)
def
get_cutlass_moe_mm_problem_sizes
(
topk_ids
:
torch
.
Tensor
,
problem_sizes1
:
torch
.
Tensor
,
problem_sizes2
:
torch
.
Tensor
,
num_experts
:
int
,
n
:
int
,
k
:
int
,
blockscale_offsets
:
torch
.
Tensor
|
None
=
None
,
force_swap_ab
:
bool
|
None
=
None
,
):
"""
Compute only the per-expert problem sizes needed by the two grouped matrix
multiplications used in CUTLASS-based fused MoE.
The function takes in topk_ids (token→expert mapping) and computes:
- problem_sizes1, problem_sizes2: M×N×K sizes of each expert's
multiplication for the two grouped MMs
used in the fused MoE operation.
Optional:
- force_swap_ab: If set to True or False, explicitly enable or disable the
A/B input swap optimization. If None (default), the swap
is selected automatically based on tensor sizes.
"""
return
torch
.
ops
.
_C
.
get_cutlass_moe_mm_problem_sizes
(
topk_ids
,
problem_sizes1
,
problem_sizes2
,
num_experts
,
n
,
k
,
blockscale_offsets
,
force_swap_ab
,
)
def
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
def
get_cutlass_moe_mm_problem_sizes_from_expert_offsets
(
expert_first_token_offset
:
torch
.
Tensor
,
expert_first_token_offset
:
torch
.
Tensor
,
problem_sizes1
:
torch
.
Tensor
,
problem_sizes1
:
torch
.
Tensor
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment