Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
4bdf4002
Unverified
Commit
4bdf4002
authored
Sep 19, 2025
by
bnellnm
Committed by
GitHub
Sep 19, 2025
Browse files
[Bugfix] Fix chunked a2_scales in modular kernels (#25264)
Signed-off-by:
Bill Nell
<
bnell@redhat.com
>
parent
7852b82b
Changes
11
Show whitespace changes
Inline
Side-by-side
Showing
11 changed files
with
23 additions
and
5 deletions
+23
-5
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
.../model_executor/layers/fused_moe/batched_deep_gemm_moe.py
+1
-0
vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
...cutor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
+2
-1
vllm/model_executor/layers/fused_moe/cutlass_moe.py
vllm/model_executor/layers/fused_moe/cutlass_moe.py
+3
-1
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
+2
-1
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
...model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+1
-0
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
+3
-1
vllm/model_executor/layers/fused_moe/fused_moe.py
vllm/model_executor/layers/fused_moe/fused_moe.py
+2
-1
vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
...l_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
+1
-0
vllm/model_executor/layers/fused_moe/modular_kernel.py
vllm/model_executor/layers/fused_moe/modular_kernel.py
+5
-0
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
+2
-0
vllm/model_executor/layers/fused_moe/trtllm_moe.py
vllm/model_executor/layers/fused_moe/trtllm_moe.py
+1
-0
No files found.
vllm/model_executor/layers/fused_moe/batched_deep_gemm_moe.py
View file @
4bdf4002
...
@@ -286,6 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -286,6 +286,7 @@ class BatchedDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
...
vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py
View file @
4bdf4002
...
@@ -126,6 +126,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -126,6 +126,7 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -136,5 +137,5 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -136,5 +137,5 @@ class BatchedTritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
assert
experts
is
not
None
assert
experts
is
not
None
experts
.
apply
(
output
,
hidden_states
,
w1
,
w2
,
topk_weights
,
topk_ids
,
experts
.
apply
(
output
,
hidden_states
,
w1
,
w2
,
topk_weights
,
topk_ids
,
activation
,
global_num_experts
,
expert_map
,
a1q_scale
,
activation
,
global_num_experts
,
expert_map
,
a1q_scale
,
workspace13
,
workspace2
,
expert_tokens_meta
,
a2_scale
,
workspace13
,
workspace2
,
expert_tokens_meta
,
apply_router_weight_on_input
)
apply_router_weight_on_input
)
vllm/model_executor/layers/fused_moe/cutlass_moe.py
View file @
4bdf4002
...
@@ -241,6 +241,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -241,6 +241,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -262,7 +263,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -262,7 +263,7 @@ class CutlassExpertsFp8Base(mk.FusedMoEPermuteExpertsUnpermute):
run_cutlass_moe_fp8
(
run_cutlass_moe_fp8
(
output
,
hidden_states
,
w1
,
w2
,
topk_ids
,
activation_callable
,
output
,
hidden_states
,
w1
,
w2
,
topk_ids
,
activation_callable
,
global_num_experts
,
expert_map
,
self
.
w1_scale
,
self
.
w2_scale
,
global_num_experts
,
expert_map
,
self
.
w1_scale
,
self
.
w2_scale
,
a1q_scale
,
self
.
a2_scale
,
self
.
ab_strides1
,
self
.
ab_strides2
,
a1q_scale
,
a2_scale
,
self
.
ab_strides1
,
self
.
ab_strides2
,
self
.
c_strides1
,
self
.
c_strides2
,
workspace13
,
workspace2
,
self
.
c_strides1
,
self
.
c_strides2
,
workspace13
,
workspace2
,
expert_num_tokens
,
expert_num_tokens
,
self
.
out_dtype
if
self
.
out_dtype
is
not
None
else
in_dtype
,
self
.
out_dtype
if
self
.
out_dtype
is
not
None
else
in_dtype
,
...
@@ -705,6 +706,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -705,6 +706,7 @@ class CutlassExpertsFp4(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
# unused
a1q_scale
:
Optional
[
torch
.
Tensor
],
# unused
a2_scale
:
Optional
[
torch
.
Tensor
],
# unused
workspace13
:
Optional
[
torch
.
Tensor
],
workspace13
:
Optional
[
torch
.
Tensor
],
workspace2
:
Optional
[
torch
.
Tensor
],
workspace2
:
Optional
[
torch
.
Tensor
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
...
vllm/model_executor/layers/fused_moe/deep_gemm_moe.py
View file @
4bdf4002
...
@@ -214,13 +214,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -214,13 +214,14 @@ class DeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
):
):
assert
a1q_scale
is
not
None
assert
a1q_scale
is
not
None
assert
self
.
a2_scale
is
None
assert
a2_scale
is
None
assert
self
.
block_shape
is
not
None
assert
self
.
block_shape
is
not
None
assert
self
.
w1_scale
is
not
None
assert
self
.
w1_scale
is
not
None
assert
self
.
w2_scale
is
not
None
assert
self
.
w2_scale
is
not
None
...
...
vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
View file @
4bdf4002
...
@@ -129,6 +129,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -129,6 +129,7 @@ class FlashInferExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
Optional
[
torch
.
Tensor
],
workspace13
:
Optional
[
torch
.
Tensor
],
workspace2
:
Optional
[
torch
.
Tensor
],
workspace2
:
Optional
[
torch
.
Tensor
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
...
vllm/model_executor/layers/fused_moe/fused_batched_moe.py
View file @
4bdf4002
...
@@ -688,6 +688,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -688,6 +688,7 @@ class NaiveBatchedExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -879,6 +880,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -879,6 +880,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -970,7 +972,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -970,7 +972,7 @@ class BatchedTritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
intermediate_cache1
.
view
(
-
1
,
N
))
intermediate_cache1
.
view
(
-
1
,
N
))
qintermediate_cache2
,
a2q_scale
=
batched_moe_kernel_quantize_input
(
qintermediate_cache2
,
a2q_scale
=
batched_moe_kernel_quantize_input
(
intermediate_cache2
,
self
.
a2_scale
,
max_num_tokens
,
E
,
N
,
intermediate_cache2
,
a2_scale
,
max_num_tokens
,
E
,
N
,
expert_num_tokens
,
self
.
quant_dtype
,
self
.
per_act_token_quant
,
expert_num_tokens
,
self
.
quant_dtype
,
self
.
per_act_token_quant
,
self
.
block_shape
)
self
.
block_shape
)
...
...
vllm/model_executor/layers/fused_moe/fused_moe.py
View file @
4bdf4002
...
@@ -1598,6 +1598,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -1598,6 +1598,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -1690,7 +1691,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
a2q_scale
:
Optional
[
torch
.
Tensor
]
=
None
a2q_scale
:
Optional
[
torch
.
Tensor
]
=
None
qintermediate_cache2
,
a2q_scale
=
moe_kernel_quantize_input
(
qintermediate_cache2
,
a2q_scale
=
moe_kernel_quantize_input
(
intermediate_cache2
,
self
.
a2_scale
,
self
.
quant_dtype
,
intermediate_cache2
,
a2_scale
,
self
.
quant_dtype
,
self
.
per_act_token_quant
,
self
.
block_shape
)
self
.
per_act_token_quant
,
self
.
block_shape
)
invoke_fused_moe_kernel
(
invoke_fused_moe_kernel
(
...
...
vllm/model_executor/layers/fused_moe/gpt_oss_triton_kernels_moe.py
View file @
4bdf4002
...
@@ -179,6 +179,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -179,6 +179,7 @@ class BatchedOAITritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
...
vllm/model_executor/layers/fused_moe/modular_kernel.py
View file @
4bdf4002
...
@@ -519,6 +519,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
...
@@ -519,6 +519,7 @@ class FusedMoEPermuteExpertsUnpermute(ABC):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
ExpertTokensMetadata
],
...
@@ -634,6 +635,7 @@ class FusedMoEModularKernel(torch.nn.Module):
...
@@ -634,6 +635,7 @@ class FusedMoEModularKernel(torch.nn.Module):
local_num_experts
:
int
,
local_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
expert_tokens_meta
:
Optional
[
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
ExpertTokensMetadata
],
apply_router_weight_on_input
:
bool
,
apply_router_weight_on_input
:
bool
,
)
->
torch
.
Tensor
:
)
->
torch
.
Tensor
:
...
@@ -671,6 +673,7 @@ class FusedMoEModularKernel(torch.nn.Module):
...
@@ -671,6 +673,7 @@ class FusedMoEModularKernel(torch.nn.Module):
global_num_experts
=
global_num_experts
,
global_num_experts
=
global_num_experts
,
expert_map
=
expert_map
,
expert_map
=
expert_map
,
a1q_scale
=
a1q_scale
,
a1q_scale
=
a1q_scale
,
a2_scale
=
a2_scale
,
workspace13
=
workspace13
,
workspace13
=
workspace13
,
workspace2
=
workspace2
,
workspace2
=
workspace2
,
expert_tokens_meta
=
expert_tokens_meta
,
expert_tokens_meta
=
expert_tokens_meta
,
...
@@ -718,6 +721,7 @@ class FusedMoEModularKernel(torch.nn.Module):
...
@@ -718,6 +721,7 @@ class FusedMoEModularKernel(torch.nn.Module):
local_num_experts
=
local_num_experts
,
local_num_experts
=
local_num_experts
,
expert_map
=
expert_map
,
expert_map
=
expert_map
,
a1q_scale
=
a1q_scale
,
a1q_scale
=
a1q_scale
,
a2_scale
=
self
.
fused_experts
.
a2_scale
,
expert_tokens_meta
=
expert_tokens_meta
,
expert_tokens_meta
=
expert_tokens_meta
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
)
)
...
@@ -803,6 +807,7 @@ class FusedMoEModularKernel(torch.nn.Module):
...
@@ -803,6 +807,7 @@ class FusedMoEModularKernel(torch.nn.Module):
local_num_experts
=
local_num_experts
,
local_num_experts
=
local_num_experts
,
expert_map
=
expert_map
,
expert_map
=
expert_map
,
a1q_scale
=
c_a1q_scale
,
a1q_scale
=
c_a1q_scale
,
a2_scale
=
c_a2_scale
,
expert_tokens_meta
=
c_expert_tokens_meta
,
expert_tokens_meta
=
c_expert_tokens_meta
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
apply_router_weight_on_input
=
apply_router_weight_on_input
,
)
)
...
...
vllm/model_executor/layers/fused_moe/triton_deep_gemm_moe.py
View file @
4bdf4002
...
@@ -111,6 +111,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -111,6 +111,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
@@ -134,6 +135,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -134,6 +135,7 @@ class TritonOrDeepGemmExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
,
global_num_experts
,
expert_map
,
expert_map
,
a1q_scale
,
a1q_scale
,
a2_scale
,
workspace13
,
workspace13
,
workspace2
,
workspace2
,
expert_tokens_meta
,
expert_tokens_meta
,
...
...
vllm/model_executor/layers/fused_moe/trtllm_moe.py
View file @
4bdf4002
...
@@ -103,6 +103,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
...
@@ -103,6 +103,7 @@ class TrtLlmGenExperts(mk.FusedMoEPermuteExpertsUnpermute):
global_num_experts
:
int
,
global_num_experts
:
int
,
expert_map
:
Optional
[
torch
.
Tensor
],
expert_map
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a1q_scale
:
Optional
[
torch
.
Tensor
],
a2_scale
:
Optional
[
torch
.
Tensor
],
workspace13
:
torch
.
Tensor
,
workspace13
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
workspace2
:
torch
.
Tensor
,
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
expert_tokens_meta
:
Optional
[
mk
.
ExpertTokensMetadata
],
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment