Unverified Commit a5a623d9 authored by yzong-rh's avatar yzong-rh Committed by GitHub
Browse files

[Bugfix] Re-enable Renormalize routing for TRT-LLM MoE experts (#38859)


Signed-off-by: default avatarYifan Zong <yzong@redhat.com>
parent f8c3af2d
...@@ -79,11 +79,8 @@ class TrtLlmBf16Experts(mk.FusedMoEExpertsMonolithic): ...@@ -79,11 +79,8 @@ class TrtLlmBf16Experts(mk.FusedMoEExpertsMonolithic):
RoutingMethodType.Default, RoutingMethodType.Default,
RoutingMethodType.DeepSeekV3, RoutingMethodType.DeepSeekV3,
RoutingMethodType.Llama4, RoutingMethodType.Llama4,
# NOTE: TRTLLM Kernel has issue with Qwen3.5 router. RoutingMethodType.Renormalize,
# Re-enable once the issue is resolved. RoutingMethodType.RenormalizeNaive,
# https://github.com/vllm-project/vllm/issues/37591
# RoutingMethodType.Renormalize,
# RoutingMethodType.RenormalizeNaive
] ]
@staticmethod @staticmethod
......
...@@ -277,13 +277,7 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit ...@@ -277,13 +277,7 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
weight_key: QuantKey | None, weight_key: QuantKey | None,
activation_key: QuantKey | None, activation_key: QuantKey | None,
) -> bool: ) -> bool:
"""Monolithic kernels need to express router support. """Monolithic kernels need to express router support."""
Renormalize/RenormalizeNaive are excluded: the monolithic kernel's
internal routing for these methods produces output uncorrelated
with the modular kernel's output and with Triton kernel's output
for Qwen3.5-35B-A3B-FP8.
See: https://github.com/vllm-project/vllm/issues/37591
"""
# NOTE(dbari): TopK routing could also be enabled, but need to validate models # NOTE(dbari): TopK routing could also be enabled, but need to validate models
# NOTE(dbari): Default is not implemented and should not be enabled until it is # NOTE(dbari): Default is not implemented and should not be enabled until it is
...@@ -295,6 +289,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit ...@@ -295,6 +289,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
return routing_method in [ return routing_method in [
RoutingMethodType.DeepSeekV3, RoutingMethodType.DeepSeekV3,
RoutingMethodType.Simulated, RoutingMethodType.Simulated,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
] ]
elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym): elif (weight_key, activation_key) == (kFp8StaticTensorSym, kFp8StaticTensorSym):
# NOTE(dbari): as above, potentially allow others here. # NOTE(dbari): as above, potentially allow others here.
...@@ -302,6 +298,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit ...@@ -302,6 +298,8 @@ class TrtLlmFp8ExpertsMonolithic(TrtLlmFp8ExpertsBase, mk.FusedMoEExpertsMonolit
RoutingMethodType.DeepSeekV3, RoutingMethodType.DeepSeekV3,
RoutingMethodType.Llama4, RoutingMethodType.Llama4,
RoutingMethodType.Simulated, RoutingMethodType.Simulated,
RoutingMethodType.Renormalize,
RoutingMethodType.RenormalizeNaive,
] ]
else: else:
raise ValueError("Unsupported quantization scheme.") raise ValueError("Unsupported quantization scheme.")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment