# FIXME: there is a bug in the trtllm_fp8_block_scale_moe.
# FIXME: there is a bug in the trtllm_fp8_block_scale_moe.
# It ignored the `output`` argument. https://github.com/flashinfer-ai/flashinfer/blob/da01b1bd8f9f22aec8c0eea189ad54860b034947/flashinfer/fused_moe/core.py#L1323-L1325
# It ignored the `output`` argument. https://github.com/flashinfer-ai/flashinfer/blob/da01b1bd8f9f22aec8c0eea189ad54860b034947/flashinfer/fused_moe/core.py#L1323-L1325
# so we put the whole function under the ``use_symmetric_memory`` context manager.
# so we put the whole function under the ``use_symmetric_memory`` context manager.
# If the bug is fixed, we can only put the output tensor allocation under the context manager.
# If the bug is fixed, we can only put the output tensor allocation under the context manager.
output=trtllm_fp8_block_scale_moe(
returntrtllm_fp8_block_scale_moe(
routing_logits=router_logits.to(torch.float32),
routing_logits=router_logits.to(torch.float32),
routing_bias=correction_bias,
routing_bias=correction_bias,
hidden_states=a_q,
hidden_states=a_q,
...
@@ -1238,8 +1247,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
...
@@ -1238,8 +1247,6 @@ class Fp8MoEMethod(FusedMoEMethodBase):
# FIXME: there is a bug in the trtllm_fp8_block_scale_moe.
# FIXME: there is a bug in the trtllm_fp8_block_scale_moe.
# It ignored the `output`` argument. https://github.com/flashinfer-ai/flashinfer/blob/da01b1bd8f9f22aec8c0eea189ad54860b034947/flashinfer/fused_moe/core.py#L1323-L1325
# It ignored the `output`` argument. https://github.com/flashinfer-ai/flashinfer/blob/da01b1bd8f9f22aec8c0eea189ad54860b034947/flashinfer/fused_moe/core.py#L1323-L1325
# so we put the whole function under the ``use_symmetric_memory`` context manager.
# so we put the whole function under the ``use_symmetric_memory`` context manager.
...
@@ -693,7 +699,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):
...
@@ -693,7 +699,6 @@ class ModelOptFp8MoEMethod(FusedMoEMethodBase):