[Model] Add LongCat-Flash

50281e36 · zhuwenwen · a441a5d9 · 50281e36 · 50281e36 · 50281e36
Commit 50281e36 authored Sep 18, 2025 by zhuwenwen
18 changed files
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -44,6 +44,9 @@ __global__ void moe_align_block_size_kernel(
  for (size_t i = tid; i < numel; i += stride) {
    int expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
    int warp_idx = expert_id / experts_per_warp;
    int expert_offset = expert_id % experts_per_warp;
    atomicAdd(&shared_counts[warp_idx * experts_per_warp + expert_offset], 1);
@@ -95,12 +98,15 @@ template <typename scalar_t>
 __global__ void count_and_sort_expert_tokens_kernel(
    const scalar_t* __restrict__ topk_ids,
    int32_t* __restrict__ sorted_token_ids, int32_t* __restrict__ cumsum_buffer,
-    size_t numel) {
+    size_t numel, int32_t num_experts) {
  const size_t tid = blockIdx.x * blockDim.x + threadIdx.x;
  const size_t stride = blockDim.x * gridDim.x;
  for (size_t i = tid; i < numel; i += stride) {
    int32_t expert_id = topk_ids[i];
+    if (expert_id >= num_experts) {
+      continue;
+    }
    int32_t rank_post_pad = atomicAdd(&cumsum_buffer[expert_id], 1);
    sorted_token_ids[rank_post_pad] = i;
  }
@@ -298,7 +304,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
          sort_kernel<<<actual_blocks, block_threads, 0, stream>>>(
              topk_ids.data_ptr<scalar_t>(),
              sorted_token_ids.data_ptr<int32_t>(),
-              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel());
+              cumsum_buffer.data_ptr<int32_t>(), topk_ids.numel(), num_experts);
        }
      });
 }

--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -414,6 +414,7 @@ th {
 | `MiniMaxM1ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-M1-40k`, `MiniMaxAI/MiniMax-M1-80k`, etc. | | | ✅︎ |
 | `MiniMaxText01ForCausalLM` | MiniMax-Text | `MiniMaxAI/MiniMax-Text-01`, etc. | | | ✅︎ |
 | `Zamba2ForCausalLM` | Zamba2 | `Zyphra/Zamba2-7B-instruct`, `Zyphra/Zamba2-2.7B-instruct`, `Zyphra/Zamba2-1.2B-instruct`, etc. | | | ✅︎ |
+| `LongcatFlashForCausalLM` | LongCat-Flash | `meituan-longcat/LongCat-Flash-Chat`, `meituan-longcat/LongCat-Flash-Chat-FP8` | ✅︎ |✅︎ | ✅︎ |
 Some models are supported only via the [Transformers backend](#transformers). The purpose of the table below is to acknowledge models which we officially support in this way. The logs will say that the Transformers backend is being used, and you will see no warning that this is fallback behaviour. This means that, if you have issues with any of the models listed below, please [make an issue](https://github.com/vllm-project/vllm/issues/new/choose) and we'll do our best to fix it!

--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -359,6 +359,8 @@ _TEXT_GENERATION_EXAMPLE_MODELS = {
    "MiMoForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix, "XiaomiMiMo/MiMo-7B-RL"),
                                        trust_remote_code=True),
    "Dots1ForCausalLM": _HfExamplesInfo(os.path.join(models_path_prefix,"rednote-hilab/dots.llm1.inst")),
+    "LongcatFlashForCausalLM": _HfExamplesInfo
+                (os.path.join(models_path_prefix, "meituan-longcat/LongCat-Flash-Chat"), trust_remote_code=True),
    # [Encoder-decoder]
    "BartModel": _HfExamplesInfo(os.path.join(models_path_prefix,"facebook/bart-base")),
    "BartForConditionalGeneration": _HfExamplesInfo(os.path.join(models_path_prefix,"facebook/bart-large-cnn")),

--- a/vllm/config/__init__.py
+++ b/vllm/config/__init__.py
@@ -1403,7 +1403,8 @@ class ModelConfig:
        if not hasattr(self.hf_text_config, "model_type"):
            return False
        elif self.hf_text_config.model_type in \
-            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp', 'kimi_k2'):
+            ('deepseek_v2', 'deepseek_v3', 'deepseek_mtp',
+              'kimi_k2', 'longcat_flash'):
            return self.hf_text_config.kv_lora_rank is not None
        elif self.hf_text_config.model_type == 'eagle':
            # if the model is an EAGLE module, check for the
@@ -1530,6 +1531,9 @@ class ModelConfig:
                or self.hf_config.model_type == "qwen3_next_mtp"):
            total_num_hidden_layers = getattr(self.hf_text_config,
                                              "num_nextn_predict_layers", 0)
+        elif (self.hf_config.model_type == "longcat_flash_mtp"):
+            total_num_hidden_layers = getattr(self.hf_text_config,
+                                              "num_nextn_predict_layers", 1)
        else:
            total_num_hidden_layers = getattr(self.hf_text_config,
                                              "num_hidden_layers", 0)
@@ -1898,7 +1902,7 @@ class DeviceConfig:
 SpeculativeMethod = Literal["ngram", "eagle", "eagle3", "medusa",
                            "mlp_speculator", "draft_model", "deepseek_mtp",
-                            "ernie_mtp", "qwen3_next_mtp"]
+                            "ernie_mtp", "qwen3_next_mtp", "longcat_flash_mtp"]
 @config
@@ -2012,6 +2016,13 @@ class SpeculativeConfig:
                "n_predict": n_predict,
                "architectures": ["DeepSeekMTPModel"]
            })
+        if hf_config.model_type == "longcat_flash":
+            hf_config.model_type = "longcat_flash_mtp"
+            n_predict = getattr(hf_config, "num_nextn_predict_layers", 1)
+            hf_config.update({
+                "n_predict": n_predict,
+                "architectures": ["LongCatFlashMTPModel"]
+            })
        if hf_config.architectures[0] == "MiMoForCausalLM":
            hf_config.model_type = "mimo_mtp"
@@ -2193,6 +2204,15 @@ class SpeculativeConfig:
                                "one layer. Might need some code changes " \
                                "to support multiple layers."
                            )
+                elif (self.draft_model_config.hf_config.model_type
+                      in ("longcat_flash_mtp")):
+                    self.method = "longcat_flash_mtp"
+                    if self.num_speculative_tokens > 1:
+                        logger.warning(
+                                "LongCat MTP models only have " \
+                                "one layer. Might need some code changes " \
+                                "to support multiple layers."
+                            )
                else:
                    self.method = "draft_model"
                    raise NotImplementedError(
@@ -2413,7 +2433,7 @@ class SpeculativeConfig:
    def use_eagle(self) -> bool:
        return self.method in ("eagle", "eagle3", "deepseek_mtp", "ernie_mtp",
-                               "qwen3_next_mtp")
+                               "qwen3_next_mtp", "longcat_flash_mtp")
    def __repr__(self) -> str:
        method = self.method

--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -1386,7 +1386,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # vLLM will use lightop for moe_fused_gate and moe_align_block_size
    "VLLM_USE_LIGHT_OP":
-        lambda: (os.environ.get("VLLM_USE_LIGHT_OP", "True").lower() in
+        lambda: (os.environ.get("VLLM_USE_LIGHT_OP", "False").lower() in
                 ("true", "1")),
    # vLLM will use global cache for moe

--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -887,6 +887,73 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
        )
+@triton.jit
+def compute_identity_kernel(
+    top_k,
+    hidden_states_ptr,
+    expert_scales_ptr,
+    num_tokens,
+    output_ptr,
+    hidden_dim,
+    scales_stride,
+    BLOCK_SIZE: tl.constexpr,
+):
+    pid = tl.program_id(0)
+    batch_id = pid // (hidden_dim // BLOCK_SIZE)
+    dim_offset = pid % (hidden_dim // BLOCK_SIZE) * BLOCK_SIZE
+    if batch_id >= num_tokens or dim_offset >= hidden_dim:
+        return
+    h = tl.load(hidden_states_ptr + batch_id * hidden_dim + dim_offset +
+                tl.arange(0, BLOCK_SIZE),
+                mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim)
+    result = tl.zeros([BLOCK_SIZE], dtype=tl.float32)
+    for i in range(top_k):
+        scale = tl.load(expert_scales_ptr + batch_id * scales_stride + i)
+        result += h * scale
+    tl.store(output_ptr + batch_id * hidden_dim + dim_offset +
+             tl.arange(0, BLOCK_SIZE),
+             result,
+             mask=(dim_offset + tl.arange(0, BLOCK_SIZE)) < hidden_dim)
+def zero_experts_compute_triton(expert_indices, expert_scales, num_experts,
+                                zero_expert_type, hidden_states):
+    N = expert_indices.numel()
+    top_k = expert_indices.size(-1)
+    grid = lambda meta: (triton.cdiv(N, meta['BLOCK_SIZE']), )
+    if zero_expert_type == "identity":
+        zero_expert_mask = expert_indices < num_experts
+        zero_expert_scales = expert_scales.clone()
+        zero_expert_scales[zero_expert_mask] = 0.0
+    normal_expert_mask = expert_indices >= num_experts
+    expert_indices[normal_expert_mask] = 0
+    expert_scales[normal_expert_mask] = 0.0
+    output = torch.zeros_like(hidden_states).to(hidden_states.device)
+    hidden_dim = hidden_states.size(-1)
+    num_tokens = hidden_states.size(0)
+    grid = lambda meta: (num_tokens * (hidden_dim // meta['BLOCK_SIZE']), )
+    compute_identity_kernel[grid](
+        top_k,
+        hidden_states,
+        zero_expert_scales,
+        num_tokens,
+        output,
+        hidden_dim,
+        zero_expert_scales.stride(0),
+        BLOCK_SIZE=256,
+    )
+    return output
 # Adapted from: https://github.com/sgl-project/sglang/pull/2628
 def get_config_file_name(E: int,
                         N: int,
@@ -1188,6 +1255,25 @@ def fused_topk(
    return topk_weights, topk_ids, token_expert_indices
+def fused_topk_bias(
+    hidden_states: torch.Tensor,
+    gating_output: torch.Tensor,
+    e_score_correction_bias: torch.Tensor,
+    topk: int,
+    renormalize: bool,
+):
+    n_routed_experts = gating_output.shape[-1]
+    scores = gating_output.softmax(dim=-1)
+    scores_for_choice = scores.view(
+        -1, n_routed_experts) + e_score_correction_bias.unsqueeze(0)
+    topk_indices = torch.topk(scores_for_choice, k=topk, dim=-1,
+                              sorted=False)[1]
+    topk_weights = scores.gather(1, topk_indices)
+    if renormalize:
+        topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
+    return topk_weights.to(torch.float32), topk_indices.to(torch.int32)
 def is_power_of_two(n):
    return n > 0 and math.log2(n).is_integer()

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -26,6 +26,8 @@ from vllm.model_executor.custom_op import CustomOp
 # yapf: disable
 from vllm.model_executor.layers.fused_moe.config import (
    FusedMoEConfig, FusedMoEParallelConfig)
+from vllm.model_executor.layers.fused_moe.fused_moe import (
+    zero_experts_compute_triton)
 # yapf: enable
 from vllm.model_executor.layers.fused_moe.modular_kernel import (
    FusedMoEActivationFormat, FusedMoEModularKernel,
@@ -263,6 +265,8 @@ class FusedMoEMethodBase(QuantizeMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        topk_ids: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        raise NotImplementedError
@@ -437,6 +441,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        topk_ids: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
        use_fused_gate: Optional[bool] = False,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
@@ -493,10 +499,13 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        topk_ids: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
        use_fused_gate: Optional[bool] = False,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
+        if topk_weights is None or topk_ids is None:
            topk_weights, topk_ids = FusedMoE.select_experts(
                hidden_states=x,
                router_logits=router_logits,
@@ -518,7 +527,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                use_fused_gate=use_fused_gate)
        if self.rocm_aiter_moe_enabled:
-            return self.rocm_aiter_fused_experts(
+            result = self.rocm_aiter_fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
@@ -531,7 +540,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            if self.has_bias:
                raise ValueError(
                    "FusedMoEModularKernel does not support bias.")
-            return self.fused_experts(
+            result = self.fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
@@ -546,7 +555,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
            )
        else:
            assert fused_experts is not None
-            return fused_experts(
+            result = fused_experts(
                hidden_states=x,
                w1=layer.w13_weight,
                w2=layer.w2_weight,
@@ -561,6 +570,7 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
                expert_map=expert_map,
                use_nn_moe=use_nn_moe,
            )
+        return result
    def forward_cpu(
        self,
@@ -584,6 +594,8 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        topk_ids: Optional[torch.Tensor] = None,
        use_nn_moe: Optional[bool] = False,
        use_fused_gate: Optional[bool] = False,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
@@ -825,6 +837,8 @@ class FusedMoE(CustomOp):
        num_redundant_experts: int = 0,
        has_bias: bool = False,
        is_sequence_parallel=False,
+        zero_expert_num: Optional[int] = 0,
+        zero_expert_type: Optional[str] = None,
    ):
        super().__init__()
        if params_dtype is None:
@@ -848,6 +862,8 @@ class FusedMoE(CustomOp):
                vllm_parallel_config=vllm_config.parallel_config))
        self.global_num_experts = num_experts + num_redundant_experts
+        self.zero_expert_num = zero_expert_num
+        self.zero_expert_type = zero_expert_type
        # we are padding globally so EP buffer allocation works
        if quant_config and quant_config.get_name() == "mxfp4":
@@ -1544,7 +1560,8 @@ class FusedMoE(CustomOp):
            equivalent to global logical ids, so should be compatible with
            plain MoE implementations without redundant experts.
        """
-        from vllm.model_executor.layers.fused_moe.fused_moe import fused_topk
+        from vllm.model_executor.layers.fused_moe.fused_moe import (
+            fused_topk, fused_topk_bias)
        # Check if we should use a routing simulation strategy
        routing_strategy = envs.VLLM_MOE_ROUTING_SIMULATION_STRATEGY
@@ -1595,6 +1612,16 @@ class FusedMoE(CustomOp):
            if indices_type is not None:
                topk_ids = topk_ids.to(dtype=indices_type)
+        elif e_score_correction_bias is not None:
+            topk_weights, topk_ids = fused_topk_bias(
+                hidden_states=hidden_states,
+                gating_output=router_logits,
+                e_score_correction_bias=e_score_correction_bias.data,
+                topk=top_k,
+                renormalize=renormalize,
+            )
+            if routed_scaling_factor is not None:
+                topk_weights *= routed_scaling_factor
        elif custom_routing_function is None:
            topk_weights, topk_ids, token_expert_indices = fused_topk(
                hidden_states=hidden_states,
@@ -1775,6 +1802,36 @@ class FusedMoE(CustomOp):
            staged_hidden_states.copy_(hidden_states, non_blocking=True)
            staged_router_logits.copy_(router_logits, non_blocking=True)
+            zero_expert_result = None
+            if self.zero_expert_num > 0 and self.zero_expert_type is not None:
+                topk_weights, topk_ids = FusedMoE.select_experts(
+                    hidden_states=staged_hidden_states,
+                    router_logits=staged_router_logits,
+                    use_grouped_topk=self.use_grouped_topk,
+                    top_k=self.top_k,
+                    renormalize=self.renormalize,
+                    topk_group=self.topk_group,
+                    num_expert_group=self.num_expert_group,
+                    custom_routing_function=self.custom_routing_function,
+                    scoring_func=self.scoring_func,
+                    routed_scaling_factor=self.routed_scaling_factor,
+                    e_score_correction_bias=self.e_score_correction_bias,
+                    indices_type=self.quant_method.topk_indices_dtype,
+                    enable_eplb=self.enable_eplb,
+                    expert_map=self.expert_map,
+                    expert_load_view=self.expert_load_view,
+                    logical_to_physical_map=self.logical_to_physical_map,
+                    logical_replica_count=self.logical_replica_count)
+                # Compute zero_expert_result
+                zero_expert_result = zero_experts_compute_triton(
+                    expert_indices=topk_ids,
+                    expert_scales=topk_weights,
+                    num_experts=self.global_num_experts,
+                    zero_expert_type=self.zero_expert_type,
+                    hidden_states=staged_hidden_states,
+                )
            # Matrix multiply.
            final_hidden_states = self.quant_method.apply(
                layer=self,
@@ -1796,11 +1853,21 @@ class FusedMoE(CustomOp):
                expert_load_view=self.expert_load_view,
                logical_to_physical_map=self.logical_to_physical_map,
                logical_replica_count=self.logical_replica_count,
+                topk_weights=topk_weights if self.zero_expert_num > 0
+                and self.zero_expert_type is not None else None,
+                topk_ids=topk_ids if self.zero_expert_num > 0
+                and self.zero_expert_type is not None else None,
            )
            assert self.shared_experts is None or isinstance(
                final_hidden_states, tuple)
+            if zero_expert_result is not None:
+                assert isinstance(final_hidden_states, torch.Tensor), \
+                    "Shared experts and zero experts are mutually exclusive"
+                final_hidden_states = final_hidden_states + \
+                    zero_expert_result[:final_hidden_states.size(0)]
            if not skip_result_store:
                if self.shared_experts is None:
                    full_fused_final_hidden_states[
@@ -1881,6 +1948,36 @@ class FusedMoE(CustomOp):
            hidden_states, router_logits = get_ep_group().dispatch(
                hidden_states, router_logits)
+        zero_expert_result = None
+        if self.zero_expert_num > 0 and self.zero_expert_type is not None:
+            topk_weights, topk_ids = FusedMoE.select_experts(
+                hidden_states=hidden_states,
+                router_logits=router_logits,
+                use_grouped_topk=self.use_grouped_topk,
+                top_k=self.top_k,
+                renormalize=self.renormalize,
+                topk_group=self.topk_group,
+                num_expert_group=self.num_expert_group,
+                custom_routing_function=self.custom_routing_function,
+                scoring_func=self.scoring_func,
+                routed_scaling_factor=self.routed_scaling_factor,
+                e_score_correction_bias=self.e_score_correction_bias,
+                indices_type=self.quant_method.topk_indices_dtype,
+                enable_eplb=self.enable_eplb,
+                expert_map=self.expert_map,
+                expert_load_view=self.expert_load_view,
+                logical_to_physical_map=self.logical_to_physical_map,
+                logical_replica_count=self.logical_replica_count)
+            # Compute zero_expert_result
+            zero_expert_result = zero_experts_compute_triton(
+                expert_indices=topk_ids,
+                expert_scales=topk_weights,
+                num_experts=self.global_num_experts,
+                zero_expert_type=self.zero_expert_type,
+                hidden_states=hidden_states,
+            )
        # Matrix multiply.
        final_hidden_states = self.quant_method.apply(
            layer=self,
@@ -1903,6 +2000,10 @@ class FusedMoE(CustomOp):
            expert_load_view=self.expert_load_view,
            logical_to_physical_map=self.logical_to_physical_map,
            logical_replica_count=self.logical_replica_count,
+            topk_weights=topk_weights if self.zero_expert_num > 0
+            and self.zero_expert_type is not None else None,
+            topk_ids=topk_ids if self.zero_expert_num > 0
+            and self.zero_expert_type is not None else None,
            use_nn_moe=self.use_nn_moe,
            use_fused_gate=self.use_fused_gate
        )
@@ -1928,14 +2029,19 @@ class FusedMoE(CustomOp):
            return states
-        if self.shared_experts is None:
+        if self.shared_experts is not None:
-            assert not isinstance(final_hidden_states, tuple)
-            return reduce_output(final_hidden_states)
-        else:
            return (
                reduce_output(final_hidden_states[0], do_combine=False),
                reduce_output(final_hidden_states[1]),
            )
+        elif zero_expert_result is not None:
+            assert isinstance(final_hidden_states, torch.Tensor), \
+                    "Shared experts and zero experts are mutually exclusive"
+            return reduce_output(
+                final_hidden_states) + zero_expert_result[:final_hidden_states.
+                                                          size(0)]
+        else:
+            return reduce_output(final_hidden_states)
    @classmethod
    def make_expert_params_mapping(

--- a/vllm/model_executor/layers/mla.py
+++ b/vllm/model_executor/layers/mla.py
@@ -103,7 +103,6 @@ class MultiHeadLatentAttention(CustomOp):
        )
        self.prefix = prefix
-        self.debug_layer_idx = int(self.prefix.split(".")[-2])
    def forward_native(
        self,

--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -17,6 +17,8 @@ from vllm.model_executor.layers.fused_moe import (
    FusedMoE, FusedMoEActivationFormat, FusedMoEConfig, FusedMoEMethodBase,
    FusedMoEPermuteExpertsUnpermute, FusedMoEPrepareAndFinalize,
    FusedMoeWeightScaleSupported)
+from vllm.model_executor.layers.fused_moe.layer import (
+    UnquantizedFusedMoEMethod)
 from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
                                               UnquantizedLinearMethod)
 from vllm.model_executor.layers.quantization import QuantizationMethods
@@ -174,6 +176,10 @@ class Fp8Config(QuantizationConfig):
                return UnquantizedLinearMethod()
            return Fp8LinearMethod(self)
        elif isinstance(layer, FusedMoE):
+            if is_layer_skipped(prefix=prefix,
+                                ignored_layers=self.ignored_layers,
+                                fused_mapping=self.packed_modules_mapping):
+                return UnquantizedFusedMoEMethod(layer.moe_config)
            return Fp8MoEMethod(self, layer)
        elif isinstance(layer, Attention):
            return Fp8KVCacheMethod(self)
@@ -998,6 +1004,8 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        expert_load_view: Optional[torch.Tensor] = None,
        logical_to_physical_map: Optional[torch.Tensor] = None,
        logical_replica_count: Optional[torch.Tensor] = None,
+        topk_weights: Optional[torch.Tensor] = None,
+        topk_ids: Optional[torch.Tensor] = None,
    ) -> Union[torch.Tensor, tuple[torch.Tensor, torch.Tensor]]:
        if enable_eplb:
            assert expert_load_view is not None
@@ -1014,7 +1022,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                assert (renormalize and use_grouped_topk
                        and custom_routing_function is None)
-                return torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
+                result = torch.ops.vllm.flashinfer_fused_moe_blockscale_fp8(
                    routing_logits=router_logits.to(torch.float32),
                    routing_bias=e_score_correction_bias,
                    x=x,
@@ -1035,7 +1043,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            else:
                assert (not renormalize
                        and custom_routing_function is not None)
-                return apply_flashinfer_per_tensor_scale_fp8(
+                result = apply_flashinfer_per_tensor_scale_fp8(
                    layer=layer,
                    hidden_states=x,
                    router_logits=router_logits,
@@ -1046,6 +1054,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    topk_group=topk_group,
                    apply_router_weight_on_input=apply_router_weight_on_input)
+        if topk_weights is None or topk_ids is None:
            topk_weights, topk_ids = FusedMoE.select_experts(
                hidden_states=x,
                router_logits=router_logits,
@@ -1069,7 +1078,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        if self.rocm_aiter_moe_enabled:
            from vllm.model_executor.layers.fused_moe.rocm_aiter_fused_moe import (  # noqa: E501
                rocm_aiter_fused_experts)
-            return rocm_aiter_fused_experts(
+            result = rocm_aiter_fused_experts(
                x,
                layer.w13_weight,
                layer.w2_weight,
@@ -1089,7 +1098,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
        elif self.use_marlin:
            assert activation == "silu", (
                f"{activation} not supported for Marlin MoE.")
-            return torch.ops.vllm.fused_marlin_moe(
+            result = torch.ops.vllm.fused_marlin_moe(
                x,
                layer.w13_weight,
                layer.w2_weight,
@@ -1112,7 +1121,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            assert scoring_func == 'sigmoid', (
                f"Expected 'sigmoid' scoring func but got {scoring_func}")
            if self.fused_experts is not None:
-                return self.fused_experts(
+                result = self.fused_experts(
                    x,
                    layer.w13_weight,
                    layer.w2_weight,
@@ -1125,7 +1134,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    apply_router_weight_on_input=apply_router_weight_on_input,
                )
            else:
-                return flashinfer_cutlass_moe_fp8(
+                result = flashinfer_cutlass_moe_fp8(
                    x,
                    layer,
                    topk_weights,
@@ -1157,10 +1166,10 @@ class Fp8MoEMethod(FusedMoEMethodBase):
            )
            if self.fused_experts is not None:
-                return self.fused_experts(**common_kwargs)
+                result = self.fused_experts(**common_kwargs)
            else:
                from vllm.model_executor.layers.fused_moe import fused_experts
-                return fused_experts(
+                result = fused_experts(
                    **common_kwargs,
                    use_fp8_w8a8=True,
                    block_shape=self.quant_config.weight_block_size,
@@ -1168,6 +1177,7 @@ class Fp8MoEMethod(FusedMoEMethodBase):
                    allow_cutlass_block_scaled_grouped_gemm=(
                        self.allow_cutlass_block_scaled_grouped_gemm),
                )
+        return result
 class Fp8KVCacheMethod(BaseKVCacheMethod):

--- a/vllm/model_executor/layers/quantization/utils/quant_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/quant_utils.py
@@ -283,6 +283,11 @@ def is_layer_skipped(
                    f"Detected some but not all shards of {prefix} "
                    "are quantized. All shards of fused layers "
                    "to have the same precision.")
+    elif "experts" in prefix:
+        return any([
+            prefix in layer_name for layer_name in ignored_layers
+            if "experts" in layer_name
+        ])
    else:
        is_skipped = prefix in ignored_layers

--- a/vllm/model_executor/models/longcat_flash.py
+++ b/vllm/model_executor/models/longcat_flash.py
--- a/vllm/model_executor/models/longcat_flash_mtp.py
+++ b/vllm/model_executor/models/longcat_flash_mtp.py
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# Adapted from
+# https://github.com/vllm-project/vllm/blob/v0.7.3/vllm/model_executor/models/deepseek_mtp.py
+from collections.abc import Iterable
+from typing import Optional
+import torch
+import torch.nn as nn
+from transformers import PretrainedConfig
+from vllm.config import VllmConfig
+from vllm.model_executor.layers.layernorm import RMSNorm
+from vllm.model_executor.layers.linear import ReplicatedLinear
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization import QuantizationConfig
+from vllm.model_executor.layers.quantization.utils.int8_utils import (
+    block_dequant)
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead, VocabParallelEmbedding)
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.longcat_flash import FlashConfig
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.sequence import IntermediateTensors
+from .deepseek_v2 import DeepseekV2DecoderLayer
+from .interfaces import SupportsPP
+from .utils import maybe_prefix
+class LongCatMultiTokenPredictorLayer(nn.Module):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        prefix: str,
+        vllm_config: VllmConfig,
+        quant_config: Optional[QuantizationConfig] = None,
+    ) -> None:
+        super().__init__()
+        self.enorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.hnorm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.eh_proj = ReplicatedLinear(2 * config.hidden_size,
+                                        config.hidden_size,
+                                        bias=False,
+                                        quant_config=quant_config,
+                                        prefix="eh_proj")
+        self.mtp_block = DeepseekV2DecoderLayer(vllm_config, prefix)
+        self.final_layernorm = RMSNorm(config.hidden_size,
+                                       eps=config.rms_norm_eps)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_index: int = 0,
+    ) -> torch.Tensor:
+        assert inputs_embeds is not None
+        inputs_embeds = self.enorm(inputs_embeds)
+        previous_hidden_states = self.hnorm(previous_hidden_states)
+        hidden_states, _ = self.eh_proj(
+            torch.cat([inputs_embeds, previous_hidden_states], dim=-1))
+        hidden_states, residual = self.mtp_block(positions=positions,
+                                                 hidden_states=hidden_states,
+                                                 residual=None)
+        hidden_states, _ = self.final_layernorm(hidden_states, residual)
+        return hidden_states
+class LongCatMultiTokenPredictor(nn.Module):
+    def __init__(self,
+                 *,
+                 vllm_config: VllmConfig,
+                 quant_config: Optional[QuantizationConfig] = None,
+                 prefix: str = ""):
+        super().__init__()
+        config = FlashConfig(**vllm_config.model_config.hf_config.__dict__)
+        vllm_config.model_config.hf_config.intermediate_size \
+                                                = config.intermediate_size
+        self.mtp_start_layer_idx = config.num_hidden_layers * 2
+        self.num_mtp_layers = 1
+        self.layers = torch.nn.ModuleDict({
+            str(idx):
+            LongCatMultiTokenPredictorLayer(
+                config,
+                prefix=f"{prefix}.layers.{idx}",
+                vllm_config=vllm_config,
+                quant_config=quant_config,
+            )
+            for idx in range(self.mtp_start_layer_idx,
+                             self.mtp_start_layer_idx + self.num_mtp_layers)
+        })
+        self.embed_tokens = VocabParallelEmbedding(
+            config.vocab_size,
+            config.hidden_size,
+        )
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        previous_hidden_states: torch.Tensor,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+        current_step_idx = (spec_step_idx % self.num_mtp_layers)
+        return self.layers[str(self.mtp_start_layer_idx + current_step_idx)](
+            input_ids,
+            positions,
+            previous_hidden_states,
+            inputs_embeds,
+            current_step_idx,
+        )
+class LongCatFlashMTP(nn.Module, SupportsPP):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+        # LongCat MTP without MoE layers
+        vllm_config.model_config.hf_config.n_routed_experts = None
+        self.config = FlashConfig(
+            **vllm_config.model_config.hf_config.__dict__)
+        self.quant_config = None if "mtp" in getattr(
+            self.config, "disable_quant_module",
+            []) else vllm_config.quant_config
+        self.model = LongCatMultiTokenPredictor(vllm_config=vllm_config,
+                                                quant_config=self.quant_config,
+                                                prefix=maybe_prefix(
+                                                    prefix, "model"))
+        self.lm_head = ParallelLMHead(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            quant_config=self.quant_config,
+        )
+        self.logits_processor = LogitsProcessor(self.config.vocab_size)
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        hidden_states: torch.Tensor,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        spec_step_idx: int = 0,
+    ) -> torch.Tensor:
+        hidden_states = self.model(input_ids, positions, hidden_states,
+                                   inputs_embeds, spec_step_idx)
+        return hidden_states
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+        spec_step_idx: int = 0,
+    ) -> Optional[torch.Tensor]:
+        logits = self.logits_processor(self.lm_head, hidden_states,
+                                       sampling_metadata)
+        return logits
+    def load_weights(self, weights: Iterable[tuple[str,
+                                                   torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("fused_qkv_a_proj", "q_a_proj", 0),
+            ("fused_qkv_a_proj", "kv_a_proj_with_mqa", 1),
+        ]
+        new_to_old_names_mapping = {
+            "model.mtp.embed_tokens.weight":
+            "model.layers.0.embed_tokens.weight",
+            "model.mtp.layers.0.eh_proj.weight": "eh_proj.weight",
+            "model.mtp.layers.0.eh_proj.weight_scale_inv":
+            "eh_proj.weight_scale_inv",
+            "model.mtp.layers.0.enorm.m.weight": "enorm.weight",
+            "model.mtp.layers.0.hnorm.m.weight": "hnorm.weight",
+            "model.mtp.layers.0.input_layernorm.weight":
+            "model.layers.0.input_layernorm.weight",
+            "model.mtp.layers.0.post_attention_layernorm.weight":
+            "model.layers.0.post_attention_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_layernorm.weight":
+            "model.layers.0.self_attn.kv_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight":
+            "model.layers.0.self_attn.kv_a_proj_with_mqa.weight",
+            "model.mtp.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv":
+            "model.layers.0.self_attn.kv_a_proj_with_mqa.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight":
+            "model.layers.0.self_attn.kv_b_proj.weight",
+            "model.mtp.layers.0.self_attn.kv_b_proj.weight_scale_inv":
+            "model.layers.0.self_attn.kv_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.o_proj.weight":
+            "model.layers.0.self_attn.o_proj.weight",
+            "model.mtp.layers.0.self_attn.o_proj.weight_scale_inv":
+            "model.layers.0.self_attn.o_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_a_layernorm.weight":
+            "model.layers.0.self_attn.q_a_layernorm.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight":
+            "model.layers.0.self_attn.q_a_proj.weight",
+            "model.mtp.layers.0.self_attn.q_a_proj.weight_scale_inv":
+            "model.layers.0.self_attn.q_a_proj.weight_scale_inv",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight":
+            "model.layers.0.self_attn.q_b_proj.weight",
+            "model.mtp.layers.0.self_attn.q_b_proj.weight_scale_inv":
+            "model.layers.0.self_attn.q_b_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight":
+            "model.layers.0.mlp.down_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.down_proj.weight_scale_inv":
+            "model.layers.0.mlp.down_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight":
+            "model.layers.0.mlp.gate_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.gate_proj.weight_scale_inv":
+            "model.layers.0.mlp.gate_proj.weight_scale_inv",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight":
+            "model.layers.0.mlp.up_proj.weight",
+            "model.mtp.layers.0.transformer_layer.mlp.up_proj.weight_scale_inv":
+            "model.layers.0.mlp.up_proj.weight_scale_inv",
+            "model.mtp.norm.weight": "final_layernorm.weight",
+        }
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            spec_layer = self.get_spec_layer_idx_from_weight_name(
+                self.config, name)
+            if spec_layer is None:
+                continue
+            name = self._rewrite_spec_layer_name(spec_layer, name,
+                                                 new_to_old_names_mapping)
+            for (param_name, weight_name, shard_id) in stacked_params_mapping:
+                # Skip non-stacked layers and experts (experts handled below).
+                if weight_name not in name:
+                    continue
+                # We have mlp.experts[0].gate_proj in the checkpoint.
+                # Since we handle the experts below in expert_params_mapping,
+                # we need to skip here BEFORE we update the name, otherwise
+                # name will be updated to mlp.experts[0].gate_up_proj, which
+                # will then be updated below in expert_params_mapping
+                # for mlp.experts[0].gate_gate_up_proj, which breaks load.
+                if (("mlp.experts." in name) and name not in params_dict):
+                    continue
+                name = name.replace(weight_name, param_name)
+                # QKV fusion is optional, fall back to normal
+                # weight loading if it's not enabled
+                if ((param_name == "fused_qkv_a_proj")
+                        and name not in params_dict):
+                    continue
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # According to DeepSeek-V3 Technical Report, MTP modules
+                # shares embedding layer. We only load the first weights.
+                if (spec_layer != self.model.mtp_start_layer_idx
+                        and ".layers" not in name):
+                    continue
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        spec_layer_id = self.config.num_hidden_layers * 2
+        self_attn = self.model.layers[str(spec_layer_id)].mtp_block.self_attn
+        if hasattr(
+                self.quant_config,
+                "weight_block_size") and self_attn.kv_b_proj.weight.dtype in (
+                    torch.float8_e4m3fn,
+                    torch.float8_e4m3fnuz,
+                ):
+            weight_block_size = self.quant_config.weight_block_size
+            if weight_block_size is not None:
+                dtype = torch.get_default_dtype()
+                w = block_dequant(self_attn.kv_b_proj.weight,
+                                  self_attn.kv_b_proj.weight_scale_inv,
+                                  weight_block_size).to(dtype)
+            else:
+                w = self_attn.kv_b_proj.weight
+        else:
+            w = self_attn.kv_b_proj.weight
+        w_kc, w_vc = w.unflatten(
+            0, (-1, self_attn.qk_nope_head_dim + self_attn.v_head_dim)).split(
+                [self_attn.qk_nope_head_dim, self_attn.v_head_dim], dim=1)
+        self_attn.w_kc = w_kc.transpose(1, 2).contiguous().transpose(1, 2)
+        self_attn.w_vc = w_vc.contiguous().transpose(1, 2)
+        if self.config.mla_scale_q_lora:
+            self_attn.q_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.q_lora_rank)**0.5
+        if self.config.mla_scale_kv_lora:
+            self_attn.kv_a_layernorm.weight.data *= (
+                self.config.hidden_size / self.config.kv_lora_rank)**0.5
+        return loaded_params
+    def _rewrite_spec_layer_name(self, spec_layer: int, name: str,
+                                 new_to_old_names_mapping: dict) -> str:
+        """
+        Rewrite the weight name to match the format of the original model.
+        Add .mtp_block for modules in transformer layer block for spec layer
+        and rename shared layer weights to be top level.
+        """
+        if name in new_to_old_names_mapping:
+            name = new_to_old_names_mapping[name]
+        spec_layer_weight_names = [
+            "embed_tokens", "enorm", "hnorm", "eh_proj", "shared_head"
+        ]
+        if name.startswith("enorm") or name.startswith(
+                "hnorm") or name.startswith("eh_proj") or name.startswith(
+                    "final_layernorm"):
+            name = "model.layers." + str(spec_layer) + "." + name
+        shared_weight_names = ["embed_tokens"]
+        spec_layer_weight = False
+        shared_weight = False
+        for weight_name in spec_layer_weight_names:
+            if weight_name in name:
+                spec_layer_weight = True
+                if weight_name in shared_weight_names:
+                    shared_weight = True
+                break
+        if not spec_layer_weight:
+            # treat rest weights as weights for transformer layer block
+            name = name.replace("model.layers.0.",
+                                f"model.layers.{spec_layer}.mtp_block.")
+        elif shared_weight:
+            # treat shared weights as top level weights
+            name = name.replace("model.layers.0.", "model.")
+        return name
+    def get_spec_layer_idx_from_weight_name(self, config: PretrainedConfig,
+                                            weight_name: str) -> Optional[int]:
+        if "model.mtp" in weight_name:
+            return config.num_hidden_layers * 2
+        return None
\ No newline at end of file
--- a/vllm/model_executor/models/registry.py
+++ b/vllm/model_executor/models/registry.py
@@ -104,6 +104,7 @@ _TEXT_GENERATION_MODELS = {
    "Llama4ForCausalLM": ("llama4", "Llama4ForCausalLM"),  # noqa: E501
    # For decapoda-research/llama-*
    "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
+    "LongcatFlashForCausalLM": ("longcat_flash", "LongcatFlashForCausalLM"),
    "MambaForCausalLM": ("mamba", "MambaForCausalLM"),
    "FalconMambaForCausalLM": ("mamba", "MambaForCausalLM"),
    "FalconH1ForCausalLM":("falcon_h1", "FalconH1ForCausalLM"),
@@ -285,6 +286,7 @@ _SPECULATIVE_DECODING_MODELS = {
    "EagleDeepSeekMTPModel": ("deepseek_eagle", "EagleDeepseekV3ForCausalLM"),
    "DeepSeekMTPModel": ("deepseek_mtp", "DeepSeekMTP"),
    "ErnieMTPModel": ("ernie_mtp", "ErnieMTP"),
+    "LongCatFlashMTPModel": ("longcat_flash_mtp", "LongCatFlashMTP"),
    "Glm4MoeMTPModel": ("glm4_moe_mtp", "Glm4MoeMTP"),
    "MedusaModel": ("medusa", "Medusa"),
    "Qwen3NextMTP": ("qwen3_next_mtp", "Qwen3NextMTP"),

--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -707,14 +707,14 @@ def maybe_prefix(prefix: str, name: str) -> str:
    return name if not prefix else f"{prefix}.{name}"
-def extract_layer_index(layer_name: str) -> int:
+def extract_layer_index(layer_name: str, num_attn_module: int = 1) -> int:
    """
    Extract the layer index from the module name.
    Examples:
    - "encoder.layers.0" -> 0
    - "encoder.layers.1.self_attn" -> 1
    - "2.self_attn" -> 2
-    - "model.encoder.layers.0.sub.1" -> ValueError
+    - "model.encoder.layers.0.sub.1" -> ValueError if num_attn_module == 1
    """
    subnames = layer_name.split(".")
    int_vals: list[int] = []
@@ -723,9 +723,17 @@ def extract_layer_index(layer_name: str) -> int:
            int_vals.append(int(subname))
        except ValueError:
            continue
+    if num_attn_module == 1 or "attn" not in layer_name:
        assert len(int_vals) == 1, (f"layer name {layer_name} should"
                                    " only contain one integer")
        return int_vals[0]
+    else:
+        assert len(int_vals) <= 2, (f"layer name {layer_name} should"
+                                    " contain most two integers")
+        layer_index = int_vals[0] * num_attn_module + int_vals[1] if len(
+            int_vals) == 2 else int_vals[0]
+        return layer_index
 def cast_overflow_tensors(

--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -255,7 +255,8 @@ class EagleProposer:
                hidden_states=self.hidden_states[:num_input_tokens],
                inputs_embeds=inputs_embeds,
            )
-            if self.method in ("deepseek_mtp", "ernie_mtp", "qwen3_next_mtp"):
+            if self.method in ("deepseek_mtp", "ernie_mtp", "qwen3_next_mtp",
+                               "longcat_flash_mtp"):
                last_hidden_states = ret_hidden_states
                hidden_states = last_hidden_states
            else:
@@ -264,6 +265,9 @@ class EagleProposer:
        logits = self.model.compute_logits(sample_hidden_states, None)
        positions = target_positions[last_token_indices]
+        if self.method in ("deepseek_mtp", "ernie_mtp", "longcat_flash_mtp"):
+            hidden_states = self.hidden_states[last_token_indices]
+        else:
            hidden_states = hidden_states[last_token_indices]
        if isinstance(attn_metadata, TreeAttentionMetadata):

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -3610,9 +3610,11 @@ class GPUModelRunner(LoRAModelRunnerMixin, KVConnectorModelRunnerMixin):
                         target_layer_name)
            kv_caches[layer_name] = kv_caches[target_layer_name]
+        num_attn_module = 2 \
+            if self.model_config.hf_config.model_type == "longcat_flash" else 1
        bind_kv_cache(kv_caches,
                      self.compilation_config.static_forward_context,
-                      self.kv_caches)
+                      self.kv_caches, num_attn_module)
        return kv_caches
    def maybe_add_kv_sharing_layers_to_kv_cache_groups(

--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -239,6 +239,7 @@ def bind_kv_cache(
    kv_caches: dict[str, torch.Tensor],
    forward_context: dict[str, "Attention"],
    runner_kv_caches: list[torch.Tensor],
+    num_attn_module: Optional[int] = 1,
 ) -> None:
    """
    Bind the allocated KV cache to both ModelRunner and forward context so
@@ -262,7 +263,8 @@ def bind_kv_cache(
    # Convert kv_caches dict to a list of tensors in the order of layer_index.
    index2name = defaultdict(list)
    for layer_name in kv_caches:
-        index2name[extract_layer_index(layer_name)].append(layer_name)
+        index2name[extract_layer_index(layer_name,
+                                       num_attn_module)].append(layer_name)
    for layer_index in sorted(index2name.keys()):
        layer_names = index2name[layer_index]

--- a/vllm/worker/worker.py
+++ b/vllm/worker/worker.py
@@ -76,6 +76,7 @@ class Worker(LocalOrDistributedWorkerBase):
                        "mlp_speculator",
                        "eagle",
                        "deepseek_mtp",
+                        "longcat_flash_mtp",
                        "glm4_moe_mtp",
                        "mimo_mtp",
                        "ernie_mtp",