Merge remote-tracking branch 'origin/v0.9.2-dev' into v0.9.2-dev

48a9e546 · 王敏 · 6372a1f3 · c11b09df · 48a9e546 · 48a9e546
Commit 48a9e546 authored Sep 07, 2025 by 王敏
18 changed files
--- a/tests/v1/tpu/test_tpu_qkv_linear.py
+++ b/tests/v1/tpu/test_tpu_qkv_linear.py
--- a/tests/worker/test_encoder_decoder_model_runner.py
+++ b/tests/worker/test_encoder_decoder_model_runner.py
@@ -482,6 +482,8 @@ def test_prepare_decode(batch_size, multiple_seqs_per_seq_group):
    assert torch.equal(actual, expected)
+@pytest.mark.skipif(current_platform.is_rocm(),
+                    reason="ROCM is not supported.")
 @pytest.mark.parametrize("batch_size", list(range(1, 257)))
 @pytest.mark.parametrize("multiple_seqs_per_seq_group", [True, False])
 def test_prepare_decode_cuda_graph(batch_size, multiple_seqs_per_seq_group):

--- a/tests/worker/test_model_runner.py
+++ b/tests/worker/test_model_runner.py
@@ -32,7 +32,7 @@ def test_deepseek_mla_attn_backend_module():
        trust_remote_code=True,
        enable_chunked_prefill=False,
    )
-    assert model_runner.attn_backend.__name__ == "TritonMLABackend"
+    assert model_runner.attn_backend.__name__ == "FlashMLABackend" # "TritonMLABackend"
 @pytest.mark.parametrize("batch_size", list(range(1, 257, 3)))

--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1971,7 +1971,8 @@ def wvSplitKQ(a: torch.Tensor, b: torch.Tensor, out_dtype: torch.dtype,
 # moe
 def moe_sum(input: torch.Tensor, output: torch.Tensor):
    torch.ops._moe_C.moe_sum(input, output)
+def moe_sum_opt1(input: torch.Tensor, output: torch.Tensor):
+    torch.ops._moe_C.moe_sum_opt1(input, output)
 def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                         block_size: int, sorted_token_ids: torch.Tensor,

--- a/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
+++ b/vllm/distributed/kv_transfer/kv_connector/v1/p2p/p2p_nccl_connector.py
@@ -215,6 +215,9 @@ class P2pNcclConnector(KVConnectorBase_V1):
                inject_kv_into_layer(kv_cache_layer, kv_cache,
                                     request.slot_mapping, request.request_id)
+                tensor = self.p2p_nccl_engine.recv_store.pop(request.request_id + "#" + layer_name, None)
+                if tensor is not None:
+                    del tensor
    def wait_for_layer_load(self, layer_name: str) -> None:
        """Blocking until the KV for a specific layer is loaded into vLLM's

--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -1004,7 +1004,7 @@ class EngineArgs:
            enable_sleep_mode=self.enable_sleep_mode,
            model_impl=self.model_impl,
            override_attention_dtype=self.override_attention_dtype,
-            enable_chunked_prefill=self.enable_chunked_prefill
+            enable_chunked_prefill=self.enable_chunked_prefill,
        )
    def create_load_config(self) -> LoadConfig:

--- a/vllm/model_executor/layers/fused_moe/config.py
+++ b/vllm/model_executor/layers/fused_moe/config.py
@@ -50,6 +50,7 @@ def get_config_quant_dtype(
    use_int8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
+    use_int4_w4a8: bool,
 ) -> Optional[torch.dtype]:
    if use_fp8_w8a8:
        return torch.float8_e4m3fn
@@ -126,6 +127,7 @@ class FusedMoEQuantConfig:
        use_int8_w8a8: bool = False,
        use_int8_w8a16: bool = False,
        use_int4_w4a16: bool = False,
+        use_int4_w4a8: bool = False,
        per_act_token_quant: bool = False,
        per_out_ch_quant: bool = False,
        block_shape: Optional[list[int]] = None,
@@ -136,6 +138,7 @@ class FusedMoEQuantConfig:
                use_int8_w8a8,
                use_int8_w8a16,
                use_int4_w4a16,
+                use_int4_w4a8,
            ]
        ]) <= 1, "Quantization flags are mutually exclusive."
@@ -144,6 +147,7 @@ class FusedMoEQuantConfig:
            use_int8_w8a8=use_int8_w8a8,
            use_int8_w8a16=use_int8_w8a16,
            use_int4_w4a16=use_int4_w4a16,
+            use_int4_w4a8=use_int4_w4a8,
        )
        return FusedMoEQuantConfig(
            quant_dtype,

--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=BW200_nn.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 2,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=128,N=176,device_name=K100_AI_nn.json
+{
+    "1": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "2": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "4": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "8": {
+        "BLOCK_SIZE_M": 16,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 32,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "16": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "24": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "32": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "48": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "64": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "96": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "128": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "256": {
+        "BLOCK_SIZE_M": 32,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "512": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "1024": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "1536": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "2048": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    },
+    "3072": {
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 64,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3,
+        "num_ldmatrixes": 1
+    },
+    "4096": {
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 8,
+        "num_stages": 2,
+        "num_ldmatrixes": 1
+    }
+}
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -652,7 +652,6 @@ def invoke_fused_moe_kernel(A: torch.Tensor,
                            B_scale: Optional[torch.Tensor],
                            B_zp: Optional[torch.Tensor],
                            topk_weights: Optional[torch.Tensor],
-                            topk_ids: torch.Tensor,
                            sorted_token_ids: torch.Tensor,
                            expert_ids: torch.Tensor,
                            num_tokens_post_padded: torch.Tensor,
@@ -1603,7 +1602,8 @@ def fused_experts_impl(
    qtype = get_config_quant_dtype(use_fp8_w8a8=use_fp8_w8a8,
                                   use_int8_w8a8=use_int8_w8a8,
                                   use_int8_w8a16=use_int8_w8a16,
-                                   use_int4_w4a16=use_int4_w4a16)
+                                   use_int4_w4a16=use_int4_w4a16,
+                                   use_int4_w4a8=use_int4_w4a8)
    get_config_func = functools.partial(
        try_get_optimal_moe_config,
@@ -1689,7 +1689,6 @@ def fused_experts_impl(
                                w1_scale,
                                w1_zp,
                                curr_topk_weights,
-                                curr_topk_ids,
                                sorted_token_ids,
                                expert_ids,
                                num_tokens_post_padded,
@@ -1729,7 +1728,6 @@ def fused_experts_impl(
                                w2_scale,
                                w2_zp,
                                curr_topk_weights,
-                                curr_topk_ids,
                                sorted_token_ids,
                                expert_ids,
                                num_tokens_post_padded,
@@ -1877,7 +1875,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
        use_int8_w8a8: bool = False,
        use_int8_w8a16: bool = False,
        use_int4_w4a16: bool = False,
-        use_int4_w4a8: bool =False,
+        use_int4_w4a8: bool = False,
        per_act_token_quant: bool = False,
        block_shape: Optional[List[int]] = None,
    ):
@@ -1896,7 +1894,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
        self.use_int4_w4a16 = use_int4_w4a16
        self.use_int8_w8a8 = use_int8_w8a8
        self.use_int8_w8a16 = use_int8_w8a16
-        self.use_int4_w4a8= use_int4_w4a8
+        self.use_int4_w4a8 = use_int4_w4a8
    @property
    def activation_formats(
@@ -2027,7 +2025,7 @@ class TritonExperts(mk.FusedMoEPermuteExpertsUnpermute):
                                use_int8_w8a8=self.use_int8_w8a8,
                                use_int8_w8a16=self.use_int8_w8a16,
                                use_int4_w4a16=self.use_int4_w4a16,
-                                use_int4_w4a8= self.use_int4_w4a8,
+                                use_int4_w4a8=self.use_int4_w4a8,
                                per_channel_quant=self.per_act_token_quant,
                                block_shape=self.block_shape)
@@ -2068,7 +2066,7 @@ def modular_triton_fused_moe(
    use_int8_w8a8: bool,
    use_int8_w8a16: bool,
    use_int4_w4a16: bool,
-    use_int4_w4a8:bool,
+    use_int4_w4a8: bool,
    per_act_token_quant: bool,
    block_shape: Optional[List[int]] = None,
 ) -> mk.FusedMoEModularKernel:
@@ -2079,7 +2077,7 @@ def modular_triton_fused_moe(
            use_int8_w8a8=use_int8_w8a8,
            use_int8_w8a16=use_int8_w8a16,
            use_int4_w4a16=use_int4_w4a16,
-            use_int4_w4a8= use_int4_w4a8,
+            use_int4_w4a8=use_int4_w4a8,
            per_act_token_quant=per_act_token_quant,
            block_shape=block_shape,
        ),

--- a/vllm/model_executor/layers/fused_moe/layer.py
+++ b/vllm/model_executor/layers/fused_moe/layer.py
@@ -795,7 +795,6 @@ class FusedMoE(torch.nn.Module):
        if quant_config is None:
            # Not considering quant for now, temporarily
            self.use_nn_moe = int(os.environ.get('MOE_NN', 1)) == 1
-            # self.use_nn_moe = os.environ.get('MOE_NN') == '1'
        else:
            self.use_nn_moe = False

--- a/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/rocm_aiter_fused_moe.py
@@ -36,9 +36,10 @@ class ActivationMethod(IntEnum):
 @cache
 def is_rocm_aiter_moe_enabled() -> bool:
-    return current_platform.is_rocm() \
+    return False
-        and envs.VLLM_ROCM_USE_AITER_MOE \
+    # return current_platform.is_rocm() \
-        and envs.VLLM_ROCM_USE_AITER
+    #     and envs.VLLM_ROCM_USE_AITER_MOE \
+    #     and envs.VLLM_ROCM_USE_AITER
 def rocm_aiter_asm_moe_tkw1_impl(

--- a/vllm/two_batch_overlap/v1/model_input_split_v1.py
+++ b/vllm/two_batch_overlap/v1/model_input_split_v1.py
@@ -25,6 +25,7 @@ class TBOModelInputSplit():
        self.req_num_right = 0
        self.scheduler_output_left = None
        self.scheduler_output_right = None
+        self.query_start_loc_right = None
 input_split = TBOModelInputSplit()
@@ -136,78 +137,39 @@ def prepare_tbo_atten_metadata(
    assert num_reqs > 0
    seq_len_offset = req_offset
-    if req_offset == 0: #left
-        query_start_offset = 0
-    else:
-        query_start_offset = req_offset + 1
    # Get the number of scheduled tokens for each request.
    tokens = [scheduler_output.num_scheduled_tokens[i] for i in req_ids]
    num_scheduled_tokens = np.array(tokens, dtype=np.int32)
    max_num_scheduled_tokens = max(tokens)
-    # Get request indices.
+    if req_offset > 0: #right
-    # E.g., [2, 5, 3] -> [0, 0, 1, 1, 1, 1, 1, 2, 2, 2]
+        if input_split.query_start_loc_right == None:
-    req_indices = np.repeat(runner.arange_np[:num_reqs],
+            # TODO: create when system init
-                            num_scheduled_tokens) + req_offset
+            input_split.query_start_loc_right = torch.zeros(runner.max_num_reqs + 1,
+                                            dtype=torch.int32,
+                                            device=runner.device)
-    # cu_num_tokens: [2, 5, 3] -> [2, 7, 10]
+        cu_num_tokens, arange = runner._get_cumsum_and_arange(
-    # arange: [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
+            num_scheduled_tokens)
-    cu_num_tokens, arange = runner._get_cumsum_and_arange(
-        num_scheduled_tokens)
-    # Get positions.
+        # Prepare the attention metadata.
-    positions_np = runner.positions_np[:total_num_scheduled_tokens]
+        runner.query_start_loc_np[0] = 0
-    np.add(runner.input_batch.num_computed_tokens_cpu[req_indices],
+        runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
-            arange,
-            out=positions_np)
-    # Calculate the slot mapping for each KV cache group.
-    for kv_cache_group_id, kv_cache_group_spec in enumerate(
+        input_split.query_start_loc_right[0: num_reqs + 1].copy_(
-            runner.kv_cache_config.kv_cache_groups):
+            runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
-        block_size = kv_cache_group_spec.kv_cache_spec.block_size
+        # Note: pad query_start_loc to be non-decreasing, as kernels
-        block_table: BlockTable = runner.input_batch.block_table[
+        # like FlashAttention requires that
-            kv_cache_group_id]
+        input_split.query_start_loc_right[num_reqs + 1:].fill_(
-        # E.g., [0, 1, 0, 1, 2, 3, 4, 0, 1, 2]
-        # -> [0, 0, K, K, K + 1, K + 1, K + 2, 2 * K, 2 * K, 2 * K + 1]
-        # where K is the max_num_blocks_per_req and the block size is 2.
-        # NOTE(woosuk): We can't simply use `token_indices // block_size`
-        # here because M (max_model_len) is not necessarily divisible by
-        # block_size.
-        block_table_indices = (
-            req_indices * block_table.max_num_blocks_per_req +
-            positions_np // block_size)
-        block_table_cpu = block_table.get_cpu_tensor()
-        block_numbers = block_table_cpu.flatten(
-        )[block_table_indices].numpy()
-        block_offsets = positions_np % block_size
-        np.add(
-            block_numbers * block_size,
-            block_offsets,
-            out=block_table.slot_mapping_np[:total_num_scheduled_tokens])
-    # Prepare the attention metadata.
-    runner.query_start_loc_np[0] = 0
-    runner.query_start_loc_np[1:num_reqs + 1] = cu_num_tokens
-    runner.seq_lens_np[:num_reqs] = (
-        runner.input_batch.num_computed_tokens_cpu[req_offset : req_offset + num_reqs] +
-        num_scheduled_tokens)
-    runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1].copy_(
-        runner.query_start_loc_cpu[:num_reqs + 1], non_blocking=True)
-    # Note: pad query_start_loc to be non-decreasing, as kernels
-    # like FlashAttention requires that
-    if req_offset > 0: #right
-        runner.query_start_loc[query_start_offset + num_reqs + 1:].fill_(
            runner.query_start_loc_cpu[num_reqs].item())
-    runner.seq_lens[seq_len_offset :seq_len_offset + num_reqs].copy_(runner.seq_lens_cpu[:num_reqs],
+        query_start_loc = input_split.query_start_loc_right[: num_reqs + 1]
-                                    non_blocking=True)
-    # Fill unused with -1. Needed for reshape_and_cache
-    if req_offset > 0: #right
-        runner.seq_lens[seq_len_offset + num_reqs:].fill_(0)
-    query_start_loc = runner.query_start_loc[query_start_offset: query_start_offset + num_reqs + 1]
+    else:
+        query_start_loc = runner.query_start_loc[:num_reqs + 1]
    seq_lens = runner.seq_lens[seq_len_offset : seq_len_offset + num_reqs]
    common_attn_metadata = CommonAttentionMetadata(
@@ -240,6 +202,9 @@ def prepare_tbo_atten_metadata(
            origin_slot_mapping = metadata_builder.block_table.slot_mapping
            metadata_builder.block_table.slot_mapping = \
                origin_slot_mapping[input_split.scheduler_output_left.total_num_scheduled_tokens:]
+            origin_slot_map_cpu = metadata_builder.block_table.slot_mapping_cpu
+            metadata_builder.block_table.slot_mapping_cpu = \
+                origin_slot_map_cpu[input_split.scheduler_output_left.total_num_scheduled_tokens:]
        if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only
            _num_decodes_record = metadata_builder._num_decodes
            _num_prefills_record = metadata_builder._num_prefills
@@ -257,6 +222,7 @@ def prepare_tbo_atten_metadata(
        if req_offset > 0:
            metadata_builder.block_table.block_table = origin_block_table
            metadata_builder.block_table.slot_mapping = origin_slot_mapping
+            metadata_builder.block_table.slot_mapping_cpu = origin_slot_map_cpu
        if isinstance(metadata_builder, MLACommonMetadataBuilder): # now support prefill only
            metadata_builder._num_decodes = _num_decodes_record
@@ -304,18 +270,16 @@ def tbo_split_and_execute_model(
    inputs_embeds,
    scheduler_output: "SchedulerOutput",
    intermediate_tensors: Optional[IntermediateTensors] = None,
+    skip_cuda_graphs: bool = True,
 ) -> Union[ModelRunnerOutput, IntermediateTensors]:
    use_tbo = False
    if isinstance(runner.attn_metadata_builders[0], MLACommonMetadataBuilder) and \
        runner.attn_metadata_builders[0]._num_decodes > 0: #is mla decode
        use_tbo = False
    else:
-        if len(scheduler_output.num_scheduled_tokens) > 1:
+        if len(scheduler_output.num_scheduled_tokens) > 1 and num_input_tokens > envs.VLLM_TBO_MIN_TOKENS:
            split_scheduler_output(runner, scheduler_output)
-            if input_split.scheduler_output_left.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS and \
+            use_tbo = True
-                input_split.scheduler_output_right.total_num_scheduled_tokens >= envs.VLLM_TBO_MIN_TOKENS:
-                use_tbo = True
    if use_tbo:
        num_input_tokens_left = input_split.scheduler_output_left.total_num_scheduled_tokens
        num_input_tokens_right = num_input_tokens - num_input_tokens_left
@@ -338,11 +302,12 @@ def tbo_split_and_execute_model(
    else:
        # Run the decoder.
        # Use persistent buffers for CUDA graphs.
+        envs.VLLM_ENABLE_TBO = False
        with set_forward_context(attn_metadata,
                                runner.vllm_config,
                                num_tokens=num_input_tokens,
                                num_tokens_across_dp=num_tokens_across_dp,
-                                skip_cuda_graphs=True):
+                                skip_cuda_graphs=skip_cuda_graphs):
            runner.maybe_setup_kv_connector(scheduler_output)
            model_output = runner.model(
@@ -355,4 +320,5 @@ def tbo_split_and_execute_model(
            runner.maybe_wait_for_kv_save()
            finished_sending, finished_recving = (
                runner.get_finished_kv_transfers(scheduler_output))
+        envs.VLLM_ENABLE_TBO = True
    return model_output, finished_sending, finished_recving
\ No newline at end of file
--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -38,6 +38,7 @@ class CachedRequestState:
    block_ids: tuple[list[int], ...]
    num_computed_tokens: int
    output_token_ids: list[int]
+    spec_token_ids: list[int] = None
    mrope_positions: Optional[torch.Tensor] = None
    mrope_position_delta: Optional[int] = None
@@ -288,9 +289,16 @@ class InputBatch:
        end_idx = start_idx + len(request.output_token_ids)
        self.token_ids_cpu[req_index,
                           start_idx:end_idx] = request.output_token_ids
+        num_spec_tokens = 0
+        if request.spec_token_ids != None:
+            num_spec_tokens = len(request.spec_token_ids)
+            self.token_ids_cpu[req_index,
+                            end_idx:end_idx + num_spec_tokens] = request.spec_token_ids
        # Number of token ids in token_ids_cpu.
        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens
+        self.num_tokens[req_index] = request.num_tokens + num_spec_tokens
        # Number of tokens without spec decode tokens.
        self.num_tokens_no_spec[req_index] = request.num_tokens

--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -482,6 +482,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            # Update the cached states.
            req_state.num_computed_tokens = num_computed_tokens
+            spec_token_ids = (
+                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
            if not is_last_rank:
                # When using PP, the scheduler sends the sampled tokens back,
@@ -498,6 +500,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                elif num_new_tokens > 0:
                    req_state.output_token_ids.extend(
                        new_token_ids[-num_new_tokens:])
+                if len(spec_token_ids) > 0:
+                    req_state.spec_token_ids = spec_token_ids
            # Update the block IDs.
            if not resumed_from_preemption:
@@ -537,8 +541,6 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                self.input_batch.num_tokens[req_index] = end_token_index
            # Add spec_token_ids to token_ids_cpu.
-            spec_token_ids = (
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))
            if spec_token_ids:
                num_spec_tokens = len(spec_token_ids)
                start_index = self.input_batch.num_tokens_no_spec[req_index]
@@ -635,7 +637,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        # where M is the max_model_len.
        token_indices = (positions_np +
                         req_indices * self.input_batch.token_ids_cpu.shape[1])
        # NOTE(woosuk): We use torch.index_select instead of np.take here
        # because torch.index_select is much faster than np.take for large
        # tensors.
@@ -1381,7 +1383,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,
-                                             inputs_embeds, scheduler_output, intermediate_tensors)
+                                             inputs_embeds, scheduler_output, intermediate_tensors,
+                                             skip_cuda_graphs)
        else:
            # Run the model.
            # Use persistent buffers for CUDA graphs.
@@ -2096,8 +2099,9 @@ class GPUModelRunner(LoRAModelRunnerMixin):
                hidden_states = outputs
            if self.speculative_config and self.speculative_config.use_eagle() and not is_profile:
-                assert isinstance(self.drafter, EagleProposer)
+                #assert isinstance(self.drafter, EagleProposer)
-                self.drafter.dummy_run(num_tokens, attn_metadata)
+                if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
+                    self.drafter.dummy_run(num_tokens, attn_metadata)
        # This is necessary to avoid blocking DP.
        # For dummy runs, we typically skip EPLB since we don't have any real
@@ -2231,8 +2235,8 @@ class GPUModelRunner(LoRAModelRunnerMixin):
    def profile_run(self) -> None:
        # set profiling flag to avoid torch compile
-        set_profilling(True)
+        #set_profilling(True)
-        self._sync_device()
+        #self._sync_device()
        # Profile with multimodal encoder & encoder cache.
        # TODO: handle encoder-decoder models once we support them.
@@ -2317,7 +2321,7 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        del hidden_states, output
        self.encoder_cache.clear()
        gc.collect()
-        set_profilling(False)
+        #set_profilling(False)
    def capture_model(self) -> None:
        if not self.use_cuda_graph:
@@ -2685,10 +2689,11 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        kv_caches = self.initialize_kv_cache_tensors(kv_cache_config)
        if self.speculative_config and self.speculative_config.use_eagle():
-            assert isinstance(self.drafter, EagleProposer)
+            #assert isinstance(self.drafter, EagleProposer)
            # validate all draft model layers belong to the same kv cache
            # group
-            self.drafter.validate_same_kv_cache_group(kv_cache_config)
+            if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
+                self.drafter.validate_same_kv_cache_group(kv_cache_config)
        if has_kv_transfer_group():
            get_kv_transfer_group().register_kv_caches(kv_caches)

--- a/vllm/zero_overhead/v1/core.py
+++ b/vllm/zero_overhead/v1/core.py
@@ -80,6 +80,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
                request._output_token_ids[fix_offset] = generated_token_ids
                request._all_token_ids[fix_offset] = generated_token_ids
                requsets_valid_token_len[req_id] += 1
+                generated_token_ids = [generated_token_ids]
            else:
                valid_output_end = valid_output_len + len(generated_token_ids) - request.num_output_tokens
                if valid_output_end == 0:
@@ -107,7 +108,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
            pooler_output = None
            if pooler_outputs:
-                pooler_output = pooler_outputs[req_index]
+                pooler_output = pooler_outputs[req_idx]
                stopped = check_stop(request, scheduler.max_model_len,
                                        pooler_output, True)
                if stopped:
@@ -118,7 +119,7 @@ def zero_overhead_update_from_output(scheduler:Scheduler,
                and request.sampling_params.logprobs is not None and logprobs:
                # NOTE: once we support N tokens per step (spec decode),
                # the outer lists can be of length > 1.
-                new_logprobs = logprobs.slice(req_index, req_index + 1)
+                new_logprobs = logprobs.slice(req_idx, req_idx + 1)
            if new_token_ids and scheduler.structured_output_manager.should_advance(
                    request):

--- a/vllm/zero_overhead/v1/gpu_model_runner.py
+++ b/vllm/zero_overhead/v1/gpu_model_runner.py
@@ -472,12 +472,12 @@ class V1ZeroModelRunner(GPUModelRunner):
        # If attention doesn't support CUDA Graphs for this batch, but we
        # compiled with full CUDA graphs, we have to skip them entirely.
        skip_cuda_graphs = self.full_cuda_graph and not attention_cuda_graphs
        if envs.VLLM_ENABLE_TBO and (not self.use_cuda_graph or skip_cuda_graphs):
            model_output, finished_sending, finished_recving = \
                 tbo_split_and_execute_model(self, attn_metadata, num_input_tokens,
                                             num_tokens_across_dp, input_ids, positions,
-                                             inputs_embeds, scheduler_output, intermediate_tensors)
+                                             inputs_embeds, scheduler_output, intermediate_tensors, 
+                                             skip_cuda_graphs)
        else:
            # Run the model.
            # Use persistent buffers for CUDA graphs.

--- a/vllm/zero_overhead/v1/outputs.py
+++ b/vllm/zero_overhead/v1/outputs.py
@@ -9,6 +9,6 @@ class ZeroV1ModelRunnerOutput(ModelRunnerOutput):
    # [num_reqs]
    fix_req_ids: list[str] = None
    fix_sampled_token_ids:list[list[int]] = None
-    fix_draft_req_ids:list[list[int]] = None
+    fix_draft_req_ids:list[str] = None
    fix_draft_tokens_ids:list[list[int]] = None
    is_output_valid:bool = True
\ No newline at end of file