remove unused code

c1c5e4f6 · zhuwenwen · d2fe5111 · c1c5e4f6 · c1c5e4f6 · c1c5e4f6
Commit c1c5e4f6 authored Dec 18, 2025 by zhuwenwen
17 changed files
--- a/vllm/attention/ops/common.py
+++ b/vllm/attention/ops/common.py
@@ -466,209 +466,4 @@ def unpack_seq_triton(
        output_shape = (N,) + original_shape[2:]
        out = out.reshape(output_shape)

-    return out
-
-
-@triton.jit
-def _pack_seq_kernel(
-        x_ptr,  # [N, D]
-        out_ptr,  # [B, Lmax, D]
-        lengths_ptr,  # *i32, [B]
-        N: tl.constexpr,
-        D: tl.constexpr,
-        Lmax: tl.constexpr,
-        PAD_VALUE: tl.constexpr,
-        BLOCK_T: tl.constexpr,  # timesteps per program
-        BLOCK_D: tl.constexpr  # features per program
-):
-    pid_b = tl.program_id(0)  # batch id
-    pid_t = tl.program_id(1)  # block over time dimension
-    pid_d = tl.program_id(2)  # block over feature dimension
-    off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T)  # [BLOCK_T]
-    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)  # [BLOCK_D]
-
-    # Compute start index and sequence length from cumulative lengths
-    in_start = 0
-    for i in range(pid_b):
-        in_start += tl.load(lengths_ptr + i)
-    seq_len = tl.load(lengths_ptr + pid_b)
-
-    # valid time positions for this block
-    t_mask = off_t < Lmax
-
-    # compute input row indices for valid (b, t)
-    in_row = in_start + off_t
-    valid_row = (off_t < seq_len) & t_mask
-
-    # Pointers
-    # x_ptr: row-major [N, D]
-    x_row_ptr = x_ptr + in_row[:, None] * D + off_d[None, :]
-
-    # out_ptr: row-major [B, Lmax, D]
-    out_row_ptr = out_ptr + (pid_b * Lmax + off_t)[:,
-                                                   None] * D + off_d[None, :]
-
-    # Initialize with PAD (cast will occur as needed based on out_ptr dtype)
-    d_mask = off_d[None, :] < D
-    pad_vals = tl.full([BLOCK_T, BLOCK_D], PAD_VALUE, tl.float32)
-    tl.store(out_row_ptr, pad_vals, mask=t_mask[:, None] & d_mask)
-
-    # Load & write only where within seq_len
-    x_vals = tl.load(x_row_ptr, mask=valid_row[:, None] & d_mask)
-    tl.store(out_row_ptr, x_vals, mask=valid_row[:, None] & d_mask)
-
-
-def pack_seq_triton(x: torch.Tensor,
-                    lengths: torch.Tensor,
-                    pad_value: float = -float('inf'),
-                    block_t: int = 64,
-                    block_d: int = 64) -> torch.Tensor:
-    """
-    Pack sequences of different lengths into a batched tensor.
-    
-    Args:
-        x: [N, ...] - input tensor where N is total number of tokens
-        lengths: [B] - sequence lengths for each batch
-        pad_value: value to use for padding
-        block_t: block size for time dimension
-        block_d: block size for feature dimension
-        
-    Returns:
-        packed: [B, Lmax, ...] - packed tensor
-    """
-
-    # Handle multi-dimensional input by reshaping to (N, -1)
-    original_shape = x.shape
-    if len(original_shape) > 2:
-        N = original_shape[0]
-        x_reshaped = x.reshape(N, -1)
-        D = x_reshaped.shape[1]
-    else:
-        N, D = x.shape
-        x_reshaped = x
-
-    B = lengths.numel()
-    Lmax = int(lengths.max().item())
-
-    # Starts are computed inside the kernel from lengths
-
-    out = torch.empty((B, Lmax, D), device=x.device, dtype=x.dtype)
-
-    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
-    _pack_seq_kernel[grid](x_reshaped,
-                           out,
-                           lengths.int(),
-                           N,
-                           D,
-                           Lmax,
-                           PAD_VALUE=float(pad_value),
-                           BLOCK_T=block_t,
-                           BLOCK_D=block_d,
-                           num_warps=4,
-                           num_stages=2)
-
-    # Reshape output back to original dimensions (except first dimension)
-    if len(original_shape) > 2:
-        output_shape = (B, Lmax) + original_shape[1:]
-        out = out.reshape(output_shape)
-
-    return out
-
-
-@triton.jit
-def _unpack_seq_triton_kernel(
-        packed_ptr,  # [B, Lmax, D]
-        out_ptr,  # [N, D]
-        lengths_ptr,  # *i32, [B]
-        B: tl.constexpr,
-        Lmax: tl.constexpr,
-        D: tl.constexpr,
-        BLOCK_T: tl.constexpr,  # timesteps per program
-        BLOCK_D: tl.constexpr  # features per program
-):
-    pid_b = tl.program_id(0)  # batch id
-    pid_t = tl.program_id(1)  # block over time dimension
-    pid_d = tl.program_id(2)  # block over feature dimension
-    off_t = pid_t * BLOCK_T + tl.arange(0, BLOCK_T)  # [BLOCK_T]
-    off_d = pid_d * BLOCK_D + tl.arange(0, BLOCK_D)  # [BLOCK_D]
-
-    # bounds: compute start from cumulative lengths
-    in_start = 0
-    for i in range(pid_b):
-        in_start += tl.load(lengths_ptr + i)
-    seq_len = tl.load(lengths_ptr + pid_b)
-
-    # valid time positions for this block
-    t_mask = off_t < Lmax
-    valid_row = (off_t < seq_len) & t_mask
-
-    # compute output row indices for valid (b, t)
-    out_row = in_start + off_t
-
-    # Pointers
-    # packed_ptr: row-major [B, Lmax, D]
-    packed_row_ptr = packed_ptr + (pid_b * Lmax +
-                                   off_t)[:, None] * D + off_d[None, :]
-
-    # out_ptr: row-major [N, D]
-    out_row_ptr = out_ptr + out_row[:, None] * D + off_d[None, :]
-
-    # Load from packed tensor and store to output
-    d_mask = off_d[None, :] < D
-    packed_vals = tl.load(packed_row_ptr, mask=valid_row[:, None] & d_mask)
-    tl.store(out_row_ptr, packed_vals, mask=valid_row[:, None] & d_mask)
-
-
-def unpack_seq_triton(packed_tensor: torch.Tensor,
-                      lengths: torch.Tensor,
-                      block_t: int = 64,
-                      block_d: int = 64) -> torch.Tensor:
-    """
-    Unpack a packed decode query tensor back to the original format.
-    Efficient Triton implementation.
-    
-    Args:
-        packed_tensor: [B, Lmax, ...] - packed tensor from pack_seq_triton
-        lengths: [B] - sequence lengths for each batch
-        block_t: block size for time dimension
-        block_d: block size for feature dimension
-        
-    Returns:
-        unpacked_tensor: [N, ...] where N = sum(lengths)
-    """
-
-    # Handle multi-dimensional input by reshaping to (B, Lmax, -1)
-    original_shape = packed_tensor.shape
-    if len(original_shape) > 3:
-        B, Lmax = original_shape[:2]
-        packed_reshaped = packed_tensor.reshape(B, Lmax, -1)
-        D = packed_reshaped.shape[2]
-    else:
-        B, Lmax, D = packed_tensor.shape
-        packed_reshaped = packed_tensor
-
-    # Calculate total number of elements
-    N = int(lengths.sum().item())
-
-    out = torch.empty((N, D),
-                      device=packed_tensor.device,
-                      dtype=packed_tensor.dtype)
-
-    grid = (B, triton.cdiv(Lmax, block_t), triton.cdiv(D, block_d))
-    _unpack_seq_triton_kernel[grid](packed_reshaped,
-                                    out,
-                                    lengths.int(),
-                                    B,
-                                    Lmax,
-                                    D,
-                                    BLOCK_T=block_t,
-                                    BLOCK_D=block_d,
-                                    num_warps=4,
-                                    num_stages=2)
-
-    # Reshape output back to original dimensions (except first dimension)
-    if len(original_shape) > 3:
-        output_shape = (N, ) + original_shape[2:]
-        out = out.reshape(output_shape)
-
-    return out
+    return out
\ No newline at end of file
--- a/vllm/attention/ops/paged_attn.py
+++ b/vllm/attention/ops/paged_attn.py
@@ -7,17 +7,11 @@ import torch
 from vllm.platforms import current_platform

 from vllm import envs
-from vllm.triton_utils import HAS_TRITON

 if current_platform.is_cuda_alike():
    from vllm import _custom_ops as ops
 elif current_platform.is_xpu():
    from vllm._ipex_ops import ipex_ops as ops
-
-if HAS_TRITON:
-    from vllm.attention.ops.prefix_prefill import context_attention_fwd
-    
-use_tc = envs.VLLM_USE_OPT_OP and envs.VLLM_USE_TC_PAGED_ATTN 
    

 class PagedAttention:

--- a/vllm/compilation/decorators.py
+++ b/vllm/compilation/decorators.py
@@ -31,7 +31,6 @@ from vllm.utils.import_utils import resolve_obj_by_qualname
 from vllm.utils.torch_utils import supports_dynamo

 from .monitor import start_monitoring_torch_compile
-from vllm.config import VllmConfig
 from vllm.forward_context import get_profilling

 logger = init_logger(__name__)

--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -1767,11 +1767,6 @@ def apply_hf_chat_template(
    )

    try:
-        resolved_kwargs = resolve_chat_template_kwargs(
-            tokenizer=tokenizer,
-            chat_template=hf_chat_template,
-            chat_template_kwargs=kwargs,
-        )
        return tokenizer.apply_chat_template(
            conversation=conversation,  # type: ignore[arg-type]
            tools=tools,  # type: ignore[arg-type]

--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -80,8 +80,6 @@ from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.engine.llm_engine import LLMEngine
 from vllm.v1.sample.logits_processor import LogitsProcessor

-import vllm.envs as envs
-

 if TYPE_CHECKING:
    from vllm.v1.metrics.reader import Metric

--- a/vllm/lora/models.py
+++ b/vllm/lora/models.py
@@ -331,9 +331,6 @@ class LoRAModelManager:
        self.supported_lora_modules = get_supported_lora_modules(self.model)
        assert self.supported_lora_modules, "No supported LoRA modules found in"
        f" {self.model.__class__.__name__}."
-        
-        if lora_config.lora_target_modules is not None:
-            self.supported_lora_modules = lora_config.lora_target_modules

        self.packed_modules_mapping = process_packed_modules_mapping(self.model)
        # Used to indicate whether the model is a multimodal model

--- a/vllm/v1/attention/backends/mla/common.py
+++ b/vllm/v1/attention/backends/mla/common.py
@@ -231,7 +231,6 @@ from vllm.v1.attention.backends.utils import (
    split_decodes_and_prefills,
 )
 from vllm.v1.kv_cache_interface import AttentionSpec
-from vllm.v1.worker.block_table import BlockTable


 class QueryLenSupport(Enum):
@@ -1460,8 +1459,6 @@ class MLACommonImpl(MLACommonBaseImpl[M], Generic[M]):
            v=v,
            return_lse=True,
        )
-        # Convert from (q_len, num_heads) to (num_heads, q_len)
-        return attn_out, lse.transpose(0, 1).contiguous()

        # Convert from (q_len, num_heads) to (num_heads, q_len)
        return attn_out, lse.transpose(0, 1).contiguous()

--- a/vllm/v1/attention/backends/utils.py
+++ b/vllm/v1/attention/backends/utils.py
@@ -69,10 +69,8 @@ class CommonAttentionMetadata:
    seq_lens_cpu: torch.Tensor
    """(batch_size,), the length of each request including both computed tokens
    and newly scheduled tokens"""
-    
    num_computed_tokens_cpu: torch.Tensor
    """(batch_size,), the number of computed tokens for each request"""
-    
    num_reqs: int
    """Number of requests"""
    # TODO(lucas): rename to num_tokens since it may be padded and this is misleading
@@ -84,12 +82,7 @@ class CommonAttentionMetadata:
    """Longest context length in batch"""

    block_table_tensor: torch.Tensor
-    
-    num_speculative_tokens: int = 0
-    """Number of speculative tokens"""
-    slot_mapping: torch.Tensor = None
-    """(batch_size, seq_len), slot mapping"""
-    spec_layer_decoding: bool = False
+    slot_mapping: torch.Tensor

    causal: bool = True


--- a/vllm/v1/attention/backends/xformers.py
+++ b/vllm/v1/attention/backends/xformers.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""Attention layer with XFormersAttention."""
-
-from dataclasses import dataclass
-from typing import TYPE_CHECKING, Optional
-
-import torch
-
-from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
-                                              AttentionMetadata, AttentionType)
-from vllm.attention.ops.triton_unified_attention import unified_attention
-from vllm.config import VllmConfig
-from vllm.logger import init_logger
-from vllm.v1.attention.backends.utils import (
-    AttentionMetadataBuilder, CommonAttentionMetadata,
-    reorder_batch_to_split_decodes_and_prefills, split_decodes_and_prefills)
-from vllm.v1.kv_cache_interface import AttentionSpec
-
-try:
-    from xformers import ops as xops
-    from xformers.ops.fmha.attn_bias import (
-        AttentionBias, PagedBlockDiagonalCausalWithOffsetPaddedKeysMask)
-
-    XFORMERS_AVAILABLE = True
-except ImportError:
-    XFORMERS_AVAILABLE = False
-
-if TYPE_CHECKING:
-    from vllm.v1.core.sched.output import SchedulerOutput
-    from vllm.v1.worker.gpu_input_batch import InputBatch
-
-from vllm import _custom_ops as ops
-
-logger = init_logger(__name__)
-
-
-class XFormersAttentionBackend(AttentionBackend):
-
-    accept_output_buffer: bool = True
-
-    @classmethod
-    def get_supported_dtypes(cls) -> list[torch.dtype]:
-        return [torch.float16, torch.bfloat16]
-
-    @classmethod
-    def get_supported_head_sizes(cls) -> list[int]:
-        return [
-            32,
-            40,
-            48,
-            56,
-            64,
-            72,
-            80,
-            88,
-            96,
-            104,
-            112,
-            120,
-            128,
-            136,
-            144,
-            152,
-            160,
-            168,
-            176,
-            184,
-            192,
-            200,
-            208,
-            216,
-            224,
-            232,
-            240,
-            248,
-            256,
-        ]
-
-    @classmethod
-    def validate_head_size(cls, head_size: int) -> None:
-        supported_head_sizes = cls.get_supported_head_sizes()
-        if head_size not in supported_head_sizes:
-            attn_type = cls.__name__.removesuffix("Backend")
-            raise ValueError(
-                f"Head size {head_size} is not supported by {attn_type}. "
-                f"Supported head sizes are: {supported_head_sizes}. "
-                "Set VLLM_ATTENTION_BACKEND=FLEX_ATTENTION to use "
-                "FlexAttention backend which supports all head sizes.")
-
-    @staticmethod
-    def get_name() -> str:
-        return "XFORMERS"
-
-    @staticmethod
-    def get_impl_cls() -> type["XFormersAttentionImpl"]:
-        return XFormersAttentionImpl
-
-    @staticmethod
-    def get_metadata_cls() -> type["AttentionMetadata"]:
-        return XFormersAttentionMetadata
-
-    @staticmethod
-    def get_kv_cache_shape(
-        num_blocks: int,
-        block_size: int,
-        num_kv_heads: int,
-        head_size: int,
-        cache_dtype_str: str = "auto",
-    ) -> tuple[int, ...]:
-        if block_size % 16 != 0:
-            raise ValueError("Block size must be a multiple of 16.")
-        return (2, num_blocks, block_size, num_kv_heads, head_size)
-
-    @staticmethod
-    def get_builder_cls() -> type["XFormersAttentionMetadataBuilder"]:
-        return XFormersAttentionMetadataBuilder
-
-    @staticmethod
-    def use_cascade_attention(*args, **kwargs) -> bool:
-        return False
-
-
-@dataclass
-class XFormersAttentionMetadata:
-    num_actual_tokens: int  # Number of tokens excluding padding.
-    max_query_len: int
-    query_start_loc: torch.Tensor
-    max_seq_len: int
-    seq_lens: torch.Tensor
-    block_table: torch.Tensor
-    slot_mapping: torch.Tensor
-
-    num_prefill_tokens: int = 0
-    num_decode_tokens: int = 0
-    num_prefills: int = 0
-    num_decodes: int = 0
-
-    # Biases for different attention types.
-    attn_bias: Optional["AttentionBias"] = None
-
-    # Self-attention prefill/decode metadata cache
-    _cached_prefill_metadata: Optional["XFormersAttentionMetadata"] = None
-    _cached_decode_metadata: Optional["XFormersAttentionMetadata"] = None
-
-    @property
-    def prefill_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_prefills == 0:
-            return None
-
-        if self._cached_prefill_metadata is not None:
-            # Recover cached prefill-phase attention
-            # metadata structure
-            return self._cached_prefill_metadata
-
-        q_start_loc = self.query_start_loc[self.num_decodes:]
-        q_seqlens = torch.diff(q_start_loc)
-        kv_seqlens = self.seq_lens[self.num_decodes:]
-        # Construct & cache prefill-phase attention metadata structure
-        self._cached_prefill_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_prefill_tokens,
-            max_query_len=int(q_seqlens.max().item()),
-            query_start_loc=q_start_loc - q_start_loc[0],
-            max_seq_len=int(kv_seqlens.max().item()),
-            seq_lens=kv_seqlens,
-            block_table=self.block_table[self.num_decodes:],
-            slot_mapping=self.slot_mapping[self.num_decode_tokens:],
-        )
-        return self._cached_prefill_metadata
-
-    @property
-    def decode_metadata(self) -> Optional["XFormersAttentionMetadata"]:
-        if self.num_decode_tokens == 0:
-            return None
-
-        if self._cached_decode_metadata is not None:
-            # Recover cached decode-phase attention
-            # metadata structure
-            return self._cached_decode_metadata
-
-        q_start_loc = self.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        decode_kv_seqlens = self.seq_lens[:self.num_decodes]
-        # Construct & cache decode-phase attention metadata structure
-        self._cached_decode_metadata = XFormersAttentionMetadata(
-            num_actual_tokens=self.num_decode_tokens,
-            max_query_len=int(q_seqlens[:self.num_decodes].max().item()),
-            query_start_loc=q_start_loc[:self.num_decodes + 1],
-            max_seq_len=int(decode_kv_seqlens.max().item()),
-            seq_lens=decode_kv_seqlens,
-            block_table=self.block_table[:self.num_decodes],
-            slot_mapping=self.slot_mapping[:self.num_decode_tokens],
-            attn_bias=self.attn_bias,
-        )
-        return self._cached_decode_metadata
-
-
-class XFormersAttentionMetadataBuilder(
-        AttentionMetadataBuilder[XFormersAttentionMetadata]):
-
-    reorder_batch_threshold: int = 1
-
-    def __init__(
-        self,
-        kv_cache_spec: AttentionSpec,
-        layer_names: list[str],
-        vllm_config: VllmConfig,
-        device: torch.device,
-    ):
-        super().__init__(kv_cache_spec, layer_names, vllm_config, device)
-
-        assert XFORMERS_AVAILABLE
-        self.block_size = kv_cache_spec.block_size
-        self._num_decodes = 0
-        self._num_decode_tokens = 0
-
-    def reorder_batch(self, input_batch: "InputBatch",
-                      scheduler_output: "SchedulerOutput") -> bool:
-        return reorder_batch_to_split_decodes_and_prefills(
-            input_batch,
-            scheduler_output,
-            decode_threshold=self.reorder_batch_threshold)
-
-    def build(
-        self,
-        common_prefix_len: int,
-        common_attn_metadata: CommonAttentionMetadata,
-        fast_build: bool = False,
-    ) -> XFormersAttentionMetadata:
-        num_decodes, num_prefills, num_decode_tokens, num_prefill_tokens = (
-            split_decodes_and_prefills(
-                common_attn_metadata,
-                decode_threshold=self.reorder_batch_threshold))
-
-        num_actual_tokens = common_attn_metadata.num_actual_tokens
-        q_start_loc = common_attn_metadata.query_start_loc
-        q_seqlens = torch.diff(q_start_loc)
-        max_query_len = common_attn_metadata.max_query_len
-        kv_seqlens = common_attn_metadata.seq_lens
-        max_seq_len = common_attn_metadata.max_seq_len
-        block_table = common_attn_metadata.block_table_tensor
-        slot_mapping = common_attn_metadata.slot_mapping
-
-        bias = None
-        if num_decodes > 0:
-            # Construct the decoder bias.
-            decode_q_seqlens = q_seqlens[:num_decodes]
-            decode_kv_seqlens = kv_seqlens[:num_decodes]
-            bias = (
-                PagedBlockDiagonalCausalWithOffsetPaddedKeysMask.from_seqlens(
-                    q_seqlen=decode_q_seqlens.tolist(),
-                    kv_seqlen=decode_kv_seqlens.tolist(),
-                    page_size=self.block_size,
-                    block_tables=block_table[:num_decodes],
-                    device=block_table.device,
-                ))
-
-        return XFormersAttentionMetadata(
-            num_actual_tokens=num_actual_tokens,
-            num_prefill_tokens=num_prefill_tokens,
-            num_decode_tokens=num_decode_tokens,
-            num_prefills=num_prefills,
-            num_decodes=num_decodes,
-            max_query_len=max_query_len,
-            query_start_loc=q_start_loc,
-            max_seq_len=max_seq_len,
-            seq_lens=kv_seqlens,
-            block_table=block_table,
-            slot_mapping=slot_mapping,
-            attn_bias=bias,
-        )
-
-
-class XFormersAttentionImpl(AttentionImpl):
-
-    def __init__(
-        self,
-        num_heads: int,
-        head_size: int,
-        scale: float,
-        num_kv_heads: int,
-        alibi_slopes: Optional[list[float]],
-        sliding_window: Optional[int],
-        kv_cache_dtype: str,
-        logits_soft_cap: Optional[float] = None,
-        attn_type: AttentionType = AttentionType.DECODER,
-        kv_sharing_target_layer_name: Optional[str] = None,
-    ) -> None:
-        if kv_sharing_target_layer_name is not None:
-            raise NotImplementedError("KV sharing is not supported in V0.")
-        if alibi_slopes is not None:
-            raise NotImplementedError(
-                "XFormers does not support alibi slopes yet.")
-        self.num_heads = num_heads
-        self.head_size = head_size
-        self.scale = float(scale)
-        self.num_kv_heads = num_kv_heads
-        self.num_queries_per_kv = self.num_heads // self.num_kv_heads
-        self.kv_cache_dtype = kv_cache_dtype
-        self.kv_sharing_target_layer_name = kv_sharing_target_layer_name
-        if alibi_slopes is not None:
-            alibi_slopes = torch.tensor(alibi_slopes, dtype=torch.float32)
-        self.alibi_slopes = alibi_slopes
-        if sliding_window is None:
-            self.sliding_window = (-1, -1)
-        else:
-            self.sliding_window = (sliding_window - 1, 0)
-        if logits_soft_cap is None:
-            # Setting logits_soft_cap to 0 means no soft cap.
-            logits_soft_cap = 0
-        self.logits_soft_cap = logits_soft_cap
-
-        XFormersAttentionBackend.validate_head_size(head_size)
-
-        if attn_type != AttentionType.DECODER:
-            raise NotImplementedError("Encoder self-attention and "
-                                      "encoder/decoder cross-attention "
-                                      "are not implemented for "
-                                      "XFormersAttentionImpl.")
-
-    def forward(
-        self,
-        layer: torch.nn.Module,
-        query: torch.Tensor,
-        key: torch.Tensor,
-        value: torch.Tensor,
-        kv_cache: torch.Tensor,
-        attn_metadata: XFormersAttentionMetadata,
-        output: Optional[torch.Tensor] = None,
-        output_scale: Optional[torch.Tensor] = None,
-        output_block_scale: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        """Forward pass with XFormers.
-
-        Args:
-            query: shape = [num_tokens, num_heads, head_size]
-            key: shape = [num_tokens, num_kv_heads, head_size]
-            value: shape = [num_tokens, num_kv_heads, head_size]
-            kv_cache: shape =
-                [2, num_blocks, block_size, num_kv_heads, head_size]
-            attn_metadata: Metadata for attention.
-        Returns:
-            shape = [num_tokens, num_heads * head_size]
-        """
-        assert output is not None, "Output tensor must be provided."
-
-        if output_scale is not None or output_block_scale is not None:
-            raise NotImplementedError(
-                "fused output quantization is not yet supported"
-                " for XFormersAttentionImpl")
-
-        if attn_metadata is None:
-            # Profiling run.
-            return output
-
-        # Cache the input KVs.
-        key_cache, value_cache = kv_cache.unbind(0)
-        if self.kv_sharing_target_layer_name is None:
-            # Reshape the input keys and values and store them in the cache.
-            # Skip this if sharing KV cache with an earlier attention layer.
-            # NOTE(woosuk): Here, key and value are padded while slot_mapping is
-            # not padded. However, we don't need to do key[:num_actual_tokens]
-            # and value[:num_actual_tokens] because the reshape_and_cache_flash
-            # op uses the slot_mapping's shape to determine the number of
-            # actual tokens.
-            ops.reshape_and_cache_flash(
-                key,
-                value,
-                key_cache,
-                value_cache,
-                attn_metadata.slot_mapping,
-                self.kv_cache_dtype,
-                layer._k_scale,
-                layer._v_scale,
-            )
-
-        num_actual_tokens = attn_metadata.num_actual_tokens
-        num_decode_tokens = attn_metadata.num_decode_tokens
-        if prefill_meta := attn_metadata.prefill_metadata:
-            descale_shape = (prefill_meta.query_start_loc.shape[0] - 1,
-                             key.shape[1])
-            unified_attention(
-                q=query[num_decode_tokens:num_actual_tokens],
-                k=key_cache,
-                v=value_cache,
-                out=output[num_decode_tokens:num_actual_tokens],
-                cu_seqlens_q=prefill_meta.query_start_loc,
-                max_seqlen_q=prefill_meta.max_query_len,
-                seqused_k=prefill_meta.seq_lens,
-                max_seqlen_k=prefill_meta.max_seq_len,
-                softmax_scale=self.scale,
-                causal=True,
-                alibi_slopes=self.alibi_slopes,
-                window_size=self.sliding_window,
-                block_table=prefill_meta.block_table,
-                softcap=self.logits_soft_cap,
-                q_descale=None,  # Not supported
-                k_descale=layer._k_scale.expand(descale_shape),
-                v_descale=layer._v_scale.expand(descale_shape),
-            )
-
-        if decode_meta := attn_metadata.decode_metadata:
-            # Query for decode. KV is not needed because it is already cached.
-            decode_query = query[:num_decode_tokens]
-            # Reshape query to [1, B_T, G, H, D].
-            q = decode_query.view(1, -1, self.num_kv_heads,
-                                  self.num_queries_per_kv, self.head_size)
-            # Reshape the k and v caches to [1, Bkv_T, G, H, D]
-            cache_k = key_cache.view(1, -1, self.num_kv_heads, 1,
-                                     self.head_size).expand(
-                                         1,
-                                         -1,
-                                         self.num_kv_heads,
-                                         self.num_queries_per_kv,
-                                         self.head_size,
-                                     )
-            cache_v = value_cache.view(1, -1, self.num_kv_heads, 1,
-                                       self.head_size).expand(
-                                           1,
-                                           -1,
-                                           self.num_kv_heads,
-                                           self.num_queries_per_kv,
-                                           self.head_size,
-                                       )
-
-            attn_bias = decode_meta.attn_bias
-            output[:
-                   num_decode_tokens] = xops.memory_efficient_attention_forward(
-                       q,
-                       cache_k,
-                       cache_v,
-                       attn_bias=attn_bias,
-                       p=0.0,
-                       scale=self.scale,
-                   ).view(decode_query.shape)
-
-        # Reshape the output tensor.
-        return output
--- a/vllm/v1/core/sched/scheduler.py
+++ b/vllm/v1/core/sched/scheduler.py
@@ -178,7 +178,6 @@ class Scheduler(SchedulerInterface):
        self.encoder_cache_manager = EncoderCacheManager(cache_size=encoder_cache_size)

        speculative_config = vllm_config.speculative_config
-
        self.use_eagle = False
        self.num_spec_tokens = self.num_lookahead_tokens = 0
        if speculative_config:
@@ -187,10 +186,6 @@ class Scheduler(SchedulerInterface):
                self.use_eagle = True
                self.num_lookahead_tokens = self.num_spec_tokens

-        self.compilation_config = vllm_config.compilation_config
-        self.full_cuda_graph = self.compilation_config.full_cuda_graph
-        self.use_mla = vllm_config.model_config.use_mla
-
        # Create the KV cache manager.
        self.kv_cache_manager = KVCacheManager(
            kv_cache_config=kv_cache_config,
@@ -207,7 +202,7 @@ class Scheduler(SchedulerInterface):
        self.use_pp = self.parallel_config.pipeline_parallel_size > 1
        self.use_v2_model_runner = envs.VLLM_USE_V2_MODEL_RUNNER

-    def schedule_default(self) -> SchedulerOutput:
+    def schedule(self) -> SchedulerOutput:
        # NOTE(woosuk) on the scheduling algorithm:
        # There's no "decoding phase" nor "prefill phase" in the scheduler.
        # Each request just has the num_computed_tokens and
@@ -420,12 +415,10 @@ class Scheduler(SchedulerInterface):
        if not preempted_reqs:
            while self.waiting and token_budget > 0:
                if len(self.running) == self.max_num_running_reqs:
-                    break
+                break

                request = self.waiting.peek_request()
-                if request.is_finished():
-                    self.waiting.pop_request()
-                    continue
+                
                # KVTransfer: skip request if still waiting for remote kvs.
                if request.status == RequestStatus.WAITING_FOR_REMOTE_KVS:
                    is_ready = self._update_waiting_for_remote_kv(request)

--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -14,7 +14,6 @@ from logging import DEBUG
 from typing import Any, TypeVar, cast

 import msgspec
-from vllm import envs
 import zmq

 from vllm.config import ParallelConfig, VllmConfig

--- a/vllm/v1/engine/llm_engine.py
+++ b/vllm/v1/engine/llm_engine.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

-import os
 import time
 from collections.abc import Callable, Mapping
 from copy import copy
@@ -136,8 +135,6 @@ class LLMEngine:

        # Don't keep the dummy data in memory
        self.reset_mm_cache()
-        
-        # self.tree_decoding = os.environ.get('VLLM_TREE_DECODING') == '1'

    @property
    @deprecated(

--- a/vllm/v1/spec_decode/eagle.py
+++ b/vllm/v1/spec_decode/eagle.py
@@ -25,7 +25,6 @@ from vllm.model_executor.model_loader import get_model
 from vllm.model_executor.models import supports_multimodal
 from vllm.model_executor.models.deepseek_v2 import DeepseekV32IndexerCache
 from vllm.model_executor.models.llama_eagle3 import Eagle3LlamaForCausalLM
-
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.platforms import current_platform
 from vllm.triton_utils import triton
@@ -51,7 +50,6 @@ from vllm.v1.spec_decode.utils import (
 from vllm.v1.utils import CpuGpuBuffer
 from vllm.v1.worker.dp_utils import coordinate_batch_across_dp
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
-from vllm.v1.worker.ubatching import dbo_current_ubatch_id

 logger = init_logger(__name__)

@@ -253,8 +251,6 @@ class EagleProposer:
        # Replace the last token with the next token.
        # E.g., [b1, b2, c1, c2, c3, c3] -> [a2, b2, b3, c2, c3, c4]
        self.input_ids[last_token_indices] = next_token_ids
-        
-        seq_lens = (target_positions[last_token_indices] + 1).int()

        assert self.runner is not None

@@ -336,7 +332,6 @@ class EagleProposer:
                hidden_states=self.hidden_states[:num_input_tokens],
                inputs_embeds=inputs_embeds,
            )
-
            if self.method == "mtp":
                last_hidden_states = ret_hidden_states
                hidden_states = last_hidden_states
@@ -390,11 +385,6 @@ class EagleProposer:

        # Generate the remaining draft tokens.
        draft_token_ids_list = [draft_token_ids]
-       
-        if self.method == "deepseek_mtp":
-            hidden_states = last_hidden_states[last_token_indices]
-        else:
-            hidden_states = hidden_states[last_token_indices]

        batch_size_dp_padded, batch_size_across_dp = self._pad_batch_across_dp(
            num_tokens_unpadded=batch_size,
@@ -534,7 +524,6 @@ class EagleProposer:

        # [batch_size, num_speculative_tokens]
        draft_token_ids = torch.stack(draft_token_ids_list, dim=1)
-
        return draft_token_ids

    def prepare_next_token_ids_cpu(

--- a/vllm/v1/spec_decode/utils.py
+++ b/vllm/v1/spec_decode/utils.py
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
 from vllm.sampling_params import SamplingParams
 from vllm.triton_utils import tl, triton


--- a/vllm/v1/worker/gpu_input_batch.py
+++ b/vllm/v1/worker/gpu_input_batch.py
@@ -39,7 +39,6 @@ class CachedRequestState:
    block_ids: tuple[list[int], ...]
    num_computed_tokens: int
    output_token_ids: list[int]
-    spec_token_ids: list[int] = None

    mrope_positions: torch.Tensor | None = None
    mrope_position_delta: int | None = None
@@ -335,7 +334,7 @@ class InputBatch:
        self.is_token_ids[req_index, start_idx:end_idx] = True
        # Number of token ids in prompt (token_ids_cpu or prompt_embeds).
        # NOTE(woosuk): This may include spec decode tokens.
-        self.num_tokens[req_index] = request.num_tokens + num_spec_tokens
+        self.num_tokens[req_index] = request.num_tokens
        # Number of tokens without spec decode tokens.
        self.num_tokens_no_spec[req_index] = request.num_tokens


--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -48,7 +48,7 @@ from vllm.distributed.parallel_state import (
    is_global_first_rank,
    prepare_communication_buffer_for_model,
 )
-from vllm.forward_context import BatchDescriptor, set_forward_context, set_profilling
+from vllm.forward_context import BatchDescriptor, set_forward_context
 from vllm.logger import init_logger
 from vllm.model_executor.layers.attention_layer_base import AttentionLayerBase
 from vllm.model_executor.layers.rotary_embedding import (
@@ -888,8 +888,6 @@ class GPUModelRunner(

            # Update the cached states.
            req_state.num_computed_tokens = num_computed_tokens
-            spec_token_ids = (
-                scheduler_output.scheduled_spec_decode_tokens.get(req_id, ()))

            if not is_last_rank:
                # When using PP, the scheduler sends the sampled tokens back,
@@ -955,7 +953,7 @@ class GPUModelRunner(
            if not is_last_rank:
                # Add new_token_ids to token_ids_cpu.
                start_token_index = num_computed_tokens
-                end_token_index = num_computed_tokens + 1
+                end_token_index = num_computed_tokens + len(new_token_ids)
                self.input_batch.token_ids_cpu[
                    req_index, start_token_index:end_token_index
                ] = new_token_ids
@@ -2004,7 +2002,6 @@ class GPUModelRunner(
            self.device, non_blocking=True
        )

-
        # Compute the draft token ids.
        # draft_token_indices:      [  1,   2,   3, 105, 106, 208]
        draft_token_ids = self.input_ids.gpu[logits_indices]
@@ -3930,10 +3927,6 @@ class GPUModelRunner(
        else:
            num_reqs = min(num_tokens, max_num_reqs)
            min_tokens_per_req = num_tokens // num_reqs
-            
-            if not is_profile and self.speculative_config is not None and self.speculative_config.num_lookahead_slots > 0:
-                min_tokens_per_req = (1 + self.speculative_config.num_lookahead_slots)
-                num_reqs = num_tokens // min_tokens_per_req
            num_scheduled_tokens_list = [min_tokens_per_req] * num_reqs
            num_scheduled_tokens_list[-1] += num_tokens % num_reqs

@@ -3994,8 +3987,6 @@ class GPUModelRunner(
            self.seq_lens.np[:num_reqs] = seq_lens
            self.seq_lens.np[num_reqs:] = 0
            self.seq_lens.copy_to_gpu()
-            
-            num_speculative_tokens = 0 if self.speculative_config is None else self.speculative_config.num_lookahead_slots

            cum_num_tokens, _ = self._get_cumsum_and_arange(num_scheduled_tokens)
            self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
@@ -4090,10 +4081,8 @@ class GPUModelRunner(
            else:
                hidden_states = outputs

-            if self.speculative_config and self.speculative_config.use_eagle()and not is_profile:
-                # assert isinstance(self.drafter, EagleProposer)
-                if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
-                    self.drafter.dummy_run(num_tokens, attn_metadata)
+            if self.speculative_config and self.speculative_config.use_eagle():
+                assert isinstance(self.drafter, EagleProposer)
                use_cudagraphs = (
                    cudagraph_runtime_mode.has_mode(CUDAGraphMode.PIECEWISE)
                    and not self.speculative_config.enforce_eager
@@ -4293,10 +4282,6 @@ class GPUModelRunner(
        return self._dummy_pooler_run_task(hidden_states, max_task)

    def profile_run(self) -> None:
-        # set profiling flag to avoid torch compile
-        # set_profilling(True)
-        # self._sync_device()
-
        # Profile with multimodal encoder & encoder cache.
        if self.supports_mm_inputs:
            mm_config = self.model_config.multimodal_config
@@ -4383,7 +4368,6 @@ class GPUModelRunner(
        del hidden_states, output
        self.encoder_cache.clear()
        gc.collect()
-        # set_profilling(False)

    def capture_model(self) -> int:
        if self.compilation_config.cudagraph_mode == CUDAGraphMode.NONE:
@@ -5324,11 +5308,10 @@ class GPUModelRunner(
        )

        if self.speculative_config and self.speculative_config.use_eagle():
-            # assert isinstance(self.drafter, EagleProposer)
+            assert isinstance(self.drafter, EagleProposer)
            # validate all draft model layers belong to the same kv cache
            # group
-            if hasattr(self, 'drafter') and isinstance(self.drafter, EagleProposer):
-                self.drafter.validate_same_kv_cache_group(kv_cache_config)
+            self.drafter.validate_same_kv_cache_group(kv_cache_config)

        if has_kv_transfer_group():
            kv_transfer_group = get_kv_transfer_group()

--- a/vllm/worker/worker_base.py
+++ b/vllm/worker/worker_base.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import os
-import numa
-import time
-from abc import abstractmethod
-
-from typing import (Any, Callable, Dict, List, Optional, Set, Tuple, TypeVar,
-                    Union, Type)
-
-import cloudpickle
-import torch.nn as nn
-
-from vllm.config import VllmConfig, set_current_vllm_config
-from vllm.logger import init_logger
-from vllm.lora.request import LoRARequest
-from vllm.sequence import ExecuteModelRequest
-from vllm.utils import (enable_trace_function_call_for_thread,
-                        resolve_obj_by_qualname, run_method,
-                        update_environment_variables,
-                        warn_for_unimplemented_methods)
-
-from vllm.v1.outputs import SamplerOutput
-
-
-logger = init_logger(__name__)
-
-_R = TypeVar("_R")
-
-
-# 设置当前进程绑定到 NUMA 节点
-def bind_to_numa(local_rank):
-    env_str = f"VLLM_RANK{local_rank}_NUMA"
-    node_count = numa.get_max_node() + 1
-    numa_node = int(os.getenv(env_str, -1))
-
-    # 未配置环境变量或配置错误则不做绑定，TODO：根据topo自动绑定方案
-    if numa_node < 0:
-        logger.warning("%s is unset or set incorrectly, vllm will not bind to numa! %s = %d", env_str, env_str, numa_node)
-        return
-
-    if numa_node > numa.get_max_node():
-        raise ValueError(f"NUMA node {numa_node} is not available.")
-
-    numa.bind([numa_node])   
-    
-    
-@warn_for_unimplemented_methods
-class WorkerBase:
-    """Worker interface that allows vLLM to cleanly separate implementations for
-    different hardware. Also abstracts control plane communication, e.g., to
-    communicate request metadata to other workers.
-    """
-    # TODO
-    tree_decoding = (os.environ.get('VLLM_TREE_DECODING') == '1')
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-    ) -> None:
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        self.cache_config = vllm_config.cache_config
-        self.lora_config = vllm_config.lora_config
-        self.load_config = vllm_config.load_config
-        self.parallel_config = vllm_config.parallel_config
-        self.scheduler_config = vllm_config.scheduler_config
-        self.device_config = vllm_config.device_config
-        self.speculative_config = vllm_config.speculative_config
-        self.observability_config = vllm_config.observability_config
-        self.kv_transfer_config = vllm_config.kv_transfer_config
-        self.compilation_config = vllm_config.compilation_config
-        from vllm.platforms import current_platform
-        self.current_platform = current_platform
-
-    def init_device(self) -> None:
-        """Initialize device state, such as loading the model or other on-device
-        memory allocations.
-        """
-        raise NotImplementedError
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """Initialize the KV cache with the given size in blocks.
-        """
-        raise NotImplementedError
-
-    def get_model(self) -> nn.Module:
-        raise NotImplementedError
-
-    def apply_model(self, fn: Callable[[nn.Module], _R]) -> _R:
-        """Apply a function on the model inside this worker."""
-        return fn(self.get_model())
-
-    def load_model(self) -> None:
-        """Load model onto target device."""
-        raise NotImplementedError
-
-    def execute_model(
-        self,
-        execute_model_req: Optional[ExecuteModelRequest] = None
-    ) -> Optional[List[SamplerOutput]]:
-        raise NotImplementedError
-
-    def start_worker_execution_loop(self) -> None:
-        """Execute model loop in parallel worker.
-
-        You can stop the loop by executing a driver worker with an empty output.
-        See `stop_remote_worker_execution_loop` for more details.
-        """
-        with self.current_platform.inference_mode():
-            while True:
-                output = self.execute_model(execute_model_req=None)
-                if output is None:
-                    return None
-
-    def determine_num_available_blocks(self) -> Tuple[int, int]:
-        """Determine the number of available blocks for the GPU KV cache and
-        swappable CPU KV cache.
-
-        The implementation may run profiling or other heuristics to determine
-        the size of caches.
-
-        Returns a Tuple[num_gpu_blocks, num_cpu_blocks], where num_gpu_blocks
-        are blocks that are "active" on the device and can be appended to.
-        num_cpu_blocks refers to "swapped" blocks in CPU memory and cannot be
-        appended to.
-        """
-        raise NotImplementedError
-
-    def get_cache_block_size_bytes(self) -> int:
-        """Return the size of a single cache block, in bytes. Used in
-        speculative decoding.
-        """
-        raise NotImplementedError
-
-    def add_lora(self, lora_request: LoRARequest) -> bool:
-        raise NotImplementedError
-
-    def remove_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def pin_lora(self, lora_id: int) -> bool:
-        raise NotImplementedError
-
-    def list_loras(self) -> Set[int]:
-        raise NotImplementedError
-    
-    # @property
-    # @abstractmethod
-    # def cache_engines(self) -> Optional[List[CacheEngine]]:
-    #     raise NotImplementedError
-
-    @property
-    def vocab_size(self) -> int:
-        """Get vocabulary size from model configuration."""
-        return self.model_config.get_vocab_size()
-
-    def shutdown(self) -> None:
-        """Clean up resources held by the worker."""
-        return
-
-
-class WorkerWrapperBase:
-    """
-    This class represents one process in an executor/engine. It is responsible
-    for lazily initializing the worker and handling the worker's lifecycle.
-    We first instantiate the WorkerWrapper, which remembers the worker module
-    and class name. Then, when we call `update_environment_variables`, and the
-    real initialization happens in `init_worker`.
-    """
-
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        rpc_rank: int = 0,
-    ) -> None:
-        """
-        Initialize the worker wrapper with the given vllm_config and rpc_rank.
-        Note: rpc_rank is the rank of the worker in the executor. In most cases,
-        it is also the rank of the worker in the distributed group. However,
-        when multiple executors work together, they can be different.
-        e.g. in the case of SPMD-style offline inference with TP=2,
-        users can launch 2 engines/executors, each with only 1 worker.
-        All workers have rpc_rank=0, but they have different ranks in the TP
-        group.
-        """
-        self.rpc_rank = rpc_rank
-        self.worker: Optional[WorkerBase] = None
-        self.vllm_config: Optional[VllmConfig] = None
-        # do not store this `vllm_config`, `init_worker` will set the final
-        # one. TODO: investigate if we can remove this field in
-        # `WorkerWrapperBase`, `init_cached_hf_modules` should be
-        # unnecessary now.
-        if vllm_config.model_config is not None:
-            # it can be None in tests
-            trust_remote_code = vllm_config.model_config.trust_remote_code
-            if trust_remote_code:
-                # note: lazy import to avoid importing torch before initializing
-                from vllm.utils import init_cached_hf_modules
-                init_cached_hf_modules()
-
-    def shutdown(self) -> None:
-        if self.worker is not None:
-            self.worker.shutdown()
-
-    def adjust_rank(self, rank_mapping: Dict[int, int]) -> None:
-        """
-        Adjust the rpc_rank based on the given mapping.
-        It is only used during the initialization of the executor,
-        to adjust the rpc_rank of workers after we create all workers.
-        """
-        if self.rpc_rank in rank_mapping:
-            self.rpc_rank = rank_mapping[self.rpc_rank]
-
-    def update_environment_variables(self, envs_list: List[Dict[str,
-                                                                str]]) -> None:
-        envs = envs_list[self.rpc_rank]
-        key = 'CUDA_VISIBLE_DEVICES'
-        if key in envs and key in os.environ:
-            # overwriting CUDA_VISIBLE_DEVICES is desired behavior
-            # suppress the warning in `update_environment_variables`
-            del os.environ[key]
-        update_environment_variables(envs)
-
-    def init_worker(self, all_kwargs: List[Dict[str, Any]]) -> None:
-        """
-        Here we inject some common logic before initializing the worker.
-        Arguments are passed to the worker class constructor.
-        """
-        kwargs = all_kwargs[self.rpc_rank]
-        self.vllm_config = kwargs.get("vllm_config")
-        assert self.vllm_config is not None, (
-            "vllm_config is required to initialize the worker")
-        enable_trace_function_call_for_thread(self.vllm_config)
-
-        from vllm.plugins import load_general_plugins
-        load_general_plugins()
-
-        if isinstance(self.vllm_config.parallel_config.worker_cls, str):
-            worker_class = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_cls)
-        else:
-            logger.warning(
-                "passing worker_cls as a class object is strongly deprecated,"
-                " as the serialization of class objects can be tricky and"
-                " error-prone. To be safe, please keep the class in a separate"
-                " module and pass the qualified name of the class as a string."
-            )
-            assert isinstance(self.vllm_config.parallel_config.worker_cls,
-                              bytes)
-            worker_class = cloudpickle.loads(
-                self.vllm_config.parallel_config.worker_cls)
-        if self.vllm_config.parallel_config.worker_extension_cls:
-            worker_extension_cls = resolve_obj_by_qualname(
-                self.vllm_config.parallel_config.worker_extension_cls)
-            extended_calls = []
-            if worker_extension_cls not in worker_class.__bases__:
-                # check any conflicts between worker and worker_extension_cls
-                for attr in dir(worker_extension_cls):
-                    if attr.startswith("__"):
-                        continue
-                    assert not hasattr(worker_class, attr), (
-                        f"Worker class {worker_class} already has an attribute"
-                        f" {attr}, which conflicts with the worker"
-                        f" extension class {worker_extension_cls}.")
-                    if callable(getattr(worker_extension_cls, attr)):
-                        extended_calls.append(attr)
-                # dynamically inherit the worker extension class
-                worker_class.__bases__ = worker_class.__bases__ + (
-                    worker_extension_cls, )
-                logger.info(
-                    "Injected %s into %s for extended collective_rpc calls %s",
-                    worker_extension_cls, worker_class, extended_calls)
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during worker initialization
-            self.worker = worker_class(**kwargs)
-            assert self.worker is not None
-            
-        VLLM_NUMA_BIND = int(os.getenv("VLLM_NUMA_BIND", 1))
-        if VLLM_NUMA_BIND > 0:
-            # 绑定当前进程到指定 NUMA 节点
-            bind_to_numa(kwargs['local_rank'])
-
-            pid = os.getpid()
-            logger.info("########## %d process(rank%s) is running on CPU(s): %s", pid, str(kwargs['local_rank']), str(os.sched_getaffinity(pid)))
-            logger.info("########## %d process(rank%s) is running on memnode(s): %s", pid, str(kwargs['local_rank']), str(numa.get_membind()))
-
-
-    def initialize_from_config(self, kv_cache_configs: List[Any]) -> None:
-        kv_cache_config = kv_cache_configs[self.rpc_rank]
-        with set_current_vllm_config(self.vllm_config):
-            self.worker.initialize_from_config(kv_cache_config)  # type: ignore
-
-    def init_device(self):
-        with set_current_vllm_config(self.vllm_config):
-            # To make vLLM config available during device initialization
-            self.worker.init_device()  # type: ignore
-
-    def execute_method(self, method: Union[str, bytes], *args, **kwargs):
-        try:
-            # method resolution order:
-            # if a method is defined in this class, it will be called directly.
-            # otherwise, since we define `__getattr__` and redirect attribute
-            # query to `self.worker`, the method will be called on the worker.
-            return run_method(self, method, args, kwargs)
-        except Exception as e:
-            # if the driver worker also execute methods,
-            # exceptions in the rest worker may cause deadlock in rpc like ray
-            # see https://github.com/vllm-project/vllm/issues/3455
-            # print the error and inform the user to solve the error
-            msg = (f"Error executing method {method!r}. "
-                   "This might cause deadlock in distributed execution.")
-            logger.exception(msg)
-            raise e
-
-    def __getattr__(self, attr):
-        return getattr(self.worker, attr)