[Refactor] Remove unused dead code (#38842)

Signed-off-by: yewentao256 <zhyanwentao@126.com>

[Refactor] Remove unused dead code (#38842)
Signed-off-by: yewentao256 <zhyanwentao@126.com>
4ae218c1 · Wentao Ye · GitHub · f40d9879 · 4ae218c1 · 4ae218c1
Unverified Commit 4ae218c1 authored Apr 06, 2026 by Wentao Ye Committed by GitHub Apr 06, 2026
3 changed files
--- a/vllm/model_executor/models/mlp_speculator.py
+++ b/vllm/model_executor/models/mlp_speculator.py
@@ -17,8 +17,6 @@ from vllm.model_executor.model_loader.weight_utils import default_weight_loader

 from .utils import maybe_prefix

-SQRT2 = 2**0.5
-

 class MLPSpeculatorLayerNorm(nn.Module):
    """
@@ -171,57 +169,6 @@ class MLPSpeculator(nn.Module):
            config.vocab_size, config.vocab_size, 1.0
        )

-    # NOTE(woosuk): This method is commented out because it is old code
-    # using V0. We should either port it to V1 or remove it.
-
-    # def generate_proposals(
-    #     self,
-    #     input_ids: torch.Tensor,
-    #     previous_hidden_states: torch.Tensor,
-    #     num_predict_tokens: int,
-    #     sampling_metadata: SamplingMetadata,
-    # ) -> list[SamplerOutput]:
-    #     if num_predict_tokens > self.max_speculative_tokens:
-    #         raise ValueError(f"Max speculative tokens for model is "
-    #                          f"{self.max_speculative_tokens}, but "
-    #                          f"{num_predict_tokens} were requested")
-
-    #     # b x 1 x d
-    #     previous_hidden_states = previous_hidden_states.unsqueeze(1)
-
-    #     if self.scale_input:
-    #         previous_hidden_states = self.ln0(previous_hidden_states) / SQRT2
-
-    #     # b x 1
-    #     last_tokens = input_ids.unsqueeze(1)
-
-    #     next_tokens = []
-
-    #     for head_index in range(num_predict_tokens):
-
-    #         # Project and predict
-    #         z = self.emb[head_index](last_tokens)  # b k d
-    #         states = self.proj[head_index](previous_hidden_states)
-
-    #         # Weighted add of state_weight*state and emb_weight*z
-    #         # Let subsequent LN take care of denominator
-    #         # state_weight is close to 1, so shouldn't be any precision issues
-    #         states.add_(z, alpha=self.emb_weight / self.state_weight)
-
-    #         states = self.activation(self.ln[head_index](states))  # b k d
-    #         previous_hidden_states = states
-    #         # TODO: not yet supporting top_k_tokens_per_head
-    #         states = states.flatten(0, 1)
-
-    #         logits = self.logits_processor(self.head[head_index], states,
-    #                                        sampling_metadata)
-
-    #         output = self.sampler(logits, sampling_metadata)
-    #         last_tokens = output.sampled_token_ids
-    #         next_tokens.append(output)
-
-    #     return next_tokens
-
    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
        params_dict = dict(self.named_parameters())
        loaded_params: set[str] = set()

--- a/vllm/v1/attention/ops/flashmla.py
+++ b/vllm/v1/attention/ops/flashmla.py
@@ -151,16 +151,3 @@ def flash_mla_with_kvcache_fp8(
        descale_k,
    )
    return out, softmax_lse
-
-
-#
-# TODO: Add fake functions
-#
-# @register_fake("_flashmla_C::get_mla_metadata")
-# def _get_mla_metadata_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
-#     return ....
-#
-# @register_fake("_flashmla_C::fwd_kvcache_mla")
-# def _fwd_kvcache_mla_fake(....) -> Tuple[torch.Tensor, torch.Tensor]:
-#     return ....
-#
--- a/vllm/v1/executor/ray_distributed_executor.py
+++ b/vllm/v1/executor/ray_distributed_executor.py
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from vllm.v1.executor.ray_executor import (
-    RayDistributedExecutor as _RayDistributedExecutor,
-)
-
-# For backwards compatibility.
-RayDistributedExecutor = _RayDistributedExecutor