"docs/vscode:/vscode.git/clone" did not exist on "86a65417e3df9d4a327de745631cb8a336a3cd79"
Commit a053add9 authored by zhuwenwen's avatar zhuwenwen
Browse files

remove unused code

parent 2b84890b
......@@ -221,8 +221,7 @@ def _support_torch_compile(
# torch.compiler.is_compiling() means we are inside the compilation
# e.g. TPU has the compilation logic in model runner, so we don't
# need to compile the model inside.
skip_cuda_graphs = get_forward_context().skip_cuda_graphs
if envs.VLLM_ENABLE_TBO and skip_cuda_graphs:
if envs.VLLM_ENABLE_TBO and get_forward_context().skip_cuda_graphs:
return self.forward(*args, **kwargs)
if self.do_not_compile or torch.compiler.is_compiling() or get_profilling():
......
......@@ -437,7 +437,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False,
) -> torch.Tensor:
if enable_eplb:
......@@ -468,7 +467,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
use_nn_moe=use_nn_moe,
routed_scaling_factor=routed_scaling_factor,
use_fused_gate=use_fused_gate
)
......@@ -495,7 +493,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False,
) -> torch.Tensor:
......@@ -517,7 +514,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
expert_load_view=expert_load_view,
logical_to_physical_map=logical_to_physical_map,
logical_replica_count=logical_replica_count,
routed_scaling_factor=routed_scaling_factor,
use_fused_gate=use_fused_gate)
if self.rocm_aiter_moe_enabled:
......@@ -588,7 +584,6 @@ class UnquantizedFusedMoEMethod(FusedMoEMethodBase, CustomOp):
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
use_nn_moe: Optional[bool] = False,
routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False,
):
if enable_eplb is not False or expert_load_view is not None or \
......@@ -828,7 +823,6 @@ class FusedMoE(CustomOp):
enable_eplb: bool = False,
num_redundant_experts: int = 0,
has_bias: bool = False,
routed_scaling_factor: Optional[float] = None,
):
super().__init__()
if params_dtype is None:
......@@ -909,7 +903,6 @@ class FusedMoE(CustomOp):
self.e_score_correction_bias = e_score_correction_bias
self.apply_router_weight_on_input = apply_router_weight_on_input
self.activation = activation
self.routed_scaling_factor = routed_scaling_factor
if self.scoring_func != "softmax" and not self.use_grouped_topk:
raise ValueError("Only softmax scoring function is supported for "
......@@ -1519,7 +1512,6 @@ class FusedMoE(CustomOp):
expert_load_view: Optional[torch.Tensor] = None,
logical_to_physical_map: Optional[torch.Tensor] = None,
logical_replica_count: Optional[torch.Tensor] = None,
routed_scaling_factor: Optional[float] = None,
use_fused_gate: Optional[bool] = False
) -> tuple[torch.Tensor, torch.Tensor]:
"""
......@@ -1827,7 +1819,6 @@ class FusedMoE(CustomOp):
logical_to_physical_map=self.logical_to_physical_map,
logical_replica_count=self.logical_replica_count,
use_nn_moe=self.use_nn_moe,
routed_scaling_factor=self.routed_scaling_factor,
use_fused_gate=self.use_fused_gate
)
......
......@@ -38,11 +38,9 @@ from vllm.model_executor.layers.layernorm import RMSNorm
from vllm.model_executor.layers.linear import (QKVParallelLinear,
RowParallelLinear)
from vllm.model_executor.layers.logits_processor import LogitsProcessor
from vllm.model_executor.layers.pooler import Pooler, PoolingType
from vllm.model_executor.layers.quantization import QuantizationConfig
from vllm.model_executor.layers.rotary_embedding import get_rope
from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
from vllm.model_executor.pooling_metadata import PoolingMetadata
from vllm.model_executor.sampling_metadata import SamplingMetadata
from vllm.sequence import IntermediateTensors, PoolerOutput
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment