Clean up imports (#5467)

177320a5 · Lianmin Zheng · GitHub · d7bc19a4 · 177320a5 · 177320a5
Unverified Commit 177320a5 authored Apr 16, 2025 by Lianmin Zheng Committed by GitHub Apr 16, 2025
11 changed files
--- a/python/sglang/srt/models/deepseek_nextn.py
+++ b/python/sglang/srt/models/deepseek_nextn.py
@@ -48,7 +48,7 @@ _is_cuda = is_cuda()
 if _is_cuda:
    from sgl_kernel import awq_dequantize
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import awq_dequantize
 class DeepseekModelNextN(nn.Module):
@@ -273,7 +273,7 @@ class DeepseekV3ForCausalLMNextN(DeepseekV3ForCausalLM):
                        self_attn.kv_b_proj.qzeros,
                    ).T
                else:
-                    w = ops.awq_dequantize(
+                    w = awq_dequantize(
                        self_attn.kv_b_proj.qweight,
                        self_attn.kv_b_proj.scales,
                        self_attn.kv_b_proj.qzeros,

--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -51,6 +51,7 @@ from sglang.srt.layers.linear import (
 )
 from sglang.srt.layers.logits_processor import LogitsProcessor
 from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, EPMoE
+from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
 from sglang.srt.layers.moe.topk import select_experts
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
@@ -80,10 +81,8 @@ _is_cuda = is_cuda()
 if _is_cuda:
    from sgl_kernel import awq_dequantize, bmm_fp8, merge_state_v2
-    from sglang.srt.layers.moe.ep_moe.token_dispatcher import DeepEPDispatcher
 else:
-    from vllm import _custom_ops as ops
+    from vllm._custom_ops import awq_dequantize
 if _is_hip:
    from sglang.srt.layers.attention.triton_ops.rocm_mla_decode_rope import (
@@ -861,7 +860,7 @@ class DeepseekV2AttentionMLA(nn.Module):
            )
        elif self.w_kc.dtype == torch.float8_e4m3fn:
            q_nope_val, q_nope_scale = per_tensor_quant_mla_fp8(
-                q_nope.transpose(0, 1), dtype=torch.float8_e4m3fn
+                q_nope.transpose(0, 1),
            )
            q_nope_out = bmm_fp8(
                q_nope_val, self.w_kc, q_nope_scale, self.w_scale, torch.bfloat16
@@ -892,7 +891,7 @@ class DeepseekV2AttentionMLA(nn.Module):
            )
        elif self.w_vc.dtype == torch.float8_e4m3fn:
            attn_output_val, attn_output_scale = per_tensor_quant_mla_fp8(
-                attn_output.transpose(0, 1), dtype=torch.float8_e4m3fn
+                attn_output.transpose(0, 1),
            )
            attn_bmm_output = bmm_fp8(
                attn_output_val,
@@ -1565,7 +1564,7 @@ class DeepseekV2ForCausalLM(nn.Module):
                            self_attn.kv_b_proj.qzeros,
                        ).T
                    else:
-                        w = ops.awq_dequantize(
+                        w = awq_dequantize(
                            self_attn.kv_b_proj.qweight,
                            self_attn.kv_b_proj.scales,
                            self_attn.kv_b_proj.qzeros,

--- a/python/sglang/srt/reasoning_parser.py
+++ b/python/sglang/srt/reasoning_parser.py
-import re
 from typing import Dict, Tuple

--- a/python/sglang/srt/sampling/sampling_batch_info.py
+++ b/python/sglang/srt/sampling/sampling_batch_info.py
@@ -10,12 +10,11 @@ import torch
 import sglang.srt.sampling.penaltylib as penaltylib
 from sglang.srt.sampling.custom_logit_processor import CustomLogitProcessor
-logger = logging.getLogger(__name__)
 if TYPE_CHECKING:
    from sglang.srt.managers.schedule_batch import ScheduleBatch
+logger = logging.getLogger(__name__)
 @dataclasses.dataclass
 class SamplingBatchInfo:

--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
-# Copyright 2023-2024 SGLang Team
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-# Some shortcuts for backward compatibility.
-# They will be removed in new versions.
-from sglang.srt.entrypoints.engine import Engine
-from sglang.srt.entrypoints.http_server import kill_process_tree, launch_server
--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -187,6 +187,7 @@ class ServerArgs:
    n_share_experts_fusion: int = 0
    disable_shared_experts_fusion: bool = False
    disable_chunked_prefix_cache: bool = False
+    disable_fast_image_processor: bool = False
    # Debug tensor dumps
    debug_tensor_dump_output_folder: Optional[str] = None
@@ -198,9 +199,6 @@ class ServerArgs:
    disaggregation_bootstrap_port: int = 8998
    disaggregation_transfer_backend: str = "mooncake"
-    # multimodal
-    disable_fast_image_processor: bool = False
    def __post_init__(self):
        # Expert parallelism
        if self.enable_ep_moe:
@@ -1136,6 +1134,11 @@ class ServerArgs:
            action="store_true",
            help="Disable chunked prefix cache feature for deepseek, which should save overhead for short sequences.",
        )
+        parser.add_argument(
+            "--disable-fast-image-processor",
+            action="store_true",
+            help="Adopt base image processor instead of fast image processor.",
+        )
        # Server warmups
        parser.add_argument(
@@ -1187,13 +1190,6 @@ class ServerArgs:
            help="The backend for disaggregation transfer. Default is mooncake.",
        )
-        # Multimodal
-        parser.add_argument(
-            "--disable-fast-image-processor",
-            action="store_true",
-            help="Adopt base image processor instead of fast image processor.",
-        )
    @classmethod
    def from_cli_args(cls, args: argparse.Namespace):
        args.tp_size = args.tensor_parallel_size

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -55,7 +55,6 @@ import torch.distributed
 import torch.distributed as dist
 import triton
 import zmq
-from decord import VideoReader, cpu
 from fastapi.responses import ORJSONResponse
 from packaging import version as pkg_version
 from PIL import Image
@@ -545,6 +544,9 @@ def load_audio(audio_file: str, sr: int = 16000, mono: bool = True) -> np.ndarra
 def encode_video(video_path, frame_count_limit=None):
+    # Lazy import because decord is not available on some arm platforms.
+    from decord import VideoReader, cpu
    if not os.path.exists(video_path):
        logger.error(f"Video {video_path} does not exist")
        return []

--- a/python/sglang/test/runners.py
+++ b/python/sglang/test/runners.py
@@ -26,8 +26,8 @@ from transformers import (
    AutoProcessor,
 )
+from sglang.srt.entrypoints.engine import Engine
 from sglang.srt.hf_transformers_utils import get_tokenizer
-from sglang.srt.server import Engine
 from sglang.srt.utils import load_image
 from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, calculate_rouge_l

--- a/python/sglang/test/test_custom_ops.py
+++ b/python/sglang/test/test_custom_ops.py
@@ -3,7 +3,7 @@
 import pytest
 import torch
-from sglang.srt.custom_op import scaled_fp8_quant
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
 from sglang.srt.utils import is_cuda

--- a/test/srt/test_fp8_kernel.py
+++ b/test/srt/test_fp8_kernel.py
@@ -93,9 +93,7 @@ class TestPerTokenGroupQuantFP8(TestFP8Base):
        A, A_quant_gt, scale_gt = self._make_A(
            M=self.M, K=self.K, group_size=self.group_size, out_dtype=self.quant_type
        )
-        A_quant, scale = per_token_group_quant_fp8(
+        A_quant, scale = per_token_group_quant_fp8(x=A, group_size=self.group_size)
-            x=A, group_size=self.group_size, dtype=self.quant_type
-        )
        torch.testing.assert_close(scale, scale_gt)
        diff = (A_quant.to(torch.float16) - A_quant_gt.to(torch.float16)).abs()
        diff_count = (diff > 1e-5).count_nonzero()

--- a/test/srt/test_triton_moe_channel_fp8_kernel.py
+++ b/test/srt/test_triton_moe_channel_fp8_kernel.py
@@ -3,9 +3,9 @@ import unittest
 import torch
-from sglang.srt.custom_op import scaled_fp8_quant as sgl_scaled_fp8_quant
 from sglang.srt.layers.activation import SiluAndMul
 from sglang.srt.layers.moe.fused_moe_triton.fused_moe import fused_moe
+from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
 from sglang.test.test_utils import CustomTestCase
@@ -41,7 +41,7 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
    B, D = a.shape
    # Perform per-token quantization
-    a_q, a_s = sgl_scaled_fp8_quant(a, use_per_token_if_dynamic=True)
+    a_q, a_s = scaled_fp8_quant(a, use_per_token_if_dynamic=True)
    # Repeat tokens to match topk
    a_q = a_q.view(B, -1, D).repeat(1, topk, 1).reshape(-1, D)
    # Also repeat the scale
@@ -69,7 +69,7 @@ def torch_w8a8_per_column_moe(a, w1, w2, w1_s, w2_s, score, topk):
            # Activation function
            act_out = SiluAndMul().forward_native(inter_out)
            # Quantize activation output with per-token
-            act_out_q, act_out_s = sgl_scaled_fp8_quant(
+            act_out_q, act_out_s = scaled_fp8_quant(
                act_out, use_per_token_if_dynamic=True
            )