[Lint] Add `python/sglang` to ruff F401 checks and remove unused imports in files (#11685)

62797440 · Chang Su · GitHub · 2614adf9 · 62797440 · 62797440
Unverified Commit 62797440 authored Oct 17, 2025 by Chang Su Committed by GitHub Oct 17, 2025
20 changed files
--- a/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
+++ b/python/sglang/srt/layers/attention/nsa/nsa_indexer.py
 from __future__ import annotations

 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, Dict, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, Optional

 import torch
 import torch.nn.functional as F
@@ -547,7 +547,7 @@ class Indexer(CustomOp):
        forward_batch: ForwardBatch,
        layer_id: int,
    ) -> torch.Tensor:
-        import custom_ops
+        import custom_ops  # noqa: F401
        import torch_npu

        from sglang.srt.layers.dp_attention import (

--- a/python/sglang/srt/layers/attention/nsa_backend.py
+++ b/python/sglang/srt/layers/attention/nsa_backend.py
 from __future__ import annotations

-import sys
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Literal, Optional, TypeAlias

@@ -34,18 +33,18 @@ _is_hip = is_hip()

 if _is_hip:
    try:
-        from aiter import (
+        from aiter import (  # noqa: F401
            flash_attn_varlen_func,
            mha_batch_prefill_func,
            paged_attention_ragged,
        )
-        from aiter.mla import mla_decode_fwd, mla_prefill_fwd
+        from aiter.mla import mla_decode_fwd, mla_prefill_fwd  # noqa: F401
    except ImportError:
        print(
            "aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device."
        )
 else:
-    from sgl_kernel.flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
+    from sgl_kernel.flash_attn import flash_attn_with_kvcache


 @dataclass(frozen=True)

--- a/python/sglang/srt/layers/layernorm.py
+++ b/python/sglang/srt/layers/layernorm.py
@@ -372,4 +372,4 @@ if not (
    logger.info(
        "sgl-kernel layernorm implementation is not available on current platform. Fallback to other kernel libraries."
    )
-    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm
+    from vllm.model_executor.layers.layernorm import GemmaRMSNorm, RMSNorm  # noqa: F401
--- a/python/sglang/srt/layers/moe/cutlass_moe.py
+++ b/python/sglang/srt/layers/moe/cutlass_moe.py
@@ -116,8 +116,6 @@ def cutlass_fused_experts_fp8(

    if is_cuda:
        from sglang.srt.layers.quantization.fp8_kernel import (
-            per_group_transpose,
-            per_token_group_quant_fp8_hopper_moe_mn_major,
            sglang_per_token_group_quant_fp8,
        )


--- a/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
+++ b/python/sglang/srt/layers/moe/cutlass_w4a8_moe.py
 # SPDX-License-Identifier: Apache-2.0
 """Cutlass W4A8 MoE kernel."""
-import logging
 from typing import Optional

 import torch

--- a/python/sglang/srt/layers/moe/ep_moe/kernels.py
+++ b/python/sglang/srt/layers/moe/ep_moe/kernels.py
 import logging
-from typing import List, Optional

 import torch
 import triton

-from sglang.srt.layers.quantization.fp8_kernel import per_token_group_quant_fp8
-from sglang.srt.utils import ceil_div, dispose_tensor, is_cuda
-from sglang.utils import is_in_ci
+from sglang.srt.utils import ceil_div, is_cuda

 logger = logging.getLogger(__name__)


--- a/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py
+++ b/python/sglang/srt/layers/moe/flashinfer_cutedsl_moe.py
-from typing import Any, Dict, Optional, Union
+from typing import Optional, Union

 import torch
 from flashinfer.cute_dsl.blockscaled_gemm import grouped_gemm_nt_masked

--- a/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
+++ b/python/sglang/srt/layers/moe/fused_moe_triton/layer.py
@@ -43,13 +43,7 @@ from sglang.srt.utils import (
 )

 if is_flashinfer_available():
-    from flashinfer import (
-        RoutingMethodType,
-        fp4_quantize,
-        reorder_rows_for_gated_act_gemm,
-        shuffle_matrix_a,
-        shuffle_matrix_sf_a,
-    )
+    from flashinfer import RoutingMethodType, fp4_quantize

 _is_hip = is_hip()
 _is_cpu_amx_available = cpu_has_amx_support()

--- a/python/sglang/srt/layers/moe/moe_runner/triton.py
+++ b/python/sglang/srt/layers/moe/moe_runner/triton.py
@@ -51,7 +51,9 @@ elif _is_hip:


 if _is_cuda or _is_hip:
-    from sgl_kernel import moe_align_block_size as sgl_moe_align_block_size
+    from sgl_kernel import (  # noqa: F401
+        moe_align_block_size as sgl_moe_align_block_size,
+    )


 @dataclass

--- a/python/sglang/srt/layers/moe/rocm_moe_utils.py
+++ b/python/sglang/srt/layers/moe/rocm_moe_utils.py
@@ -2,7 +2,6 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from enum import IntEnum
-from functools import cache
 from typing import Optional

 import torch

--- a/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
+++ b/python/sglang/srt/layers/moe/token_dispatcher/deepep.py
@@ -3,7 +3,7 @@ from __future__ import annotations
 import logging
 from contextlib import nullcontext
 from dataclasses import dataclass
-from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import TYPE_CHECKING, List, NamedTuple, Optional, Tuple, Union

 from sglang.srt.eplb.expert_distribution import get_global_expert_distribution_recorder
 from sglang.srt.layers.moe.token_dispatcher.base import (

--- a/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
+++ b/python/sglang/srt/layers/moe/token_dispatcher/mooncake.py
@@ -22,7 +22,7 @@ try:
 except ImportError:
    use_mooncake_ep = False

-from enum import Enum, IntEnum, auto
+from enum import Enum, auto

 import torch
 import torch.distributed as dist

--- a/python/sglang/srt/layers/quantization/awq.py
+++ b/python/sglang/srt/layers/quantization/awq.py
@@ -3,7 +3,7 @@ from __future__ import annotations

 import logging
 import warnings
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import torch


--- a/python/sglang/srt/layers/quantization/base_config.py
+++ b/python/sglang/srt/layers/quantization/base_config.py
@@ -3,7 +3,6 @@ from __future__ import annotations

 import inspect
 from abc import ABC, abstractmethod
-from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Type

 import torch

--- a/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
+++ b/python/sglang/srt/layers/quantization/compressed_tensors/compressed_tensors_moe.py
@@ -5,7 +5,7 @@ from __future__ import annotations
 import enum
 import logging
 from enum import Enum
-from typing import TYPE_CHECKING, List, Optional
+from typing import TYPE_CHECKING, List

 import torch
 from compressed_tensors import CompressionFormat
@@ -21,14 +21,7 @@ from sglang.srt.layers.quantization.utils import (
    per_tensor_dequantize,
    replace_parameter,
 )
-from sglang.srt.utils import (
-    get_bool_env_var,
-    is_cpu,
-    is_cuda,
-    is_hip,
-    is_npu,
-    set_weight_attrs,
-)
+from sglang.srt.utils import get_bool_env_var, is_hip, set_weight_attrs

 if TYPE_CHECKING:
    from sglang.srt.layers.moe.fused_moe_triton import FusedMoE
@@ -49,7 +42,7 @@ if _use_aiter:
    from sglang.srt.layers.moe.rocm_moe_utils import rocm_fused_experts_tkw1

 try:
-    import vllm
+    import vllm  # noqa: F401

    VLLM_AVAILABLE = True
 except ImportError:

--- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/configurer.py
@@ -12,7 +12,7 @@ def _compute_enable_deep_gemm():
        return False

    try:
-        import deep_gemm
+        import deep_gemm  # noqa: F401
    except ImportError:
        return False


--- a/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py
+++ b/python/sglang/srt/layers/quantization/deep_gemm_wrapper/entrypoint.py
@@ -5,7 +5,7 @@ from typing import Tuple
 import torch

 from sglang.srt.layers.quantization.deep_gemm_wrapper import compile_utils
-from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (
+from sglang.srt.layers.quantization.deep_gemm_wrapper.configurer import (  # noqa: F401
    DEEPGEMM_BLACKWELL,
    DEEPGEMM_SCALE_UE8M0,
    ENABLE_JIT_DEEPGEMM,
@@ -17,7 +17,7 @@ logger = logging.getLogger(__name__)

 if ENABLE_JIT_DEEPGEMM:
    import deep_gemm
-    from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor
+    from deep_gemm.utils.layout import get_mn_major_tma_aligned_tensor  # noqa: F401

 _SANITY_CHECK = get_bool_env_var("SGLANG_DEEPGEMM_SANITY_CHECK")


--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -67,7 +67,7 @@ if _is_hip:
            raise ImportError("aiter is required when SGLANG_USE_AITER is set to True")
    else:
        try:
-            import vllm._C
+            import vllm._C  # noqa: F401
        except ImportError:
            raise ImportError("vllm is required when SGLANG_USE_AITER is set to False")


--- a/python/sglang/srt/layers/quantization/fpgemm_fp8.py
+++ b/python/sglang/srt/layers/quantization/fpgemm_fp8.py
@@ -11,7 +11,6 @@ from torch.nn.parameter import Parameter
 from sglang.srt.layers.linear import LinearBase
 from sglang.srt.layers.parameter import ChannelQuantScaleParameter, ModelWeightParameter
 from sglang.srt.layers.quantization.base_config import (
-    FusedMoEMethodBase,
    LinearMethodBase,
    QuantizationConfig,
    QuantizeMethodBase,
@@ -28,7 +27,7 @@ from sglang.srt.layers.quantization.marlin_utils_fp8 import (
    prepare_fp8_layer_for_marlin,
 )
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
-from sglang.srt.layers.quantization.utils import is_layer_skipped, replace_parameter
+from sglang.srt.layers.quantization.utils import is_layer_skipped
 from sglang.srt.utils import get_bool_env_var, is_cuda

 _is_cuda = is_cuda()

--- a/python/sglang/srt/layers/quantization/gptq.py
+++ b/python/sglang/srt/layers/quantization/gptq.py
@@ -199,7 +199,6 @@ class GPTQConfig(QuantizationConfig):
        self, layer: torch.nn.Module, prefix: str
    ) -> Optional[LinearMethodBase]:
        # Delay the import to avoid circular dependency
-        from sglang.srt.layers.linear import LinearBase
        from sglang.srt.layers.moe.fused_moe_triton import FusedMoE

        if isinstance(layer, FusedMoE):