[Lint] Add `python/sglang` to ruff F401 checks and remove unused imports in files (#11685)

62797440 · Chang Su · GitHub · 2614adf9 · 62797440 · 62797440
Unverified Commit 62797440 authored Oct 17, 2025 by Chang Su Committed by GitHub Oct 17, 2025
20 changed files
--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -8,7 +8,7 @@ import torch
 import triton
 import triton.language as tl

-from sglang.srt.utils import get_bool_env_var, get_device_name, is_cuda
+from sglang.srt.utils import get_device_name, is_cuda

 _is_cuda = is_cuda()
 if _is_cuda:

--- a/python/sglang/srt/layers/quantization/modelopt_quant.py
+++ b/python/sglang/srt/layers/quantization/modelopt_quant.py
@@ -1059,16 +1059,7 @@ class ModelOptNvFp4FusedMoEMethod(FusedMoEMethodBase):
        intermediate_size,
        num_experts,
    ):
-        from flashinfer import (
-            RoutingMethodType,
-            e2m1_and_ufp8sf_scale_to_float,
-            fp4_quantize,
-            next_positive_power_of_2,
-            nvfp4_block_scale_interleave,
-            reorder_rows_for_gated_act_gemm,
-            shuffle_matrix_a,
-            shuffle_matrix_sf_a,
-        )
+        from flashinfer import nvfp4_block_scale_interleave
        from flashinfer.fused_moe.core import (
            _maybe_get_cached_w2_permute_indices,
            _maybe_get_cached_w3_w1_permute_indices,

--- a/python/sglang/srt/layers/quantization/petit.py
+++ b/python/sglang/srt/layers/quantization/petit.py
@@ -2,7 +2,7 @@


 import logging
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Dict, List, Optional

 import regex as re
 import torch

--- a/python/sglang/srt/layers/quantization/quark/quark_moe.py
+++ b/python/sglang/srt/layers/quantization/quark/quark_moe.py
@@ -3,16 +3,16 @@
 from __future__ import annotations

 import logging
-from typing import TYPE_CHECKING, Any, Callable, Optional
+from typing import TYPE_CHECKING, Any

 import torch
-from aiter import ActivationType, QuantType, biased_grouped_topk
+from aiter import ActivationType, QuantType
 from aiter.fused_moe import fused_moe
 from aiter.utility.fp4_utils import e8m0_shuffle

 from sglang.srt.layers.moe import MoeRunnerConfig
 from sglang.srt.layers.quantization.base_config import FusedMoEMethodBase
-from sglang.srt.utils import get_bool_env_var, is_hip, mxfp_supported, set_weight_attrs
+from sglang.srt.utils import is_hip, set_weight_attrs

 if TYPE_CHECKING:
    from sglang.srt.layers.moe.token_dispatcher import (

--- a/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
+++ b/python/sglang/srt/layers/quantization/quark/schemes/quark_w4a4_mxfp4.py
@@ -2,20 +2,13 @@

 from typing import Any, Callable, Optional

-import aiter
 import torch
-import torch.nn.functional as F
-from aiter.ops.gemm_op_a4w4 import gemm_a4w4
-from aiter.ops.shuffle import shuffle_weight
 from aiter.ops.triton.gemm_afp4wfp4 import gemm_afp4wfp4
 from aiter.ops.triton.gemm_afp4wfp4_pre_quant_atomic import gemm_afp4wfp4_pre_quant
 from aiter.ops.triton.quant import dynamic_mxfp4_quant
-from aiter.utility import dtypes
-from aiter.utility.fp4_utils import e8m0_shuffle

 from sglang.srt.layers.parameter import GroupQuantScaleParameter, PackedvLLMParameter
 from sglang.srt.layers.quantization.quark.schemes import QuarkScheme
-from sglang.srt.utils import get_bool_env_var

 __all__ = ["QuarkW4A4MXFP4"]


--- a/python/sglang/srt/layers/quantization/utils.py
+++ b/python/sglang/srt/layers/quantization/utils.py
@@ -11,7 +11,6 @@ import numpy
 import torch

 from sglang.srt.layers.quantization.fp8_kernel import scaled_fp8_quant
-from sglang.srt.utils import is_cuda

 if TYPE_CHECKING:
    from sglang.srt.layers.quantization.base_config import QuantizationConfig

--- a/python/sglang/srt/layers/quantization/w4afp8.py
+++ b/python/sglang/srt/layers/quantization/w4afp8.py
 from __future__ import annotations

 import logging
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, Optional

 import torch
 from torch.nn import Module
 from torch.nn.parameter import Parameter

-from sglang.srt.distributed.parallel_state import get_moe_expert_parallel_world_size
-from sglang.srt.layers.linear import LinearBase, UnquantizedLinearMethod
+from sglang.srt.layers.linear import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.base_config import (
    FusedMoEMethodBase,
    QuantizationConfig,
@@ -17,11 +16,11 @@ from sglang.srt.layers.quantization.base_config import (
 from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod
 from sglang.srt.layers.quantization.unquant import UnquantizedLinearMethod
 from sglang.srt.layers.quantization.utils import is_layer_skipped
-from sglang.srt.utils import is_npu, set_weight_attrs
+from sglang.srt.utils import set_weight_attrs

 if TYPE_CHECKING:
    from sglang.srt.layers.moe import MoeRunnerConfig
-    from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE, EPMoE
+    from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE
    from sglang.srt.layers.moe.token_dispatcher import (
        CombineInput,
        DeepEPNormalOutput,

--- a/python/sglang/srt/layers/quantization/w8a8_int8.py
+++ b/python/sglang/srt/layers/quantization/w8a8_int8.py
 from __future__ import annotations

-import importlib
-import sys
 from types import MappingProxyType
-from typing import (
-    TYPE_CHECKING,
-    Any,
-    Callable,
-    Dict,
-    List,
-    Mapping,
-    Optional,
-    Tuple,
-    Union,
-    cast,
-)
+from typing import TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, Union, cast

 import torch
 from torch.nn.parameter import Parameter

-from sglang.srt.distributed import (
-    get_tensor_model_parallel_rank,
-    get_tensor_model_parallel_world_size,
-)
+from sglang.srt.distributed import get_tensor_model_parallel_world_size
 from sglang.srt.layers.amx_utils import _amx_process_weight_after_loading
 from sglang.srt.layers.moe import MoeRunner, MoeRunnerBackend, MoeRunnerConfig
 from sglang.srt.layers.moe.moe_runner.triton import TritonMoeQuantInfo

--- a/python/sglang/srt/layers/utils.py
+++ b/python/sglang/srt/layers/utils.py
 import logging
 import re
-from functools import lru_cache

 import torch


--- a/python/sglang/srt/lora/backend/triton_backend.py
+++ b/python/sglang/srt/lora/backend/triton_backend.py
@@ -11,7 +11,6 @@ from sglang.srt.lora.triton_ops import (
 )
 from sglang.srt.lora.utils import LoRABatchInfo
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch
-from sglang.srt.server_args import ServerArgs


 class TritonLoRABackend(BaseLoRABackend):

--- a/python/sglang/srt/lora/eviction_policy.py
+++ b/python/sglang/srt/lora/eviction_policy.py
@@ -20,7 +20,7 @@ import logging
 import time
 from abc import ABC, abstractmethod
 from collections import OrderedDict
-from typing import Any, Dict, List, Optional, Set
+from typing import Optional, Set

 logger = logging.getLogger(__name__)


--- a/python/sglang/srt/lora/lora_manager.py
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -16,7 +16,7 @@
 # and "Punica: Multi-Tenant LoRA Serving"

 import logging
-from typing import Dict, Iterable, List, Optional, Set, Tuple
+from typing import Dict, Iterable, List, Optional

 import torch


--- a/python/sglang/srt/managers/cache_controller.py
+++ b/python/sglang/srt/managers/cache_controller.py
@@ -14,11 +14,10 @@ limitations under the License.
 """

 import logging
-import math
 import threading
 import time
-from queue import Empty, Full, PriorityQueue, Queue
-from typing import TYPE_CHECKING, List, NamedTuple, Optional, Set, Tuple
+from queue import Empty, Full, Queue
+from typing import TYPE_CHECKING, List, NamedTuple, Optional

 import torch

@@ -41,7 +40,7 @@ from sglang.srt.layers.dp_attention import (
    get_attention_tp_size,
    is_dp_attention_enabled,
 )
-from sglang.srt.mem_cache.memory_pool import MHATokenToKVPool, MLATokenToKVPool
+from sglang.srt.mem_cache.memory_pool import MLATokenToKVPool

 logger = logging.getLogger(__name__)


--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -59,11 +59,10 @@ from sglang.srt.mem_cache.allocator import (
    SWATokenToKVPoolAllocator,
 )
 from sglang.srt.mem_cache.base_prefix_cache import BasePrefixCache
-from sglang.srt.mem_cache.chunk_cache import ChunkCache, SWAChunkCache
+from sglang.srt.mem_cache.chunk_cache import SWAChunkCache
 from sglang.srt.mem_cache.common import (
    alloc_for_decode,
    alloc_for_extend,
-    alloc_token_slots,
    evict_from_tree_cache,
 )
 from sglang.srt.mem_cache.mamba_radix_cache import MambaRadixCache
@@ -76,7 +75,6 @@ from sglang.srt.sampling.sampling_batch_info import SamplingBatchInfo
 from sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.srt.server_args import ServerArgs, get_global_server_args
 from sglang.srt.utils import flatten_nested_list
-from sglang.srt.utils.common import next_power_of_2

 if TYPE_CHECKING:
    from sglang.srt.configs.model_config import ModelConfig

--- a/python/sglang/srt/managers/scheduler_metrics_mixin.py
+++ b/python/sglang/srt/managers/scheduler_metrics_mixin.py
@@ -3,13 +3,10 @@ from __future__ import annotations
 import logging
 import time
 from collections import defaultdict
-from typing import TYPE_CHECKING, Dict, List, Optional, Union
-
-import torch
+from typing import TYPE_CHECKING, List, Optional

 from sglang.srt.disaggregation.kv_events import EventPublisherFactory, KVEventBatch
 from sglang.srt.disaggregation.utils import DisaggregationMode
-from sglang.srt.managers.io_struct import TokenizedGenerateReqInput
 from sglang.srt.managers.schedule_policy import PrefillAdder
 from sglang.srt.managers.scheduler import Req, ScheduleBatch
 from sglang.srt.metrics.collector import SchedulerMetricsCollector, SchedulerStats

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -16,7 +16,6 @@
 import asyncio
 import copy
 import dataclasses
-import json
 import logging
 import math
 import os

--- a/python/sglang/srt/managers/utils.py
+++ b/python/sglang/srt/managers/utils.py
 from __future__ import annotations

 import logging
-import multiprocessing as mp
-from typing import TYPE_CHECKING, Dict, List, Optional
+from typing import TYPE_CHECKING, Optional

 from sglang.srt.layers.logits_processor import LogitsProcessorOutput
 from sglang.srt.managers.schedule_batch import Req

--- a/python/sglang/srt/mem_cache/allocator_ascend.py
+++ b/python/sglang/srt/mem_cache/allocator_ascend.py
@@ -92,7 +92,7 @@ class AscendPagedTokenToKVPoolAllocator(PagedTokenToKVPoolAllocator):
        )

        if num_new_pages_item < 200:
-            import sgl_kernel_npu
+            import sgl_kernel_npu  # noqa: F401

            torch.ops.npu.alloc_extend(
                prefix_lens,

--- a/python/sglang/srt/mem_cache/base_prefix_cache.py
+++ b/python/sglang/srt/mem_cache/base_prefix_cache.py
 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, Any, List, NamedTuple, Optional, Tuple
+from typing import TYPE_CHECKING, Any, NamedTuple, Optional, Tuple

 import torch


--- a/python/sglang/srt/mem_cache/evict_policy.py
+++ b/python/sglang/srt/mem_cache/evict_policy.py
 from __future__ import annotations

 from abc import ABC, abstractmethod
-from typing import TYPE_CHECKING, List, Tuple, Union
+from typing import TYPE_CHECKING, Tuple, Union

 if TYPE_CHECKING:
    from sglang.srt.mem_cache.radix_cache import TreeNode