Add typo checker in pre-commit (#6179)

Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>

Add typo checker in pre-commit (#6179)
Co-authored-by: Brayden Zhong <b8zhong@uwaterloo.ca>
2ce87935 · applesaucethebun · GitHub · de167cf5 · 2ce87935 · 2ce87935
Unverified Commit 2ce87935 authored May 11, 2025 by applesaucethebun Committed by GitHub May 11, 2025
20 changed files
--- a/python/sglang/srt/layers/quantization/fp8.py
+++ b/python/sglang/srt/layers/quantization/fp8.py
@@ -235,7 +235,7 @@ class Fp8LinearMethod(LinearMethodBase):
                        f"{input_size_per_partition} is not divisible by "
                        f"weight quantization block_k = {block_k}."
                    )
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
            if (
                tp_size > 1 and output_size // output_size_per_partition == tp_size
            ) or len(output_partition_sizes) > 1:
@@ -491,7 +491,7 @@ class Fp8MoEMethod:
                self.quant_config.weight_block_size[1],
            )
            # NOTE(HandH1998): To ensure proper alignment of the block-wise quantization scales, the output_size of the weights for both the gate and up layers must be divisible by block_n.
-            # Required by collum parallel or enabling merged weights
+            # Required by column parallel or enabling merged weights
            if intermediate_size % block_n != 0:
                raise ValueError(
                    f"The output_size of gate's and up's weight = "

--- a/python/sglang/srt/layers/quantization/fp8_kernel.py
+++ b/python/sglang/srt/layers/quantization/fp8_kernel.py
@@ -104,7 +104,7 @@ def _per_token_group_quant_fp8(
    y_s_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Avoid to divide zero
    eps,
@@ -342,7 +342,7 @@ def _static_quant_fp8(
    y_s_repeat_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Information for float8
    fp8_min,
@@ -794,7 +794,7 @@ def w8a8_block_fp8_matmul(
            config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
        else:
            # Default config
-            # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+            # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
            config = {
                "BLOCK_SIZE_M": 64,
                "BLOCK_SIZE_N": block_size[0],

--- a/python/sglang/srt/layers/quantization/int8_kernel.py
+++ b/python/sglang/srt/layers/quantization/int8_kernel.py
@@ -76,7 +76,7 @@ def _per_token_group_quant_int8(
    y_s_ptr,
    # Stride of input
    y_stride,
-    # Collums of input
+    # Columns of input
    N,
    # Avoid to divide zero
    eps,
@@ -370,7 +370,7 @@ def w8a8_block_int8_matmul(
        config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
    else:
        # Default config
-        # Block-wise quant: BLOCK_SIZE_K must be divisable by block_size[1]
+        # Block-wise quant: BLOCK_SIZE_K must be divisible by block_size[1]
        config = {
            "BLOCK_SIZE_M": 64,
            "BLOCK_SIZE_N": block_size[0],

--- a/python/sglang/srt/lora/lora_manager.py
+++ b/python/sglang/srt/lora/lora_manager.py
@@ -100,7 +100,7 @@ class LoRAManager:
            self.configs[name] = LoRAConfig(path)
            self.hf_target_names.update(self.configs[name].target_modules)
-        # Target lora weight names for lora_a and lora_b modules repectively.
+        # Target lora weight names for lora_a and lora_b modules respectively.
        # e.g., {("qkv_proj", "q_proj"), ("qkv_proj", "kv_proj")}
        self.lora_weight_names: Set[Tuple[str]] = set(
            [get_stacked_name(module) for module in self.hf_target_names]

--- a/python/sglang/srt/lora/mem_pool.py
+++ b/python/sglang/srt/lora/mem_pool.py
@@ -50,15 +50,15 @@ class LoRAMemoryPool:
        self.uid_to_buffer_id: Dict[Optional[str], int] = {}
        # Buffer idx -> lora uid in memory pool
-        # All uids are initalized as empty strings for empty buffer slots
+        # All uids are initialized as empty strings for empty buffer slots
-        # Here we don't initalize to None since None is a valid uid
+        # Here we don't initialize to None since None is a valid uid
        self.buffer_id_to_uid: List[Optional[str]] = [""] * self.max_loras_per_batch
    def get_lora_A_shape(
        self, module_name: str, base_model: torch.nn.Module
    ) -> Tuple[int]:
        """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
        """
        input_dim, _ = get_hidden_dim(module_name, self.base_hf_config, base_model)
        c = get_stacked_multiply(module_name)
@@ -75,7 +75,7 @@ class LoRAMemoryPool:
        self, module_name: str, base_model: torch.nn.Module
    ) -> Tuple[int]:
        """
-        Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+        Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
        """
        _, output_dim = get_hidden_dim(module_name, self.base_hf_config, base_model)
        c = get_stacked_multiply(module_name)

--- a/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/gate_up_lora_b.py
@@ -77,7 +77,7 @@ def _gate_up_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(

--- a/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/qkv_lora_b.py
@@ -79,7 +79,7 @@ def _qkv_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(

--- a/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_a.py
@@ -67,7 +67,7 @@ def _sgemm_lora_a_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(

--- a/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
+++ b/python/sglang/srt/lora/triton_ops/sgemm_lora_b.py
@@ -69,7 +69,7 @@ def _sgemm_lora_b_kernel(
        k_offset[:, None] * w_stride_2 + n_offset[None, :] * w_stride_1
    )
-    # Iteate to compute the block in output matrix
+    # Iterate to compute the block in output matrix
    partial_sum = tl.zeros((BLOCK_S, BLOCK_N), dtype=tl.float32)
    for k in range(0, tl.cdiv(K, BLOCK_K)):
        x_tile = tl.load(

--- a/python/sglang/srt/lora/utils.py
+++ b/python/sglang/srt/lora/utils.py
@@ -79,7 +79,7 @@ def get_hidden_dim(
    module_name: str, config: AutoConfig, base_model: torch.nn.Module
 ) -> Tuple[int]:
    """
-    Given a module_name (might be a stacked name), return the hidden dims of modules's input and output.
+    Given a module_name (might be a stacked name), return the hidden dims of modules' input and output.
    """
    if hasattr(base_model, "get_hidden_dim"):

--- a/python/sglang/srt/managers/data_parallel_controller.py
+++ b/python/sglang/srt/managers/data_parallel_controller.py
@@ -210,7 +210,7 @@ class DataParallelController:
                    )
                    # compute zmq ports for this dp rank
                    rank_port_args = PortArgs.init_new(server_args, dp_rank)
-                    # Data parallelism resues the tensor parallelism group,
+                    # Data parallelism reuses the tensor parallelism group,
                    # so all dp ranks should use the same nccl port.
                    rank_port_args.nccl_port = port_args.nccl_port

--- a/python/sglang/srt/managers/io_struct.py
+++ b/python/sglang/srt/managers/io_struct.py
@@ -12,7 +12,7 @@
 # limitations under the License.
 # ==============================================================================
 """
-The definition of objects transfered between different
+The definition of objects transferred between different
 processes (TokenizerManager, DetokenizerManager, Controller).
 """

--- a/python/sglang/srt/managers/mm_utils.py
+++ b/python/sglang/srt/managers/mm_utils.py
@@ -51,7 +51,7 @@ class MultiModalityDataPaddingPatternTokenPairs(MultiModalityDataPaddingPattern)
        self, input_ids: List[int], mm_inputs: MultimodalInputs
    ) -> List[int]:
        """
-        This function will replace the data-tokens inbetween with pad_values accordingly
+        This function will replace the data-tokens in between with pad_values accordingly
        """
        pad_values = [item.pad_value for item in mm_inputs.mm_items]
        data_token_pairs = self.data_token_id_pairs

--- a/python/sglang/srt/managers/schedule_batch.py
+++ b/python/sglang/srt/managers/schedule_batch.py
@@ -879,7 +879,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"{phase_str} out of memory. Try to lower your batch size.\n"
                f"Try to allocate {num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
            )
            logger.error(error_msg)
            if self.tree_cache is not None:
@@ -920,7 +920,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"Prefill out of memory. Try to lower your batch size.\n"
                f"Try to allocate {extend_num_tokens} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                f"{self.tree_cache.evictable_size()=}\n"
            )
@@ -955,7 +955,7 @@ class ScheduleBatch(ScheduleBatchDisaggregationDecodeMixin):
            error_msg = (
                f"Decode out of memory. Try to lower your batch size.\n"
                f"Try to allocate {len(seq_lens)} tokens.\n"
-                f"Avaliable tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
+                f"Available tokens: {self.token_to_kv_pool_allocator.available_size() + self.tree_cache.evictable_size()}\n"
                f"{self.token_to_kv_pool_allocator.available_size()=}\n"
                f"{self.tree_cache.evictable_size()=}\n"
            )

--- a/python/sglang/srt/managers/scheduler.py
+++ b/python/sglang/srt/managers/scheduler.py
@@ -1325,7 +1325,7 @@ class Scheduler(
            return None
        running_bs = len(self.running_batch.reqs)
-        # Igore the check if self.chunked_req is not None.
+        # Ignore the check if self.chunked_req is not None.
        # In the non-PP case, when self.chunked_req is not None, num_allocatable_reqs should always be greater than 0,
        # as the space for the chunked request has just been released.
        # In PP case, a chunked req can start in one microbatch and end in another microbatch, so the max_running_requests per microbatch should not be strict.

--- a/python/sglang/srt/managers/tokenizer_manager.py
+++ b/python/sglang/srt/managers/tokenizer_manager.py
@@ -1273,7 +1273,7 @@ class TokenizerManager:
            self.model_update_result.set_result(recv_obj)
        else:  # self.server_args.dp_size > 1
            self.model_update_tmp.append(recv_obj)
-            # set future if the all results are recevied
+            # set future if the all results are received
            if len(self.model_update_tmp) == self.server_args.dp_size:
                self.model_update_result.set_result(self.model_update_tmp)

--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -296,12 +296,12 @@ class CudaGraphRunner:
                self.capture()
        except RuntimeError as e:
            raise Exception(
-                f"Capture cuda graph failed: {e}\n"
+                f"Capture CUDA graph failed: {e}\n"
                "Possible solutions:\n"
                "1. set --mem-fraction-static to a smaller value (e.g., 0.8 or 0.7)\n"
                "2. set --cuda-graph-max-bs to a smaller value (e.g., 16)\n"
                "3. disable torch compile by not using --enable-torch-compile\n"
-                "4. disable cuda graph by --disable-cuda-graph. (Not recommonded. Huge perf loss)\n"
+                "4. disable CUDA graph by --disable-cuda-graph. (Not recommended. Huge performance loss)\n"
                "Open an issue on GitHub https://github.com/sgl-project/sglang/issues/new/choose \n"
            )

--- a/python/sglang/srt/model_executor/forward_batch_info.py
+++ b/python/sglang/srt/model_executor/forward_batch_info.py
@@ -58,7 +58,7 @@ class ForwardMode(IntEnum):
    DECODE = auto()
    # Contains both EXTEND and DECODE when doing chunked prefill.
    MIXED = auto()
-    # No sequence to forward. For data parallel attention, some workers wil be IDLE if no sequence are allocated.
+    # No sequence to forward. For data parallel attention, some workers will be IDLE if no sequence are allocated.
    IDLE = auto()
    # Used in speculative decoding: verify a batch in the target model.

--- a/python/sglang/srt/models/deepseek_janus_pro.py
+++ b/python/sglang/srt/models/deepseek_janus_pro.py
@@ -188,7 +188,7 @@ def trunc_normal_tf_(
    best when :math:`a \\leq \text{mean} \\leq b`.
    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
-    and the result is subsquently scaled and shifted by the mean and std args.
+    and the result is subsequently scaled and shifted by the mean and std args.
    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
@@ -735,7 +735,7 @@ class VisionTransformer(nn.Module):
            img_size: Input image size.
            patch_size: Patch size.
            in_chans: Number of image input channels.
-            num_classes: Mumber of classes for classification head.
+            num_classes: Number of classes for classification head.
            global_pool: Type of global pooling for final sequence (default: 'token').
            embed_dim: Transformer embedding dimension.
            depth: Depth of transformer.

--- a/python/sglang/srt/models/deepseek_v2.py
+++ b/python/sglang/srt/models/deepseek_v2.py
@@ -1287,7 +1287,7 @@ class DeepseekV2DecoderLayer(nn.Module):
        # Fully Connected
        hidden_states = self.mlp(hidden_states)
-        # TODO(ch-wan): ues reduce-scatter in MLP to avoid this scatter
+        # TODO(ch-wan): use reduce-scatter in MLP to avoid this scatter
        # Scatter
        if self.dp_size != 1:
            # important: forward batch.gathered_buffer is used both after scatter and after gather.
@@ -1499,7 +1499,7 @@ class DeepseekV2ForCausalLM(nn.Module):
            else:
                assert (
                    self.n_share_experts_fusion == self.tp_size
-                ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performace."
+                ), f"Shared experts fusion optimization is enabled in DeepSeek V3/R1, set it to {self.tp_size} can get best optimized performance."
        elif self.n_share_experts_fusion == 0:
            if (
                _is_cuda
@@ -1665,7 +1665,7 @@ class DeepseekV2ForCausalLM(nn.Module):
        if is_nextn:
            if hasattr(self.config, "num_nextn_predict_layers"):
                num_nextn_layers = self.config.num_nextn_predict_layers
-                assert num_nextn_layers == 1, "Only 1 nextn layer is supportted"
+                assert num_nextn_layers == 1, "Only 1 nextn layer is supported"
                # compatible with old design
                nextn_layer_id = (
                    0