Revert "fix some typos" (#6244)

e8e18dcd · Lianmin Zheng · GitHub · bad7c26f · e8e18dcd · e8e18dcd
Unverified Commit e8e18dcd authored May 12, 2025 by Lianmin Zheng Committed by GitHub May 12, 2025
15 changed files
--- a/python/sglang/srt/speculative/eagle_worker.py
+++ b/python/sglang/srt/speculative/eagle_worker.py
@@ -78,7 +78,7 @@ class EAGLEWorker(TpModelWorker):
        # Override context length with target model's context length
        server_args.context_length = target_worker.model_runner.model_config.context_len
-        # Do not capture CUDA graph in `super().__init__()`
+        # Do not capture cuda graph in `super().__init__()`
        # It will be captured later.
        backup_disable_cuda_graph = server_args.disable_cuda_graph
        server_args.disable_cuda_graph = True
@@ -136,7 +136,7 @@ class EAGLEWorker(TpModelWorker):
            # Share the embedding and lm_head
            self.draft_model_runner.model.set_embed_and_head(embed, head)
-        # Init attention backend and CUDA graphs
+        # Init attention backend and cuda graphs
        self.draft_model_runner.server_args.disable_cuda_graph = (
            backup_disable_cuda_graph
        )
@@ -148,7 +148,7 @@ class EAGLEWorker(TpModelWorker):
            self.init_cuda_graphs()
    def init_attention_backend(self):
-        # Create multi-step attn backends and CUDA graph runners
+        # Create multi-step attn backends and cuda graph runners
        if self.server_args.attention_backend == "flashinfer":
            if not global_server_args_dict["use_mla_backend"]:
                from sglang.srt.layers.attention.flashinfer_backend import (
@@ -207,7 +207,7 @@ class EAGLEWorker(TpModelWorker):
        self.draft_model_runner.draft_attn_backend = self.draft_attn_backend
    def init_cuda_graphs(self):
-        """Capture CUDA graphs."""
+        """Capture cuda graphs."""
        self.cuda_graph_runner = None
        self.cuda_graph_runner_for_draft_extend = None
@@ -218,12 +218,12 @@ class EAGLEWorker(TpModelWorker):
        tic = time.time()
        before_mem = get_available_gpu_memory(self.device, self.gpu_id)
        logger.info(
-            f"Capture draft CUDA graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
+            f"Capture draft cuda graph begin. This can take up to several minutes. avail mem={before_mem:.2f} GB"
        )
        self.cuda_graph_runner = EAGLEDraftCudaGraphRunner(self)
        after_mem = get_available_gpu_memory(self.device, self.gpu_id)
        logger.info(
-            f"Capture draft CUDA graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
+            f"Capture draft cuda graph end. Time elapsed: {time.time() - tic:.2f} s. avail mem={after_mem:.2f} GB. mem usage={(before_mem - after_mem):.2f} GB."
        )
        # Capture extend

--- a/python/sglang/srt/utils.py
+++ b/python/sglang/srt/utils.py
@@ -1117,7 +1117,7 @@ prometheus_multiproc_dir: tempfile.TemporaryDirectory
 def set_prometheus_multiproc_dir():
    # Set prometheus multiprocess directory
-    # SGLang uses prometheus multiprocess mode
+    # sglang uses prometheus multiprocess mode
    # we need to set this before importing prometheus_client
    # https://prometheus.github.io/client_python/multiprocess/
    global prometheus_multiproc_dir

--- a/python/sglang/test/attention/test_flashattn_backend.py
+++ b/python/sglang/test/attention/test_flashattn_backend.py
@@ -42,7 +42,7 @@ class MockModelRunner:
            "TokenPool",
            (),
            {
-                # A typical max_bs * max_context_len for CUDA graph decode
+                # A typical max_bs * max_context_len for cuda graph decode
                "size": max_batch_size,
                # Add req_to_token attribute
                "req_to_token": torch.zeros(

--- a/python/sglang/test/attention/test_flashattn_mla_backend.py
+++ b/python/sglang/test/attention/test_flashattn_mla_backend.py
@@ -37,7 +37,7 @@ class MockModelRunner:
            "TokenPool",
            (),
            {
-                # A typical max_bs * max_context_len for CUDA graph decode
+                # A typical max_bs * max_context_len for cuda graph decode
                "size": batch_size,
                # Add req_to_token attribute
                "req_to_token": torch.zeros(

--- a/sgl-kernel/README.md
+++ b/sgl-kernel/README.md
@@ -83,11 +83,11 @@ Third-party libraries:
 ### FlashAttention FYI
-  FA3 can fail without a enough shared memory for some shapes, such as higher hidden_dim or some special cases. Right now, FA3 is supported for sm80/sm87 and sm86/sm89.
+  FA3 can fail without a enough shared memory for a some shapes, such as higher hidden_dim or some special cases. Right now, fa3 is supported for sm80/sm87 and sm86/sm89.
  The main different Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x.
-  And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use FA3.
+  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a. That means if you use **A100(tested)**/A*0/**L20(tested)**/L40/L40s/**3090(tested)** you can use fa3.
 ### Kernel Development

--- a/sgl-kernel/benchmark/bench_moe_align_block_size.py
+++ b/sgl-kernel/benchmark/bench_moe_align_block_size.py
@@ -177,7 +177,7 @@ def calculate_diff(num_tokens, num_experts=256, block_size=128, topk=8):
    expert_ids_vllm = torch.zeros_like(expert_ids_cuda)
    num_tokens_post_pad_vllm = torch.empty_like(num_tokens_post_pad_cuda)
-    # compare the performance of CUDA, triton and vllm implementation
+    # compare the performance of cuda, triton and vllm implementation
    sgl_moe_align_block_size(
        topk_ids,
        num_experts,
@@ -349,7 +349,7 @@ def benchmark(num_tokens, num_experts, topk, provider):
            ),
            quantiles=quantiles,
        )
-    else:  # vLLM
+    else:  # vllm
        try:
            ms, min_ms, max_ms = triton.testing.do_bench(
                lambda: ops.moe_align_block_size(

--- a/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
+++ b/sgl-kernel/csrc/allreduce/custom_all_reduce.cuh
@@ -280,8 +280,8 @@ class CustomAllreduce {
  std::unordered_map<void*, RankData*> buffers_;
  Signal* self_sg_;
-  // Stores rank data from all ranks. This is mainly for CUDA graph purposes.
+  // Stores rank data from all ranks. This is mainly for cuda graph purposes.
-  // For CUDA graph to work, all kernel arguments must be fixed during graph
+  // For cuda graph to work, all kernel arguments must be fixed during graph
  // capture time. However, the peer pointers are not known during graph capture
  // time. Therefore, during capture, we increment the rank data pointer and use
  // that as the argument to the kernel. The kernel arguments are stored in
@@ -291,7 +291,7 @@ class CustomAllreduce {
  //
  // The overall process looks like this:
  // 1. Graph capture.
-  // 2. Each rank obtains the IPC handles for each addresses used during CUDA
+  // 2. Each rank obtains the IPC handles for each addresses used during cuda
  // graph capture using get_graph_buffer_ipc_meta.
  // 3. (In Python) all gather the IPC handles.
  // 4. Obtain the peer pointers by opening the IPC handles, and store them in

--- a/sgl-kernel/python/sgl_kernel/__init__.py
+++ b/sgl-kernel/python/sgl_kernel/__init__.py
@@ -65,5 +65,5 @@ from sgl_kernel.speculative import (
 from sgl_kernel.version import __version__
 build_tree_kernel = (
-    None  # TODO(ying): remove this after updating the SGLang python code.
+    None  # TODO(ying): remove this after updating the sglang python code.
 )
--- a/sgl-kernel/python/sgl_kernel/flash_attn.py
+++ b/sgl-kernel/python/sgl_kernel/flash_attn.py
@@ -10,14 +10,14 @@ except:
 def is_fa3_supported(device=None) -> bool:
-    #  There some FA3 FYI
+    #  There some fa3 FYI
    #  FA3 can fail without a enough shared memory for a some shapes, such as higher
    #  hidden_dim or some special cases.
-    #  Right now, FA3 is supported for sm80/sm87 and sm86/sm89. The main different
+    #  Right now, fa3 is supported for sm80/sm87 and sm86/sm89. The main different
    #  Between sm80/sm87 and sm86/sm89 is the shared memory size. you can follow the link below for more information
    #  https://docs.nvidia.com/cuda/cuda-c-programming-guide/#shared-memory-8-x
-    #  And for sgl-kernel right now, we can build FA3 on sm80/sm86/sm89/sm90a.
+    #  And for sgl-kernel right now, we can build fa3 on sm80/sm86/sm89/sm90a.
-    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use FA3.
+    #  That means if you use A100/A*0/L20/L40/L40s/4090 you can use fa3.
    return (
        torch.cuda.get_device_capability(device)[0] == 9
        or torch.cuda.get_device_capability(device)[0] == 8

--- a/sgl-kernel/tests/test_merge_state_v2.py
+++ b/sgl-kernel/tests/test_merge_state_v2.py
@@ -197,7 +197,7 @@ def test_merge_attn_states(
    if not torch.cuda.is_available():
        pytest.skip(
            "Currently only support compare triton merge_attn_states "
-            "with custom CUDA merge_attn_states kernel"
+            "with custom cuda merge_attn_states kernel"
        )
    NUM_TOKENS = num_tokens

--- a/test/srt/models/lora/test_lora_cuda_graph.py
+++ b/test/srt/models/lora/test_lora_cuda_graph.py
@@ -47,8 +47,8 @@ TEST_CUDA_GRAPH_PADDING_PROMPTS = [
 class TestLoRACudaGraph(CustomTestCase):
    def _run_without_cuda_graph_on_model_cases(self, model_cases: List[LoRAModelCase]):
-        # Since we have already enabled CUDA graph by default in other LoRA tests,
+        # Since we have already enabled CUDA graph by default in other lora tests,
-        # we only need to run LoRA tests without CUDA graph here.
+        # we only need to run lora tests without CUDA graph here.
        for model_case in model_cases:
            # If skip_long_prompt is True, filter out prompts longer than 1000 characters
            prompts = (

--- a/test/srt/models/lora/utils.py
+++ b/test/srt/models/lora/utils.py
@@ -154,7 +154,7 @@ def run_lora_test_one_by_one(
        model_case (LoRAModelCase): The model case to test.
        torch_dtype (torch.dtype): The torch dtype to use.
        max_new_tokens (int): The maximum number of new tokens to generate.
-        backend (str): The LoRA backend to use.
+        backend (str): The lora backend to use.
        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
@@ -289,7 +289,7 @@ def run_lora_test_by_batch(
    test_tag: str = "",
 ):
    """
-    Run LoRA tests as a batch.
+    Run lora tests as a batch.
    For prompt0, prompt1, ..., promptN,
    we will use adaptor0, adaptor1, ..., adaptorN included in model case,
    We will then compare the outputs of HF and SRT with LoRA.
@@ -301,7 +301,7 @@ def run_lora_test_by_batch(
        model_case (LoRAModelCase): The model case to test.
        torch_dtype (torch.dtype): The torch dtype to use.
        max_new_tokens (int): The maximum number of new tokens to generate.
-        backend (str): The LoRA backend to use.
+        backend (str): The lora backend to use.
        disable_cuda_graph (bool, optional): Whether to disable CUDA graph. Defaults to False.
        disable_radix_cache (bool, optional): Whether to disable radix cache. Defaults to True.
        mem_fraction_static (float, optional): The fraction of memory to use. Defaults to 0.88.
@@ -372,8 +372,8 @@ def run_lora_test_by_batch(
        print("ROUGE-L score:", rouge_score)
        print("SRT output:", srt_output_str)
        print("HF output:", hf_output_str)
-        print("SRT no LoRA output:", srt_no_lora_outputs.output_strs[i].strip())
+        print("SRT no lora output:", srt_no_lora_outputs.output_strs[i].strip())
-        print("HF no LoRA output:", hf_no_lora_outputs.output_strs[i].strip())
+        print("HF no lora output:", hf_no_lora_outputs.output_strs[i].strip())
        assert srt_outputs.output_strs[i].strip(" ") == hf_outputs.output_strs[i].strip(
            " "
        ), (

--- a/test/srt/test_srt_engine_with_quant_args.py
+++ b/test/srt/test_srt_engine_with_quant_args.py
@@ -8,7 +8,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
    def test_1_quantization_args(self):
-        # we only test fp8 because other methods are currently dependent on vLLM. We can add other methods back to test after vLLM dependency is resolved.
+        # we only test fp8 because other methods are currently dependent on vllm. We can add other methods back to test after vllm dependency is resolved.
        quantization_args_list = [
            # "awq",
            "fp8",
@@ -34,7 +34,7 @@ class TestSRTEngineWithQuantArgs(CustomTestCase):
    def test_2_torchao_args(self):
-        # we don't test int8dq because currently there is conflict between int8dq and capture CUDA graph
+        # we don't test int8dq because currently there is conflict between int8dq and capture cuda graph
        torchao_args_list = [
            # "int8dq",
            "int8wo",

--- a/test/srt/test_triton_attention_kernels.py
+++ b/test/srt/test_triton_attention_kernels.py
@@ -277,7 +277,7 @@ class TestTritonAttention(CustomTestCase):
    def test_decode_attention(self):
        # Here we just to ensure there is no error
-        # TODO: correctness test
+        # TODO: correctnesss test
        # Test configurations
        configs = [

--- a/test/srt/test_update_weights_from_distributed.py
+++ b/test/srt/test_update_weights_from_distributed.py
@@ -189,7 +189,7 @@ def init_process_hf(
    print(f"[hf] {rank=} {broadcast_time=:.3f}s")
    param_queue.put(("broadcast_time", broadcast_time))
-    # Delete the HuggingFace models to free up memory.
+    # Delete the huggingface models to free up memory.
    del hf_instruct_model
    del hf_base_model
    gc.collect()