Simplify tests & Fix trtllm custom allreduce registration (#4252)

aa957102 · Lianmin Zheng · GitHub · 007f8b3d · aa957102 · aa957102
Unverified Commit aa957102 authored Mar 10, 2025 by Lianmin Zheng Committed by GitHub Mar 10, 2025
13 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -266,7 +266,7 @@ jobs:
          cd test/srt
          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
-          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
      - name: Benchmark single latency + torch.compile (TP=2)
        timeout-minutes: 10

--- a/python/sglang/srt/_custom_ops.py
+++ b/python/sglang/srt/_custom_ops.py
@@ -6,10 +6,12 @@ from typing import List, Tuple
 import torch
 import torch.library
-from sglang.srt.utils import is_hip, is_hpu
+from sglang.srt.utils import get_bool_env_var, is_hip, is_hpu
 logger = logging.getLogger(__name__)
-use_vllm_custom_allreduce = os.environ.get("USE_VLLM_CUSTOM_ALLREDUCE", default=True)
+use_vllm_custom_allreduce = get_bool_env_var(
+    "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+)
 if not is_hpu():
    # ROCm does not use vllm custom allreduce

--- a/python/sglang/srt/layers/attention/flashinfer_backend.py
+++ b/python/sglang/srt/layers/attention/flashinfer_backend.py
@@ -22,7 +22,7 @@ from sglang.srt.layers.attention.utils import create_flashinfer_kv_indices_trito
 from sglang.srt.layers.dp_attention import get_attention_tp_size
 from sglang.srt.model_executor.forward_batch_info import ForwardBatch, ForwardMode
 from sglang.srt.speculative.eagle_utils import EagleDraftInput, EagleVerifyInput
-from sglang.srt.utils import is_flashinfer_available
+from sglang.srt.utils import get_bool_env_var, is_flashinfer_available
 if TYPE_CHECKING:
    from sglang.srt.layers.radix_attention import RadixAttention

--- a/python/sglang/srt/model_loader/loader.py
+++ b/python/sglang/srt/model_loader/loader.py
@@ -48,6 +48,7 @@ from sglang.srt.model_loader.weight_utils import (
    safetensors_weights_iterator,
 )
 from sglang.srt.utils import (
+    get_bool_env_var,
    get_device_capability,
    is_pin_memory_available,
    set_weight_attrs,
@@ -197,7 +198,7 @@ class DefaultModelLoader(BaseModelLoader):
        Returns the path to the downloaded model, or None if the model is not
        downloaded from ModelScope."""
-        if os.environ.get("SGLANG_USE_MODELSCOPE", None) == "True":
+        if get_bool_env_var("SGLANG_USE_MODELSCOPE"):
            # download model from ModelScope hub,
            # lazy import so that modelscope is not required for normal use.
            # pylint: disable=C.

--- a/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
+++ b/sgl-kernel/csrc/gemm/cublas_grouped_gemm.cu
@@ -100,7 +100,6 @@ void cublas_grouped_gemm(
  check_device_dtype(out_dtype, inputs);
  check_device_dtype(out_dtype, weights);
  check_device_dtype(out_dtype, outputs);
-  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
  // Weights should be transposed to (n, k) of column major
  std::vector<cublasOperation_t> transa_array(group_count, CUBLAS_OP_T);
@@ -132,7 +131,6 @@ void cublas_grouped_gemm(
  std::vector<void*> b_array = get_tensor_ptrs(inputs);
  std::vector<void*> c_array = get_tensor_ptrs(outputs);
-  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
  auto stream = reinterpret_cast<cudaStream_t>(cuda_stream);
  // Should allocate tensors for storage of pointers
@@ -141,6 +139,9 @@ void cublas_grouped_gemm(
  torch::Tensor d_c = create_ptr_pointer(c_array, stream);
 #if defined CUDA_VERSION && CUDA_VERSION >= 12050
+  auto handle = reinterpret_cast<cublasHandle_t>(cublas_handle);
+  cudaDataType_t cuda_data_type = (out_dtype == torch::kHalf ? CUDA_R_16F : CUDA_R_16BF);
  auto status = cublasGemmGroupedBatchedEx(
      handle,
      transa_array.data(),

--- a/sgl-kernel/csrc/torch_extension.cc
+++ b/sgl-kernel/csrc/torch_extension.cc
@@ -32,11 +32,8 @@ TORCH_LIBRARY_EXPAND(sgl_kernel, m) {
  m.def("all_reduce(int fa, Tensor inp, Tensor! out) -> ()");
  m.impl("all_reduce", torch::kCUDA, &all_reduce);
-  m.def("get_graph_buffer_ipc_meta(int fa) -> (int[], int[])");
+  m.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
-  m.impl("get_graph_buffer_ipc_meta", torch::kCUDA, &get_graph_buffer_ipc_meta);
+  m.def("register_graph_buffers", &register_graph_buffers);
-  m.def("register_graph_buffers(int fa, int[][] handles, int[][] offsets) -> ()");
-  m.impl("register_graph_buffers", torch::kCUDA, &register_graph_buffers);
  /*
   * From csrc/attention

--- a/sgl-kernel/tests/test_rotary_embedding.py
+++ b/sgl-kernel/tests/test_rotary_embedding.py
-import math
 from typing import Any, Dict, List, Optional, Tuple, Union
 import pytest
 import torch
-import torch.nn as nn
 from sgl_kernel import apply_rope_with_cos_sin_cache_inplace

--- a/test/srt/run_suite.py
+++ b/test/srt/run_suite.py
@@ -20,30 +20,33 @@ suites = {
        TestFile("models/test_generation_models.py", 103),
        TestFile("models/test_qwen_models.py", 82),
        TestFile("models/test_reward_models.py", 83),
-        TestFile("test_gptqmodel_dynamic.py", 72),
        TestFile("models/test_gme_qwen_models.py", 45),
        TestFile("test_abort.py", 51),
+        TestFile("test_block_int8.py", 22),
        TestFile("test_chunked_prefill.py", 336),
-        TestFile("test_custom_allreduce.py", 1),
-        TestFile("test_double_sparsity.py", 50),
        TestFile("test_eagle_infer.py", 447),
+        TestFile("test_ebnf_constrained.py"),
+        TestFile("test_fp8_kernel.py", 2),
        TestFile("test_embedding_openai_server.py", 36),
-        TestFile("test_eval_accuracy_mini.py", 63),
        TestFile("test_gguf.py", 78),
+        TestFile("test_gptqmodel_dynamic.py", 72),
+        TestFile("test_hidden_states.py", 55),
+        TestFile("test_int8_kernel.py", 1),
        TestFile("test_input_embeddings.py", 38),
+        TestFile("test_json_constrained.py", 98),
+        TestFile("test_large_max_new_tokens.py", 41),
+        TestFile("test_metrics.py", 32),
        TestFile("test_mla.py", 92),
        TestFile("test_mla_deepseek_v3.py", 221),
        TestFile("test_mla_flashinfer.py", 395),
        TestFile("test_mla_fp8.py", 93),
-        TestFile("test_json_constrained.py", 98),
-        TestFile("test_large_max_new_tokens.py", 41),
-        TestFile("test_metrics.py", 32),
        TestFile("test_no_chunked_prefill.py", 126),
        TestFile("test_no_overlap_scheduler.py", 262),
        TestFile("test_openai_server.py", 124),
        TestFile("test_penalty.py", 41),
        TestFile("test_pytorch_sampling_backend.py", 66),
        TestFile("test_radix_attention.py", 167),
+        TestFile("test_reasoning_content.py", 89),
        TestFile("test_regex_constrained.py", 64),
        TestFile("test_release_memory_occupation.py", 44),
        TestFile("test_request_length_validation.py", 31),
@@ -58,7 +61,6 @@ suites = {
        TestFile("test_torchao.py", 70),
        TestFile("test_triton_attention_kernels.py", 4),
        TestFile("test_triton_attention_backend.py", 134),
-        TestFile("test_hidden_states.py", 55),
        TestFile("test_update_weights_from_disk.py", 114),
        TestFile("test_update_weights_from_tensor.py", 48),
        TestFile("test_vertex_endpoint.py", 31),
@@ -66,10 +68,6 @@ suites = {
        TestFile("test_vision_llm.py", 18.4),
        TestFile("test_vision_openai_server.py", 344),
        TestFile("test_w8a8_quantization.py", 46),
-        TestFile("test_fp8_kernel.py", 2),
-        TestFile("test_block_int8.py", 22),
-        TestFile("test_int8_kernel.py", 1),
-        TestFile("test_reasoning_content.py", 89),
    ],
    "nightly": [
        TestFile("test_nightly_gsm8k_eval.py"),

--- a/test/srt/test_bench_one_batch.py
+++ b/test/srt/test_bench_one_batch.py
@@ -3,6 +3,7 @@ import unittest
 from sglang.test.test_utils import (
    DEFAULT_MODEL_NAME_FOR_TEST,
    DEFAULT_MOE_MODEL_NAME_FOR_TEST,
+    get_bool_env_var,
    is_in_ci,
    run_bench_one_batch,
    write_github_step_summary,
@@ -27,9 +28,13 @@ class TestBenchOneBatch(unittest.TestCase):
            DEFAULT_MOE_MODEL_NAME_FOR_TEST, ["--tp", "2", "--cuda-graph-max-bs", "2"]
        )
+        use_vllm_custom_allreduce = get_bool_env_var(
+            "USE_VLLM_CUSTOM_ALLREDUCE", default="true"
+        )
        if is_in_ci():
            write_github_step_summary(
-                f"### test_moe_tp2_bs1\n"
+                f"### test_moe_tp2_bs1 ({use_vllm_custom_allreduce=})\n"
                f"output_throughput : {output_throughput:.2f} token/s\n"
            )
            self.assertGreater(output_throughput, 124)

--- a/test/srt/test_eval_accuracy_large_chunked_prefill.py
+++ b/test/srt/test_eval_accuracy_large_chunked_prefill.py
-import unittest
-from types import SimpleNamespace
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=["--log-level-http", "warning", "--chunked-prefill-size", "256"],
-        )
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
+++ b/test/srt/test_eval_accuracy_large_mixed_chunked_prefill.py
-import unittest
-from types import SimpleNamespace
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-class TestEvalAccuracyLargeChunkedPrefill(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model,
-            cls.base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=[
-                "--log-level-http",
-                "warning",
-                "--chunked-prefill-size",
-                "256",
-                "--enable-mixed-chunk",
-            ],
-        )
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=3000,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.705, f"{metrics}"
-    def test_human_eval(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="humaneval",
-            num_examples=None,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.64, f"{metrics}"
-    def test_mgsm_en(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mgsm_en",
-            num_examples=None,
-            num_threads=1024,
-        )
-        metrics = run_eval(args)
-        assert metrics["score"] >= 0.84, f"{metrics}"
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_eval_accuracy_mini.py
+++ b/test/srt/test_eval_accuracy_mini.py
-import unittest
-from types import SimpleNamespace
-from sglang.srt.utils import kill_process_tree
-from sglang.test.run_eval import run_eval
-from sglang.test.test_utils import (
-    DEFAULT_MODEL_NAME_FOR_TEST,
-    DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-    DEFAULT_URL_FOR_TEST,
-    popen_launch_server,
-)
-class TestEvalAccuracyMini(unittest.TestCase):
-    @classmethod
-    def setUpClass(cls):
-        cls.model = DEFAULT_MODEL_NAME_FOR_TEST
-        cls.base_url = DEFAULT_URL_FOR_TEST
-        cls.process = popen_launch_server(
-            cls.model, cls.base_url, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
-        )
-    @classmethod
-    def tearDownClass(cls):
-        kill_process_tree(cls.process.pid)
-    def test_mmlu(self):
-        args = SimpleNamespace(
-            base_url=self.base_url,
-            model=self.model,
-            eval_name="mmlu",
-            num_examples=64,
-            num_threads=32,
-            temperature=0.1,
-        )
-        metrics = run_eval(args)
-        self.assertGreaterEqual(metrics["score"], 0.65)
-if __name__ == "__main__":
-    unittest.main()
--- a/test/srt/test_gptqmodel_dynamic.py
+++ b/test/srt/test_gptqmodel_dynamic.py
@@ -129,6 +129,7 @@ class TestGPTQModelDynamic(unittest.TestCase):
                "text": "The capital of France is",
                "sampling_params": {
                    "max_new_tokens": max_new_tokens,
+                    "temperature": 0.001,
                },
            },
        )