Test no vllm custom allreduce (#4256)

5a6400ee · Lianmin Zheng · GitHub · cf0ccd40 · 5a6400ee · 5a6400ee
Unverified Commit 5a6400ee authored Mar 10, 2025 by Lianmin Zheng Committed by GitHub Mar 10, 2025
4 changed files
--- a/.github/workflows/pr-test.yml
+++ b/.github/workflows/pr-test.yml
@@ -266,7 +266,7 @@ jobs:
          cd test/srt
          python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1

-          # USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
+          USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1

      - name: Benchmark single latency + torch.compile (TP=2)
        timeout-minutes: 10

--- a/python/pyproject.toml
+++ b/python/pyproject.toml
@@ -44,7 +44,7 @@ runtime_common = [

 srt = [
    "sglang[runtime_common]",
-    "sgl-kernel==0.0.4",
+    "sgl-kernel==0.0.4.post1",
    "flashinfer_python==0.2.2.post1",
    "torch==2.5.1",
    "vllm>=0.6.4.post1,<=0.7.2",

--- a/python/sglang/srt/server_args.py
+++ b/python/sglang/srt/server_args.py
@@ -480,7 +480,7 @@ class ServerArgs:
            "--chunked-prefill-size",
            type=int,
            default=ServerArgs.chunked_prefill_size,
-            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill",
+            help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.",
        )
        parser.add_argument(
            "--max-prefill-tokens",
@@ -505,7 +505,7 @@ class ServerArgs:
            "--cpu-offload-gb",
            type=int,
            default=ServerArgs.cpu_offload_gb,
-            help="How many GBs of RAM to reserve for CPU offloading",
+            help="How many GBs of RAM to reserve for CPU offloading.",
        )

        # Other runtime options

--- a/scripts/ci_install_dependency.sh
+++ b/scripts/ci_install_dependency.sh
@@ -26,4 +26,4 @@ pip install transformers==4.45.2 sentence_transformers accelerate peft pandas da
 pip install cuda-python nvidia-cuda-nvrtc-cu12

 # reinstall sgl-kernel
-pip install sgl-kernel==0.0.4 --force-reinstall --no-deps
+pip install sgl-kernel==0.0.4.post1 --force-reinstall --no-deps