Unverified Commit 5a6400ee authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

Test no vllm custom allreduce (#4256)

parent cf0ccd40
...@@ -266,7 +266,7 @@ jobs: ...@@ -266,7 +266,7 @@ jobs:
cd test/srt cd test/srt
python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
# USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1 USE_VLLM_CUSTOM_ALLREDUCE=0 python3 -m unittest test_bench_one_batch.TestBenchOneBatch.test_moe_tp2_bs1
- name: Benchmark single latency + torch.compile (TP=2) - name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 10 timeout-minutes: 10
......
...@@ -44,7 +44,7 @@ runtime_common = [ ...@@ -44,7 +44,7 @@ runtime_common = [
srt = [ srt = [
"sglang[runtime_common]", "sglang[runtime_common]",
"sgl-kernel==0.0.4", "sgl-kernel==0.0.4.post1",
"flashinfer_python==0.2.2.post1", "flashinfer_python==0.2.2.post1",
"torch==2.5.1", "torch==2.5.1",
"vllm>=0.6.4.post1,<=0.7.2", "vllm>=0.6.4.post1,<=0.7.2",
......
...@@ -480,7 +480,7 @@ class ServerArgs: ...@@ -480,7 +480,7 @@ class ServerArgs:
"--chunked-prefill-size", "--chunked-prefill-size",
type=int, type=int,
default=ServerArgs.chunked_prefill_size, default=ServerArgs.chunked_prefill_size,
help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill", help="The maximum number of tokens in a chunk for the chunked prefill. Setting this to -1 means disabling chunked prefill.",
) )
parser.add_argument( parser.add_argument(
"--max-prefill-tokens", "--max-prefill-tokens",
...@@ -505,7 +505,7 @@ class ServerArgs: ...@@ -505,7 +505,7 @@ class ServerArgs:
"--cpu-offload-gb", "--cpu-offload-gb",
type=int, type=int,
default=ServerArgs.cpu_offload_gb, default=ServerArgs.cpu_offload_gb,
help="How many GBs of RAM to reserve for CPU offloading", help="How many GBs of RAM to reserve for CPU offloading.",
) )
# Other runtime options # Other runtime options
......
...@@ -26,4 +26,4 @@ pip install transformers==4.45.2 sentence_transformers accelerate peft pandas da ...@@ -26,4 +26,4 @@ pip install transformers==4.45.2 sentence_transformers accelerate peft pandas da
pip install cuda-python nvidia-cuda-nvrtc-cu12 pip install cuda-python nvidia-cuda-nvrtc-cu12
# reinstall sgl-kernel # reinstall sgl-kernel
pip install sgl-kernel==0.0.4 --force-reinstall --no-deps pip install sgl-kernel==0.0.4.post1 --force-reinstall --no-deps
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment