Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
03227c5f
Unverified
Commit
03227c5f
authored
May 11, 2025
by
Lianmin Zheng
Committed by
GitHub
May 11, 2025
Browse files
[CI] Reorganize the 8 gpu tests (#6192)
parent
01bdbf7f
Changes
6
Hide whitespace changes
Inline
Side-by-side
Showing
6 changed files
with
15 additions
and
82 deletions
+15
-82
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+1
-19
python/sglang/srt/server_args.py
python/sglang/srt/server_args.py
+6
-0
scripts/ci_install_dependency.sh
scripts/ci_install_dependency.sh
+5
-8
test/srt/run_suite.py
test/srt/run_suite.py
+1
-3
test/srt/test_disaggregation.py
test/srt/test_disaggregation.py
+0
-2
test/srt/test_pp_single_node.py
test/srt/test_pp_single_node.py
+2
-50
No files found.
.github/workflows/pr-test.yml
View file @
03227c5f
...
...
@@ -92,7 +92,7 @@ jobs:
unittest-test-backend-8-gpu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
needs
:
[
unit-test-frontend
,
unit-test-backend-1-gpu
,
unit-test-backend-2-gpu
]
needs
:
[
unit-test-frontend
,
unit-test-backend-2-gpu
]
runs-on
:
8-gpu-runner
steps
:
-
name
:
Checkout code
...
...
@@ -271,24 +271,6 @@ jobs:
cd test/srt
python3 test_moe_eval_accuracy_large.py
unit-test-backend-pd
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
8-gpu-runner
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Install dependencies
run
:
|
bash scripts/ci_install_dependency.sh
-
name
:
Run test
timeout-minutes
:
10
run
:
|
cd test/srt
python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k
finish
:
if
:
always()
needs
:
[
...
...
python/sglang/srt/server_args.py
View file @
03227c5f
...
...
@@ -305,6 +305,12 @@ class ServerArgs:
if
self
.
grammar_backend
is
None
:
self
.
grammar_backend
=
"xgrammar"
if
self
.
pp_size
>
1
:
self
.
disable_overlap_schedule
=
True
logger
.
warning
(
"Overlap scheduler is disabled because of using pipeline parallelism."
)
# Data parallelism attention
if
self
.
enable_dp_attention
:
self
.
schedule_conservativeness
=
self
.
schedule_conservativeness
*
0.3
...
...
scripts/ci_install_dependency.sh
View file @
03227c5f
...
...
@@ -5,25 +5,22 @@ set -euxo pipefail
SCRIPT_DIR
=
"
$(
cd
"
$(
dirname
"
${
BASH_SOURCE
[0]
}
"
)
"
&&
pwd
)
"
bash
"
${
SCRIPT_DIR
}
/killall_sglang.sh"
# Update pip
pip
install
--upgrade
pip
# Clean up existing installations
pip uninstall
-y
flashinfer flashinfer_python sgl-kernel sglang vllm
||
true
pip uninstall
-y
flashinfer flashinfer_python sgl-kernel sglang vllm
pip cache purge
rm
-rf
/root/.cache/flashinfer
rm
-rf
/usr/local/lib/python3.10/dist-packages/flashinfer
*
rm
-rf
/usr/local/lib/python3.10/dist-packages/sgl_kernel
*
# Update pip
pip
install
--upgrade
pip
# Install sgl-kernel
pip
install
sgl-kernel
==
0.1.2.post1
--no-cache-dir
# Install the main package
pip
install
-e
"python[all]"
# Install additional dependencies
pip
install
torch_memory_saver
pip
install
transformers
==
4.51.0 sentence_transformers accelerate peft pandas datasets
timm
torchaudio
==
2.6.0
pip
install
transformers
==
4.51.0
timm
torchaudio
==
2.6.0
sentence_transformers accelerate peft pandas datasets
mooncake-transfer-engine
# For compiling xgrammar kernels
pip
install
cuda-python nvidia-cuda-nvrtc-cu12
...
...
test/srt/run_suite.py
View file @
03227c5f
...
...
@@ -85,9 +85,6 @@ suites = {
TestFile
(
"test_w8a8_quantization.py"
,
46
),
TestFile
(
"models/lora/test_lora_cuda_graph.py"
,
250
),
],
"per-commit-pd"
:
[
TestFile
(
"test_disaggregation.py"
,
90
),
],
"per-commit-2-gpu"
:
[
TestFile
(
"models/lora/test_lora_tp.py"
,
116
),
TestFile
(
"test_data_parallelism.py"
,
73
),
...
...
@@ -105,6 +102,7 @@ suites = {
# TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
TestFile
(
"test_local_attn.py"
,
250
),
TestFile
(
"test_disaggregation.py"
,
90
),
TestFile
(
"test_full_deepseek_v3.py"
,
250
),
TestFile
(
"test_pp_single_node.py"
,
150
),
],
...
...
test/srt/test_disaggregation.py
View file @
03227c5f
import
subprocess
import
threading
import
time
import
unittest
from
types
import
SimpleNamespace
import
requests
import
torch
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
as
run_eval_few_shot_gsm8k
...
...
test/srt/test_pp_single_node.py
View file @
03227c5f
...
...
@@ -9,13 +9,10 @@ import time
import
unittest
from
types
import
SimpleNamespace
import
requests
from
sglang.bench_one_batch_server
import
BenchArgs
as
OneBatchBenchArgs
from
sglang.srt.server_args
import
ServerArgs
from
sglang.srt.utils
import
kill_process_tree
from
sglang.test.few_shot_gsm8k
import
run_eval
from
sglang.test.runners
import
DEFAULT_PROMPTS
from
sglang.test.test_utils
import
(
DEFAULT_MODEL_NAME_FOR_TEST
,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
...
...
@@ -28,17 +25,16 @@ from sglang.test.test_utils import (
class
TestPPAccuracy
(
unittest
.
TestCase
):
@
classmethod
def
setUpClass
(
cls
):
# These config helps find a leak.
os
.
environ
[
"SGLANG_IS_IN_CI"
]
=
"1"
cls
.
base_url
=
"http://127.0.0.1:23333"
cls
.
process
=
popen_launch_server
(
DEFAULT_MODEL_NAME_FOR_TEST
,
cls
.
base_url
,
timeout
=
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH
,
other_args
=
[
"--tp-size"
,
2
,
"--pp-size"
,
4
,
"--disable-overlap-schedule"
,
"--chunked-prefill-size"
,
256
,
],
...
...
@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase):
time
.
sleep
(
5
)
# class TestPPAccuracyFlashInfer(unittest.TestCase):
# @classmethod
# def setUpClass(cls):
# # These config helps find a leak.
# os.environ["SGLANG_IS_IN_CI"] = "1"
# cls.base_url = "http://127.0.0.1:23333"
# cls.process = popen_launch_server(
# DEFAULT_MODEL_NAME_FOR_TEST,
# cls.base_url,
# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
# other_args=[
# "--pp-size",
# 4,
# "--disable-overlap-schedule",
# "--attention-backend",
# "flashinfer",
# "--chunked-prefill-size",
# 256,
# ],
# )
#
# @classmethod
# def tearDownClass(cls):
# kill_process_tree(cls.process.pid)
#
# def test_gsm8k(self):
# args = SimpleNamespace(
# num_shots=5,
# data_path=None,
# num_questions=200,
# max_new_tokens=512,
# parallel=128,
# host="http://127.0.0.1",
# port=int(self.base_url.split(":")[-1]),
# )
# metrics = run_eval(args)
# print(f"{metrics=}")
#
# self.assertGreater(metrics["accuracy"], 0.75)
# # Wait a little bit so that the memory check happens.
# time.sleep(5)
class
TestFixedBugs
(
unittest
.
TestCase
):
def
test_chunked_prefill_with_small_bs
(
self
):
model
=
DEFAULT_MODEL_NAME_FOR_TEST
...
...
@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase):
2
,
"--pp-size"
,
2
,
"--disable-overlap-schedule"
,
"--chunked-prefill"
,
256
,
"--max-running-requests"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment