Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
change
sglang
Commits
02404a1e
Unverified
Commit
02404a1e
authored
Jul 17, 2025
by
Cheng Wan
Committed by
GitHub
Jul 17, 2025
Browse files
[ci] recover 8-gpu deepep test (#8105)
parent
5c08a36c
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
52 additions
and
59 deletions
+52
-59
.github/workflows/pr-test.yml
.github/workflows/pr-test.yml
+21
-21
scripts/ci_install_deepep.sh
scripts/ci_install_deepep.sh
+11
-18
test/srt/test_deepep_large.py
test/srt/test_deepep_large.py
+11
-9
test/srt/test_deepep_small.py
test/srt/test_deepep_small.py
+9
-11
No files found.
.github/workflows/pr-test.yml
View file @
02404a1e
...
...
@@ -324,33 +324,33 @@ jobs:
cd test/srt
python3 run_suite.py --suite per-commit-4-gpu-deepep
#
unit-test-deepep-8-gpu:
#
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
#
github.event.pull_request.draft == false
#
runs-on: 8-gpu-runner
#
needs: [
#
unit-test-deepep-4-gpu,
#
]
#
steps:
#
- name: Checkout code
#
uses: actions/checkout@v4
#
#
- name: Install dependencies
#
run: |
#
bash scripts/ci_install_deepep.sh
#
#
- name: Run test
#
timeout-minutes: 20
#
run: |
#
cd test/srt
#
python3 run_suite.py --suite per-commit-8-gpu-deepep
unit-test-deepep-8-gpu
:
if
:
(github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft ==
false
runs-on
:
8-gpu-runner
needs
:
[
unit-test-deepep-4-gpu
,
]
steps
:
-
name
:
Checkout code
uses
:
actions/checkout@v4
-
name
:
Install dependencies
run
:
|
bash scripts/ci_install_deepep.sh
-
name
:
Run test
timeout-minutes
:
20
run
:
|
cd test/srt
python3 run_suite.py --suite per-commit-8-gpu-deepep
finish
:
if
:
always()
needs
:
[
unit-test-frontend
,
unit-test-backend-1-gpu
,
unit-test-backend-2-gpu
,
unit-test-backend-4-gpu
,
unit-test-backend-8-gpu
,
performance-test-1-gpu-part-1
,
performance-test-1-gpu-part-2
,
performance-test-2-gpu
,
accuracy-test-1-gpu
,
accuracy-test-2-gpu
,
unit-test-deepep-4-gpu
,
#
unit-test-deepep-8-gpu,
accuracy-test-1-gpu
,
accuracy-test-2-gpu
,
unit-test-deepep-4-gpu
,
unit-test-deepep-8-gpu
,
]
runs-on
:
ubuntu-latest
steps
:
...
...
scripts/ci_install_deepep.sh
View file @
02404a1e
...
...
@@ -4,30 +4,30 @@ set -euxo pipefail
bash scripts/ci_install_dependency.sh
if
python3
-c
"import deep_ep"
>
/dev/null 2>&1
;
then
echo
"deep_ep is already installed or importable. Skipping installation."
exit
0
fi
export
GDRCOPY_HOME
=
/usr/src/gdrdrv-2.4.4/
export
NVSHMEM_DIR
=
/opt/nvshmem/install
export
LD_LIBRARY_PATH
=
"
${
NVSHMEM_DIR
}
/lib:
$LD_LIBRARY_PATH
"
export
PATH
=
"
${
NVSHMEM_DIR
}
/bin:
$PATH
"
export
CUDA_HOME
=
/usr/local/cuda
if
python3
-c
"import deep_ep"
>
/dev/null 2>&1
;
then
echo
"deep_ep is already installed or importable. Skipping installation."
exit
0
fi
# Install system dependencies
apt
install
-y
curl wget git
sudo
libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake
# Install GDRCopy
rm
-rf
/opt/gdrcopy
&&
mkdir
-p
/opt/gdrcopy
mkdir
-p
/opt/nvshmem
rm
-rf
/opt/nvshmem
&&
mkdir
-p
/opt/nvshmem
cd
/opt/gdrcopy
git clone https://github.com/NVIDIA/gdrcopy.git
.
git checkout v2.4.4
apt update
apt
install
-y
nvidia-dkms-535
apt
install
-y
build-essential devscripts debhelper fakeroot pkg-config dkms
apt
install
-y
check libsubunit0 libsubunit-dev
apt
install
-y
check libsubunit0 libsubunit-dev
python3-venv
cd
packages
CUDA
=
/usr/local/cuda ./build-deb-packages.sh
dpkg
-i
gdrdrv-dkms_
*
.deb
...
...
@@ -40,16 +40,11 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then
fi
apt-get update
&&
apt-get
install
-y
libfabric-dev
# Clone DeepEP
rm
-rf
/root/.cache/deepep
&&
git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep
&&
cd
/root/.cache/deepep
&&
git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac
# Install NVSHMEM
cd
/opt/nvshmem
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz
tar
-xf
nvshmem_src_3.2.5-1.txz
rm
-rf
nvshmem
&&
mv
nvshmem_src nvshmem
cd
nvshmem
git apply /root/.cache/deepep/third-party/nvshmem.patch
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
tar
-xf
nvshmem_src_cuda12-all-all-3.3.9.tar.gz
mv
nvshmem_src nvshmem
&&
cd
nvshmem
NVSHMEM_SHMEM_SUPPORT
=
0
\
NVSHMEM_UCX_SUPPORT
=
0
\
NVSHMEM_USE_NCCL
=
0
\
...
...
@@ -63,12 +58,10 @@ cd build
make
-j
$(
nproc
)
install
# Install DeepEP
rm
-rf
/root/.cache/deepep
&&
git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep
&&
cd
/root/.cache/deepep
&&
git checkout b6ce310bb0b75079682d09bc2ebc063a074fbd58
cd
/root/.cache/deepep
&&
python3 setup.py
install
# Verify configuration
echo
"=== NCCL Configuration ==="
nvidia-smi topo
-m
nvidia-smi nvlink
-s
echo
"=== Verify GDRCOPY ==="
gdrcopy_copybw
echo
"=== Verify NVSHMEM ==="
...
...
test/srt/test_deepep_large.py
View file @
02404a1e
...
...
@@ -45,6 +45,7 @@ class TestDeepseek(CustomTestCase):
"256"
,
"--max-running-requests"
,
"2048"
,
"--disable-radix-cache"
,
],
)
...
...
@@ -54,10 +55,10 @@ class TestDeepseek(CustomTestCase):
def
test_gsm8k
(
self
):
args
=
SimpleNamespace
(
num_shots
=
8
,
num_shots
=
5
,
data_path
=
None
,
num_questions
=
12
5
0
,
parallel
=
12
5
0
,
num_questions
=
12
0
0
,
parallel
=
12
0
0
,
max_new_tokens
=
512
,
host
=
"http://127.0.0.1"
,
port
=
int
(
self
.
base_url
.
split
(
":"
)[
-
1
]),
...
...
@@ -65,7 +66,7 @@ class TestDeepseek(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
f
"Eval accuracy of GSM8K:
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
3
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
2
)
class
TestDeepseekMTP
(
CustomTestCase
):
...
...
@@ -107,6 +108,7 @@ class TestDeepseekMTP(CustomTestCase):
"1"
,
"--speculative-num-draft-tokens"
,
"2"
,
"--disable-radix-cache"
,
],
)
...
...
@@ -116,10 +118,10 @@ class TestDeepseekMTP(CustomTestCase):
def
test_gsm8k
(
self
):
args
=
SimpleNamespace
(
num_shots
=
8
,
num_shots
=
5
,
data_path
=
None
,
num_questions
=
12
5
0
,
parallel
=
12
5
0
,
num_questions
=
12
0
0
,
parallel
=
12
0
0
,
max_new_tokens
=
512
,
host
=
"http://127.0.0.1"
,
port
=
int
(
self
.
base_url
.
split
(
":"
)[
-
1
]),
...
...
@@ -127,7 +129,7 @@ class TestDeepseekMTP(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
f
"Eval accuracy of GSM8K:
{
metrics
=
}
"
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
3
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.9
2
)
server_info
=
requests
.
get
(
self
.
base_url
+
"/get_server_info"
)
avg_spec_accept_length
=
server_info
.
json
()[
"internal_states"
][
0
][
...
...
@@ -138,7 +140,7 @@ class TestDeepseekMTP(CustomTestCase):
f
"accuracy=
{
metrics
[
'accuracy'
]
=
:.
3
f
}
\n
"
f
"
{
avg_spec_accept_length
=
:.
3
f
}
\n
"
)
self
.
assertGreater
(
avg_spec_accept_length
,
1.
9
)
self
.
assertGreater
(
avg_spec_accept_length
,
1.
85
)
if
__name__
==
"__main__"
:
...
...
test/srt/test_deepep_small.py
View file @
02404a1e
...
...
@@ -36,6 +36,8 @@ class TestPureDP(CustomTestCase):
"128"
,
"--max-running-requests"
,
"128"
,
"--mem-fraction-static"
,
"0.5"
,
],
)
...
...
@@ -56,7 +58,7 @@ class TestPureDP(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
0
)
class
TestHybridDPTP
(
CustomTestCase
):
...
...
@@ -100,7 +102,7 @@ class TestHybridDPTP(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
0
)
class
TestTP
(
CustomTestCase
):
...
...
@@ -141,10 +143,10 @@ class TestTP(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
0
)
#
@unittest.skip("covered in test_deepep_large.py")
@
unittest
.
skip
(
"covered in test_deepep_large.py"
)
class
TestNoGatherdBuffer
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
...
...
@@ -189,7 +191,7 @@ class TestNoGatherdBuffer(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
0
)
class
TestTBO
(
CustomTestCase
):
...
...
@@ -236,10 +238,10 @@ class TestTBO(CustomTestCase):
metrics
=
run_eval_few_shot_gsm8k
(
args
)
print
(
metrics
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
2
)
self
.
assertGreater
(
metrics
[
"accuracy"
],
0.6
0
)
#
@unittest.skip("covered in TestMTPWithTBO")
@
unittest
.
skip
(
"covered in TestMTPWithTBO"
)
class
TestMTP
(
CustomTestCase
):
@
classmethod
def
setUpClass
(
cls
):
...
...
@@ -280,8 +282,6 @@ class TestMTP(CustomTestCase):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_gsm8k
(
self
):
requests
.
get
(
self
.
base_url
+
"/flush_cache"
)
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
...
...
@@ -352,8 +352,6 @@ class TestMTPWithTBO(CustomTestCase):
kill_process_tree
(
cls
.
process
.
pid
)
def
test_gsm8k
(
self
):
requests
.
get
(
self
.
base_url
+
"/flush_cache"
)
args
=
SimpleNamespace
(
num_shots
=
5
,
data_path
=
None
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment