Unverified Commit 02404a1e authored by Cheng Wan's avatar Cheng Wan Committed by GitHub
Browse files

[ci] recover 8-gpu deepep test (#8105)

parent 5c08a36c
...@@ -324,33 +324,33 @@ jobs: ...@@ -324,33 +324,33 @@ jobs:
cd test/srt cd test/srt
python3 run_suite.py --suite per-commit-4-gpu-deepep python3 run_suite.py --suite per-commit-4-gpu-deepep
# unit-test-deepep-8-gpu: unit-test-deepep-8-gpu:
# if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
# github.event.pull_request.draft == false github.event.pull_request.draft == false
# runs-on: 8-gpu-runner runs-on: 8-gpu-runner
# needs: [ needs: [
# unit-test-deepep-4-gpu, unit-test-deepep-4-gpu,
# ] ]
# steps: steps:
# - name: Checkout code - name: Checkout code
# uses: actions/checkout@v4 uses: actions/checkout@v4
#
# - name: Install dependencies - name: Install dependencies
# run: | run: |
# bash scripts/ci_install_deepep.sh bash scripts/ci_install_deepep.sh
#
# - name: Run test - name: Run test
# timeout-minutes: 20 timeout-minutes: 20
# run: | run: |
# cd test/srt cd test/srt
# python3 run_suite.py --suite per-commit-8-gpu-deepep python3 run_suite.py --suite per-commit-8-gpu-deepep
finish: finish:
if: always() if: always()
needs: [ needs: [
unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu, unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu, unit-test-backend-4-gpu,
unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu, unit-test-backend-8-gpu, performance-test-1-gpu-part-1, performance-test-1-gpu-part-2, performance-test-2-gpu,
accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, # unit-test-deepep-8-gpu, accuracy-test-1-gpu, accuracy-test-2-gpu, unit-test-deepep-4-gpu, unit-test-deepep-8-gpu,
] ]
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
......
...@@ -4,30 +4,30 @@ set -euxo pipefail ...@@ -4,30 +4,30 @@ set -euxo pipefail
bash scripts/ci_install_dependency.sh bash scripts/ci_install_dependency.sh
if python3 -c "import deep_ep" >/dev/null 2>&1; then
echo "deep_ep is already installed or importable. Skipping installation."
exit 0
fi
export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/ export GDRCOPY_HOME=/usr/src/gdrdrv-2.4.4/
export NVSHMEM_DIR=/opt/nvshmem/install export NVSHMEM_DIR=/opt/nvshmem/install
export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH" export LD_LIBRARY_PATH="${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH"
export PATH="${NVSHMEM_DIR}/bin:$PATH" export PATH="${NVSHMEM_DIR}/bin:$PATH"
export CUDA_HOME=/usr/local/cuda export CUDA_HOME=/usr/local/cuda
if python3 -c "import deep_ep" >/dev/null 2>&1; then
echo "deep_ep is already installed or importable. Skipping installation."
exit 0
fi
# Install system dependencies # Install system dependencies
apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake apt install -y curl wget git sudo libibverbs-dev rdma-core infiniband-diags openssh-server perftest ibverbs-providers libibumad3 libibverbs1 libnl-3-200 libnl-route-3-200 librdmacm1 build-essential cmake
# Install GDRCopy # Install GDRCopy
rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy rm -rf /opt/gdrcopy && mkdir -p /opt/gdrcopy
mkdir -p /opt/nvshmem rm -rf /opt/nvshmem && mkdir -p /opt/nvshmem
cd /opt/gdrcopy cd /opt/gdrcopy
git clone https://github.com/NVIDIA/gdrcopy.git . git clone https://github.com/NVIDIA/gdrcopy.git .
git checkout v2.4.4 git checkout v2.4.4
apt update apt update
apt install -y nvidia-dkms-535 apt install -y nvidia-dkms-535
apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms apt install -y build-essential devscripts debhelper fakeroot pkg-config dkms
apt install -y check libsubunit0 libsubunit-dev apt install -y check libsubunit0 libsubunit-dev python3-venv
cd packages cd packages
CUDA=/usr/local/cuda ./build-deb-packages.sh CUDA=/usr/local/cuda ./build-deb-packages.sh
dpkg -i gdrdrv-dkms_*.deb dpkg -i gdrdrv-dkms_*.deb
...@@ -40,16 +40,11 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then ...@@ -40,16 +40,11 @@ if [ ! -e "/usr/lib/x86_64-linux-gnu/libmlx5.so" ]; then
fi fi
apt-get update && apt-get install -y libfabric-dev apt-get update && apt-get install -y libfabric-dev
# Clone DeepEP
rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout eef7ab50fa5cf0ab1dd3fce4c6493c90bdf290ac
# Install NVSHMEM # Install NVSHMEM
cd /opt/nvshmem cd /opt/nvshmem
wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz
tar -xf nvshmem_src_3.2.5-1.txz tar -xf nvshmem_src_cuda12-all-all-3.3.9.tar.gz
rm -rf nvshmem && mv nvshmem_src nvshmem mv nvshmem_src nvshmem && cd nvshmem
cd nvshmem
git apply /root/.cache/deepep/third-party/nvshmem.patch
NVSHMEM_SHMEM_SUPPORT=0 \ NVSHMEM_SHMEM_SUPPORT=0 \
NVSHMEM_UCX_SUPPORT=0 \ NVSHMEM_UCX_SUPPORT=0 \
NVSHMEM_USE_NCCL=0 \ NVSHMEM_USE_NCCL=0 \
...@@ -63,12 +58,10 @@ cd build ...@@ -63,12 +58,10 @@ cd build
make -j$(nproc) install make -j$(nproc) install
# Install DeepEP # Install DeepEP
rm -rf /root/.cache/deepep && git clone https://github.com/deepseek-ai/DeepEP.git /root/.cache/deepep && cd /root/.cache/deepep && git checkout b6ce310bb0b75079682d09bc2ebc063a074fbd58
cd /root/.cache/deepep && python3 setup.py install cd /root/.cache/deepep && python3 setup.py install
# Verify configuration # Verify configuration
echo "=== NCCL Configuration ==="
nvidia-smi topo -m
nvidia-smi nvlink -s
echo "=== Verify GDRCOPY ===" echo "=== Verify GDRCOPY ==="
gdrcopy_copybw gdrcopy_copybw
echo "=== Verify NVSHMEM ===" echo "=== Verify NVSHMEM ==="
......
...@@ -45,6 +45,7 @@ class TestDeepseek(CustomTestCase): ...@@ -45,6 +45,7 @@ class TestDeepseek(CustomTestCase):
"256", "256",
"--max-running-requests", "--max-running-requests",
"2048", "2048",
"--disable-radix-cache",
], ],
) )
...@@ -54,10 +55,10 @@ class TestDeepseek(CustomTestCase): ...@@ -54,10 +55,10 @@ class TestDeepseek(CustomTestCase):
def test_gsm8k(self): def test_gsm8k(self):
args = SimpleNamespace( args = SimpleNamespace(
num_shots=8, num_shots=5,
data_path=None, data_path=None,
num_questions=1250, num_questions=1200,
parallel=1250, parallel=1200,
max_new_tokens=512, max_new_tokens=512,
host="http://127.0.0.1", host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]), port=int(self.base_url.split(":")[-1]),
...@@ -65,7 +66,7 @@ class TestDeepseek(CustomTestCase): ...@@ -65,7 +66,7 @@ class TestDeepseek(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(f"Eval accuracy of GSM8K: {metrics=}") print(f"Eval accuracy of GSM8K: {metrics=}")
self.assertGreater(metrics["accuracy"], 0.93) self.assertGreater(metrics["accuracy"], 0.92)
class TestDeepseekMTP(CustomTestCase): class TestDeepseekMTP(CustomTestCase):
...@@ -107,6 +108,7 @@ class TestDeepseekMTP(CustomTestCase): ...@@ -107,6 +108,7 @@ class TestDeepseekMTP(CustomTestCase):
"1", "1",
"--speculative-num-draft-tokens", "--speculative-num-draft-tokens",
"2", "2",
"--disable-radix-cache",
], ],
) )
...@@ -116,10 +118,10 @@ class TestDeepseekMTP(CustomTestCase): ...@@ -116,10 +118,10 @@ class TestDeepseekMTP(CustomTestCase):
def test_gsm8k(self): def test_gsm8k(self):
args = SimpleNamespace( args = SimpleNamespace(
num_shots=8, num_shots=5,
data_path=None, data_path=None,
num_questions=1250, num_questions=1200,
parallel=1250, parallel=1200,
max_new_tokens=512, max_new_tokens=512,
host="http://127.0.0.1", host="http://127.0.0.1",
port=int(self.base_url.split(":")[-1]), port=int(self.base_url.split(":")[-1]),
...@@ -127,7 +129,7 @@ class TestDeepseekMTP(CustomTestCase): ...@@ -127,7 +129,7 @@ class TestDeepseekMTP(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(f"Eval accuracy of GSM8K: {metrics=}") print(f"Eval accuracy of GSM8K: {metrics=}")
self.assertGreater(metrics["accuracy"], 0.93) self.assertGreater(metrics["accuracy"], 0.92)
server_info = requests.get(self.base_url + "/get_server_info") server_info = requests.get(self.base_url + "/get_server_info")
avg_spec_accept_length = server_info.json()["internal_states"][0][ avg_spec_accept_length = server_info.json()["internal_states"][0][
...@@ -138,7 +140,7 @@ class TestDeepseekMTP(CustomTestCase): ...@@ -138,7 +140,7 @@ class TestDeepseekMTP(CustomTestCase):
f"accuracy={metrics['accuracy']=:.3f}\n" f"accuracy={metrics['accuracy']=:.3f}\n"
f"{avg_spec_accept_length=:.3f}\n" f"{avg_spec_accept_length=:.3f}\n"
) )
self.assertGreater(avg_spec_accept_length, 1.9) self.assertGreater(avg_spec_accept_length, 1.85)
if __name__ == "__main__": if __name__ == "__main__":
......
...@@ -36,6 +36,8 @@ class TestPureDP(CustomTestCase): ...@@ -36,6 +36,8 @@ class TestPureDP(CustomTestCase):
"128", "128",
"--max-running-requests", "--max-running-requests",
"128", "128",
"--mem-fraction-static",
"0.5",
], ],
) )
...@@ -56,7 +58,7 @@ class TestPureDP(CustomTestCase): ...@@ -56,7 +58,7 @@ class TestPureDP(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(metrics) print(metrics)
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.60)
class TestHybridDPTP(CustomTestCase): class TestHybridDPTP(CustomTestCase):
...@@ -100,7 +102,7 @@ class TestHybridDPTP(CustomTestCase): ...@@ -100,7 +102,7 @@ class TestHybridDPTP(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(metrics) print(metrics)
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.60)
class TestTP(CustomTestCase): class TestTP(CustomTestCase):
...@@ -141,10 +143,10 @@ class TestTP(CustomTestCase): ...@@ -141,10 +143,10 @@ class TestTP(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(metrics) print(metrics)
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.60)
# @unittest.skip("covered in test_deepep_large.py") @unittest.skip("covered in test_deepep_large.py")
class TestNoGatherdBuffer(CustomTestCase): class TestNoGatherdBuffer(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -189,7 +191,7 @@ class TestNoGatherdBuffer(CustomTestCase): ...@@ -189,7 +191,7 @@ class TestNoGatherdBuffer(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(metrics) print(metrics)
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.60)
class TestTBO(CustomTestCase): class TestTBO(CustomTestCase):
...@@ -236,10 +238,10 @@ class TestTBO(CustomTestCase): ...@@ -236,10 +238,10 @@ class TestTBO(CustomTestCase):
metrics = run_eval_few_shot_gsm8k(args) metrics = run_eval_few_shot_gsm8k(args)
print(metrics) print(metrics)
self.assertGreater(metrics["accuracy"], 0.62) self.assertGreater(metrics["accuracy"], 0.60)
# @unittest.skip("covered in TestMTPWithTBO") @unittest.skip("covered in TestMTPWithTBO")
class TestMTP(CustomTestCase): class TestMTP(CustomTestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -280,8 +282,6 @@ class TestMTP(CustomTestCase): ...@@ -280,8 +282,6 @@ class TestMTP(CustomTestCase):
kill_process_tree(cls.process.pid) kill_process_tree(cls.process.pid)
def test_gsm8k(self): def test_gsm8k(self):
requests.get(self.base_url + "/flush_cache")
args = SimpleNamespace( args = SimpleNamespace(
num_shots=5, num_shots=5,
data_path=None, data_path=None,
...@@ -352,8 +352,6 @@ class TestMTPWithTBO(CustomTestCase): ...@@ -352,8 +352,6 @@ class TestMTPWithTBO(CustomTestCase):
kill_process_tree(cls.process.pid) kill_process_tree(cls.process.pid)
def test_gsm8k(self): def test_gsm8k(self):
requests.get(self.base_url + "/flush_cache")
args = SimpleNamespace( args = SimpleNamespace(
num_shots=5, num_shots=5,
data_path=None, data_path=None,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment