"git@developer.sourcefind.cn:renzhc/diffusers_dcu.git" did not exist on "7aac77affa17b6b504b0a406aacb471c5226b36d"
Unverified Commit 03227c5f authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Reorganize the 8 gpu tests (#6192)

parent 01bdbf7f
...@@ -92,7 +92,7 @@ jobs: ...@@ -92,7 +92,7 @@ jobs:
unittest-test-backend-8-gpu: unittest-test-backend-8-gpu:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false github.event.pull_request.draft == false
needs: [unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu] needs: [unit-test-frontend, unit-test-backend-2-gpu]
runs-on: 8-gpu-runner runs-on: 8-gpu-runner
steps: steps:
- name: Checkout code - name: Checkout code
...@@ -271,24 +271,6 @@ jobs: ...@@ -271,24 +271,6 @@ jobs:
cd test/srt cd test/srt
python3 test_moe_eval_accuracy_large.py python3 test_moe_eval_accuracy_large.py
unit-test-backend-pd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
runs-on: 8-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k
finish: finish:
if: always() if: always()
needs: [ needs: [
......
...@@ -305,6 +305,12 @@ class ServerArgs: ...@@ -305,6 +305,12 @@ class ServerArgs:
if self.grammar_backend is None: if self.grammar_backend is None:
self.grammar_backend = "xgrammar" self.grammar_backend = "xgrammar"
if self.pp_size > 1:
self.disable_overlap_schedule = True
logger.warning(
"Overlap scheduler is disabled because of using pipeline parallelism."
)
# Data parallelism attention # Data parallelism attention
if self.enable_dp_attention: if self.enable_dp_attention:
self.schedule_conservativeness = self.schedule_conservativeness * 0.3 self.schedule_conservativeness = self.schedule_conservativeness * 0.3
......
...@@ -5,25 +5,22 @@ set -euxo pipefail ...@@ -5,25 +5,22 @@ set -euxo pipefail
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
bash "${SCRIPT_DIR}/killall_sglang.sh" bash "${SCRIPT_DIR}/killall_sglang.sh"
# Update pip
pip install --upgrade pip
# Clean up existing installations # Clean up existing installations
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm
pip cache purge pip cache purge
rm -rf /root/.cache/flashinfer rm -rf /root/.cache/flashinfer
rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer* rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel* rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
# Update pip
pip install --upgrade pip
# Install sgl-kernel
pip install sgl-kernel==0.1.2.post1 --no-cache-dir
# Install the main package # Install the main package
pip install -e "python[all]" pip install -e "python[all]"
# Install additional dependencies # Install additional dependencies
pip install torch_memory_saver pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0 pip install transformers==4.51.0 timm torchaudio==2.6.0 sentence_transformers accelerate peft pandas datasets mooncake-transfer-engine
# For compiling xgrammar kernels # For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12 pip install cuda-python nvidia-cuda-nvrtc-cu12
......
...@@ -85,9 +85,6 @@ suites = { ...@@ -85,9 +85,6 @@ suites = {
TestFile("test_w8a8_quantization.py", 46), TestFile("test_w8a8_quantization.py", 46),
TestFile("models/lora/test_lora_cuda_graph.py", 250), TestFile("models/lora/test_lora_cuda_graph.py", 250),
], ],
"per-commit-pd": [
TestFile("test_disaggregation.py", 90),
],
"per-commit-2-gpu": [ "per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 116), TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73), TestFile("test_data_parallelism.py", 73),
...@@ -105,6 +102,7 @@ suites = { ...@@ -105,6 +102,7 @@ suites = {
# TestFile("test_deepep_low_latency.py", 50), # TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250), # TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
TestFile("test_local_attn.py", 250), TestFile("test_local_attn.py", 250),
TestFile("test_disaggregation.py", 90),
TestFile("test_full_deepseek_v3.py", 250), TestFile("test_full_deepseek_v3.py", 250),
TestFile("test_pp_single_node.py", 150), TestFile("test_pp_single_node.py", 150),
], ],
......
import subprocess import subprocess
import threading
import time import time
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
import requests import requests
import torch
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
......
...@@ -9,13 +9,10 @@ import time ...@@ -9,13 +9,10 @@ import time
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
import requests
from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
from sglang.srt.server_args import ServerArgs from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.runners import DEFAULT_PROMPTS
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
...@@ -28,17 +25,16 @@ from sglang.test.test_utils import ( ...@@ -28,17 +25,16 @@ from sglang.test.test_utils import (
class TestPPAccuracy(unittest.TestCase): class TestPPAccuracy(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
# These config helps find a leak.
os.environ["SGLANG_IS_IN_CI"] = "1"
cls.base_url = "http://127.0.0.1:23333" cls.base_url = "http://127.0.0.1:23333"
cls.process = popen_launch_server( cls.process = popen_launch_server(
DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_MODEL_NAME_FOR_TEST,
cls.base_url, cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[ other_args=[
"--tp-size",
2,
"--pp-size", "--pp-size",
4, 4,
"--disable-overlap-schedule",
"--chunked-prefill-size", "--chunked-prefill-size",
256, 256,
], ],
...@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase): ...@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase):
time.sleep(5) time.sleep(5)
# class TestPPAccuracyFlashInfer(unittest.TestCase):
# @classmethod
# def setUpClass(cls):
# # These config helps find a leak.
# os.environ["SGLANG_IS_IN_CI"] = "1"
# cls.base_url = "http://127.0.0.1:23333"
# cls.process = popen_launch_server(
# DEFAULT_MODEL_NAME_FOR_TEST,
# cls.base_url,
# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
# other_args=[
# "--pp-size",
# 4,
# "--disable-overlap-schedule",
# "--attention-backend",
# "flashinfer",
# "--chunked-prefill-size",
# 256,
# ],
# )
#
# @classmethod
# def tearDownClass(cls):
# kill_process_tree(cls.process.pid)
#
# def test_gsm8k(self):
# args = SimpleNamespace(
# num_shots=5,
# data_path=None,
# num_questions=200,
# max_new_tokens=512,
# parallel=128,
# host="http://127.0.0.1",
# port=int(self.base_url.split(":")[-1]),
# )
# metrics = run_eval(args)
# print(f"{metrics=}")
#
# self.assertGreater(metrics["accuracy"], 0.75)
# # Wait a little bit so that the memory check happens.
# time.sleep(5)
class TestFixedBugs(unittest.TestCase): class TestFixedBugs(unittest.TestCase):
def test_chunked_prefill_with_small_bs(self): def test_chunked_prefill_with_small_bs(self):
model = DEFAULT_MODEL_NAME_FOR_TEST model = DEFAULT_MODEL_NAME_FOR_TEST
...@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase): ...@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase):
2, 2,
"--pp-size", "--pp-size",
2, 2,
"--disable-overlap-schedule",
"--chunked-prefill", "--chunked-prefill",
256, 256,
"--max-running-requests", "--max-running-requests",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment