Unverified Commit 03227c5f authored by Lianmin Zheng's avatar Lianmin Zheng Committed by GitHub
Browse files

[CI] Reorganize the 8 gpu tests (#6192)

parent 01bdbf7f
......@@ -92,7 +92,7 @@ jobs:
unittest-test-backend-8-gpu:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
needs: [unit-test-frontend, unit-test-backend-1-gpu, unit-test-backend-2-gpu]
needs: [unit-test-frontend, unit-test-backend-2-gpu]
runs-on: 8-gpu-runner
steps:
- name: Checkout code
......@@ -271,24 +271,6 @@ jobs:
cd test/srt
python3 test_moe_eval_accuracy_large.py
unit-test-backend-pd:
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') &&
github.event.pull_request.draft == false
runs-on: 8-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
- name: Install dependencies
run: |
bash scripts/ci_install_dependency.sh
- name: Run test
timeout-minutes: 10
run: |
cd test/srt
python3 -m unittest test_disaggregation.TestDisaggregationMooncake.test_gsm8k
finish:
if: always()
needs: [
......
......@@ -305,6 +305,12 @@ class ServerArgs:
if self.grammar_backend is None:
self.grammar_backend = "xgrammar"
if self.pp_size > 1:
self.disable_overlap_schedule = True
logger.warning(
"Overlap scheduler is disabled because of using pipeline parallelism."
)
# Data parallelism attention
if self.enable_dp_attention:
self.schedule_conservativeness = self.schedule_conservativeness * 0.3
......
......@@ -5,25 +5,22 @@ set -euxo pipefail
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
bash "${SCRIPT_DIR}/killall_sglang.sh"
# Update pip
pip install --upgrade pip
# Clean up existing installations
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm || true
pip uninstall -y flashinfer flashinfer_python sgl-kernel sglang vllm
pip cache purge
rm -rf /root/.cache/flashinfer
rm -rf /usr/local/lib/python3.10/dist-packages/flashinfer*
rm -rf /usr/local/lib/python3.10/dist-packages/sgl_kernel*
# Update pip
pip install --upgrade pip
# Install sgl-kernel
pip install sgl-kernel==0.1.2.post1 --no-cache-dir
# Install the main package
pip install -e "python[all]"
# Install additional dependencies
pip install torch_memory_saver
pip install transformers==4.51.0 sentence_transformers accelerate peft pandas datasets timm torchaudio==2.6.0
pip install transformers==4.51.0 timm torchaudio==2.6.0 sentence_transformers accelerate peft pandas datasets mooncake-transfer-engine
# For compiling xgrammar kernels
pip install cuda-python nvidia-cuda-nvrtc-cu12
......
......@@ -85,9 +85,6 @@ suites = {
TestFile("test_w8a8_quantization.py", 46),
TestFile("models/lora/test_lora_cuda_graph.py", 250),
],
"per-commit-pd": [
TestFile("test_disaggregation.py", 90),
],
"per-commit-2-gpu": [
TestFile("models/lora/test_lora_tp.py", 116),
TestFile("test_data_parallelism.py", 73),
......@@ -105,6 +102,7 @@ suites = {
# TestFile("test_deepep_low_latency.py", 50),
# TestFile("test_moe_deepep_eval_accuracy_large.py", 250),
TestFile("test_local_attn.py", 250),
TestFile("test_disaggregation.py", 90),
TestFile("test_full_deepseek_v3.py", 250),
TestFile("test_pp_single_node.py", 150),
],
......
import subprocess
import threading
import time
import unittest
from types import SimpleNamespace
import requests
import torch
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval as run_eval_few_shot_gsm8k
......
......@@ -9,13 +9,10 @@ import time
import unittest
from types import SimpleNamespace
import requests
from sglang.bench_one_batch_server import BenchArgs as OneBatchBenchArgs
from sglang.srt.server_args import ServerArgs
from sglang.srt.utils import kill_process_tree
from sglang.test.few_shot_gsm8k import run_eval
from sglang.test.runners import DEFAULT_PROMPTS
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
......@@ -28,17 +25,16 @@ from sglang.test.test_utils import (
class TestPPAccuracy(unittest.TestCase):
@classmethod
def setUpClass(cls):
# These config helps find a leak.
os.environ["SGLANG_IS_IN_CI"] = "1"
cls.base_url = "http://127.0.0.1:23333"
cls.process = popen_launch_server(
DEFAULT_MODEL_NAME_FOR_TEST,
cls.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
other_args=[
"--tp-size",
2,
"--pp-size",
4,
"--disable-overlap-schedule",
"--chunked-prefill-size",
256,
],
......@@ -66,49 +62,6 @@ class TestPPAccuracy(unittest.TestCase):
time.sleep(5)
# class TestPPAccuracyFlashInfer(unittest.TestCase):
# @classmethod
# def setUpClass(cls):
# # These config helps find a leak.
# os.environ["SGLANG_IS_IN_CI"] = "1"
# cls.base_url = "http://127.0.0.1:23333"
# cls.process = popen_launch_server(
# DEFAULT_MODEL_NAME_FOR_TEST,
# cls.base_url,
# timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
# other_args=[
# "--pp-size",
# 4,
# "--disable-overlap-schedule",
# "--attention-backend",
# "flashinfer",
# "--chunked-prefill-size",
# 256,
# ],
# )
#
# @classmethod
# def tearDownClass(cls):
# kill_process_tree(cls.process.pid)
#
# def test_gsm8k(self):
# args = SimpleNamespace(
# num_shots=5,
# data_path=None,
# num_questions=200,
# max_new_tokens=512,
# parallel=128,
# host="http://127.0.0.1",
# port=int(self.base_url.split(":")[-1]),
# )
# metrics = run_eval(args)
# print(f"{metrics=}")
#
# self.assertGreater(metrics["accuracy"], 0.75)
# # Wait a little bit so that the memory check happens.
# time.sleep(5)
class TestFixedBugs(unittest.TestCase):
def test_chunked_prefill_with_small_bs(self):
model = DEFAULT_MODEL_NAME_FOR_TEST
......@@ -124,7 +77,6 @@ class TestFixedBugs(unittest.TestCase):
2,
"--pp-size",
2,
"--disable-overlap-schedule",
"--chunked-prefill",
256,
"--max-running-requests",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment