Unverified Commit 0c0779d6 authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: improve nightly-ci (#11385)

parent a55cf530
...@@ -62,7 +62,7 @@ jobs: ...@@ -62,7 +62,7 @@ jobs:
nightly-test-eval-vlms: nightly-test-eval-vlms:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner runs-on: 2-gpu-runner
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
...@@ -79,7 +79,7 @@ jobs: ...@@ -79,7 +79,7 @@ jobs:
nightly-test-perf-vlms: nightly-test-perf-vlms:
if: github.repository == 'sgl-project/sglang' if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner runs-on: 2-gpu-runner
steps: steps:
- name: Checkout code - name: Checkout code
uses: actions/checkout@v4 uses: actions/checkout@v4
......
...@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple ...@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
import numpy as np import numpy as np
import requests import requests
from pydantic import BaseModel from pydantic import BaseModel
from transformers import AutoProcessor, PreTrainedTokenizer
from sglang.bench_serving import ( from sglang.bench_serving import (
get_processor,
get_tokenizer, get_tokenizer,
sample_mmmu_requests, sample_mmmu_requests,
sample_random_requests, sample_random_requests,
...@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please: ...@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
if self.profile_links.extend or self.profile_links.decode: if self.profile_links.extend or self.profile_links.decode:
# Create a combined link or use the first available one # Create a combined link or use the first available one
trace_files = [self.profile_links.extend, self.profile_links.decode] trace_files = [self.profile_links.extend, self.profile_links.decode]
if any(trace_file is None for trace_file in trace_files):
logger.error("Some trace files are None", f"{trace_files=}")
trace_files_relay_links = [ trace_files_relay_links = [
(
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})" f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
if trace_file
else "N/A"
)
for trace_file in trace_files for trace_file in trace_files
] ]
...@@ -114,10 +122,8 @@ Note: To view the traces through perfetto-ui, please: ...@@ -114,10 +122,8 @@ Note: To view the traces through perfetto-ui, please:
# Build the row # Build the row
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n" return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
@classmethod
def generate_markdown_report( def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
cls, trace_dir, results: List["BenchmarkResult"]
) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run.""" """Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os import os
...@@ -131,10 +137,13 @@ Note: To view the traces through perfetto-ui, please: ...@@ -131,10 +137,13 @@ Note: To view the traces through perfetto-ui, please:
# all results should share the same isl & osl # all results should share the same isl & osl
for result in results: for result in results:
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/") base_url = os.getenv(
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/") "TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html" ).rstrip("/")
# base_url = "https://github.com/sgl-project/ci-data/traces" relay_base = os.getenv(
"PERFETTO_RELAY_URL",
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
).rstrip("/")
summary += result.to_markdown_row(trace_dir, base_url, relay_base) summary += result.to_markdown_row(trace_dir, base_url, relay_base)
return summary return summary
...@@ -288,7 +297,7 @@ def run_one_case( ...@@ -288,7 +297,7 @@ def run_one_case(
input_len_step_percentage: float, input_len_step_percentage: float,
run_name: str, run_name: str,
result_filename: str, result_filename: str,
tokenizer, tokenizer: PreTrainedTokenizer | AutoProcessor,
dataset_name="", dataset_name="",
profile: bool = False, profile: bool = False,
profile_steps: int = 3, profile_steps: int = 3,
...@@ -302,9 +311,8 @@ def run_one_case( ...@@ -302,9 +311,8 @@ def run_one_case(
if dataset_name == "mmmu": if dataset_name == "mmmu":
input_requests = sample_mmmu_requests( input_requests = sample_mmmu_requests(
num_requests=batch_size, num_requests=batch_size,
tokenizer=tokenizer, processor=tokenizer,
fixed_output_len=output_len, fixed_output_len=output_len,
apply_chat_template=True,
random_sample=False, random_sample=False,
) )
elif dataset_name == "random": elif dataset_name == "random":
...@@ -364,6 +372,8 @@ def run_one_case( ...@@ -364,6 +372,8 @@ def run_one_case(
if dataset_name == "mmmu": if dataset_name == "mmmu":
# vlm # vlm
input_ids = [] input_ids = []
# for vlms, tokenizer is an instance of AutoProcessor
tokenizer = tokenizer.tokenizer
for input_req in input_requests: for input_req in input_requests:
input_ids += [tokenizer.encode(input_req.prompt)] input_ids += [tokenizer.encode(input_req.prompt)]
payload["image_data"] = [req.image_data for req in input_requests] payload["image_data"] = [req.image_data for req in input_requests]
...@@ -609,6 +619,11 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs): ...@@ -609,6 +619,11 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
tokenizer_path = server_info["tokenizer_path"] tokenizer_path = server_info["tokenizer_path"]
elif "prefill" in server_info: elif "prefill" in server_info:
tokenizer_path = server_info["prefill"][0]["tokenizer_path"] tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
if bench_args.dataset_name == "mmmu":
# mmmu implies this is a MLLM
tokenizer = get_processor(tokenizer_path)
else:
tokenizer = get_tokenizer(tokenizer_path) tokenizer = get_tokenizer(tokenizer_path)
# warmup # warmup
......
...@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro ...@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
import argparse import argparse
import asyncio import asyncio
import base64
import io import io
import json import json
import os import os
...@@ -671,7 +670,7 @@ def get_processor( ...@@ -671,7 +670,7 @@ def get_processor(
if pretrained_model_name_or_path.endswith( if pretrained_model_name_or_path.endswith(
".json" ".json"
) or pretrained_model_name_or_path.endswith(".model"): ) or pretrained_model_name_or_path.endswith(".model"):
from sglang.srt.hf_transformers_utils import get_processor from sglang.srt.utils.hf_transformers_utils import get_processor
return get_processor(pretrained_model_name_or_path) return get_processor(pretrained_model_name_or_path)
...@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time( ...@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
for i in range(num_rounds): for i in range(num_rounds):
# Add user query for the current round # Add user query for the current round
chat_history.append( chat_history.append(
{"role": "user", "content": f"Round {i+1}: {user_query_base}"} {"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
) )
# Form the full prompt from history # Form the full prompt from history
...@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time( ...@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
def sample_mmmu_requests( def sample_mmmu_requests(
num_requests: int, num_requests: int,
processor: AutoProcessor, processor: AutoProcessor | AutoTokenizer,
fixed_output_len: Optional[int] = None, fixed_output_len: Optional[int] = None,
random_sample: bool = True, random_sample: bool = True,
) -> List[DatasetRow]: ) -> List[DatasetRow]:
...@@ -973,9 +972,7 @@ def sample_mmmu_requests( ...@@ -973,9 +972,7 @@ def sample_mmmu_requests(
Args: Args:
num_requests: Number of requests to sample. num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests. fixed_output_len: If provided, use this fixed output length for all requests.
apply_chat_template: Whether to apply the chat template to the prompt.
random_sample: Whether to randomly sample or take the first N. random_sample: Whether to randomly sample or take the first N.
Returns: Returns:
...@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]: ...@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
) )
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor): def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
try: try:
content_items = [ content_items = [
{"type": "image_url", "image_url": {"url": img_url}} {"type": "image", "image": {"url": image_base64}}
for img_url in images_base64 for image_base64 in images_base64
] ]
content_items.append({"type": "text", "text": text_prompt}) content_items.append({"type": "text", "text": text_prompt})
prompt_str = processor.apply_chat_template( prompt_str = processor.apply_chat_template(
...@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor ...@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
add_generation_prompt=True, add_generation_prompt=True,
tokenize=False, tokenize=False,
) )
except Exception: except Exception as e:
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
print(f"Error applying chat template: {e}, fallback to <image> tag")
# Some tokenizers do not support list content; fall back to a placeholder in the text # Some tokenizers do not support list content; fall back to a placeholder in the text
prompt_str = f"<image>{text_prompt}" prompt_str = f"<image>{text_prompt}"
...@@ -1425,7 +1424,7 @@ def sample_image_requests( ...@@ -1425,7 +1424,7 @@ def sample_image_requests(
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}") print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}") print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
print( print(
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request" f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
) )
return dataset return dataset
......
...@@ -3,7 +3,7 @@ import subprocess ...@@ -3,7 +3,7 @@ import subprocess
import time import time
import unittest import unittest
from sglang.bench_one_batch_server import BenchmarkResult from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
...@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): ...@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models: for model_setup in self.models:
benchmark_results = [] benchmark_results = []
with self.subTest(model=model_setup.model_path): with self.subTest(model=model_setup.model_path):
...@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): ...@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
# Clean up JSON file # Clean up JSON file
os.remove(json_output_file) os.remove(json_output_file)
else: else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found") print(f"Warning: JSON output file {json_output_file} not found")
finally: finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report( report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
PROFILE_DIR, benchmark_results
)
self.full_report += report_part + "\n" self.full_report += report_part + "\n"
if is_in_ci(): if is_in_ci():
write_github_step_summary(self.full_report) write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
import json import json
import unittest import unittest
import warnings import warnings
from functools import partial
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
...@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = { ...@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = {
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626" "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
): ModelEvalMetrics(0.305, 23.8), ): ModelEvalMetrics(0.305, 23.8),
ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics( ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
0.330, 22.3 0.330, 22.3
), ),
ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1),
ModelLaunchSettings( ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503" "unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7), ): ModelEvalMetrics(0.310, 16.7),
......
...@@ -3,7 +3,7 @@ import subprocess ...@@ -3,7 +3,7 @@ import subprocess
import unittest import unittest
import warnings import warnings
from sglang.bench_one_batch_server import BenchmarkResult from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
...@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [ ...@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [
ModelLaunchSettings( ModelLaunchSettings(
"google/gemma-3-27b-it", "google/gemma-3-27b-it",
), ),
ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
# "OpenGVLab/InternVL2_5-2B", # "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl # buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6", # "openbmb/MiniCPM-V-2_6",
...@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
cls.models = [] cls.models = []
model_paths = parse_models(nightly_vlm_models_str) model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths: for model_path in model_paths:
cls.models.append( cls.models.append(ModelLaunchSettings(model_path))
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
)
else: else:
cls.models = MODEL_DEFAULTS cls.models = MODEL_DEFAULTS
...@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models: for model_setup in self.models:
benchmark_results = [] benchmark_results = []
...@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
f"Error running benchmark for {model_setup.model_path} with batch size:" f"Error running benchmark for {model_setup.model_path} with batch size:"
) )
print(result.stderr) print(result.stderr)
# Continue to next batch size even if one fails
continue continue
print(f"Output for {model_setup.model_path} with batch size:") print(f"Output for {model_setup.model_path} with batch size:")
...@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
) )
else: else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found") print(f"Warning: JSON output file {json_output_file} not found")
finally: finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report( report_part = generate_markdown_report(
PROFILE_DIR, benchmark_results PROFILE_DIR,
benchmark_results,
) )
self.full_report += report_part + "\n" self.full_report += report_part + "\n"
if is_in_ci(): if is_in_ci():
write_github_step_summary(self.full_report) write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment