Unverified Commit 0c0779d6 authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: improve nightly-ci (#11385)

parent a55cf530
......@@ -62,7 +62,7 @@ jobs:
nightly-test-eval-vlms:
if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
......@@ -79,7 +79,7 @@ jobs:
nightly-test-perf-vlms:
if: github.repository == 'sgl-project/sglang'
runs-on: 1-gpu-runner
runs-on: 2-gpu-runner
steps:
- name: Checkout code
uses: actions/checkout@v4
......
......@@ -25,8 +25,10 @@ from typing import List, Optional, Tuple
import numpy as np
import requests
from pydantic import BaseModel
from transformers import AutoProcessor, PreTrainedTokenizer
from sglang.bench_serving import (
get_processor,
get_tokenizer,
sample_mmmu_requests,
sample_random_requests,
......@@ -104,8 +106,14 @@ Note: To view the traces through perfetto-ui, please:
if self.profile_links.extend or self.profile_links.decode:
# Create a combined link or use the first available one
trace_files = [self.profile_links.extend, self.profile_links.decode]
if any(trace_file is None for trace_file in trace_files):
logger.error("Some trace files are None", f"{trace_files=}")
trace_files_relay_links = [
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
(
f"[trace]({get_perfetto_relay_link_from_trace_file(trace_file)})"
if trace_file
else "N/A"
)
for trace_file in trace_files
]
......@@ -114,30 +122,31 @@ Note: To view the traces through perfetto-ui, please:
# Build the row
return f"| {self.batch_size} | {self.input_len} | {self.latency:.2f} | {self.input_throughput:.2f} | {self.output_throughput:.2f} | {accept_length} | {itl:.2f} | {input_cost:.2f} | {output_cost:.2f} | {profile_link} |\n"
@classmethod
def generate_markdown_report(
cls, trace_dir, results: List["BenchmarkResult"]
) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os
summary = f"### {results[0].model_path}\n"
def generate_markdown_report(trace_dir, results: List["BenchmarkResult"]) -> str:
"""Generate a markdown report from a list of BenchmarkResult object from a single run."""
import os
summary = f"### {results[0].model_path}\n"
# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
# summary += (
# f"Input lens: {result.input_len}. Output lens: {result.output_len}.\n"
# )
summary += "| batch size | input len | latency (s) | input throughput (tok/s) | output throughput (tok/s) | acc length | ITL (ms) | input cost ($/1M) | output cost ($/1M) | profile (extend) | profile (decode)|\n"
summary += "| ---------- | --------- | ----------- | ------------------------- | ------------------------- | ---------- | -------- | ----------------- | ------------------ | --------------- | -------------- |\n"
# all results should share the same isl & osl
for result in results:
base_url = os.getenv("TRACE_BASE_URL", "").rstrip("/")
relay_base = os.getenv("PERFETTO_RELAY_URL", "").rstrip("/")
relay_base = "https://docs.sglang.ai/ci-data/pages/perfetto_relay.html"
# base_url = "https://github.com/sgl-project/ci-data/traces"
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
# all results should share the same isl & osl
for result in results:
base_url = os.getenv(
"TRACE_BASE_URL", "https://github.com/sgl-project/ci-data/traces"
).rstrip("/")
relay_base = os.getenv(
"PERFETTO_RELAY_URL",
"https://docs.sglang.ai/ci-data/pages/perfetto_relay.html",
).rstrip("/")
summary += result.to_markdown_row(trace_dir, base_url, relay_base)
return summary
return summary
@dataclasses.dataclass
......@@ -288,7 +297,7 @@ def run_one_case(
input_len_step_percentage: float,
run_name: str,
result_filename: str,
tokenizer,
tokenizer: PreTrainedTokenizer | AutoProcessor,
dataset_name="",
profile: bool = False,
profile_steps: int = 3,
......@@ -302,9 +311,8 @@ def run_one_case(
if dataset_name == "mmmu":
input_requests = sample_mmmu_requests(
num_requests=batch_size,
tokenizer=tokenizer,
processor=tokenizer,
fixed_output_len=output_len,
apply_chat_template=True,
random_sample=False,
)
elif dataset_name == "random":
......@@ -364,6 +372,8 @@ def run_one_case(
if dataset_name == "mmmu":
# vlm
input_ids = []
# for vlms, tokenizer is an instance of AutoProcessor
tokenizer = tokenizer.tokenizer
for input_req in input_requests:
input_ids += [tokenizer.encode(input_req.prompt)]
payload["image_data"] = [req.image_data for req in input_requests]
......@@ -609,7 +619,12 @@ def run_benchmark(server_args: ServerArgs, bench_args: BenchArgs):
tokenizer_path = server_info["tokenizer_path"]
elif "prefill" in server_info:
tokenizer_path = server_info["prefill"][0]["tokenizer_path"]
tokenizer = get_tokenizer(tokenizer_path)
if bench_args.dataset_name == "mmmu":
# mmmu implies this is a MLLM
tokenizer = get_processor(tokenizer_path)
else:
tokenizer = get_tokenizer(tokenizer_path)
# warmup
if not bench_args.skip_warmup:
......
......@@ -12,7 +12,6 @@ python3 -m sglang.bench_serving --backend sglang --dataset-name random --num-pro
import argparse
import asyncio
import base64
import io
import json
import os
......@@ -671,7 +670,7 @@ def get_processor(
if pretrained_model_name_or_path.endswith(
".json"
) or pretrained_model_name_or_path.endswith(".model"):
from sglang.srt.hf_transformers_utils import get_processor
from sglang.srt.utils.hf_transformers_utils import get_processor
return get_processor(pretrained_model_name_or_path)
......@@ -935,7 +934,7 @@ async def get_mooncake_request_over_time(
for i in range(num_rounds):
# Add user query for the current round
chat_history.append(
{"role": "user", "content": f"Round {i+1}: {user_query_base}"}
{"role": "user", "content": f"Round {i + 1}: {user_query_base}"}
)
# Form the full prompt from history
......@@ -964,7 +963,7 @@ async def get_mooncake_request_over_time(
def sample_mmmu_requests(
num_requests: int,
processor: AutoProcessor,
processor: AutoProcessor | AutoTokenizer,
fixed_output_len: Optional[int] = None,
random_sample: bool = True,
) -> List[DatasetRow]:
......@@ -973,9 +972,7 @@ def sample_mmmu_requests(
Args:
num_requests: Number of requests to sample.
tokenizer: Tokenizer to use for token counting.
fixed_output_len: If provided, use this fixed output length for all requests.
apply_chat_template: Whether to apply the chat template to the prompt.
random_sample: Whether to randomly sample or take the first N.
Returns:
......@@ -1282,11 +1279,11 @@ def parse_image_resolution(image_resolution: str) -> Tuple[int, int]:
)
def create_mm_data_row(text_prompt, images, images_base64, output_len, processor):
def create_mm_data_row(text_prompt, images: list, images_base64, output_len, processor):
try:
content_items = [
{"type": "image_url", "image_url": {"url": img_url}}
for img_url in images_base64
{"type": "image", "image": {"url": image_base64}}
for image_base64 in images_base64
]
content_items.append({"type": "text", "text": text_prompt})
prompt_str = processor.apply_chat_template(
......@@ -1294,7 +1291,9 @@ def create_mm_data_row(text_prompt, images, images_base64, output_len, processor
add_generation_prompt=True,
tokenize=False,
)
except Exception:
except Exception as e:
# Note (Xinyuan): This is a workaround for an issue where some tokenizers do not support content as a list. (e.g. InternVL)
print(f"Error applying chat template: {e}, fallback to <image> tag")
# Some tokenizers do not support list content; fall back to a placeholder in the text
prompt_str = f"<image>{text_prompt}"
......@@ -1425,7 +1424,7 @@ def sample_image_requests(
print(f"#Input tokens: {np.sum([x.prompt_len for x in dataset])}")
print(f"#Output tokens: {np.sum([x.output_len for x in dataset])}")
print(
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes//num_requests} bytes per request"
f"\nCreated {len(dataset)} {image_content} {image_format} images with average {total_image_bytes // num_requests} bytes per request"
)
return dataset
......
......@@ -3,7 +3,7 @@ import subprocess
import time
import unittest
from sglang.bench_one_batch_server import BenchmarkResult
from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
......@@ -41,7 +41,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self):
all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models:
benchmark_results = []
with self.subTest(model=model_setup.model_path):
......@@ -113,19 +113,21 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
# Clean up JSON file
os.remove(json_output_file)
else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found")
finally:
kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
)
report_part = generate_markdown_report(PROFILE_DIR, benchmark_results)
self.full_report += report_part + "\n"
if is_in_ci():
write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__":
unittest.main()
import json
import unittest
import warnings
from functools import partial
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
......@@ -26,16 +25,19 @@ MODEL_THRESHOLDS = {
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
): ModelEvalMetrics(0.305, 23.8),
ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 17.7),
ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
0.330, 22.3
),
ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.259, 36.3),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 17.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]
): ModelEvalMetrics(0.29, 29.1),
ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7),
......
......@@ -3,7 +3,7 @@ import subprocess
import unittest
import warnings
from sglang.bench_one_batch_server import BenchmarkResult
from sglang.bench_one_batch_server import BenchmarkResult, generate_markdown_report
from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
......@@ -27,6 +27,7 @@ MODEL_DEFAULTS = [
ModelLaunchSettings(
"google/gemma-3-27b-it",
),
ModelLaunchSettings("Qwen/Qwen3-VL-30B-A3B-Instruct", extra_args=["--tp=2"]),
# "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6",
......@@ -45,9 +46,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
cls.models = []
model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths:
cls.models.append(
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
)
cls.models.append(ModelLaunchSettings(model_path))
else:
cls.models = MODEL_DEFAULTS
......@@ -60,6 +59,7 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self):
all_benchmark_results = []
all_model_succeed = True
for model_setup in self.models:
benchmark_results = []
......@@ -112,7 +112,6 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
f"Error running benchmark for {model_setup.model_path} with batch size:"
)
print(result.stderr)
# Continue to next batch size even if one fails
continue
print(f"Output for {model_setup.model_path} with batch size:")
......@@ -136,19 +135,24 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
)
else:
all_model_succeed = False
print(f"Warning: JSON output file {json_output_file} not found")
finally:
kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
report_part = generate_markdown_report(
PROFILE_DIR,
benchmark_results,
)
self.full_report += report_part + "\n"
if is_in_ci():
write_github_step_summary(self.full_report)
if not all_model_succeed:
raise AssertionError("Some models failed the perf tests.")
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment