Unverified Commit 64d1505c authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: unify the model launch method of nightly ci (#11230)

parent f3764c26
......@@ -20,7 +20,6 @@ from functools import partial
from pathlib import Path
from types import SimpleNamespace
from typing import Any, Awaitable, Callable, List, Optional, Tuple
from urllib.parse import quote
import aiohttp
import numpy as np
......@@ -1652,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
return text.removesuffix(suffix)
class ModelDeploySetup:
def __init__(self, model_path: str, extra_args: List[str] = []):
class ModelLaunchSettings:
def __init__(
self,
model_path: str,
tp_size: int = 1,
extra_args: Optional[List[str]] = None,
env: Optional[dict] = None,
):
self.model_path = model_path
if "--enable-multimodal" not in extra_args:
extra_args.append("--enable-multimodal")
if "--trust-remote-code" not in extra_args:
extra_args.append("--trust-remote-code")
self.tp_size = tp_size
self.extra_args = list(extra_args) if extra_args else []
self.env = env
self.extra_args = extra_args
if self.tp_size > 1 and "--tp" not in self.extra_args:
self.extra_args.extend(["--tp", str(self.tp_size)])
fixed_args = ["--enable-multimodal", "--trust-remote-code"]
for fixed_arg in fixed_args:
if fixed_arg not in self.extra_args:
self.extra_args.append(fixed_arg)
class ModelEvalMetrics:
......
......@@ -12,6 +12,7 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
check_evaluation_test_results,
parse_models,
popen_launch_server,
......@@ -44,12 +45,19 @@ MODEL_SCORE_THRESHOLDS = {
class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.models = []
models_tp1 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1)
for model_path in models_tp1:
cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
models_tp2 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2)
for model_path in models_tp2:
cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
cls.base_url = DEFAULT_URL_FOR_TEST
def test_mgsm_en_all_models(self):
......@@ -58,26 +66,24 @@ class TestNightlyGsm8KEval(unittest.TestCase):
)
is_first = True
all_results = []
model_count = 0
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
model_count += 1
with self.subTest(model=model):
other_args = ["--tp", "2"] if is_tp2 else []
if model == "meta-llama/Llama-3.1-70B-Instruct":
other_args.extend(["--mem-fraction-static", "0.9"])
process = popen_launch_server(
model=model,
other_args=other_args,
base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
for model_setup in self.models:
with self.subTest(model=model_setup.model_path):
other_args = list(model_setup.extra_args)
if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
other_args.extend(["--mem-fraction-static", "0.9"])
process = popen_launch_server(
model=model_setup.model_path,
other_args=other_args,
base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
try:
args = SimpleNamespace(
base_url=self.base_url,
model=model,
model=model_setup.model_path,
eval_name="mgsm_en",
num_examples=None,
num_threads=1024,
......@@ -85,14 +91,17 @@ class TestNightlyGsm8KEval(unittest.TestCase):
metrics = run_eval(args)
print(
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
)
write_results_to_json(model, metrics, "w" if is_first else "a")
write_results_to_json(
model_setup.model_path, metrics, "w" if is_first else "a"
)
is_first = False
# 0.0 for empty latency
all_results.append((model, metrics["score"], 0.0))
all_results.append((model_setup.model_path, metrics["score"], 0.0))
finally:
kill_process_tree(process.pid)
try:
......@@ -107,7 +116,7 @@ class TestNightlyGsm8KEval(unittest.TestCase):
all_results,
self.__class__.__name__,
model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS,
model_count=model_count,
model_count=len(self.models),
)
......
......@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
_parse_int_list_env,
is_in_ci,
parse_models,
......@@ -21,14 +22,16 @@ PROFILE_DIR = "performance_profiles_text_models"
class TestNightlyTextModelsPerformance(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.model_groups = [
(parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False),
(parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.models = []
# TODO: replace with DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 or other model lists
for model_path in parse_models("meta-llama/Llama-3.1-8B-Instruct"):
cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
for model_path in parse_models("Qwen/Qwen2-57B-A14B-Instruct"):
cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
cls.base_url = DEFAULT_URL_FOR_TEST
cls.batch_sizes = [1, 1, 8, 16, 64]
cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
......@@ -39,93 +42,86 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self):
all_benchmark_results = []
for model_group, is_fp8, is_tp2 in self.model_groups:
for model in model_group:
benchmark_results = []
with self.subTest(model=model):
process = popen_launch_server(
model=model,
base_url=self.base_url,
other_args=["--tp", "2"] if is_tp2 else [],
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
for model_setup in self.models:
benchmark_results = []
with self.subTest(model=model_setup.model_path):
process = popen_launch_server(
model=model_setup.model_path,
base_url=self.base_url,
other_args=model_setup.extra_args,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
try:
profile_filename = (
f"{model_setup.model_path.replace('/', '_')}_{int(time.time())}"
)
try:
profile_filename = (
f"{model.replace('/', '_')}_{int(time.time())}"
)
profile_path_prefix = os.path.join(
PROFILE_DIR, profile_filename
profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
json_output_file = f"results_{model_setup.model_path.replace('/', '_')}_{int(time.time())}.json"
command = [
"python3",
"-m",
"sglang.bench_one_batch_server",
"--model",
model_setup.model_path,
"--base-url",
self.base_url,
"--batch-size",
*[str(x) for x in self.batch_sizes],
"--input-len",
*[str(x) for x in self.input_lens],
"--output-len",
*[str(x) for x in self.output_lens],
"--show-report",
"--profile",
"--profile-by-stage",
"--profile-filename-prefix",
profile_path_prefix,
f"--output-path={json_output_file}",
"--no-append-to-github-summary",
]
print(f"Running command: {' '.join(command)}")
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
print(
f"Error running benchmark for {model_setup.model_path} with batch size:"
)
json_output_file = (
f"results_{model.replace('/', '_')}_{int(time.time())}.json"
print(result.stderr)
# Continue to next batch size even if one fails
continue
# Load and deserialize JSON results
if os.path.exists(json_output_file):
import json
with open(json_output_file, "r") as f:
json_data = json.load(f)
# Convert JSON data to BenchmarkResult objects
for data in json_data:
benchmark_result = BenchmarkResult(**data)
all_benchmark_results.append(benchmark_result)
benchmark_results.append(benchmark_result)
print(
f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
)
command = [
"python3",
"-m",
"sglang.bench_one_batch_server",
"--model",
model,
"--base-url",
self.base_url,
"--batch-size",
*[str(x) for x in self.batch_sizes],
"--input-len",
*[str(x) for x in self.input_lens],
"--output-len",
*[str(x) for x in self.output_lens],
"--show-report",
"--profile",
"--profile-by-stage",
"--profile-filename-prefix",
profile_path_prefix,
f"--output-path={json_output_file}",
"--no-append-to-github-summary",
]
print(f"Running command: {' '.join(command)}")
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
print(
f"Error running benchmark for {model} with batch size:"
)
print(result.stderr)
# Continue to next batch size even if one fails
continue
# Load and deserialize JSON results
if os.path.exists(json_output_file):
import json
with open(json_output_file, "r") as f:
json_data = json.load(f)
# Convert JSON data to BenchmarkResult objects
for data in json_data:
benchmark_result = BenchmarkResult(**data)
all_benchmark_results.append(benchmark_result)
benchmark_results.append(benchmark_result)
print(
f"Loaded {len(benchmark_results)} benchmark results from {json_output_file}"
)
# Clean up JSON file
os.remove(json_output_file)
else:
print(
f"Warning: JSON output file {json_output_file} not found"
)
finally:
kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
)
self.full_report += report_part + "\n"
# Clean up JSON file
os.remove(json_output_file)
else:
print(f"Warning: JSON output file {json_output_file} not found")
finally:
kill_process_tree(process.pid)
report_part = BenchmarkResult.generate_markdown_report(
PROFILE_DIR, benchmark_results
)
self.full_report += report_part + "\n"
if is_in_ci():
write_github_step_summary(self.full_report)
......
import json
import unittest
import warnings
from functools import partial
from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree
......@@ -8,8 +9,8 @@ from sglang.test.run_eval import run_eval
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelDeploySetup,
ModelEvalMetrics,
ModelLaunchSettings,
check_evaluation_test_results,
popen_launch_server,
write_results_to_json,
......@@ -17,25 +18,29 @@ from sglang.test.test_utils import (
MODEL_THRESHOLDS = {
# Conservative thresholds on 100 MMMU samples, especially for latency thresholds
ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1),
ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9),
ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics(
0.305, 23.8
ModelLaunchSettings("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(
0.330, 56.1
),
ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3),
ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics(
0.310, 16.7
ModelLaunchSettings("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 40.3),
ModelLaunchSettings(
"Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
): ModelEvalMetrics(0.305, 23.8),
ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
0.330, 22.3
),
ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7),
ModelLaunchSettings("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
ModelLaunchSettings("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
}
......
......@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
_parse_int_list_env,
is_in_ci,
parse_models,
......@@ -19,8 +20,13 @@ PROFILE_DIR = "performance_profiles_vlms"
MODEL_DEFAULTS = [
# Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS
"Qwen/Qwen2.5-VL-7B-Instruct",
"google/gemma-3-27b-it",
ModelLaunchSettings(
"Qwen/Qwen2.5-VL-7B-Instruct",
extra_args=["--mem-fraction-static=0.7"],
),
ModelLaunchSettings(
"google/gemma-3-27b-it",
),
# "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6",
......@@ -33,9 +39,18 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket"
)
cls.models = parse_models(
os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS))
)
nightly_vlm_models_str = os.environ.get("NIGHTLY_VLM_MODELS")
if nightly_vlm_models_str:
cls.models = []
model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths:
cls.models.append(
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
)
else:
cls.models = MODEL_DEFAULTS
cls.base_url = DEFAULT_URL_FOR_TEST
cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16")
......@@ -46,29 +61,31 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self):
all_benchmark_results = []
for model in self.models:
for model_setup in self.models:
benchmark_results = []
with self.subTest(model=model):
with self.subTest(model=model_setup.model_path):
process = popen_launch_server(
model=model,
model=model_setup.model_path,
base_url=self.base_url,
other_args=["--mem-fraction-static=0.7"],
other_args=model_setup.extra_args,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
)
try:
# Run bench_one_batch_server against the launched server
profile_filename = f"{model.replace('/', '_')}"
profile_filename = f"{model_setup.model_path.replace('/', '_')}"
# path for this run
profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
# JSON output file for this model
json_output_file = f"results_{model.replace('/', '_')}.json"
json_output_file = (
f"results_{model_setup.model_path.replace('/', '_')}.json"
)
command = [
"python3",
"-m",
"sglang.bench_one_batch_server",
f"--model={model}",
f"--model={model_setup.model_path}",
"--base-url",
self.base_url,
"--batch-size",
......@@ -91,12 +108,14 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0:
print(f"Error running benchmark for {model} with batch size:")
print(
f"Error running benchmark for {model_setup.model_path} with batch size:"
)
print(result.stderr)
# Continue to next batch size even if one fails
continue
print(f"Output for {model} with batch size:")
print(f"Output for {model_setup.model_path} with batch size:")
print(result.stdout)
# Load and deserialize JSON results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment