Unverified Commit 64d1505c authored by Mick's avatar Mick Committed by GitHub
Browse files

ci: unify the model launch method of nightly ci (#11230)

parent f3764c26
...@@ -20,7 +20,6 @@ from functools import partial ...@@ -20,7 +20,6 @@ from functools import partial
from pathlib import Path from pathlib import Path
from types import SimpleNamespace from types import SimpleNamespace
from typing import Any, Awaitable, Callable, List, Optional, Tuple from typing import Any, Awaitable, Callable, List, Optional, Tuple
from urllib.parse import quote
import aiohttp import aiohttp
import numpy as np import numpy as np
...@@ -1652,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str): ...@@ -1652,15 +1651,26 @@ def _ensure_remove_suffix(text: str, suffix: str):
return text.removesuffix(suffix) return text.removesuffix(suffix)
class ModelDeploySetup: class ModelLaunchSettings:
def __init__(self, model_path: str, extra_args: List[str] = []): def __init__(
self,
model_path: str,
tp_size: int = 1,
extra_args: Optional[List[str]] = None,
env: Optional[dict] = None,
):
self.model_path = model_path self.model_path = model_path
if "--enable-multimodal" not in extra_args: self.tp_size = tp_size
extra_args.append("--enable-multimodal") self.extra_args = list(extra_args) if extra_args else []
if "--trust-remote-code" not in extra_args: self.env = env
extra_args.append("--trust-remote-code")
if self.tp_size > 1 and "--tp" not in self.extra_args:
self.extra_args.extend(["--tp", str(self.tp_size)])
self.extra_args = extra_args fixed_args = ["--enable-multimodal", "--trust-remote-code"]
for fixed_arg in fixed_args:
if fixed_arg not in self.extra_args:
self.extra_args.append(fixed_arg)
class ModelEvalMetrics: class ModelEvalMetrics:
......
...@@ -12,6 +12,7 @@ from sglang.test.test_utils import ( ...@@ -12,6 +12,7 @@ from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2, DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2,
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
check_evaluation_test_results, check_evaluation_test_results,
parse_models, parse_models,
popen_launch_server, popen_launch_server,
...@@ -44,12 +45,19 @@ MODEL_SCORE_THRESHOLDS = { ...@@ -44,12 +45,19 @@ MODEL_SCORE_THRESHOLDS = {
class TestNightlyGsm8KEval(unittest.TestCase): class TestNightlyGsm8KEval(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model_groups = [ cls.models = []
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), models_tp1 = parse_models(
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), ) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1)
(parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), for model_path in models_tp1:
] cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
models_tp2 = parse_models(
DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2
) + parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2)
for model_path in models_tp2:
cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
def test_mgsm_en_all_models(self): def test_mgsm_en_all_models(self):
...@@ -58,26 +66,24 @@ class TestNightlyGsm8KEval(unittest.TestCase): ...@@ -58,26 +66,24 @@ class TestNightlyGsm8KEval(unittest.TestCase):
) )
is_first = True is_first = True
all_results = [] all_results = []
model_count = 0 for model_setup in self.models:
for model_group, is_fp8, is_tp2 in self.model_groups: with self.subTest(model=model_setup.model_path):
for model in model_group: other_args = list(model_setup.extra_args)
model_count += 1
with self.subTest(model=model): if model_setup.model_path == "meta-llama/Llama-3.1-70B-Instruct":
other_args = ["--tp", "2"] if is_tp2 else []
if model == "meta-llama/Llama-3.1-70B-Instruct":
other_args.extend(["--mem-fraction-static", "0.9"]) other_args.extend(["--mem-fraction-static", "0.9"])
process = popen_launch_server( process = popen_launch_server(
model=model, model=model_setup.model_path,
other_args=other_args, other_args=other_args,
base_url=self.base_url, base_url=self.base_url,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
) )
try:
args = SimpleNamespace( args = SimpleNamespace(
base_url=self.base_url, base_url=self.base_url,
model=model, model=model_setup.model_path,
eval_name="mgsm_en", eval_name="mgsm_en",
num_examples=None, num_examples=None,
num_threads=1024, num_threads=1024,
...@@ -85,14 +91,17 @@ class TestNightlyGsm8KEval(unittest.TestCase): ...@@ -85,14 +91,17 @@ class TestNightlyGsm8KEval(unittest.TestCase):
metrics = run_eval(args) metrics = run_eval(args)
print( print(
f"{'=' * 42}\n{model} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n" f"{'=' * 42}\n{model_setup.model_path} - metrics={metrics} score={metrics['score']}\n{'=' * 42}\n"
) )
write_results_to_json(model, metrics, "w" if is_first else "a") write_results_to_json(
model_setup.model_path, metrics, "w" if is_first else "a"
)
is_first = False is_first = False
# 0.0 for empty latency # 0.0 for empty latency
all_results.append((model, metrics["score"], 0.0)) all_results.append((model_setup.model_path, metrics["score"], 0.0))
finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
try: try:
...@@ -107,7 +116,7 @@ class TestNightlyGsm8KEval(unittest.TestCase): ...@@ -107,7 +116,7 @@ class TestNightlyGsm8KEval(unittest.TestCase):
all_results, all_results,
self.__class__.__name__, self.__class__.__name__,
model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS, model_accuracy_thresholds=MODEL_SCORE_THRESHOLDS,
model_count=model_count, model_count=len(self.models),
) )
......
...@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree ...@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
_parse_int_list_env, _parse_int_list_env,
is_in_ci, is_in_ci,
parse_models, parse_models,
...@@ -21,14 +22,16 @@ PROFILE_DIR = "performance_profiles_text_models" ...@@ -21,14 +22,16 @@ PROFILE_DIR = "performance_profiles_text_models"
class TestNightlyTextModelsPerformance(unittest.TestCase): class TestNightlyTextModelsPerformance(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
cls.model_groups = [ cls.models = []
(parse_models("meta-llama/Llama-3.1-8B-Instruct"), False, False), # TODO: replace with DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1 or other model lists
(parse_models("Qwen/Qwen2-57B-A14B-Instruct"), False, True), for model_path in parse_models("meta-llama/Llama-3.1-8B-Instruct"):
cls.models.append(ModelLaunchSettings(model_path, tp_size=1))
for model_path in parse_models("Qwen/Qwen2-57B-A14B-Instruct"):
cls.models.append(ModelLaunchSettings(model_path, tp_size=2))
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False), # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP1), False, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True), # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_TP2), False, True),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False), # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP1), True, False),
# (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True), # (parse_models(DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2), True, True),
]
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.batch_sizes = [1, 1, 8, 16, 64] cls.batch_sizes = [1, 1, 8, 16, 64]
cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096")) cls.input_lens = tuple(_parse_int_list_env("NIGHTLY_INPUT_LENS", "4096"))
...@@ -39,34 +42,29 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): ...@@ -39,34 +42,29 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
for model_group, is_fp8, is_tp2 in self.model_groups: for model_setup in self.models:
for model in model_group:
benchmark_results = [] benchmark_results = []
with self.subTest(model=model): with self.subTest(model=model_setup.model_path):
process = popen_launch_server( process = popen_launch_server(
model=model, model=model_setup.model_path,
base_url=self.base_url, base_url=self.base_url,
other_args=["--tp", "2"] if is_tp2 else [], other_args=model_setup.extra_args,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
) )
try: try:
profile_filename = ( profile_filename = (
f"{model.replace('/', '_')}_{int(time.time())}" f"{model_setup.model_path.replace('/', '_')}_{int(time.time())}"
)
profile_path_prefix = os.path.join(
PROFILE_DIR, profile_filename
)
json_output_file = (
f"results_{model.replace('/', '_')}_{int(time.time())}.json"
) )
profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
json_output_file = f"results_{model_setup.model_path.replace('/', '_')}_{int(time.time())}.json"
command = [ command = [
"python3", "python3",
"-m", "-m",
"sglang.bench_one_batch_server", "sglang.bench_one_batch_server",
"--model", "--model",
model, model_setup.model_path,
"--base-url", "--base-url",
self.base_url, self.base_url,
"--batch-size", "--batch-size",
...@@ -89,7 +87,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): ...@@ -89,7 +87,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
if result.returncode != 0: if result.returncode != 0:
print( print(
f"Error running benchmark for {model} with batch size:" f"Error running benchmark for {model_setup.model_path} with batch size:"
) )
print(result.stderr) print(result.stderr)
# Continue to next batch size even if one fails # Continue to next batch size even if one fails
...@@ -115,9 +113,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase): ...@@ -115,9 +113,7 @@ class TestNightlyTextModelsPerformance(unittest.TestCase):
# Clean up JSON file # Clean up JSON file
os.remove(json_output_file) os.remove(json_output_file)
else: else:
print( print(f"Warning: JSON output file {json_output_file} not found")
f"Warning: JSON output file {json_output_file} not found"
)
finally: finally:
kill_process_tree(process.pid) kill_process_tree(process.pid)
......
import json import json
import unittest import unittest
import warnings import warnings
from functools import partial
from types import SimpleNamespace from types import SimpleNamespace
from sglang.srt.utils import kill_process_tree from sglang.srt.utils import kill_process_tree
...@@ -8,8 +9,8 @@ from sglang.test.run_eval import run_eval ...@@ -8,8 +9,8 @@ from sglang.test.run_eval import run_eval
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
ModelDeploySetup,
ModelEvalMetrics, ModelEvalMetrics,
ModelLaunchSettings,
check_evaluation_test_results, check_evaluation_test_results,
popen_launch_server, popen_launch_server,
write_results_to_json, write_results_to_json,
...@@ -17,25 +18,29 @@ from sglang.test.test_utils import ( ...@@ -17,25 +18,29 @@ from sglang.test.test_utils import (
MODEL_THRESHOLDS = { MODEL_THRESHOLDS = {
# Conservative thresholds on 100 MMMU samples, especially for latency thresholds # Conservative thresholds on 100 MMMU samples, especially for latency thresholds
ModelDeploySetup("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(0.330, 56.1), ModelLaunchSettings("deepseek-ai/deepseek-vl2-small"): ModelEvalMetrics(
ModelDeploySetup("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 39.9), 0.330, 56.1
ModelDeploySetup("Efficient-Large-Model/NVILA-Lite-2B-hf-0626"): ModelEvalMetrics(
0.305, 23.8
), ),
ModelDeploySetup("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9), ModelLaunchSettings("deepseek-ai/Janus-Pro-7B"): ModelEvalMetrics(0.285, 40.3),
ModelDeploySetup("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3), ModelLaunchSettings(
ModelDeploySetup("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6), "Efficient-Large-Model/NVILA-Lite-2B-hf-0626"
ModelDeploySetup("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(0.330, 22.3), ): ModelEvalMetrics(0.305, 23.8),
ModelDeploySetup("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3), ModelLaunchSettings("google/gemma-3-4b-it"): ModelEvalMetrics(0.360, 10.9),
ModelDeploySetup("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5), ModelLaunchSettings("google/gemma-3n-E4B-it"): ModelEvalMetrics(0.360, 15.3),
ModelDeploySetup("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0), ModelLaunchSettings("mistral-community/pixtral-12b"): ModelEvalMetrics(0.360, 16.6),
ModelDeploySetup("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3), ModelLaunchSettings("moonshotai/Kimi-VL-A3B-Instruct"): ModelEvalMetrics(
ModelDeploySetup("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9), 0.330, 22.3
ModelDeploySetup("unsloth/Mistral-Small-3.1-24B-Instruct-2503"): ModelEvalMetrics(
0.310, 16.7
), ),
ModelDeploySetup("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0), ModelLaunchSettings("openbmb/MiniCPM-o-2_6"): ModelEvalMetrics(0.330, 29.3),
ModelDeploySetup("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4), ModelLaunchSettings("openbmb/MiniCPM-v-2_6"): ModelEvalMetrics(0.270, 24.5),
ModelLaunchSettings("OpenGVLab/InternVL2_5-2B"): ModelEvalMetrics(0.300, 14.0),
ModelLaunchSettings("Qwen/Qwen2-VL-7B-Instruct"): ModelEvalMetrics(0.310, 83.3),
ModelLaunchSettings("Qwen/Qwen2.5-VL-7B-Instruct"): ModelEvalMetrics(0.340, 31.9),
ModelLaunchSettings(
"unsloth/Mistral-Small-3.1-24B-Instruct-2503"
): ModelEvalMetrics(0.310, 16.7),
ModelLaunchSettings("XiaomiMiMo/MiMo-VL-7B-RL"): ModelEvalMetrics(0.28, 32.0),
ModelLaunchSettings("zai-org/GLM-4.1V-9B-Thinking"): ModelEvalMetrics(0.280, 30.4),
} }
......
...@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree ...@@ -8,6 +8,7 @@ from sglang.srt.utils import kill_process_tree
from sglang.test.test_utils import ( from sglang.test.test_utils import (
DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
DEFAULT_URL_FOR_TEST, DEFAULT_URL_FOR_TEST,
ModelLaunchSettings,
_parse_int_list_env, _parse_int_list_env,
is_in_ci, is_in_ci,
parse_models, parse_models,
...@@ -19,8 +20,13 @@ PROFILE_DIR = "performance_profiles_vlms" ...@@ -19,8 +20,13 @@ PROFILE_DIR = "performance_profiles_vlms"
MODEL_DEFAULTS = [ MODEL_DEFAULTS = [
# Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS # Keep conservative defaults. Can be overridden by env NIGHTLY_VLM_MODELS
ModelLaunchSettings(
"Qwen/Qwen2.5-VL-7B-Instruct", "Qwen/Qwen2.5-VL-7B-Instruct",
extra_args=["--mem-fraction-static=0.7"],
),
ModelLaunchSettings(
"google/gemma-3-27b-it", "google/gemma-3-27b-it",
),
# "OpenGVLab/InternVL2_5-2B", # "OpenGVLab/InternVL2_5-2B",
# buggy in official transformers impl # buggy in official transformers impl
# "openbmb/MiniCPM-V-2_6", # "openbmb/MiniCPM-V-2_6",
...@@ -33,9 +39,18 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -33,9 +39,18 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
warnings.filterwarnings( warnings.filterwarnings(
"ignore", category=ResourceWarning, message="unclosed.*socket" "ignore", category=ResourceWarning, message="unclosed.*socket"
) )
cls.models = parse_models(
os.environ.get("NIGHTLY_VLM_MODELS", ",".join(MODEL_DEFAULTS)) nightly_vlm_models_str = os.environ.get("NIGHTLY_VLM_MODELS")
if nightly_vlm_models_str:
cls.models = []
model_paths = parse_models(nightly_vlm_models_str)
for model_path in model_paths:
cls.models.append(
ModelLaunchSettings(model_path, extra_args=VLM_EXTRA_ARGS)
) )
else:
cls.models = MODEL_DEFAULTS
cls.base_url = DEFAULT_URL_FOR_TEST cls.base_url = DEFAULT_URL_FOR_TEST
cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16") cls.batch_sizes = _parse_int_list_env("NIGHTLY_VLM_BATCH_SIZES", "1,1,2,8,16")
...@@ -46,29 +61,31 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -46,29 +61,31 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
def test_bench_one_batch(self): def test_bench_one_batch(self):
all_benchmark_results = [] all_benchmark_results = []
for model in self.models: for model_setup in self.models:
benchmark_results = [] benchmark_results = []
with self.subTest(model=model): with self.subTest(model=model_setup.model_path):
process = popen_launch_server( process = popen_launch_server(
model=model, model=model_setup.model_path,
base_url=self.base_url, base_url=self.base_url,
other_args=["--mem-fraction-static=0.7"], other_args=model_setup.extra_args,
timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
) )
try: try:
# Run bench_one_batch_server against the launched server # Run bench_one_batch_server against the launched server
profile_filename = f"{model.replace('/', '_')}" profile_filename = f"{model_setup.model_path.replace('/', '_')}"
# path for this run # path for this run
profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename) profile_path_prefix = os.path.join(PROFILE_DIR, profile_filename)
# JSON output file for this model # JSON output file for this model
json_output_file = f"results_{model.replace('/', '_')}.json" json_output_file = (
f"results_{model_setup.model_path.replace('/', '_')}.json"
)
command = [ command = [
"python3", "python3",
"-m", "-m",
"sglang.bench_one_batch_server", "sglang.bench_one_batch_server",
f"--model={model}", f"--model={model_setup.model_path}",
"--base-url", "--base-url",
self.base_url, self.base_url,
"--batch-size", "--batch-size",
...@@ -91,12 +108,14 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase): ...@@ -91,12 +108,14 @@ class TestNightlyVLMModelsPerformance(unittest.TestCase):
result = subprocess.run(command, capture_output=True, text=True) result = subprocess.run(command, capture_output=True, text=True)
if result.returncode != 0: if result.returncode != 0:
print(f"Error running benchmark for {model} with batch size:") print(
f"Error running benchmark for {model_setup.model_path} with batch size:"
)
print(result.stderr) print(result.stderr)
# Continue to next batch size even if one fails # Continue to next batch size even if one fails
continue continue
print(f"Output for {model} with batch size:") print(f"Output for {model_setup.model_path} with batch size:")
print(result.stdout) print(result.stdout)
# Load and deserialize JSON results # Load and deserialize JSON results
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment