Unverified Commit fae35432 authored by Alec's avatar Alec Committed by GitHub
Browse files

ci: longer timeout, change model for l40 (#2951)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 6104c93f
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run clear_namespace
python3 -m dynamo.sglang.utils.clear_namespace --namespace dynamo
# run ingress
python3 -m dynamo.frontend --http-port=8000 &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.sglang \
--model-path "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--served-model-name "deepseek-ai/DeepSeek-R1-Distill-Llama-8B" \
--page-size 16 \
--tp 1 \
--trust-remote-code \
--skip-tokenizer-init
......@@ -26,10 +26,10 @@ sglang_dir = os.environ.get("SGLANG_DIR", "/workspace/components/backends/sglang
sglang_configs = {
"aggregated": SGLangConfig(
name="aggregated",
directory=sglang_dir,
script_name="agg.sh",
directory="/workspace/tests/serve",
script_name="sglang_agg.sh",
marks=[pytest.mark.gpu_1],
model="Qwen/Qwen3-0.6B",
model="deepseek-ai/DeepSeek-R1-Distill-Llama-8B",
env={},
models_port=8000,
request_payloads=[chat_payload_default(), completion_payload_default()],
......
......@@ -35,11 +35,12 @@ class EngineConfig:
name: str
directory: str
script_name: str
marks: List[Any]
request_payloads: List[BasePayload]
model: str
script_name: Optional[str] = None
command: Optional[List[str]] = None
script_args: Optional[List[str]] = None
models_port: int = 8000
timeout: int = 600
......@@ -47,6 +48,13 @@ class EngineConfig:
env: Dict[str, str] = field(default_factory=dict)
stragglers: list[str] = field(default_factory=list)
def __post_init__(self):
"""Validate that either script_name or command is provided, but not both."""
if not self.script_name and not self.command:
raise ValueError("Either script_name or command must be provided")
if self.script_name and self.command:
raise ValueError("Cannot provide both script_name and command")
class EngineProcess(ManagedProcess):
"""Base class for LLM engine processes (vLLM, TRT-LLM, etc.)"""
......@@ -132,24 +140,21 @@ class EngineProcess(ManagedProcess):
logger.info(f"SUCCESS: All expected log patterns: {patterns} found")
@classmethod
def from_script(
def from_config(
cls,
config: EngineConfig,
request: Any,
extra_env: Optional[Dict[str, str]] = None,
) -> "EngineProcess":
"""Factory to create an EngineProcess configured to run a launch script."""
"""Factory to create an EngineProcess from configuration (script or command)."""
assert isinstance(config, EngineConfig), "Must use an instance of EngineConfig"
directory = config.directory
script_path = os.path.join(directory, "launch", config.script_name)
if not os.path.exists(script_path):
raise FileNotFoundError(f"Script not found: {script_path}")
command: List[str] = ["bash", script_path]
if config.script_args:
command.extend(config.script_args)
if config.script_name:
command = cls._build_script_command(config)
elif config.command:
command = config.command.copy()
else:
raise ValueError("Either script_name or command must be provided in config")
env = os.environ.copy()
if getattr(config, "env", None):
......@@ -162,7 +167,7 @@ class EngineProcess(ManagedProcess):
env=env,
timeout=config.timeout,
display_output=True,
working_dir=directory,
working_dir=config.directory,
health_check_ports=[],
health_check_urls=[
(f"http://localhost:{config.models_port}/v1/models", check_models_api),
......@@ -176,3 +181,47 @@ class EngineProcess(ManagedProcess):
stragglers=config.stragglers,
log_dir=request.node.name,
)
@classmethod
def _build_script_command(cls, config: EngineConfig) -> List[str]:
"""Build command from script configuration."""
assert (
config.script_name
), "Must provide script_name to run fn _build_script_command"
directory = config.directory
script_path = os.path.join(directory, "launch", config.script_name)
if not os.path.exists(script_path):
raise FileNotFoundError(f"Script not found: {script_path}")
command: List[str] = ["bash", script_path]
if config.script_args:
command.extend(config.script_args)
return command
@classmethod
def from_script(
cls,
config: EngineConfig,
request: Any,
extra_env: Optional[Dict[str, str]] = None,
) -> "EngineProcess":
"""Factory to create an EngineProcess configured to run a launch script.
Deprecated: Use from_config() instead.
"""
return cls.from_config(config, request, extra_env)
@classmethod
def from_command(
cls,
config: EngineConfig,
request: Any,
extra_env: Optional[Dict[str, str]] = None,
) -> "EngineProcess":
"""Factory to create an EngineProcess configured to run a direct command.
Deprecated: Use from_config() instead.
"""
return cls.from_config(config, request, extra_env)
......@@ -30,7 +30,7 @@ class BasePayload:
expected_response: List[str]
expected_log: List[str]
repeat_count: int = 1
timeout: int = 30
timeout: int = 60
# Connection info
host: str = "localhost"
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment