"vscode:/vscode.git/clone" did not exist on "0a517dc05a29aca7a1af1157ba244475607676e3"
Unverified Commit 67329d10 authored by MatejKosec's avatar MatejKosec Committed by GitHub
Browse files

fix: profiler deployment timeout handling for MoE models (#6086)

Wrap wait_for_deployment_ready() in try/except TimeoutError for both prefill and decode profiling sweeps
On timeout: log error, record via add_profiling_error(), clean up the timed-out deployment, and continue to the next parallelization mapping
Previously, a single deployment timeout would crash the entire profiler job
parent 9f76d060
......@@ -243,7 +243,8 @@ async def run_profile(args):
for num_gpus in profile_num_gpus:
logger.info(f"Profiling prefill with {num_gpus} GPUs...")
candidate_mappings = get_candidate_parallel_mappings(
num_gpus, args.model_info, EngineType.PREFILL
num_gpus,
args.model_info,
)
for mapping in candidate_mappings:
......@@ -296,9 +297,23 @@ async def run_profile(args):
deployment_clients.append(client) # Track for cleanup
await client.create_deployment(prefill_config_fn)
logger.info("Waiting for deployment to be ready...")
await client.wait_for_deployment_ready(
timeout=getattr(args, "deployment_timeout", 1800)
)
try:
await client.wait_for_deployment_ready(
timeout=getattr(args, "deployment_timeout", 1800)
)
except TimeoutError:
logger.error(
f"Deployment for mapping {mapping.label()} with {num_gpus} GPUs "
f"failed to become ready within timeout during prefill profiling, skipping"
)
add_profiling_error(
f"Mapping {mapping.label()} with {num_gpus} GPUs timed out "
f"during prefill profiling"
)
logger.info("Cleaning up timed-out deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
continue
logger.info("Deployment is ready")
logger.info("Getting deployment logs...")
......@@ -350,7 +365,8 @@ async def run_profile(args):
for num_gpus in profile_num_gpus:
logger.info(f"Profiling decode with {num_gpus} GPUs...")
candidate_mappings = get_candidate_parallel_mappings(
num_gpus, args.model_info, EngineType.DECODE
num_gpus,
args.model_info,
)
for mapping in candidate_mappings:
......@@ -401,9 +417,23 @@ async def run_profile(args):
deployment_clients.append(client) # Track for cleanup
await client.create_deployment(decode_config_fn)
logger.info("Waiting for deployment to be ready...")
await client.wait_for_deployment_ready(
timeout=getattr(args, "deployment_timeout", 1800)
)
try:
await client.wait_for_deployment_ready(
timeout=getattr(args, "deployment_timeout", 1800)
)
except TimeoutError:
logger.error(
f"Deployment for mapping {mapping.label()} with {num_gpus} GPUs "
f"failed to become ready within timeout during decode profiling, skipping"
)
add_profiling_error(
f"Mapping {mapping.label()} with {num_gpus} GPUs timed out "
f"during decode profiling"
)
logger.info("Cleaning up timed-out deployment...")
await client.delete_deployment()
deployment_clients.remove(client)
continue
logger.info("Deployment is ready")
logger.info("Getting deployment logs...")
......
......@@ -169,10 +169,11 @@ def _validate_intermediate_size(
def get_candidate_parallel_mappings(
num_gpus: int, model_info: ModelInfo, phase: str
num_gpus: int,
model_info: ModelInfo,
) -> list[ParallelizationMapping]:
"""
Return a list of candidate parallelization mappings for a given GPU count and phase,
Return a list of candidate parallelization mappings for a given GPU count,
verified against model properties.
Verification rules:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment