fix: profiler deployment timeout handling for MoE models (#6086)

Wrap wait_for_deployment_ready() in try/except TimeoutError for both prefill and decode profiling sweeps On timeout: log error, record via add_profiling_error(), clean up the timed-out deployment, and continue to the next parallelization mapping Previously, a single deployment timeout would crash the entire profiler job

fix: profiler deployment timeout handling for MoE models (#6086)
Wrap wait_for_deployment_ready() in try/except TimeoutError for both prefill and decode profiling sweeps On timeout: log error, record via add_profiling_error(), clean up the timed-out deployment, and continue to the next parallelization mapping Previously, a single deployment timeout would crash the entire profiler job
67329d10 · MatejKosec · GitHub · 9f76d060 · 67329d10 · 67329d10
Unverified Commit 67329d10 authored Feb 09, 2026 by MatejKosec Committed by GitHub Feb 09, 2026
Showing with 41 additions and 10 deletions

benchmarks/profiler/profile_sla.py benchmarks/profiler/profile_sla.py +38 -8

benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py ...rofiler/utils/config_modifiers/parallelization_mapping.py +3 -2

No files found.
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -243,7 +243,8 @@ async def run_profile(args):
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling prefill with {num_gpus} GPUs...")
            candidate_mappings = get_candidate_parallel_mappings(
-                num_gpus, args.model_info, EngineType.PREFILL
+                num_gpus,
+                args.model_info,
            )

            for mapping in candidate_mappings:
@@ -296,9 +297,23 @@ async def run_profile(args):
                    deployment_clients.append(client)  # Track for cleanup
                    await client.create_deployment(prefill_config_fn)
                    logger.info("Waiting for deployment to be ready...")
-                    await client.wait_for_deployment_ready(
-                        timeout=getattr(args, "deployment_timeout", 1800)
-                    )
+                    try:
+                        await client.wait_for_deployment_ready(
+                            timeout=getattr(args, "deployment_timeout", 1800)
+                        )
+                    except TimeoutError:
+                        logger.error(
+                            f"Deployment for mapping {mapping.label()} with {num_gpus} GPUs "
+                            f"failed to become ready within timeout during prefill profiling, skipping"
+                        )
+                        add_profiling_error(
+                            f"Mapping {mapping.label()} with {num_gpus} GPUs timed out "
+                            f"during prefill profiling"
+                        )
+                        logger.info("Cleaning up timed-out deployment...")
+                        await client.delete_deployment()
+                        deployment_clients.remove(client)
+                        continue
                    logger.info("Deployment is ready")

                    logger.info("Getting deployment logs...")
@@ -350,7 +365,8 @@ async def run_profile(args):
        for num_gpus in profile_num_gpus:
            logger.info(f"Profiling decode with {num_gpus} GPUs...")
            candidate_mappings = get_candidate_parallel_mappings(
-                num_gpus, args.model_info, EngineType.DECODE
+                num_gpus,
+                args.model_info,
            )

            for mapping in candidate_mappings:
@@ -401,9 +417,23 @@ async def run_profile(args):
                    deployment_clients.append(client)  # Track for cleanup
                    await client.create_deployment(decode_config_fn)
                    logger.info("Waiting for deployment to be ready...")
-                    await client.wait_for_deployment_ready(
-                        timeout=getattr(args, "deployment_timeout", 1800)
-                    )
+                    try:
+                        await client.wait_for_deployment_ready(
+                            timeout=getattr(args, "deployment_timeout", 1800)
+                        )
+                    except TimeoutError:
+                        logger.error(
+                            f"Deployment for mapping {mapping.label()} with {num_gpus} GPUs "
+                            f"failed to become ready within timeout during decode profiling, skipping"
+                        )
+                        add_profiling_error(
+                            f"Mapping {mapping.label()} with {num_gpus} GPUs timed out "
+                            f"during decode profiling"
+                        )
+                        logger.info("Cleaning up timed-out deployment...")
+                        await client.delete_deployment()
+                        deployment_clients.remove(client)
+                        continue
                    logger.info("Deployment is ready")

                    logger.info("Getting deployment logs...")

--- a/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
+++ b/benchmarks/profiler/utils/config_modifiers/parallelization_mapping.py
@@ -169,10 +169,11 @@ def _validate_intermediate_size(


 def get_candidate_parallel_mappings(
-    num_gpus: int, model_info: ModelInfo, phase: str
+    num_gpus: int,
+    model_info: ModelInfo,
 ) -> list[ParallelizationMapping]:
    """
-    Return a list of candidate parallelization mappings for a given GPU count and phase,
+    Return a list of candidate parallelization mappings for a given GPU count,
    verified against model properties.

    Verification rules: