fix: use actual service names for profiler logs and handle FileNotFoundError correctly (#6112)

Signed-off-by: Hannah Zhang <hannahz@nvidia.com>

fix: use actual service names for profiler logs and handle FileNotFoundError correctly (#6112)
Signed-off-by: Hannah Zhang <hannahz@nvidia.com>
1aab7f6b · hhzhang16 · GitHub · 120ae649 · 1aab7f6b · 1aab7f6b
Unverified Commit 1aab7f6b authored Feb 10, 2026 by hhzhang16 Committed by GitHub Feb 10, 2026
Showing with 37 additions and 19 deletions

benchmarks/profiler/profile_sla.py benchmarks/profiler/profile_sla.py +14 -3

benchmarks/profiler/utils/config_modifiers/vllm.py benchmarks/profiler/utils/config_modifiers/vllm.py +23 -16

No files found.
--- a/benchmarks/profiler/profile_sla.py
+++ b/benchmarks/profiler/profile_sla.py
@@ -26,6 +26,7 @@ from benchmarks.profiler.utils.aiperf import (
    get_decode_itl_and_thpt_per_gpu,
    get_prefill_ttft,
 )
+from benchmarks.profiler.utils.config import Config, get_service_name_by_type
 from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
 from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
    ParallelizationMapping,
@@ -63,7 +64,7 @@ from deploy.utils.dynamo_deployment import (
    DynamoDeploymentClient,
    cleanup_remaining_deployments,
 )
-from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SubComponentType
+from dynamo.planner.defaults import SubComponentType


 @dataclass
@@ -445,8 +446,13 @@ async def run_profile(args):
                    # Compute max_concurrency and max_kv_tokens to know which
                    # num_request to sweep over.
                    attention_dp_size = mapping.get_attn_dp_size()
+                    # Get the actual decode service name from the config
+                    decode_cfg = Config.model_validate(decode_config)
+                    decode_service_name = get_service_name_by_type(
+                        decode_cfg, args.backend, SubComponentType.DECODE
+                    ).lower()
                    max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-                        f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
+                        f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
                        attention_dp_size=attention_dp_size,
                    )
                    max_concurrency = max_kv_tokens // (args.isl + args.osl)
@@ -762,8 +768,13 @@ async def run_profile(args):
            )

            attention_dp_size = best_decode_mapping.get_attn_dp_size()
+            # Get the actual decode service name from the config
+            decode_cfg = Config.model_validate(decode_config)
+            decode_service_name = get_service_name_by_type(
+                decode_cfg, args.backend, SubComponentType.DECODE
+            ).lower()
            max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
-                f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
+                f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
                attention_dp_size=attention_dp_size,
            )


--- a/benchmarks/profiler/utils/config_modifiers/vllm.py
+++ b/benchmarks/profiler/utils/config_modifiers/vllm.py
@@ -322,11 +322,14 @@ class VllmV1ConfigModifier(BaseConfigModifier):
            with open(dynamo_log_fn, "r") as f:
                for line in f:
                    if "Maximum concurrency for" in line:
+                        try:
                            line = line.strip().split("Maximum concurrency for ")[1]
                            token_count = int(
                                line.split(" tokens per request: ")[0].replace(",", "")
                            )
-                        concurrency = float(line.split(" tokens per request: ")[1][:-1])
+                            concurrency = float(
+                                line.split(" tokens per request: ")[1][:-1]
+                            )

                            # Log shows per-rank KV cache; multiply by attention_dp_size for total
                            kv_cache_per_rank = int(token_count * concurrency)
@@ -339,6 +342,10 @@ class VllmV1ConfigModifier(BaseConfigModifier):
                            logger.warning(
                                f"Failed to parse KV cache size from line: {line}. Error: {e}"
                            )
+        except FileNotFoundError:
+            logger.warning(f"Log file not found: {dynamo_log_fn}")
+        except Exception as e:
+            logger.warning(f"Failed to read log file {dynamo_log_fn}: {e}")
        return 0

    @classmethod