Unverified Commit 1aab7f6b authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: use actual service names for profiler logs and handle FileNotFoundError correctly (#6112)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 120ae649
...@@ -26,6 +26,7 @@ from benchmarks.profiler.utils.aiperf import ( ...@@ -26,6 +26,7 @@ from benchmarks.profiler.utils.aiperf import (
get_decode_itl_and_thpt_per_gpu, get_decode_itl_and_thpt_per_gpu,
get_prefill_ttft, get_prefill_ttft,
) )
from benchmarks.profiler.utils.config import Config, get_service_name_by_type
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import ( from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
ParallelizationMapping, ParallelizationMapping,
...@@ -63,7 +64,7 @@ from deploy.utils.dynamo_deployment import ( ...@@ -63,7 +64,7 @@ from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient, DynamoDeploymentClient,
cleanup_remaining_deployments, cleanup_remaining_deployments,
) )
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SubComponentType from dynamo.planner.defaults import SubComponentType
@dataclass @dataclass
...@@ -445,8 +446,13 @@ async def run_profile(args): ...@@ -445,8 +446,13 @@ async def run_profile(args):
# Compute max_concurrency and max_kv_tokens to know which # Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over. # num_request to sweep over.
attention_dp_size = mapping.get_attn_dp_size() attention_dp_size = mapping.get_attn_dp_size()
# Get the actual decode service name from the config
decode_cfg = Config.model_validate(decode_config)
decode_service_name = get_service_name_by_type(
decode_cfg, args.backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log( max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log", f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=attention_dp_size, attention_dp_size=attention_dp_size,
) )
max_concurrency = max_kv_tokens // (args.isl + args.osl) max_concurrency = max_kv_tokens // (args.isl + args.osl)
...@@ -762,8 +768,13 @@ async def run_profile(args): ...@@ -762,8 +768,13 @@ async def run_profile(args):
) )
attention_dp_size = best_decode_mapping.get_attn_dp_size() attention_dp_size = best_decode_mapping.get_attn_dp_size()
# Get the actual decode service name from the config
decode_cfg = Config.model_validate(decode_config)
decode_service_name = get_service_name_by_type(
decode_cfg, args.backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log( max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log", f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=attention_dp_size, attention_dp_size=attention_dp_size,
) )
......
...@@ -322,11 +322,14 @@ class VllmV1ConfigModifier(BaseConfigModifier): ...@@ -322,11 +322,14 @@ class VllmV1ConfigModifier(BaseConfigModifier):
with open(dynamo_log_fn, "r") as f: with open(dynamo_log_fn, "r") as f:
for line in f: for line in f:
if "Maximum concurrency for" in line: if "Maximum concurrency for" in line:
try:
line = line.strip().split("Maximum concurrency for ")[1] line = line.strip().split("Maximum concurrency for ")[1]
token_count = int( token_count = int(
line.split(" tokens per request: ")[0].replace(",", "") line.split(" tokens per request: ")[0].replace(",", "")
) )
concurrency = float(line.split(" tokens per request: ")[1][:-1]) concurrency = float(
line.split(" tokens per request: ")[1][:-1]
)
# Log shows per-rank KV cache; multiply by attention_dp_size for total # Log shows per-rank KV cache; multiply by attention_dp_size for total
kv_cache_per_rank = int(token_count * concurrency) kv_cache_per_rank = int(token_count * concurrency)
...@@ -339,6 +342,10 @@ class VllmV1ConfigModifier(BaseConfigModifier): ...@@ -339,6 +342,10 @@ class VllmV1ConfigModifier(BaseConfigModifier):
logger.warning( logger.warning(
f"Failed to parse KV cache size from line: {line}. Error: {e}" f"Failed to parse KV cache size from line: {line}. Error: {e}"
) )
except FileNotFoundError:
logger.warning(f"Log file not found: {dynamo_log_fn}")
except Exception as e:
logger.warning(f"Failed to read log file {dynamo_log_fn}: {e}")
return 0 return 0
@classmethod @classmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment