Unverified Commit 1aab7f6b authored by hhzhang16's avatar hhzhang16 Committed by GitHub
Browse files

fix: use actual service names for profiler logs and handle FileNotFoundError correctly (#6112)


Signed-off-by: default avatarHannah Zhang <hannahz@nvidia.com>
parent 120ae649
......@@ -26,6 +26,7 @@ from benchmarks.profiler.utils.aiperf import (
get_decode_itl_and_thpt_per_gpu,
get_prefill_ttft,
)
from benchmarks.profiler.utils.config import Config, get_service_name_by_type
from benchmarks.profiler.utils.config_modifiers import CONFIG_MODIFIERS
from benchmarks.profiler.utils.config_modifiers.parallelization_mapping import (
ParallelizationMapping,
......@@ -63,7 +64,7 @@ from deploy.utils.dynamo_deployment import (
DynamoDeploymentClient,
cleanup_remaining_deployments,
)
from dynamo.planner.defaults import WORKER_COMPONENT_NAMES, SubComponentType
from dynamo.planner.defaults import SubComponentType
@dataclass
......@@ -445,8 +446,13 @@ async def run_profile(args):
# Compute max_concurrency and max_kv_tokens to know which
# num_request to sweep over.
attention_dp_size = mapping.get_attn_dp_size()
# Get the actual decode service name from the config
decode_cfg = Config.model_validate(decode_config)
decode_service_name = get_service_name_by_type(
decode_cfg, args.backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=attention_dp_size,
)
max_concurrency = max_kv_tokens // (args.isl + args.osl)
......@@ -762,8 +768,13 @@ async def run_profile(args):
)
attention_dp_size = best_decode_mapping.get_attn_dp_size()
# Get the actual decode service name from the config
decode_cfg = Config.model_validate(decode_config)
decode_service_name = get_service_name_by_type(
decode_cfg, args.backend, SubComponentType.DECODE
).lower()
max_kv_tokens = config_modifier.get_kv_cache_size_from_dynamo_log(
f"{work_dir}/{client.deployment_name}/{WORKER_COMPONENT_NAMES[args.backend].decode_worker_k8s_name.lower()}/0.log",
f"{work_dir}/{client.deployment_name}/{decode_service_name}/0.log",
attention_dp_size=attention_dp_size,
)
......
......@@ -322,11 +322,14 @@ class VllmV1ConfigModifier(BaseConfigModifier):
with open(dynamo_log_fn, "r") as f:
for line in f:
if "Maximum concurrency for" in line:
try:
line = line.strip().split("Maximum concurrency for ")[1]
token_count = int(
line.split(" tokens per request: ")[0].replace(",", "")
)
concurrency = float(line.split(" tokens per request: ")[1][:-1])
concurrency = float(
line.split(" tokens per request: ")[1][:-1]
)
# Log shows per-rank KV cache; multiply by attention_dp_size for total
kv_cache_per_rank = int(token_count * concurrency)
......@@ -339,6 +342,10 @@ class VllmV1ConfigModifier(BaseConfigModifier):
logger.warning(
f"Failed to parse KV cache size from line: {line}. Error: {e}"
)
except FileNotFoundError:
logger.warning(f"Log file not found: {dynamo_log_fn}")
except Exception as e:
logger.warning(f"Failed to read log file {dynamo_log_fn}: {e}")
return 0
@classmethod
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment