Unverified Commit 76c96c5d authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(frontend): Remove the debug_perf flag used for perf work (#7024)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 10081929
......@@ -71,7 +71,6 @@ class FrontendConfig(KvRouterConfigBase):
event_plane: str
chat_processor: str
enable_anthropic_api: bool
debug_perf: bool
preprocess_workers: int
def validate(self) -> None:
......@@ -357,19 +356,6 @@ class FrontendArgGroup(ArgGroup):
choices=["dynamo", "vllm"],
)
add_negatable_bool_argument(
g,
flag_name="--dyn-debug-perf",
env_var="DYN_DEBUG_PERF",
default=False,
dest="debug_perf",
help=(
"[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
"Logs per-function timing, request concurrency, and hot-path section durations. "
"'--dyn-chat-processor vllm' only."
),
)
add_argument(
g,
flag_name="--dyn-preprocess-workers",
......
......@@ -60,7 +60,7 @@ def setup_engine_factory(
"""
from .vllm_processor import EngineFactory
return EngineFactory(runtime, router_config, config, vllm_flags, config.debug_perf)
return EngineFactory(runtime, router_config, config, vllm_flags)
def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Performance instrumentation for diagnosing frontend preprocessing bottlenecks.
Activated by passing --dyn-debug-perf to dynamo.frontend.
"""
from __future__ import annotations
import logging
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Concurrency gauge
# ---------------------------------------------------------------------------
_active_requests = 0
_peak_requests = 0
def enter_generator() -> int:
"""Increment active request count. Returns current count.
Safe without a lock: only called while the GIL is held (all callers are
in Python code), so the read-modify-write on the global int is atomic
with respect to other Python threads.
"""
global _active_requests, _peak_requests
_active_requests += 1
count = _active_requests
if count > _peak_requests:
_peak_requests = count
return count
def exit_generator() -> int:
"""Decrement active request count. Returns current count."""
global _active_requests
_active_requests -= 1
return _active_requests
def get_active_requests() -> int:
return _active_requests
def get_peak_requests() -> int:
return _peak_requests
......@@ -81,7 +81,6 @@ class VllmProcessor:
output_processor: OutputProcessor,
tool_parser_class: type[ToolParser] | None,
reasoning_parser_class: type[ReasoningParser] | None,
debug_perf: bool = False,
):
self.tokenizer = tokenizer
self.input_processor = input_processor
......@@ -90,7 +89,6 @@ class VllmProcessor:
self.output_processor = output_processor
self.tool_parser_class = tool_parser_class
self.reasoning_parser_class = reasoning_parser_class
self.debug_perf = debug_perf
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# it has a lot of fields.
......@@ -103,36 +101,14 @@ class VllmProcessor:
model inference to a backend using the router.
"""
# ** VllmProcessor.generator called: {'messages': [{'role': 'user', 'content': 'What is the capital of Tuvalu?'}], 'model': '/home/grahamk/llms/Qwen3-0.6B', 'max_completion_tokens': 1000, 'stream': False}
if self.debug_perf:
from .perf_instrumentation import enter_generator, exit_generator
active = enter_generator()
t_start = time.monotonic()
logger.info("[perf] generator enter: active_requests=%d", active)
try:
async for item in self._generator_inner(request):
yield item
finally:
if self.debug_perf:
active = exit_generator()
elapsed_ms = (time.monotonic() - t_start) * 1000.0
logger.info(
"[perf] generator exit: total=%.2fms active_requests=%d",
elapsed_ms,
active,
)
async for item in self._generator_inner(request):
yield item
async def _generator_inner(
self, request: dict[str, Any]
) -> AsyncGenerator[dict[str, Any], None]:
request_id = random_uuid()
if self.debug_perf:
t0 = time.monotonic()
pre = await preprocess_chat_request(
request,
tokenizer=self.tokenizer,
......@@ -140,14 +116,6 @@ class VllmProcessor:
tool_parser_class=self.tool_parser_class,
)
if self.debug_perf:
t1 = time.monotonic()
logger.info(
"[perf] preprocess_chat_request: %.2fms (request=%s)",
(t1 - t0) * 1000.0,
request_id,
)
request_for_sampling = pre.request_for_sampling
tool_parser = pre.tool_parser
chat_template_kwargs = pre.chat_template_kwargs
......@@ -207,9 +175,6 @@ class VllmProcessor:
"mm_processor_kwargs"
] = request_for_sampling.mm_processor_kwargs
if self.debug_perf:
t2 = time.monotonic()
vllm_preproc: EngineCoreRequest = self.input_processor.process_inputs(
request_id,
prompt_inputs,
......@@ -222,15 +187,6 @@ class VllmProcessor:
# data_parallel_rank: int | None = None,
)
if self.debug_perf:
t3 = time.monotonic()
logger.info(
"[perf] input_processor.process_inputs: %.2fms (request=%s tokens=%d)",
(t3 - t2) * 1000.0,
request_id,
len(tokens),
)
InputProcessor.assign_request_id(vllm_preproc)
# Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
......@@ -315,10 +271,6 @@ class VllmProcessor:
) -> AsyncGenerator[dict[str, Any], None]:
self.output_processor.add_request(vllm_preproc, None)
token_count = 0
output_proc_total_ms = 0.0
post_proc_total_ms = 0.0
try:
if self.is_kv_router:
dynamo_stream = await self.router.generate(
......@@ -362,17 +314,10 @@ class VllmProcessor:
stop_reason=stop_reason,
)
if self.debug_perf:
t_op0 = time.monotonic()
vllm_out: OutputProcessorOutput = self.output_processor.process_outputs(
[vllm_response]
)
if self.debug_perf:
t_op1 = time.monotonic()
output_proc_total_ms += (t_op1 - t_op0) * 1000.0
if vllm_out.reqs_to_abort:
pass
......@@ -384,11 +329,6 @@ class VllmProcessor:
if choice:
choices.append(choice)
if self.debug_perf:
t_op2 = time.monotonic()
post_proc_total_ms += (t_op2 - t_op1) * 1000.0
token_count += len(engine_response["token_ids"])
if choices:
dynamo_out = {
"id": request_id,
......@@ -406,18 +346,6 @@ class VllmProcessor:
self.output_processor.abort_requests(
[vllm_preproc.request_id], internal=True
)
if self.debug_perf and token_count > 0:
logger.info(
"[perf] stream done: request=%s tokens=%d "
"output_processor_total=%.2fms (%.3fms/tok) "
"post_processor_total=%.2fms (%.3fms/tok)",
request_id,
token_count,
output_proc_total_ms,
output_proc_total_ms / token_count,
post_proc_total_ms,
post_proc_total_ms / token_count,
)
class EngineFactory:
......@@ -427,7 +355,6 @@ class EngineFactory:
router_config: RouterConfig,
config: FrontendConfig,
flags: Namespace,
debug_perf: bool = False,
):
if config.preprocess_workers != 0:
raise RuntimeError(
......@@ -438,7 +365,6 @@ class EngineFactory:
self.router_config = router_config
self.config = config
self.flags = flags
self.debug_perf = debug_perf
self.stream_interval = 20
raw_stream_interval = os.getenv("DYN_VLLM_STREAM_INTERVAL")
if raw_stream_interval:
......@@ -536,7 +462,6 @@ class EngineFactory:
output_processor,
tool_parser_class,
reasoning_parser_class,
debug_perf=self.debug_perf,
)
return PythonAsyncEngine(gen.generator, loop)
......@@ -317,8 +317,8 @@ class TRTLLMProcess:
time.sleep(2)
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
@pytest.mark.timeout(300)
def test_trtllm_kv_router_basic(
......@@ -420,8 +420,8 @@ def test_router_decisions_trtllm_attention_dp(
)
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
def test_router_decisions_trtllm_multiple_workers(
......@@ -461,8 +461,8 @@ def test_router_decisions_trtllm_multiple_workers(
)
@pytest.mark.pre_merge
@pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
@pytest.mark.parametrize(
"store_backend,durable_kv_events,request_plane",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment