Unverified Commit 76c96c5d authored by Graham King's avatar Graham King Committed by GitHub
Browse files

chore(frontend): Remove the debug_perf flag used for perf work (#7024)


Signed-off-by: default avatarGraham King <grahamk@nvidia.com>
parent 10081929
...@@ -71,7 +71,6 @@ class FrontendConfig(KvRouterConfigBase): ...@@ -71,7 +71,6 @@ class FrontendConfig(KvRouterConfigBase):
event_plane: str event_plane: str
chat_processor: str chat_processor: str
enable_anthropic_api: bool enable_anthropic_api: bool
debug_perf: bool
preprocess_workers: int preprocess_workers: int
def validate(self) -> None: def validate(self) -> None:
...@@ -357,19 +356,6 @@ class FrontendArgGroup(ArgGroup): ...@@ -357,19 +356,6 @@ class FrontendArgGroup(ArgGroup):
choices=["dynamo", "vllm"], choices=["dynamo", "vllm"],
) )
add_negatable_bool_argument(
g,
flag_name="--dyn-debug-perf",
env_var="DYN_DEBUG_PERF",
default=False,
dest="debug_perf",
help=(
"[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
"Logs per-function timing, request concurrency, and hot-path section durations. "
"'--dyn-chat-processor vllm' only."
),
)
add_argument( add_argument(
g, g,
flag_name="--dyn-preprocess-workers", flag_name="--dyn-preprocess-workers",
......
...@@ -60,7 +60,7 @@ def setup_engine_factory( ...@@ -60,7 +60,7 @@ def setup_engine_factory(
""" """
from .vllm_processor import EngineFactory from .vllm_processor import EngineFactory
return EngineFactory(runtime, router_config, config, vllm_flags, config.debug_perf) return EngineFactory(runtime, router_config, config, vllm_flags)
def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]: def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:
......
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
Performance instrumentation for diagnosing frontend preprocessing bottlenecks.
Activated by passing --dyn-debug-perf to dynamo.frontend.
"""
from __future__ import annotations
import logging
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Concurrency gauge
# ---------------------------------------------------------------------------
_active_requests = 0
_peak_requests = 0
def enter_generator() -> int:
"""Increment active request count. Returns current count.
Safe without a lock: only called while the GIL is held (all callers are
in Python code), so the read-modify-write on the global int is atomic
with respect to other Python threads.
"""
global _active_requests, _peak_requests
_active_requests += 1
count = _active_requests
if count > _peak_requests:
_peak_requests = count
return count
def exit_generator() -> int:
"""Decrement active request count. Returns current count."""
global _active_requests
_active_requests -= 1
return _active_requests
def get_active_requests() -> int:
return _active_requests
def get_peak_requests() -> int:
return _peak_requests
...@@ -81,7 +81,6 @@ class VllmProcessor: ...@@ -81,7 +81,6 @@ class VllmProcessor:
output_processor: OutputProcessor, output_processor: OutputProcessor,
tool_parser_class: type[ToolParser] | None, tool_parser_class: type[ToolParser] | None,
reasoning_parser_class: type[ReasoningParser] | None, reasoning_parser_class: type[ReasoningParser] | None,
debug_perf: bool = False,
): ):
self.tokenizer = tokenizer self.tokenizer = tokenizer
self.input_processor = input_processor self.input_processor = input_processor
...@@ -90,7 +89,6 @@ class VllmProcessor: ...@@ -90,7 +89,6 @@ class VllmProcessor:
self.output_processor = output_processor self.output_processor = output_processor
self.tool_parser_class = tool_parser_class self.tool_parser_class = tool_parser_class
self.reasoning_parser_class = reasoning_parser_class self.reasoning_parser_class = reasoning_parser_class
self.debug_perf = debug_perf
# Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but # Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
# it has a lot of fields. # it has a lot of fields.
...@@ -103,36 +101,14 @@ class VllmProcessor: ...@@ -103,36 +101,14 @@ class VllmProcessor:
model inference to a backend using the router. model inference to a backend using the router.
""" """
# ** VllmProcessor.generator called: {'messages': [{'role': 'user', 'content': 'What is the capital of Tuvalu?'}], 'model': '/home/grahamk/llms/Qwen3-0.6B', 'max_completion_tokens': 1000, 'stream': False}
if self.debug_perf:
from .perf_instrumentation import enter_generator, exit_generator
active = enter_generator()
t_start = time.monotonic()
logger.info("[perf] generator enter: active_requests=%d", active)
try:
async for item in self._generator_inner(request): async for item in self._generator_inner(request):
yield item yield item
finally:
if self.debug_perf:
active = exit_generator()
elapsed_ms = (time.monotonic() - t_start) * 1000.0
logger.info(
"[perf] generator exit: total=%.2fms active_requests=%d",
elapsed_ms,
active,
)
async def _generator_inner( async def _generator_inner(
self, request: dict[str, Any] self, request: dict[str, Any]
) -> AsyncGenerator[dict[str, Any], None]: ) -> AsyncGenerator[dict[str, Any], None]:
request_id = random_uuid() request_id = random_uuid()
if self.debug_perf:
t0 = time.monotonic()
pre = await preprocess_chat_request( pre = await preprocess_chat_request(
request, request,
tokenizer=self.tokenizer, tokenizer=self.tokenizer,
...@@ -140,14 +116,6 @@ class VllmProcessor: ...@@ -140,14 +116,6 @@ class VllmProcessor:
tool_parser_class=self.tool_parser_class, tool_parser_class=self.tool_parser_class,
) )
if self.debug_perf:
t1 = time.monotonic()
logger.info(
"[perf] preprocess_chat_request: %.2fms (request=%s)",
(t1 - t0) * 1000.0,
request_id,
)
request_for_sampling = pre.request_for_sampling request_for_sampling = pre.request_for_sampling
tool_parser = pre.tool_parser tool_parser = pre.tool_parser
chat_template_kwargs = pre.chat_template_kwargs chat_template_kwargs = pre.chat_template_kwargs
...@@ -207,9 +175,6 @@ class VllmProcessor: ...@@ -207,9 +175,6 @@ class VllmProcessor:
"mm_processor_kwargs" "mm_processor_kwargs"
] = request_for_sampling.mm_processor_kwargs ] = request_for_sampling.mm_processor_kwargs
if self.debug_perf:
t2 = time.monotonic()
vllm_preproc: EngineCoreRequest = self.input_processor.process_inputs( vllm_preproc: EngineCoreRequest = self.input_processor.process_inputs(
request_id, request_id,
prompt_inputs, prompt_inputs,
...@@ -222,15 +187,6 @@ class VllmProcessor: ...@@ -222,15 +187,6 @@ class VllmProcessor:
# data_parallel_rank: int | None = None, # data_parallel_rank: int | None = None,
) )
if self.debug_perf:
t3 = time.monotonic()
logger.info(
"[perf] input_processor.process_inputs: %.2fms (request=%s tokens=%d)",
(t3 - t2) * 1000.0,
request_id,
len(tokens),
)
InputProcessor.assign_request_id(vllm_preproc) InputProcessor.assign_request_id(vllm_preproc)
# Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None) # Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
...@@ -315,10 +271,6 @@ class VllmProcessor: ...@@ -315,10 +271,6 @@ class VllmProcessor:
) -> AsyncGenerator[dict[str, Any], None]: ) -> AsyncGenerator[dict[str, Any], None]:
self.output_processor.add_request(vllm_preproc, None) self.output_processor.add_request(vllm_preproc, None)
token_count = 0
output_proc_total_ms = 0.0
post_proc_total_ms = 0.0
try: try:
if self.is_kv_router: if self.is_kv_router:
dynamo_stream = await self.router.generate( dynamo_stream = await self.router.generate(
...@@ -362,17 +314,10 @@ class VllmProcessor: ...@@ -362,17 +314,10 @@ class VllmProcessor:
stop_reason=stop_reason, stop_reason=stop_reason,
) )
if self.debug_perf:
t_op0 = time.monotonic()
vllm_out: OutputProcessorOutput = self.output_processor.process_outputs( vllm_out: OutputProcessorOutput = self.output_processor.process_outputs(
[vllm_response] [vllm_response]
) )
if self.debug_perf:
t_op1 = time.monotonic()
output_proc_total_ms += (t_op1 - t_op0) * 1000.0
if vllm_out.reqs_to_abort: if vllm_out.reqs_to_abort:
pass pass
...@@ -384,11 +329,6 @@ class VllmProcessor: ...@@ -384,11 +329,6 @@ class VllmProcessor:
if choice: if choice:
choices.append(choice) choices.append(choice)
if self.debug_perf:
t_op2 = time.monotonic()
post_proc_total_ms += (t_op2 - t_op1) * 1000.0
token_count += len(engine_response["token_ids"])
if choices: if choices:
dynamo_out = { dynamo_out = {
"id": request_id, "id": request_id,
...@@ -406,18 +346,6 @@ class VllmProcessor: ...@@ -406,18 +346,6 @@ class VllmProcessor:
self.output_processor.abort_requests( self.output_processor.abort_requests(
[vllm_preproc.request_id], internal=True [vllm_preproc.request_id], internal=True
) )
if self.debug_perf and token_count > 0:
logger.info(
"[perf] stream done: request=%s tokens=%d "
"output_processor_total=%.2fms (%.3fms/tok) "
"post_processor_total=%.2fms (%.3fms/tok)",
request_id,
token_count,
output_proc_total_ms,
output_proc_total_ms / token_count,
post_proc_total_ms,
post_proc_total_ms / token_count,
)
class EngineFactory: class EngineFactory:
...@@ -427,7 +355,6 @@ class EngineFactory: ...@@ -427,7 +355,6 @@ class EngineFactory:
router_config: RouterConfig, router_config: RouterConfig,
config: FrontendConfig, config: FrontendConfig,
flags: Namespace, flags: Namespace,
debug_perf: bool = False,
): ):
if config.preprocess_workers != 0: if config.preprocess_workers != 0:
raise RuntimeError( raise RuntimeError(
...@@ -438,7 +365,6 @@ class EngineFactory: ...@@ -438,7 +365,6 @@ class EngineFactory:
self.router_config = router_config self.router_config = router_config
self.config = config self.config = config
self.flags = flags self.flags = flags
self.debug_perf = debug_perf
self.stream_interval = 20 self.stream_interval = 20
raw_stream_interval = os.getenv("DYN_VLLM_STREAM_INTERVAL") raw_stream_interval = os.getenv("DYN_VLLM_STREAM_INTERVAL")
if raw_stream_interval: if raw_stream_interval:
...@@ -536,7 +462,6 @@ class EngineFactory: ...@@ -536,7 +462,6 @@ class EngineFactory:
output_processor, output_processor,
tool_parser_class, tool_parser_class,
reasoning_parser_class, reasoning_parser_class,
debug_perf=self.debug_perf,
) )
return PythonAsyncEngine(gen.generator, loop) return PythonAsyncEngine(gen.generator, loop)
...@@ -317,8 +317,8 @@ class TRTLLMProcess: ...@@ -317,8 +317,8 @@ class TRTLLMProcess:
time.sleep(2) time.sleep(2)
@pytest.mark.pre_merge
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True) @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
@pytest.mark.timeout(300) @pytest.mark.timeout(300)
def test_trtllm_kv_router_basic( def test_trtllm_kv_router_basic(
...@@ -420,8 +420,8 @@ def test_router_decisions_trtllm_attention_dp( ...@@ -420,8 +420,8 @@ def test_router_decisions_trtllm_attention_dp(
) )
@pytest.mark.pre_merge
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.parametrize("request_plane", ["tcp"], indirect=True) @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up @pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
def test_router_decisions_trtllm_multiple_workers( def test_router_decisions_trtllm_multiple_workers(
...@@ -461,8 +461,8 @@ def test_router_decisions_trtllm_multiple_workers( ...@@ -461,8 +461,8 @@ def test_router_decisions_trtllm_multiple_workers(
) )
@pytest.mark.pre_merge
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.nightly
@pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up @pytest.mark.timeout(150) # ~3x average (~45s/test), rounded up
@pytest.mark.parametrize( @pytest.mark.parametrize(
"store_backend,durable_kv_events,request_plane", "store_backend,durable_kv_events,request_plane",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment