chore(frontend): Remove the debug_perf flag used for perf work (#7024)

Signed-off-by: Graham King <grahamk@nvidia.com>

chore(frontend): Remove the debug_perf flag used for perf work (#7024)
Signed-off-by: Graham King <grahamk@nvidia.com>
76c96c5d · Graham King · GitHub · 10081929 · 76c96c5d · 76c96c5d
Unverified Commit 76c96c5d authored Mar 06, 2026 by Graham King Committed by GitHub Mar 06, 2026
5 changed files
--- a/components/src/dynamo/frontend/frontend_args.py
+++ b/components/src/dynamo/frontend/frontend_args.py
@@ -71,7 +71,6 @@ class FrontendConfig(KvRouterConfigBase):
    event_plane: str
    chat_processor: str
    enable_anthropic_api: bool
-    debug_perf: bool
    preprocess_workers: int

    def validate(self) -> None:
@@ -357,19 +356,6 @@ class FrontendArgGroup(ArgGroup):
            choices=["dynamo", "vllm"],
        )

-        add_negatable_bool_argument(
-            g,
-            flag_name="--dyn-debug-perf",
-            env_var="DYN_DEBUG_PERF",
-            default=False,
-            dest="debug_perf",
-            help=(
-                "[EXPERIMENTAL] Enable performance instrumentation for diagnosing preprocessing bottlenecks. "
-                "Logs per-function timing, request concurrency, and hot-path section durations. "
-                "'--dyn-chat-processor vllm' only."
-            ),
-        )
-
        add_argument(
            g,
            flag_name="--dyn-preprocess-workers",

--- a/components/src/dynamo/frontend/main.py
+++ b/components/src/dynamo/frontend/main.py
@@ -60,7 +60,7 @@ def setup_engine_factory(
    """
    from .vllm_processor import EngineFactory

-    return EngineFactory(runtime, router_config, config, vllm_flags, config.debug_perf)
+    return EngineFactory(runtime, router_config, config, vllm_flags)


 def parse_args() -> tuple[FrontendConfig, Optional[Namespace]]:

--- a/components/src/dynamo/frontend/perf_instrumentation.py
+++ b/components/src/dynamo/frontend/perf_instrumentation.py
-#  SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-#  SPDX-License-Identifier: Apache-2.0
-
-"""
-Performance instrumentation for diagnosing frontend preprocessing bottlenecks.
-
-Activated by passing --dyn-debug-perf to dynamo.frontend.
-"""
-
-from __future__ import annotations
-
-import logging
-
-logger = logging.getLogger(__name__)
-
-# ---------------------------------------------------------------------------
-# Concurrency gauge
-# ---------------------------------------------------------------------------
-
-_active_requests = 0
-_peak_requests = 0
-
-
-def enter_generator() -> int:
-    """Increment active request count. Returns current count.
-
-    Safe without a lock: only called while the GIL is held (all callers are
-    in Python code), so the read-modify-write on the global int is atomic
-    with respect to other Python threads.
-    """
-    global _active_requests, _peak_requests
-    _active_requests += 1
-    count = _active_requests
-    if count > _peak_requests:
-        _peak_requests = count
-    return count
-
-
-def exit_generator() -> int:
-    """Decrement active request count. Returns current count."""
-    global _active_requests
-    _active_requests -= 1
-    return _active_requests
-
-
-def get_active_requests() -> int:
-    return _active_requests
-
-
-def get_peak_requests() -> int:
-    return _peak_requests
--- a/components/src/dynamo/frontend/vllm_processor.py
+++ b/components/src/dynamo/frontend/vllm_processor.py
@@ -81,7 +81,6 @@ class VllmProcessor:
        output_processor: OutputProcessor,
        tool_parser_class: type[ToolParser] | None,
        reasoning_parser_class: type[ReasoningParser] | None,
-        debug_perf: bool = False,
    ):
        self.tokenizer = tokenizer
        self.input_processor = input_processor
@@ -90,7 +89,6 @@ class VllmProcessor:
        self.output_processor = output_processor
        self.tool_parser_class = tool_parser_class
        self.reasoning_parser_class = reasoning_parser_class
-        self.debug_perf = debug_perf

    # Ideally we would map NVCreateChatCompletionRequest into Python so it can be type checked, but
    # it has a lot of fields.
@@ -103,36 +101,14 @@ class VllmProcessor:
        model inference to a backend using the router.
        """

-        # ** VllmProcessor.generator called: {'messages': [{'role': 'user', 'content': 'What is the capital of Tuvalu?'}], 'model': '/home/grahamk/llms/Qwen3-0.6B', 'max_completion_tokens': 1000, 'stream': False}
-
-        if self.debug_perf:
-            from .perf_instrumentation import enter_generator, exit_generator
-
-            active = enter_generator()
-            t_start = time.monotonic()
-            logger.info("[perf] generator enter: active_requests=%d", active)
-
-        try:
-            async for item in self._generator_inner(request):
-                yield item
-        finally:
-            if self.debug_perf:
-                active = exit_generator()
-                elapsed_ms = (time.monotonic() - t_start) * 1000.0
-                logger.info(
-                    "[perf] generator exit: total=%.2fms active_requests=%d",
-                    elapsed_ms,
-                    active,
-                )
+        async for item in self._generator_inner(request):
+            yield item

    async def _generator_inner(
        self, request: dict[str, Any]
    ) -> AsyncGenerator[dict[str, Any], None]:
        request_id = random_uuid()

-        if self.debug_perf:
-            t0 = time.monotonic()
-
        pre = await preprocess_chat_request(
            request,
            tokenizer=self.tokenizer,
@@ -140,14 +116,6 @@ class VllmProcessor:
            tool_parser_class=self.tool_parser_class,
        )

-        if self.debug_perf:
-            t1 = time.monotonic()
-            logger.info(
-                "[perf] preprocess_chat_request: %.2fms (request=%s)",
-                (t1 - t0) * 1000.0,
-                request_id,
-            )
-
        request_for_sampling = pre.request_for_sampling
        tool_parser = pre.tool_parser
        chat_template_kwargs = pre.chat_template_kwargs
@@ -207,9 +175,6 @@ class VllmProcessor:
                "mm_processor_kwargs"
            ] = request_for_sampling.mm_processor_kwargs

-        if self.debug_perf:
-            t2 = time.monotonic()
-
        vllm_preproc: EngineCoreRequest = self.input_processor.process_inputs(
            request_id,
            prompt_inputs,
@@ -222,15 +187,6 @@ class VllmProcessor:
            # data_parallel_rank: int | None = None,
        )

-        if self.debug_perf:
-            t3 = time.monotonic()
-            logger.info(
-                "[perf] input_processor.process_inputs: %.2fms (request=%s tokens=%d)",
-                (t3 - t2) * 1000.0,
-                request_id,
-                len(tokens),
-            )
-
        InputProcessor.assign_request_id(vllm_preproc)

        # Processed: EngineCoreRequest(request_id='a2b76a85cd65e151', prompt_token_ids=[3838, 374, 279, 6722, 315, 28649, 25510, 30], mm_features=None, sampling_params=SamplingParams(n=1, presence_penalty=0.0, frequency_penalty=0.0, repetition_penalty=1.0, temperature=1.0, top_p=1.0, top_k=0, min_p=0.0, seed=None, stop=[], stop_token_ids=[151643], bad_words=[], include_stop_str_in_output=False, ignore_eos=False, max_tokens=16, min_tokens=0, logprobs=None, prompt_logprobs=None, skip_special_tokens=True, spaces_between_special_tokens=True, truncate_prompt_tokens=None, structured_outputs=None, extra_args=None), pooling_params=None, eos_token_id=151645, arrival_time=1769036937.9417946, lora_request=None, cache_salt=None, data_parallel_rank=None, prompt_embeds=None, client_index=0, current_wave=0, priority=0, trace_headers=None)
@@ -315,10 +271,6 @@ class VllmProcessor:
    ) -> AsyncGenerator[dict[str, Any], None]:
        self.output_processor.add_request(vllm_preproc, None)

-        token_count = 0
-        output_proc_total_ms = 0.0
-        post_proc_total_ms = 0.0
-
        try:
            if self.is_kv_router:
                dynamo_stream = await self.router.generate(
@@ -362,17 +314,10 @@ class VllmProcessor:
                    stop_reason=stop_reason,
                )

-                if self.debug_perf:
-                    t_op0 = time.monotonic()
-
                vllm_out: OutputProcessorOutput = self.output_processor.process_outputs(
                    [vllm_response]
                )

-                if self.debug_perf:
-                    t_op1 = time.monotonic()
-                    output_proc_total_ms += (t_op1 - t_op0) * 1000.0
-
                if vllm_out.reqs_to_abort:
                    pass

@@ -384,11 +329,6 @@ class VllmProcessor:
                    if choice:
                        choices.append(choice)

-                if self.debug_perf:
-                    t_op2 = time.monotonic()
-                    post_proc_total_ms += (t_op2 - t_op1) * 1000.0
-                    token_count += len(engine_response["token_ids"])
-
                if choices:
                    dynamo_out = {
                        "id": request_id,
@@ -406,18 +346,6 @@ class VllmProcessor:
                self.output_processor.abort_requests(
                    [vllm_preproc.request_id], internal=True
                )
-            if self.debug_perf and token_count > 0:
-                logger.info(
-                    "[perf] stream done: request=%s tokens=%d "
-                    "output_processor_total=%.2fms (%.3fms/tok) "
-                    "post_processor_total=%.2fms (%.3fms/tok)",
-                    request_id,
-                    token_count,
-                    output_proc_total_ms,
-                    output_proc_total_ms / token_count,
-                    post_proc_total_ms,
-                    post_proc_total_ms / token_count,
-                )


 class EngineFactory:
@@ -427,7 +355,6 @@ class EngineFactory:
        router_config: RouterConfig,
        config: FrontendConfig,
        flags: Namespace,
-        debug_perf: bool = False,
    ):
        if config.preprocess_workers != 0:
            raise RuntimeError(
@@ -438,7 +365,6 @@ class EngineFactory:
        self.router_config = router_config
        self.config = config
        self.flags = flags
-        self.debug_perf = debug_perf
        self.stream_interval = 20
        raw_stream_interval = os.getenv("DYN_VLLM_STREAM_INTERVAL")
        if raw_stream_interval:
@@ -536,7 +462,6 @@ class EngineFactory:
            output_processor,
            tool_parser_class,
            reasoning_parser_class,
-            debug_perf=self.debug_perf,
        )

        return PythonAsyncEngine(gen.generator, loop)
--- a/tests/router/test_router_e2e_with_trtllm.py
+++ b/tests/router/test_router_e2e_with_trtllm.py
@@ -317,8 +317,8 @@ class TRTLLMProcess:
        time.sleep(2)


-@pytest.mark.pre_merge
 @pytest.mark.gpu_1
+@pytest.mark.nightly
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
 @pytest.mark.timeout(300)
 def test_trtllm_kv_router_basic(
@@ -420,8 +420,8 @@ def test_router_decisions_trtllm_attention_dp(
        )


-@pytest.mark.pre_merge
 @pytest.mark.gpu_1
+@pytest.mark.nightly
 @pytest.mark.parametrize("request_plane", ["tcp"], indirect=True)
 @pytest.mark.timeout(150)  # ~3x average (~45s/test), rounded up
 def test_router_decisions_trtllm_multiple_workers(
@@ -461,8 +461,8 @@ def test_router_decisions_trtllm_multiple_workers(
        )


-@pytest.mark.pre_merge
 @pytest.mark.gpu_1
+@pytest.mark.nightly
 @pytest.mark.timeout(150)  # ~3x average (~45s/test), rounded up
 @pytest.mark.parametrize(
    "store_backend,durable_kv_events,request_plane",