fix: KvCacheConfig Settings Lost When Publishing Events (#5198)

66dfc494 · Indrajit Bhosale · GitHub · 996077f4 · 66dfc494 · 66dfc494
Unverified Commit 66dfc494 authored Jan 07, 2026 by Indrajit Bhosale Committed by GitHub Jan 07, 2026
4 changed files
--- a/components/src/dynamo/trtllm/main.py
+++ b/components/src/dynamo/trtllm/main.py
@@ -208,14 +208,14 @@ async def init(runtime: DistributedRuntime, config: Config):

    if config.publish_events_and_metrics:
        # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
-        # Add it to kv_cache_config while preserving cache_transceiver_config from YAML
+        # Add it to kv_cache_config while preserving all settings from YAML
        current_kv_config = arg_map["kv_cache_config"]
        if isinstance(current_kv_config, KvCacheConfig):
-            # Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve)
-            arg_map["kv_cache_config"] = {
-                "free_gpu_memory_fraction": config.free_gpu_memory_fraction,
-                "event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE,
-            }
+            # Convert KvCacheConfig object to dict, preserving ALL existing settings
+            # This ensures YAML overrides are not lost when adding event_buffer_max_size
+            kv_config_dict = current_kv_config.model_dump(exclude_none=True)
+            kv_config_dict["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
+            arg_map["kv_cache_config"] = kv_config_dict
        elif isinstance(current_kv_config, dict):
            # Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
            current_kv_config[

--- a/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-tensor_parallel_size: 8
+tensor_parallel_size: 1
 moe_expert_parallel_size: 1
 enable_attention_dp: false
 max_num_tokens: 4096

--- a/examples/backends/trtllm/launch/agg_multimodal.sh
+++ b/examples/backends/trtllm/launch/agg_multimodal.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+# Environment variables with defaults
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
+export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"}
+export MODALITY=${MODALITY:-"multimodal"}
+
+# Setup cleanup trap
+cleanup() {
+    echo "Cleaning up background processes..."
+    kill $DYNAMO_PID 2>/dev/null || true
+    wait $DYNAMO_PID 2>/dev/null || true
+    echo "Cleanup complete."
+}
+trap cleanup EXIT INT TERM
+
+
+# run frontend
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python3 -m dynamo.frontend --router-mode kv &
+DYNAMO_PID=$!
+
+# run worker
+python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args "$AGG_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --publish-events-and-metrics
+
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -183,6 +183,22 @@ trtllm_configs = {
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
+    "aggregated_multimodal_router": TRTLLMConfig(
+        name="aggregated_multimodal_router",
+        directory=trtllm_dir,
+        script_name="agg_multimodal.sh",
+        marks=[
+            pytest.mark.gpu_1,
+            pytest.mark.trtllm,
+            pytest.mark.multimodal,
+            pytest.mark.nightly,
+        ],
+        model="Qwen/Qwen2-VL-7B-Instruct",
+        frontend_port=DefaultPort.FRONTEND.value,
+        timeout=900,
+        delayed_start=60,
+        request_payloads=[multimodal_payload_default()],
+    ),
    "completions_only": TRTLLMConfig(
        name="completions_only",
        directory=trtllm_dir,