Unverified Commit 66dfc494 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: KvCacheConfig Settings Lost When Publishing Events (#5198)

parent 996077f4
...@@ -208,14 +208,14 @@ async def init(runtime: DistributedRuntime, config: Config): ...@@ -208,14 +208,14 @@ async def init(runtime: DistributedRuntime, config: Config):
if config.publish_events_and_metrics: if config.publish_events_and_metrics:
# 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events. # 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
# Add it to kv_cache_config while preserving cache_transceiver_config from YAML # Add it to kv_cache_config while preserving all settings from YAML
current_kv_config = arg_map["kv_cache_config"] current_kv_config = arg_map["kv_cache_config"]
if isinstance(current_kv_config, KvCacheConfig): if isinstance(current_kv_config, KvCacheConfig):
# Convert KvCacheConfig object to dict (no cache_transceiver_config to preserve) # Convert KvCacheConfig object to dict, preserving ALL existing settings
arg_map["kv_cache_config"] = { # This ensures YAML overrides are not lost when adding event_buffer_max_size
"free_gpu_memory_fraction": config.free_gpu_memory_fraction, kv_config_dict = current_kv_config.model_dump(exclude_none=True)
"event_buffer_max_size": DEFAULT_KV_EVENT_BUFFER_MAX_SIZE, kv_config_dict["event_buffer_max_size"] = DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
} arg_map["kv_cache_config"] = kv_config_dict
elif isinstance(current_kv_config, dict): elif isinstance(current_kv_config, dict):
# Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings # Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
current_kv_config[ current_kv_config[
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
tensor_parallel_size: 8 tensor_parallel_size: 1
moe_expert_parallel_size: 1 moe_expert_parallel_size: 1
enable_attention_dp: false enable_attention_dp: false
max_num_tokens: 4096 max_num_tokens: 4096
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen2-VL-7B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen2-VL-7B-Instruct"}
export AGG_ENGINE_ARGS=${AGG_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"}
export MODALITY=${MODALITY:-"multimodal"}
# Setup cleanup trap
cleanup() {
echo "Cleaning up background processes..."
kill $DYNAMO_PID 2>/dev/null || true
wait $DYNAMO_PID 2>/dev/null || true
echo "Cleanup complete."
}
trap cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend --router-mode kv &
DYNAMO_PID=$!
# run worker
python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$AGG_ENGINE_ARGS" \
--modality "$MODALITY" \
--publish-events-and-metrics
...@@ -183,6 +183,22 @@ trtllm_configs = { ...@@ -183,6 +183,22 @@ trtllm_configs = {
delayed_start=60, delayed_start=60,
request_payloads=[multimodal_payload_default()], request_payloads=[multimodal_payload_default()],
), ),
"aggregated_multimodal_router": TRTLLMConfig(
name="aggregated_multimodal_router",
directory=trtllm_dir,
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.trtllm,
pytest.mark.multimodal,
pytest.mark.nightly,
],
model="Qwen/Qwen2-VL-7B-Instruct",
frontend_port=DefaultPort.FRONTEND.value,
timeout=900,
delayed_start=60,
request_payloads=[multimodal_payload_default()],
),
"completions_only": TRTLLMConfig( "completions_only": TRTLLMConfig(
name="completions_only", name="completions_only",
directory=trtllm_dir, directory=trtllm_dir,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment