Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
66dfc494
Unverified
Commit
66dfc494
authored
Jan 07, 2026
by
Indrajit Bhosale
Committed by
GitHub
Jan 07, 2026
Browse files
fix: KvCacheConfig Settings Lost When Publishing Events (#5198)
parent
996077f4
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
57 additions
and
7 deletions
+57
-7
components/src/dynamo/trtllm/main.py
components/src/dynamo/trtllm/main.py
+6
-6
examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml
...kends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml
+1
-1
examples/backends/trtllm/launch/agg_multimodal.sh
examples/backends/trtllm/launch/agg_multimodal.sh
+34
-0
tests/serve/test_trtllm.py
tests/serve/test_trtllm.py
+16
-0
No files found.
components/src/dynamo/trtllm/main.py
View file @
66dfc494
...
...
@@ -208,14 +208,14 @@ async def init(runtime: DistributedRuntime, config: Config):
if
config
.
publish_events_and_metrics
:
# 'event_buffer_max_size' is required to enable TRTLLM to publish kv cache events.
# Add it to kv_cache_config while preserving
cache_transceiver_config
from YAML
# Add it to kv_cache_config while preserving
all settings
from YAML
current_kv_config
=
arg_map
[
"kv_cache_config"
]
if
isinstance
(
current_kv_config
,
KvCacheConfig
):
# Convert KvCacheConfig object to dict
(no cache_transceiver_config to preserve)
arg_map
[
"kv_cache_config"
]
=
{
"free_gpu_memory_fraction"
:
config
.
free_gpu_memory_fraction
,
"event_buffer_max_size"
:
DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
,
}
# Convert KvCacheConfig object to dict
, preserving ALL existing settings
# This ensures YAML overrides are not lost when adding event_buffer_max_size
kv_config_dict
=
current_kv_config
.
model_dump
(
exclude_none
=
True
)
kv_config_dict
[
"event_buffer_max_size"
]
=
DEFAULT_KV_EVENT_BUFFER_MAX_SIZE
arg_map
[
"kv_cache_config"
]
=
kv_config_dict
elif
isinstance
(
current_kv_config
,
dict
):
# Add event_buffer_max_size while preserving cache_transceiver_config and other YAML settings
current_kv_config
[
...
...
examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml
View file @
66dfc494
...
...
@@ -12,7 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
8
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
4096
...
...
examples/backends/trtllm/launch/agg_multimodal.sh
0 → 100755
View file @
66dfc494
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Environment variables with defaults
export
DYNAMO_HOME
=
${
DYNAMO_HOME
:-
"/workspace"
}
export
MODEL_PATH
=
${
MODEL_PATH
:-
"Qwen/Qwen2-VL-7B-Instruct"
}
export
SERVED_MODEL_NAME
=
${
SERVED_MODEL_NAME
:-
"Qwen/Qwen2-VL-7B-Instruct"
}
export
AGG_ENGINE_ARGS
=
${
AGG_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/agg.yaml"
}
export
MODALITY
=
${
MODALITY
:-
"multimodal"
}
# Setup cleanup trap
cleanup
()
{
echo
"Cleaning up background processes..."
kill
$DYNAMO_PID
2>/dev/null
||
true
wait
$DYNAMO_PID
2>/dev/null
||
true
echo
"Cleanup complete."
}
trap
cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3
-m
dynamo.frontend
--router-mode
kv &
DYNAMO_PID
=
$!
# run worker
python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--served-model-name
"
$SERVED_MODEL_NAME
"
\
--extra-engine-args
"
$AGG_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--publish-events-and-metrics
tests/serve/test_trtllm.py
View file @
66dfc494
...
...
@@ -183,6 +183,22 @@ trtllm_configs = {
delayed_start
=
60
,
request_payloads
=
[
multimodal_payload_default
()],
),
"aggregated_multimodal_router"
:
TRTLLMConfig
(
name
=
"aggregated_multimodal_router"
,
directory
=
trtllm_dir
,
script_name
=
"agg_multimodal.sh"
,
marks
=
[
pytest
.
mark
.
gpu_1
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
multimodal
,
pytest
.
mark
.
nightly
,
],
model
=
"Qwen/Qwen2-VL-7B-Instruct"
,
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
timeout
=
900
,
delayed_start
=
60
,
request_payloads
=
[
multimodal_payload_default
()],
),
"completions_only"
:
TRTLLMConfig
(
name
=
"completions_only"
,
directory
=
trtllm_dir
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment