chore: remove stale example assets (#7059)

7dfbe4fd · Alec · GitHub · 310f8ca9 · 310f8ca9 · 310f8ca9
Unverified Commit 7dfbe4fd authored Mar 27, 2026 by Alec Committed by GitHub Mar 28, 2026
20 changed files
--- a/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml
+++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-
-# moe_load_balancer settings for TRTLLM based on:
-# https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/ep_load_balancer/README.md#online-ep-load-balancer
-num_slots: 288
-layer_updates_per_iter: 2
--- a/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml
+++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-backend: pytorch
-
-# WideEP related settings
-moe_config:
-  backend: WIDEEP
-  load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml
-
-# TP/EP/PP/DP
-tensor_parallel_size: 16
-moe_expert_parallel_size: 16
-pipeline_parallel_size: 1
-enable_attention_dp: true
-
-max_batch_size: 256
-max_num_tokens: 256
-# 8448 = 8192 ISL + 256 OSL
-max_seq_len: 8448
-
-kv_cache_config:
-  # With dp attention disabled: high free_gpu_memory_fraction is fine.
-  # free_gpu_memory_fraction: 0.85
-  # With dp attention enabled: large ISL at high concurrency may need
-  # free_gpu_memory_fraction low to have enough available memory.
-  free_gpu_memory_fraction: 0.30
-  dtype: fp8
-
-
-# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-# NOTE: overlap_scheduler enabled by default since this commit and changed
-# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-disable_overlap_scheduler: false
-cuda_graph_config:
-  enable_padding: true
-  # NOTE: For larger max batch size, you may want to
-  # add larger cuda graph batch sizes below to match.
-  batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-
-
-print_iter_log: true
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/wide_ep_prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-backend: pytorch
-
-# WideEP related settings
-moe_config:
-  backend: WIDEEP
-  load_balancer: /mnt/examples/backends/trtllm/engine_configs/deepseek-r1/disagg/wide_ep/eplb.yaml
-
-# TP/EP/PP/DP
-tensor_parallel_size: 16
-moe_expert_parallel_size: 16
-pipeline_parallel_size: 1
-enable_attention_dp: true
-
-max_batch_size: 1
-max_num_tokens: 4096
-max_seq_len: 8192
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.3
-  dtype: fp8 # NOTE: This dtype must match in both prefill/decode configs
-
-# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-# NOTE: overlap_scheduler enabled by default since this commit and changed
-# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
-disable_overlap_scheduler: true
-print_iter_log: true
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
+++ b/examples/backends/trtllm/engine_configs/gemma3/vswa_agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tensor_parallel_size: 1
-backend: pytorch
-
-kv_cache_config:
-  max_attention_window:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 32768
--- a/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
+++ b/examples/backends/trtllm/engine_configs/gemma3/vswa_decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tensor_parallel_size: 1
-backend: pytorch
-
-kv_cache_config:
-  max_attention_window:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 32768
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/gemma3/vswa_prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-tensor_parallel_size: 1
-backend: pytorch
-disable_overlap_scheduler: true
-
-kv_cache_config:
-  max_attention_window:
-    - 512
-    - 512
-    - 512
-    - 512
-    - 512
-    - 32768
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml
+++ b/examples/backends/trtllm/engine_configs/gpt-oss-120b/agg.yaml
--- a/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml
+++ b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_agg.yml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 4
-moe_expert_parallel_size: 4
-max_batch_size: 192
-max_num_tokens: 3072
-disable_overlap_scheduler: false
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 3
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.2
-  enable_block_reuse: false
-
-cuda_graph_config:
-  enable_padding: true
-  batch_sizes: [1,2,3,4,5,6,7,8,16,32,48,64,128,190,191,192]
-
-print_iter_log: true
-
--- a/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
+++ b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 4
-moe_expert_parallel_size: 4
-max_batch_size: 256
-max_num_tokens: 1024
-# 8704 = 8192 ISL + 512 OSL
-max_seq_len: 8704
-disable_overlap_scheduler: true
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 3
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false
-
-cuda_graph_config:
-  enable_padding: true
-  batch_sizes:
-  - 1
-  - 2
-  - 4
-  - 8
-  - 16
-  - 32
-  - 64
-  - 128
-  - 256
-
-print_iter_log: true
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/llama4/eagle/eagle_prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-backend: pytorch
-tensor_parallel_size: 4
-moe_expert_parallel_size: 4
-max_batch_size: 1
-max_num_tokens: 8192
-max_seq_len: 8192
-print_iter_log: true
-disable_overlap_scheduler: true
-
-# Enable Speculative Decoding in the model engine
-speculative_config:
-  decoding_type: Eagle
-  max_draft_len: 3
-  speculative_model_dir: nvidia/Llama-4-Maverick-17B-128E-Eagle3
-  eagle3_one_model: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.5
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml
+++ b/examples/backends/trtllm/engine_configs/llama4/multimodal/agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 8
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 4096
-max_batch_size: 8
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.3
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
-# NOTE: pytorch_backend_config section flattened since: https://github.com/NVIDIA/TensorRT-LLM/pull/4603
-# NOTE: overlap_scheduler enabled by default since this commit and changed
-# config field from 'enable_overlap_scheduler' to 'disable_overlap_scheduler':
-# https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428
--- a/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml
+++ b/examples/backends/trtllm/engine_configs/llama4/multimodal/decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 8
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 4096
-max_batch_size: 8
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-disable_overlap_scheduler: false
-kv_cache_config:
-  free_gpu_memory_fraction: 0.20
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/llama4/multimodal/prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 8
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 4096
-max_batch_size: 8
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-# Overlap scheduler not currently supported in prefill only workers.
-disable_overlap_scheduler: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.20
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.60
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/decode.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-disable_overlap_scheduler: false
-kv_cache_config:
-  free_gpu_memory_fraction: 0.30
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-# Overlap scheduler not currently supported in prefill only workers.
-disable_overlap_scheduler: true
-
-# Note: kv_cache_config is not needed for encode workers since MultimodalEncoder
-# only runs vision encoder + projector and doesn't need KV cache memory.
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-# Overlap scheduler not currently supported in prefill only workers.
-disable_overlap_scheduler: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.30
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/decode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-disable_overlap_scheduler: false
-kv_cache_config:
-  free_gpu_memory_fraction: 0.30
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/encode.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-trust_remote_code: true
-backend: pytorch
-disable_overlap_scheduler: false
-
-cuda_graph_config:
-  max_batch_size: 16
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.85
-
-cache_transceiver_config:
-  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen2-vl-7b-instruct/prefill.yaml
-# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-tensor_parallel_size: 1
-moe_expert_parallel_size: 1
-enable_attention_dp: false
-max_num_tokens: 8192
-max_batch_size: 16
-trust_remote_code: true
-backend: pytorch
-enable_chunked_prefill: true
-# Overlap scheduler not currently supported in prefill only workers.
-disable_overlap_scheduler: true
-
-kv_cache_config:
-  free_gpu_memory_fraction: 0.30
-  enable_block_reuse: false
-
-cache_transceiver_config:
-  backend: DEFAULT
\ No newline at end of file