fix: Phase out llava and make EPD single GPU (#6674)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

fix: Phase out llava and make EPD single GPU (#6674)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
4ffa1082 · Indrajit Bhosale · GitHub · 98a6d3b9 · 4ffa1082 · 4ffa1082
Unverified Commit 4ffa1082 authored Feb 27, 2026 by Indrajit Bhosale Committed by GitHub Feb 27, 2026
5 changed files
--- a/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/decode.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/decode.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+max_batch_size: 4
+max_seq_len: 4096
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+disable_overlap_scheduler: false
+kv_cache_config:
+  free_gpu_memory_fraction: 0.10
+  enable_block_reuse: false
+cache_transceiver_config:
+  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/encode.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/encode.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+max_batch_size: 4
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+# Overlap scheduler not currently supported in prefill only workers.
+disable_overlap_scheduler: true
+# Note: Encode workers use MultimodalEncoder (vision encoder + projector only),
+# which ignores most engine_args. No kv_cache_config or cache_transceiver_config
+# is needed since MultimodalEncoder doesn't allocate KV cache or transfer buffers.
--- a/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/prefill.yaml
+++ b/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/prefill.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 1024
+max_batch_size: 4
+max_seq_len: 4096
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+# Overlap scheduler not currently supported in prefill only workers.
+disable_overlap_scheduler: true
+kv_cache_config:
+  free_gpu_memory_fraction: 0.10
+  enable_block_reuse: false
+cache_transceiver_config:
+  backend: DEFAULT
\ No newline at end of file
--- a/examples/backends/trtllm/launch/epd_multimodal_image_and_embeddings.sh
+++ b/examples/backends/trtllm/launch/epd_multimodal_image_and_embeddings.sh
@@ -4,19 +4,19 @@
 # Environment variables with defaults
 export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
-export MODEL_PATH=${MODEL_PATH:-"llava-hf/llava-v1.6-mistral-7b-hf"}
+export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
-export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"llava-v1.6-mistral-7b-hf"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-VL-2B-Instruct"}
-export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/prefill.yaml"}
+export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/prefill.yaml"}
-export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/decode.yaml"}
+export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/decode.yaml"}
-export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"}
+export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/encode.yaml"}
 export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
-export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"}
+export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"0"}
-export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"}
+export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"0"}
 export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.generate"}
 export MODALITY=${MODALITY:-"multimodal"}
 export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
 export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
-export CUSTOM_TEMPLATE=${CUSTOM_TEMPLATE:-"$DYNAMO_HOME/examples/backends/trtllm/templates/llava_multimodal.jinja"}
 # Setup cleanup trap
 cleanup() {
@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM
 # run frontend
-python3 -m dynamo.frontend --http-port 8000 &
+# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
+python3 -m dynamo.frontend &
 DYNAMO_PID=$!
 # run encode worker
@@ -50,8 +51,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
  --extra-engine-args "$PREFILL_ENGINE_ARGS" \
  --modality "$MODALITY" \
  --disaggregation-mode prefill \
-  --encode-endpoint "$ENCODE_ENDPOINT" \
+  --encode-endpoint "$ENCODE_ENDPOINT" &
-  --custom-jinja-template "$CUSTOM_TEMPLATE" &
 PREFILL_PID=$!
 # run decode worker
@@ -62,8 +62,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
  --modality "$MODALITY" \
  --allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
  --max-file-size-mb "$MAX_FILE_SIZE_MB" \
-  --disaggregation-mode decode \
+  --disaggregation-mode decode &
-  --custom-jinja-template "$CUSTOM_TEMPLATE" &
 DECODE_PID=$!
 wait $DYNAMO_PID
\ No newline at end of file
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -199,22 +199,22 @@ trtllm_configs = {
        delayed_start=60,
        request_payloads=[multimodal_payload_default()],
    ),
-    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for nightly CI
+    # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
-    # Uses llava model with 2 GPUs (encode shares GPU with prefill)
+    # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
    #
    # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
    #       once CI supports gpu_8 runners and launch scripts are available
    "epd_multimodal": TRTLLMConfig(
        name="epd_multimodal",
        directory=trtllm_dir,
-        script_name="epd_multimodal_image.sh",
+        script_name="epd_multimodal_image_and_embeddings.sh",
        marks=[
-            pytest.mark.gpu_2,
+            pytest.mark.gpu_1,
            pytest.mark.trtllm,
            pytest.mark.multimodal,
-            pytest.mark.nightly,
+            pytest.mark.pre_merge,
        ],
-        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        model="Qwen/Qwen3-VL-2B-Instruct",
        frontend_port=DefaultPort.FRONTEND.value,
        timeout=900,
        delayed_start=120,
@@ -225,9 +225,8 @@ trtllm_configs = {
            )
        ],
        env={
-            # Override GPU assignments to fit on 2 GPUs (encode shares with prefill)
            "PREFILL_CUDA_VISIBLE_DEVICES": "0",
-            "DECODE_CUDA_VISIBLE_DEVICES": "1",
+            "DECODE_CUDA_VISIBLE_DEVICES": "0",
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),