Unverified Commit 4ffa1082 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

fix: Phase out llava and make EPD single GPU (#6674)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 98a6d3b9
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
max_batch_size: 4
max_seq_len: 4096
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
disable_overlap_scheduler: false
kv_cache_config:
free_gpu_memory_fraction: 0.10
enable_block_reuse: false
cache_transceiver_config:
backend: DEFAULT
\ No newline at end of file
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
max_batch_size: 4
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler: true
# Note: Encode workers use MultimodalEncoder (vision encoder + projector only),
# which ignores most engine_args. No kv_cache_config or cache_transceiver_config
# is needed since MultimodalEncoder doesn't allocate KV cache or transfer buffers.
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 1024
max_batch_size: 4
max_seq_len: 4096
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler: true
kv_cache_config:
free_gpu_memory_fraction: 0.10
enable_block_reuse: false
cache_transceiver_config:
backend: DEFAULT
\ No newline at end of file
...@@ -4,19 +4,19 @@ ...@@ -4,19 +4,19 @@
# Environment variables with defaults # Environment variables with defaults
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"} export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
export MODEL_PATH=${MODEL_PATH:-"llava-hf/llava-v1.6-mistral-7b-hf"} export MODEL_PATH=${MODEL_PATH:-"Qwen/Qwen3-VL-2B-Instruct"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"llava-v1.6-mistral-7b-hf"} export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"Qwen/Qwen3-VL-2B-Instruct"}
export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/prefill.yaml"} export PREFILL_ENGINE_ARGS=${PREFILL_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/prefill.yaml"}
export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/decode.yaml"} export DECODE_ENGINE_ARGS=${DECODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/decode.yaml"}
export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"} export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/encode.yaml"}
export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"} export PREFILL_CUDA_VISIBLE_DEVICES=${PREFILL_CUDA_VISIBLE_DEVICES:-"0"}
export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"1"} export DECODE_CUDA_VISIBLE_DEVICES=${DECODE_CUDA_VISIBLE_DEVICES:-"0"}
export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"2"} export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"0"}
export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.generate"} export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.generate"}
export MODALITY=${MODALITY:-"multimodal"} export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"} export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50} export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
export CUSTOM_TEMPLATE=${CUSTOM_TEMPLATE:-"$DYNAMO_HOME/examples/backends/trtllm/templates/llava_multimodal.jinja"}
# Setup cleanup trap # Setup cleanup trap
cleanup() { cleanup() {
...@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM ...@@ -29,7 +29,8 @@ trap cleanup EXIT INT TERM
# run frontend # run frontend
python3 -m dynamo.frontend --http-port 8000 & # dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3 -m dynamo.frontend &
DYNAMO_PID=$! DYNAMO_PID=$!
# run encode worker # run encode worker
...@@ -50,8 +51,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -50,8 +51,7 @@ CUDA_VISIBLE_DEVICES=$PREFILL_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--extra-engine-args "$PREFILL_ENGINE_ARGS" \ --extra-engine-args "$PREFILL_ENGINE_ARGS" \
--modality "$MODALITY" \ --modality "$MODALITY" \
--disaggregation-mode prefill \ --disaggregation-mode prefill \
--encode-endpoint "$ENCODE_ENDPOINT" \ --encode-endpoint "$ENCODE_ENDPOINT" &
--custom-jinja-template "$CUSTOM_TEMPLATE" &
PREFILL_PID=$! PREFILL_PID=$!
# run decode worker # run decode worker
...@@ -62,8 +62,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \ ...@@ -62,8 +62,7 @@ CUDA_VISIBLE_DEVICES=$DECODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--modality "$MODALITY" \ --modality "$MODALITY" \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \ --allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \ --max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode decode \ --disaggregation-mode decode &
--custom-jinja-template "$CUSTOM_TEMPLATE" &
DECODE_PID=$! DECODE_PID=$!
wait $DYNAMO_PID wait $DYNAMO_PID
\ No newline at end of file
...@@ -199,22 +199,22 @@ trtllm_configs = { ...@@ -199,22 +199,22 @@ trtllm_configs = {
delayed_start=60, delayed_start=60,
request_payloads=[multimodal_payload_default()], request_payloads=[multimodal_payload_default()],
), ),
# TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for nightly CI # TensorRT-LLM EPD (Encode-Prefill-Decode) multimodal test for pre-merge CI
# Uses llava model with 2 GPUs (encode shares GPU with prefill) # Uses Qwen3-VL-2B-Instruct model with 1 GPU (all workers share same GPU)
# #
# TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama) # TODO: Add Llama-4-Scout multimodal tests (agg_multimodal_llama, disagg_multimodal_llama)
# once CI supports gpu_8 runners and launch scripts are available # once CI supports gpu_8 runners and launch scripts are available
"epd_multimodal": TRTLLMConfig( "epd_multimodal": TRTLLMConfig(
name="epd_multimodal", name="epd_multimodal",
directory=trtllm_dir, directory=trtllm_dir,
script_name="epd_multimodal_image.sh", script_name="epd_multimodal_image_and_embeddings.sh",
marks=[ marks=[
pytest.mark.gpu_2, pytest.mark.gpu_1,
pytest.mark.trtllm, pytest.mark.trtllm,
pytest.mark.multimodal, pytest.mark.multimodal,
pytest.mark.nightly, pytest.mark.pre_merge,
], ],
model="llava-hf/llava-v1.6-mistral-7b-hf", model="Qwen/Qwen3-VL-2B-Instruct",
frontend_port=DefaultPort.FRONTEND.value, frontend_port=DefaultPort.FRONTEND.value,
timeout=900, timeout=900,
delayed_start=120, delayed_start=120,
...@@ -225,9 +225,8 @@ trtllm_configs = { ...@@ -225,9 +225,8 @@ trtllm_configs = {
) )
], ],
env={ env={
# Override GPU assignments to fit on 2 GPUs (encode shares with prefill)
"PREFILL_CUDA_VISIBLE_DEVICES": "0", "PREFILL_CUDA_VISIBLE_DEVICES": "0",
"DECODE_CUDA_VISIBLE_DEVICES": "1", "DECODE_CUDA_VISIBLE_DEVICES": "0",
"ENCODE_CUDA_VISIBLE_DEVICES": "0", "ENCODE_CUDA_VISIBLE_DEVICES": "0",
}, },
), ),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment