test: Add ci coverage for trtllm multimodal raw embeddings (#7540)

Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>

test: Add ci coverage for trtllm multimodal raw embeddings (#7540)
Signed-off-by: Indrajit Bhosale <iamindrajitb@gmail.com>
c84c0934 · Indrajit Bhosale · GitHub · 3bfee568 · c84c0934 · c84c0934
Unverified Commit c84c0934 authored Apr 01, 2026 by Indrajit Bhosale Committed by GitHub Apr 01, 2026
4 changed files
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 4096
+max_batch_size: 8
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+# LLaVA's text_config (MistralConfig) does not inherit torch_dtype from the
+# top-level LlavaNextConfig.  Without this, TRT-LLM creates Mistral-7B layers
+# in float32 (~28 GB), exceeding the 22 GB available on the L4 GPUs used in
+# our CI environment.  Propagate the checkpoint dtype so that layers are
+# created in float16 (~14 GB).
+model_kwargs:
+  text_config:
+    torch_dtype: float16
+kv_cache_config:
+  free_gpu_memory_fraction: 0.10
+  enable_block_reuse: false
+cache_transceiver_config:
+  backend: DEFAULT
--- a/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
+++ b/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+tensor_parallel_size: 1
+moe_expert_parallel_size: 1
+enable_attention_dp: false
+max_num_tokens: 4096
+max_batch_size: 8
+trust_remote_code: true
+backend: pytorch
+enable_chunked_prefill: true
+# Overlap scheduler not currently supported in prefill only workers.
+disable_overlap_scheduler: true
+# Note: kv_cache_config is not needed for encode workers since MultimodalEncoder
+# only runs vision encoder + projector and doesn't need KV cache memory.
+cache_transceiver_config:
+  backend: DEFAULT
--- a/tests/serve/launch/agg_raw_embeddings_llava.sh
+++ b/tests/serve/launch/agg_raw_embeddings_llava.sh
+#!/bin/bash
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+# LLaVA Raw-Embeddings E/PD Test
+#
+# Phase 1 — Run HuggingFace vision encoder standalone to produce
+#            pre-computed embeddings at $EMBEDDINGS_FILE (.pt tensor).
+#
+# Phase 2 — Start Encode + Aggregated PD workers for LLaVA, then
+#            accept chat/completions requests whose image_url points
+#            to the embeddings file (file:///tmp/llava_embeddings.pt).
+#
+# Known limitation: The default revision of llava-hf/llava-v1.6-mistral-7b-hf
+# may crash with certain TRT-LLM versions.  Set MODEL_REVISION to pin a
+# safe commit (e.g. 52320fb52229).
+set -e
+trap 'echo Cleaning up...; rm -f "${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}" /tmp/_resolved_model_path.txt; kill 0' EXIT
+SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
+# ── Configuration ─────────────────────────────────────────────────────────────
+export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
+source "${DYNAMO_HOME}/examples/common/launch_utils.sh"
+export MODEL_PATH=${MODEL_PATH:-"llava-hf/llava-v1.6-mistral-7b-hf"}
+export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"llava-hf/llava-v1.6-mistral-7b-hf"}
+export MODEL_REVISION=${MODEL_REVISION:-"52320fb52229"}
+export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"}
+export PD_ENGINE_ARGS=${PD_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml"}
+export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"0"}
+export PD_CUDA_VISIBLE_DEVICES=${PD_CUDA_VISIBLE_DEVICES:-"1"}
+export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.generate"}
+export MODALITY=${MODALITY:-"multimodal"}
+export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
+export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
+export CUSTOM_TEMPLATE=${CUSTOM_TEMPLATE:-"$DYNAMO_HOME/examples/backends/trtllm/templates/llava_multimodal.jinja"}
+# Embeddings configuration
+EMBEDDINGS_FILE="${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}"
+TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png}"
+# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
+EXTRA_PD_ARGS=("$@")
+# Prevent port collisions: the test framework exports DYN_SYSTEM_PORT which all
+# child processes would inherit. Unset it so only workers that need it set their own.
+unset DYN_SYSTEM_PORT
+HTTP_PORT="${DYN_HTTP_PORT:-8000}"
+print_launch_banner --multimodal "Launching LLaVA Raw Embeddings E/PD" "$MODEL_PATH" "$HTTP_PORT" \
+    "Embeddings: ${EMBEDDINGS_FILE}"
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 1: Generate embeddings using standalone HF vision encoder
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+echo "Phase 1: Generating vision embeddings from test image …"
+echo "         Image : ${TEST_IMAGE_URL}"
+echo "         Output: ${EMBEDDINGS_FILE}"
+echo "         Phase 1 GPU: CUDA_VISIBLE_DEVICES=0"
+# The test framework sets HF_HUB_OFFLINE=1 after predownloading models at the
+# default (main) revision.  Phase 1 needs a *specific* pinned revision, so we
+# temporarily disable offline mode for the download.  Phase 2 uses the resolved
+# local path and does not need HF hub access.
+_SAVED_HF_OFFLINE="${HF_HUB_OFFLINE:-}"
+unset HF_HUB_OFFLINE
+CUDA_VISIBLE_DEVICES=0 python3 - <<'PYEOF'
+import torch, io, os, urllib.request
+from PIL import Image
+from huggingface_hub import snapshot_download
+from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
+model_id   = os.environ["MODEL_PATH"]
+revision   = os.environ.get("MODEL_REVISION", "") or None
+image_url  = os.environ.get("TEST_IMAGE_URL",
+    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
+output     = os.environ.get("EMBEDDINGS_FILE", "/tmp/llava_embeddings.pt")
+# ── Download / resolve model ──
+print(f"Resolving model {model_id} (revision={revision}) …")
+model_path = snapshot_download(model_id, revision=revision)
+print(f"Model path: {model_path}")
+# ── Load model (vision tower + projector) ──
+print("Loading LlavaNext model …")
+model = LlavaNextForConditionalGeneration.from_pretrained(
+    model_path, torch_dtype=torch.float16, device_map="cuda:0",
+)
+processor = LlavaNextProcessor.from_pretrained(model_path)
+# ── Download and process image ──
+print(f"Downloading test image from {image_url} …")
+with urllib.request.urlopen(image_url) as resp:
+    image = Image.open(io.BytesIO(resp.read())).convert("RGB")
+print(f"Image size: {image.size}")
+inputs = processor(text="<image>", images=image, return_tensors="pt")
+pixel_values = inputs["pixel_values"].to(device="cuda:0", dtype=torch.float16)
+# ── Run vision encoder + projector ──
+print("Running vision tower …")
+with torch.no_grad():
+    # LlavaNext may produce 5-D pixel_values: (batch, num_patches, C, H, W)
+    if pixel_values.ndim == 5:
+        b, n, c, h, w = pixel_values.shape
+        pixel_values_flat = pixel_values.reshape(b * n, c, h, w)
+    else:
+        pixel_values_flat = pixel_values
+    vision_out = model.vision_tower(pixel_values_flat, output_hidden_states=True)
+    features = vision_out.hidden_states[model.config.vision_feature_layer]
+    strategy = getattr(model.config, "vision_feature_select_strategy", "default")
+    if strategy == "default":
+        features = features[:, 1:]
+    embeddings = model.multi_modal_projector(features)
+    # Collapse (num_patches, seq_len, hidden) → (total_tokens, hidden)
+    if embeddings.ndim == 3:
+        embeddings = embeddings.reshape(-1, embeddings.shape[-1])
+print(f"Embeddings: shape={embeddings.shape}, dtype={embeddings.dtype}")
+# ── Save to disk ──
+torch.save(embeddings.cpu(), output)
+print(f"Saved embeddings → {output}")
+# ── Write resolved model path so Phase 2 uses the exact same revision ──
+model_path_file = os.environ.get("_MODEL_PATH_FILE", "/tmp/_resolved_model_path.txt")
+with open(model_path_file, "w") as f:
+    f.write(model_path)
+print(f"Resolved model path written to {model_path_file}")
+# ── Free GPU memory ──
+del model, processor, vision_out, features, embeddings, pixel_values
+torch.cuda.empty_cache()
+print("GPU memory released. Phase 1 complete ✓")
+PYEOF
+# Restore offline mode (if it was set by the test framework)
+if [ -n "$_SAVED_HF_OFFLINE" ]; then
+    export HF_HUB_OFFLINE="$_SAVED_HF_OFFLINE"
+fi
+if [ ! -f "$EMBEDDINGS_FILE" ]; then
+    echo "ERROR: Embeddings file not produced at ${EMBEDDINGS_FILE}"
+    exit 1
+fi
+echo "Embeddings generated at ${EMBEDDINGS_FILE}"
+# Override MODEL_PATH with the resolved local cache path so Phase 2 workers
+# load the exact same revision (HF hub caches are revision-specific).
+_MODEL_PATH_FILE="/tmp/_resolved_model_path.txt"
+if [ -f "$_MODEL_PATH_FILE" ]; then
+    RESOLVED_PATH=$(cat "$_MODEL_PATH_FILE")
+    echo "Using resolved model path for Phase 2: ${RESOLVED_PATH}"
+    export MODEL_PATH="$RESOLVED_PATH"
+    rm -f "$_MODEL_PATH_FILE"
+fi
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 2: Start Encode + Aggregated PD workers
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+echo "Phase 2: Starting E/PD workers …"
+echo "  Encode worker → CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES}"
+echo "  PD worker     → CUDA_VISIBLE_DEVICES=${PD_CUDA_VISIBLE_DEVICES}"
+# Frontend
+python3 -m dynamo.frontend &
+# Encode worker (vision encoder on GPU 0)
+echo "[Phase 2] Starting Encode worker on GPU ${ENCODE_CUDA_VISIBLE_DEVICES} ..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
+CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args "$ENCODE_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
+  --max-file-size-mb "$MAX_FILE_SIZE_MB" \
+  --disaggregation-mode encode &
+ENCODE_PID=$!
+echo "[Phase 2] Encode worker PID=${ENCODE_PID}"
+# Aggregated PD worker
+echo "[Phase 2] Starting PD worker on GPU ${PD_CUDA_VISIBLE_DEVICES} ..."
+DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
+CUDA_VISIBLE_DEVICES=$PD_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
+  --model-path "$MODEL_PATH" \
+  --served-model-name "$SERVED_MODEL_NAME" \
+  --extra-engine-args "$PD_ENGINE_ARGS" \
+  --modality "$MODALITY" \
+  --encode-endpoint "$ENCODE_ENDPOINT" \
+  --allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
+  --max-file-size-mb "$MAX_FILE_SIZE_MB" \
+  --disaggregation-mode prefill_and_decode \
+  --custom-jinja-template "$CUSTOM_TEMPLATE" \
+  "${EXTRA_PD_ARGS[@]}" &
+PD_PID=$!
+echo "[Phase 2] PD worker PID=${PD_PID}"
+# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
+wait_any_exit
--- a/tests/serve/test_trtllm.py
+++ b/tests/serve/test_trtllm.py
@@ -10,6 +10,7 @@ from typing import Any
 import pytest
 from tests.serve.common import (
+    SERVE_TEST_DIR,
    WORKSPACE_DIR,
    params_with_model_mark,
    run_serve_deployment,
@@ -296,6 +297,49 @@ trtllm_configs = {
            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
        },
    ),
+    # LLaVA raw-embeddings E/PD test
+    # Validates the raw-embeddings code path where pre-computed vision embeddings
+    # (.pt tensor file) are sent via file:// URL instead of a raw image URL.
+    #
+    # Flow:
+    #   1. Launch script generates embeddings using standalone HF vision encoder
+    #   2. Encode + Aggregated PD workers start for LLaVA
+    #   3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
+    #
+    # Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
+    # The 7B LLaVA model requires two GPUs because both encode and PD workers
+    # load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
+    # Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
+    "raw_embeddings_epd": TRTLLMConfig(
+        name="raw_embeddings_epd",
+        directory=SERVE_TEST_DIR,
+        script_name="agg_raw_embeddings_llava.sh",
+        marks=[
+            pytest.mark.gpu_2,
+            pytest.mark.trtllm,
+            pytest.mark.multimodal,
+            pytest.mark.pre_merge,
+            pytest.mark.timeout(
+                900
+            ),  # Embeddings generation (~60s) + model load (~120s) + inference
+        ],
+        model="llava-hf/llava-v1.6-mistral-7b-hf",
+        frontend_port=DefaultPort.FRONTEND.value,
+        timeout=600,
+        # Embeddings generation + worker startup takes longer than normal
+        delayed_start=180,
+        request_payloads=[
+            multimodal_payload_default(
+                image_url="file:///tmp/llava_embeddings.pt",
+                text="Describe what this image shows.",
+                expected_response=["bench", "person", "image", "picture"],
+            )
+        ],
+        env={
+            "ENCODE_CUDA_VISIBLE_DEVICES": "0",
+            "PD_CUDA_VISIBLE_DEVICES": "1",
+        },
+    ),
    # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
    # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
    # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),