Unverified Commit c84c0934 authored by Indrajit Bhosale's avatar Indrajit Bhosale Committed by GitHub
Browse files

test: Add ci coverage for trtllm multimodal raw embeddings (#7540)


Signed-off-by: default avatarIndrajit Bhosale <iamindrajitb@gmail.com>
parent 3bfee568
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 4096
max_batch_size: 8
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
# LLaVA's text_config (MistralConfig) does not inherit torch_dtype from the
# top-level LlavaNextConfig. Without this, TRT-LLM creates Mistral-7B layers
# in float32 (~28 GB), exceeding the 22 GB available on the L4 GPUs used in
# our CI environment. Propagate the checkpoint dtype so that layers are
# created in float16 (~14 GB).
model_kwargs:
text_config:
torch_dtype: float16
kv_cache_config:
free_gpu_memory_fraction: 0.10
enable_block_reuse: false
cache_transceiver_config:
backend: DEFAULT
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size: 1
moe_expert_parallel_size: 1
enable_attention_dp: false
max_num_tokens: 4096
max_batch_size: 8
trust_remote_code: true
backend: pytorch
enable_chunked_prefill: true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler: true
# Note: kv_cache_config is not needed for encode workers since MultimodalEncoder
# only runs vision encoder + projector and doesn't need KV cache memory.
cache_transceiver_config:
backend: DEFAULT
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# LLaVA Raw-Embeddings E/PD Test
#
# Phase 1 — Run HuggingFace vision encoder standalone to produce
# pre-computed embeddings at $EMBEDDINGS_FILE (.pt tensor).
#
# Phase 2 — Start Encode + Aggregated PD workers for LLaVA, then
# accept chat/completions requests whose image_url points
# to the embeddings file (file:///tmp/llava_embeddings.pt).
#
# Known limitation: The default revision of llava-hf/llava-v1.6-mistral-7b-hf
# may crash with certain TRT-LLM versions. Set MODEL_REVISION to pin a
# safe commit (e.g. 52320fb52229).
set -e
trap 'echo Cleaning up...; rm -f "${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}" /tmp/_resolved_model_path.txt; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
# ── Configuration ─────────────────────────────────────────────────────────────
export DYNAMO_HOME=${DYNAMO_HOME:-"/workspace"}
source "${DYNAMO_HOME}/examples/common/launch_utils.sh"
export MODEL_PATH=${MODEL_PATH:-"llava-hf/llava-v1.6-mistral-7b-hf"}
export SERVED_MODEL_NAME=${SERVED_MODEL_NAME:-"llava-hf/llava-v1.6-mistral-7b-hf"}
export MODEL_REVISION=${MODEL_REVISION:-"52320fb52229"}
export ENCODE_ENGINE_ARGS=${ENCODE_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"}
export PD_ENGINE_ARGS=${PD_ENGINE_ARGS:-"$DYNAMO_HOME/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml"}
export ENCODE_CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES:-"0"}
export PD_CUDA_VISIBLE_DEVICES=${PD_CUDA_VISIBLE_DEVICES:-"1"}
export ENCODE_ENDPOINT=${ENCODE_ENDPOINT:-"dyn://dynamo.tensorrt_llm_encode.generate"}
export MODALITY=${MODALITY:-"multimodal"}
export ALLOWED_LOCAL_MEDIA_PATH=${ALLOWED_LOCAL_MEDIA_PATH:-"/tmp"}
export MAX_FILE_SIZE_MB=${MAX_FILE_SIZE_MB:-50}
export CUSTOM_TEMPLATE=${CUSTOM_TEMPLATE:-"$DYNAMO_HOME/examples/backends/trtllm/templates/llava_multimodal.jinja"}
# Embeddings configuration
EMBEDDINGS_FILE="${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}"
TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png}"
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS=("$@")
# Prevent port collisions: the test framework exports DYN_SYSTEM_PORT which all
# child processes would inherit. Unset it so only workers that need it set their own.
unset DYN_SYSTEM_PORT
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching LLaVA Raw Embeddings E/PD" "$MODEL_PATH" "$HTTP_PORT" \
"Embeddings: ${EMBEDDINGS_FILE}"
# ══════════════════════════════════════════════════════════════════════════════
# Phase 1: Generate embeddings using standalone HF vision encoder
# ══════════════════════════════════════════════════════════════════════════════
echo ""
echo "Phase 1: Generating vision embeddings from test image …"
echo " Image : ${TEST_IMAGE_URL}"
echo " Output: ${EMBEDDINGS_FILE}"
echo " Phase 1 GPU: CUDA_VISIBLE_DEVICES=0"
# The test framework sets HF_HUB_OFFLINE=1 after predownloading models at the
# default (main) revision. Phase 1 needs a *specific* pinned revision, so we
# temporarily disable offline mode for the download. Phase 2 uses the resolved
# local path and does not need HF hub access.
_SAVED_HF_OFFLINE="${HF_HUB_OFFLINE:-}"
unset HF_HUB_OFFLINE
CUDA_VISIBLE_DEVICES=0 python3 - <<'PYEOF'
import torch, io, os, urllib.request
from PIL import Image
from huggingface_hub import snapshot_download
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
model_id = os.environ["MODEL_PATH"]
revision = os.environ.get("MODEL_REVISION", "") or None
image_url = os.environ.get("TEST_IMAGE_URL",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
output = os.environ.get("EMBEDDINGS_FILE", "/tmp/llava_embeddings.pt")
# ── Download / resolve model ──
print(f"Resolving model {model_id} (revision={revision}) …")
model_path = snapshot_download(model_id, revision=revision)
print(f"Model path: {model_path}")
# ── Load model (vision tower + projector) ──
print("Loading LlavaNext model …")
model = LlavaNextForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.float16, device_map="cuda:0",
)
processor = LlavaNextProcessor.from_pretrained(model_path)
# ── Download and process image ──
print(f"Downloading test image from {image_url} …")
with urllib.request.urlopen(image_url) as resp:
image = Image.open(io.BytesIO(resp.read())).convert("RGB")
print(f"Image size: {image.size}")
inputs = processor(text="<image>", images=image, return_tensors="pt")
pixel_values = inputs["pixel_values"].to(device="cuda:0", dtype=torch.float16)
# ── Run vision encoder + projector ──
print("Running vision tower …")
with torch.no_grad():
# LlavaNext may produce 5-D pixel_values: (batch, num_patches, C, H, W)
if pixel_values.ndim == 5:
b, n, c, h, w = pixel_values.shape
pixel_values_flat = pixel_values.reshape(b * n, c, h, w)
else:
pixel_values_flat = pixel_values
vision_out = model.vision_tower(pixel_values_flat, output_hidden_states=True)
features = vision_out.hidden_states[model.config.vision_feature_layer]
strategy = getattr(model.config, "vision_feature_select_strategy", "default")
if strategy == "default":
features = features[:, 1:]
embeddings = model.multi_modal_projector(features)
# Collapse (num_patches, seq_len, hidden) → (total_tokens, hidden)
if embeddings.ndim == 3:
embeddings = embeddings.reshape(-1, embeddings.shape[-1])
print(f"Embeddings: shape={embeddings.shape}, dtype={embeddings.dtype}")
# ── Save to disk ──
torch.save(embeddings.cpu(), output)
print(f"Saved embeddings → {output}")
# ── Write resolved model path so Phase 2 uses the exact same revision ──
model_path_file = os.environ.get("_MODEL_PATH_FILE", "/tmp/_resolved_model_path.txt")
with open(model_path_file, "w") as f:
f.write(model_path)
print(f"Resolved model path written to {model_path_file}")
# ── Free GPU memory ──
del model, processor, vision_out, features, embeddings, pixel_values
torch.cuda.empty_cache()
print("GPU memory released. Phase 1 complete ✓")
PYEOF
# Restore offline mode (if it was set by the test framework)
if [ -n "$_SAVED_HF_OFFLINE" ]; then
export HF_HUB_OFFLINE="$_SAVED_HF_OFFLINE"
fi
if [ ! -f "$EMBEDDINGS_FILE" ]; then
echo "ERROR: Embeddings file not produced at ${EMBEDDINGS_FILE}"
exit 1
fi
echo "Embeddings generated at ${EMBEDDINGS_FILE}"
# Override MODEL_PATH with the resolved local cache path so Phase 2 workers
# load the exact same revision (HF hub caches are revision-specific).
_MODEL_PATH_FILE="/tmp/_resolved_model_path.txt"
if [ -f "$_MODEL_PATH_FILE" ]; then
RESOLVED_PATH=$(cat "$_MODEL_PATH_FILE")
echo "Using resolved model path for Phase 2: ${RESOLVED_PATH}"
export MODEL_PATH="$RESOLVED_PATH"
rm -f "$_MODEL_PATH_FILE"
fi
# ══════════════════════════════════════════════════════════════════════════════
# Phase 2: Start Encode + Aggregated PD workers
# ══════════════════════════════════════════════════════════════════════════════
echo ""
echo "Phase 2: Starting E/PD workers …"
echo " Encode worker → CUDA_VISIBLE_DEVICES=${ENCODE_CUDA_VISIBLE_DEVICES}"
echo " PD worker → CUDA_VISIBLE_DEVICES=${PD_CUDA_VISIBLE_DEVICES}"
# Frontend
python3 -m dynamo.frontend &
# Encode worker (vision encoder on GPU 0)
echo "[Phase 2] Starting Encode worker on GPU ${ENCODE_CUDA_VISIBLE_DEVICES} ..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT1:-8081} \
CUDA_VISIBLE_DEVICES=$ENCODE_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$ENCODE_ENGINE_ARGS" \
--modality "$MODALITY" \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode encode &
ENCODE_PID=$!
echo "[Phase 2] Encode worker PID=${ENCODE_PID}"
# Aggregated PD worker
echo "[Phase 2] Starting PD worker on GPU ${PD_CUDA_VISIBLE_DEVICES} ..."
DYN_SYSTEM_PORT=${DYN_SYSTEM_PORT2:-8082} \
CUDA_VISIBLE_DEVICES=$PD_CUDA_VISIBLE_DEVICES python3 -m dynamo.trtllm \
--model-path "$MODEL_PATH" \
--served-model-name "$SERVED_MODEL_NAME" \
--extra-engine-args "$PD_ENGINE_ARGS" \
--modality "$MODALITY" \
--encode-endpoint "$ENCODE_ENDPOINT" \
--allowed-local-media-path "$ALLOWED_LOCAL_MEDIA_PATH" \
--max-file-size-mb "$MAX_FILE_SIZE_MB" \
--disaggregation-mode prefill_and_decode \
--custom-jinja-template "$CUSTOM_TEMPLATE" \
"${EXTRA_PD_ARGS[@]}" &
PD_PID=$!
echo "[Phase 2] PD worker PID=${PD_PID}"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
...@@ -10,6 +10,7 @@ from typing import Any ...@@ -10,6 +10,7 @@ from typing import Any
import pytest import pytest
from tests.serve.common import ( from tests.serve.common import (
SERVE_TEST_DIR,
WORKSPACE_DIR, WORKSPACE_DIR,
params_with_model_mark, params_with_model_mark,
run_serve_deployment, run_serve_deployment,
...@@ -296,6 +297,49 @@ trtllm_configs = { ...@@ -296,6 +297,49 @@ trtllm_configs = {
"ENCODE_CUDA_VISIBLE_DEVICES": "0", "ENCODE_CUDA_VISIBLE_DEVICES": "0",
}, },
), ),
# LLaVA raw-embeddings E/PD test
# Validates the raw-embeddings code path where pre-computed vision embeddings
# (.pt tensor file) are sent via file:// URL instead of a raw image URL.
#
# Flow:
# 1. Launch script generates embeddings using standalone HF vision encoder
# 2. Encode + Aggregated PD workers start for LLaVA
# 3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
#
# Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
# The 7B LLaVA model requires two GPUs because both encode and PD workers
# load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
# Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
"raw_embeddings_epd": TRTLLMConfig(
name="raw_embeddings_epd",
directory=SERVE_TEST_DIR,
script_name="agg_raw_embeddings_llava.sh",
marks=[
pytest.mark.gpu_2,
pytest.mark.trtllm,
pytest.mark.multimodal,
pytest.mark.pre_merge,
pytest.mark.timeout(
900
), # Embeddings generation (~60s) + model load (~120s) + inference
],
model="llava-hf/llava-v1.6-mistral-7b-hf",
frontend_port=DefaultPort.FRONTEND.value,
timeout=600,
# Embeddings generation + worker startup takes longer than normal
delayed_start=180,
request_payloads=[
multimodal_payload_default(
image_url="file:///tmp/llava_embeddings.pt",
text="Describe what this image shows.",
expected_response=["bench", "person", "image", "picture"],
)
],
env={
"ENCODE_CUDA_VISIBLE_DEVICES": "0",
"PD_CUDA_VISIBLE_DEVICES": "1",
},
),
# TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model. # TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
# Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos). # Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
# Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU), # Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment