Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
c84c0934
Unverified
Commit
c84c0934
authored
Apr 01, 2026
by
Indrajit Bhosale
Committed by
GitHub
Apr 01, 2026
Browse files
test: Add ci coverage for trtllm multimodal raw embeddings (#7540)
Signed-off-by:
Indrajit Bhosale
<
iamindrajitb@gmail.com
>
parent
3bfee568
Changes
4
Show whitespace changes
Inline
Side-by-side
Showing
4 changed files
with
320 additions
and
0 deletions
+320
-0
examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
...s/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
+38
-0
examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
...rtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
+30
-0
tests/serve/launch/agg_raw_embeddings_llava.sh
tests/serve/launch/agg_raw_embeddings_llava.sh
+208
-0
tests/serve/test_trtllm.py
tests/serve/test_trtllm.py
+44
-0
No files found.
examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml
0 → 100644
View file @
c84c0934
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
4096
max_batch_size
:
8
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
# LLaVA's text_config (MistralConfig) does not inherit torch_dtype from the
# top-level LlavaNextConfig. Without this, TRT-LLM creates Mistral-7B layers
# in float32 (~28 GB), exceeding the 22 GB available on the L4 GPUs used in
# our CI environment. Propagate the checkpoint dtype so that layers are
# created in float16 (~14 GB).
model_kwargs
:
text_config
:
torch_dtype
:
float16
kv_cache_config
:
free_gpu_memory_fraction
:
0.10
enable_block_reuse
:
false
cache_transceiver_config
:
backend
:
DEFAULT
examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml
0 → 100644
View file @
c84c0934
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
moe_expert_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
4096
max_batch_size
:
8
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler
:
true
# Note: kv_cache_config is not needed for encode workers since MultimodalEncoder
# only runs vision encoder + projector and doesn't need KV cache memory.
cache_transceiver_config
:
backend
:
DEFAULT
tests/serve/launch/agg_raw_embeddings_llava.sh
0 → 100755
View file @
c84c0934
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# LLaVA Raw-Embeddings E/PD Test
#
# Phase 1 — Run HuggingFace vision encoder standalone to produce
# pre-computed embeddings at $EMBEDDINGS_FILE (.pt tensor).
#
# Phase 2 — Start Encode + Aggregated PD workers for LLaVA, then
# accept chat/completions requests whose image_url points
# to the embeddings file (file:///tmp/llava_embeddings.pt).
#
# Known limitation: The default revision of llava-hf/llava-v1.6-mistral-7b-hf
# may crash with certain TRT-LLM versions. Set MODEL_REVISION to pin a
# safe commit (e.g. 52320fb52229).
set
-e
trap
'echo Cleaning up...; rm -f "${EMBEDDINGS_FILE:-/tmp/llava_embeddings.pt}" /tmp/_resolved_model_path.txt; kill 0'
EXIT
SCRIPT_DIR
=
"
$(
dirname
"
$(
readlink
-f
"
$0
"
)
"
)
"
# ── Configuration ─────────────────────────────────────────────────────────────
export
DYNAMO_HOME
=
${
DYNAMO_HOME
:-
"/workspace"
}
source
"
${
DYNAMO_HOME
}
/examples/common/launch_utils.sh"
export
MODEL_PATH
=
${
MODEL_PATH
:-
"llava-hf/llava-v1.6-mistral-7b-hf"
}
export
SERVED_MODEL_NAME
=
${
SERVED_MODEL_NAME
:-
"llava-hf/llava-v1.6-mistral-7b-hf"
}
export
MODEL_REVISION
=
${
MODEL_REVISION
:-
"52320fb52229"
}
export
ENCODE_ENGINE_ARGS
=
${
ENCODE_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"
}
export
PD_ENGINE_ARGS
=
${
PD_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/agg.yaml"
}
export
ENCODE_CUDA_VISIBLE_DEVICES
=
${
ENCODE_CUDA_VISIBLE_DEVICES
:-
"0"
}
export
PD_CUDA_VISIBLE_DEVICES
=
${
PD_CUDA_VISIBLE_DEVICES
:-
"1"
}
export
ENCODE_ENDPOINT
=
${
ENCODE_ENDPOINT
:-
"dyn://dynamo.tensorrt_llm_encode.generate"
}
export
MODALITY
=
${
MODALITY
:-
"multimodal"
}
export
ALLOWED_LOCAL_MEDIA_PATH
=
${
ALLOWED_LOCAL_MEDIA_PATH
:-
"/tmp"
}
export
MAX_FILE_SIZE_MB
=
${
MAX_FILE_SIZE_MB
:-
50
}
export
CUSTOM_TEMPLATE
=
${
CUSTOM_TEMPLATE
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/templates/llava_multimodal.jinja"
}
# Embeddings configuration
EMBEDDINGS_FILE
=
"
${
EMBEDDINGS_FILE
:-
/tmp/llava_embeddings.pt
}
"
TEST_IMAGE_URL
=
"
${
TEST_IMAGE_URL
:-
https
://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png
}
"
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS
=(
"
$@
"
)
# Prevent port collisions: the test framework exports DYN_SYSTEM_PORT which all
# child processes would inherit. Unset it so only workers that need it set their own.
unset
DYN_SYSTEM_PORT
HTTP_PORT
=
"
${
DYN_HTTP_PORT
:-
8000
}
"
print_launch_banner
--multimodal
"Launching LLaVA Raw Embeddings E/PD"
"
$MODEL_PATH
"
"
$HTTP_PORT
"
\
"Embeddings:
${
EMBEDDINGS_FILE
}
"
# ══════════════════════════════════════════════════════════════════════════════
# Phase 1: Generate embeddings using standalone HF vision encoder
# ══════════════════════════════════════════════════════════════════════════════
echo
""
echo
"Phase 1: Generating vision embeddings from test image …"
echo
" Image :
${
TEST_IMAGE_URL
}
"
echo
" Output:
${
EMBEDDINGS_FILE
}
"
echo
" Phase 1 GPU: CUDA_VISIBLE_DEVICES=0"
# The test framework sets HF_HUB_OFFLINE=1 after predownloading models at the
# default (main) revision. Phase 1 needs a *specific* pinned revision, so we
# temporarily disable offline mode for the download. Phase 2 uses the resolved
# local path and does not need HF hub access.
_SAVED_HF_OFFLINE
=
"
${
HF_HUB_OFFLINE
:-}
"
unset
HF_HUB_OFFLINE
CUDA_VISIBLE_DEVICES
=
0 python3 -
<<
'
PYEOF
'
import torch, io, os, urllib.request
from PIL import Image
from huggingface_hub import snapshot_download
from transformers import LlavaNextForConditionalGeneration, LlavaNextProcessor
model_id = os.environ["MODEL_PATH"]
revision = os.environ.get("MODEL_REVISION", "") or None
image_url = os.environ.get("TEST_IMAGE_URL",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png")
output = os.environ.get("EMBEDDINGS_FILE", "/tmp/llava_embeddings.pt")
# ── Download / resolve model ──
print(f"Resolving model {model_id} (revision={revision}) …")
model_path = snapshot_download(model_id, revision=revision)
print(f"Model path: {model_path}")
# ── Load model (vision tower + projector) ──
print("Loading LlavaNext model …")
model = LlavaNextForConditionalGeneration.from_pretrained(
model_path, torch_dtype=torch.float16, device_map="cuda:0",
)
processor = LlavaNextProcessor.from_pretrained(model_path)
# ── Download and process image ──
print(f"Downloading test image from {image_url} …")
with urllib.request.urlopen(image_url) as resp:
image = Image.open(io.BytesIO(resp.read())).convert("RGB")
print(f"Image size: {image.size}")
inputs = processor(text="<image>", images=image, return_tensors="pt")
pixel_values = inputs["pixel_values"].to(device="cuda:0", dtype=torch.float16)
# ── Run vision encoder + projector ──
print("Running vision tower …")
with torch.no_grad():
# LlavaNext may produce 5-D pixel_values: (batch, num_patches, C, H, W)
if pixel_values.ndim == 5:
b, n, c, h, w = pixel_values.shape
pixel_values_flat = pixel_values.reshape(b * n, c, h, w)
else:
pixel_values_flat = pixel_values
vision_out = model.vision_tower(pixel_values_flat, output_hidden_states=True)
features = vision_out.hidden_states[model.config.vision_feature_layer]
strategy = getattr(model.config, "vision_feature_select_strategy", "default")
if strategy == "default":
features = features[:, 1:]
embeddings = model.multi_modal_projector(features)
# Collapse (num_patches, seq_len, hidden) → (total_tokens, hidden)
if embeddings.ndim == 3:
embeddings = embeddings.reshape(-1, embeddings.shape[-1])
print(f"Embeddings: shape={embeddings.shape}, dtype={embeddings.dtype}")
# ── Save to disk ──
torch.save(embeddings.cpu(), output)
print(f"Saved embeddings → {output}")
# ── Write resolved model path so Phase 2 uses the exact same revision ──
model_path_file = os.environ.get("_MODEL_PATH_FILE", "/tmp/_resolved_model_path.txt")
with open(model_path_file, "w") as f:
f.write(model_path)
print(f"Resolved model path written to {model_path_file}")
# ── Free GPU memory ──
del model, processor, vision_out, features, embeddings, pixel_values
torch.cuda.empty_cache()
print("GPU memory released. Phase 1 complete ✓")
PYEOF
# Restore offline mode (if it was set by the test framework)
if
[
-n
"
$_SAVED_HF_OFFLINE
"
]
;
then
export
HF_HUB_OFFLINE
=
"
$_SAVED_HF_OFFLINE
"
fi
if
[
!
-f
"
$EMBEDDINGS_FILE
"
]
;
then
echo
"ERROR: Embeddings file not produced at
${
EMBEDDINGS_FILE
}
"
exit
1
fi
echo
"Embeddings generated at
${
EMBEDDINGS_FILE
}
"
# Override MODEL_PATH with the resolved local cache path so Phase 2 workers
# load the exact same revision (HF hub caches are revision-specific).
_MODEL_PATH_FILE
=
"/tmp/_resolved_model_path.txt"
if
[
-f
"
$_MODEL_PATH_FILE
"
]
;
then
RESOLVED_PATH
=
$(
cat
"
$_MODEL_PATH_FILE
"
)
echo
"Using resolved model path for Phase 2:
${
RESOLVED_PATH
}
"
export
MODEL_PATH
=
"
$RESOLVED_PATH
"
rm
-f
"
$_MODEL_PATH_FILE
"
fi
# ══════════════════════════════════════════════════════════════════════════════
# Phase 2: Start Encode + Aggregated PD workers
# ══════════════════════════════════════════════════════════════════════════════
echo
""
echo
"Phase 2: Starting E/PD workers …"
echo
" Encode worker → CUDA_VISIBLE_DEVICES=
${
ENCODE_CUDA_VISIBLE_DEVICES
}
"
echo
" PD worker → CUDA_VISIBLE_DEVICES=
${
PD_CUDA_VISIBLE_DEVICES
}
"
# Frontend
python3
-m
dynamo.frontend &
# Encode worker (vision encoder on GPU 0)
echo
"[Phase 2] Starting Encode worker on GPU
${
ENCODE_CUDA_VISIBLE_DEVICES
}
..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT1
:-
8081
}
\
CUDA_VISIBLE_DEVICES
=
$ENCODE_CUDA_VISIBLE_DEVICES
python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--served-model-name
"
$SERVED_MODEL_NAME
"
\
--extra-engine-args
"
$ENCODE_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--allowed-local-media-path
"
$ALLOWED_LOCAL_MEDIA_PATH
"
\
--max-file-size-mb
"
$MAX_FILE_SIZE_MB
"
\
--disaggregation-mode
encode &
ENCODE_PID
=
$!
echo
"[Phase 2] Encode worker PID=
${
ENCODE_PID
}
"
# Aggregated PD worker
echo
"[Phase 2] Starting PD worker on GPU
${
PD_CUDA_VISIBLE_DEVICES
}
..."
DYN_SYSTEM_PORT
=
${
DYN_SYSTEM_PORT2
:-
8082
}
\
CUDA_VISIBLE_DEVICES
=
$PD_CUDA_VISIBLE_DEVICES
python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--served-model-name
"
$SERVED_MODEL_NAME
"
\
--extra-engine-args
"
$PD_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--encode-endpoint
"
$ENCODE_ENDPOINT
"
\
--allowed-local-media-path
"
$ALLOWED_LOCAL_MEDIA_PATH
"
\
--max-file-size-mb
"
$MAX_FILE_SIZE_MB
"
\
--disaggregation-mode
prefill_and_decode
\
--custom-jinja-template
"
$CUSTOM_TEMPLATE
"
\
"
${
EXTRA_PD_ARGS
[@]
}
"
&
PD_PID
=
$!
echo
"[Phase 2] PD worker PID=
${
PD_PID
}
"
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
tests/serve/test_trtllm.py
View file @
c84c0934
...
@@ -10,6 +10,7 @@ from typing import Any
...
@@ -10,6 +10,7 @@ from typing import Any
import
pytest
import
pytest
from
tests.serve.common
import
(
from
tests.serve.common
import
(
SERVE_TEST_DIR
,
WORKSPACE_DIR
,
WORKSPACE_DIR
,
params_with_model_mark
,
params_with_model_mark
,
run_serve_deployment
,
run_serve_deployment
,
...
@@ -296,6 +297,49 @@ trtllm_configs = {
...
@@ -296,6 +297,49 @@ trtllm_configs = {
"ENCODE_CUDA_VISIBLE_DEVICES"
:
"0"
,
"ENCODE_CUDA_VISIBLE_DEVICES"
:
"0"
,
},
},
),
),
# LLaVA raw-embeddings E/PD test
# Validates the raw-embeddings code path where pre-computed vision embeddings
# (.pt tensor file) are sent via file:// URL instead of a raw image URL.
#
# Flow:
# 1. Launch script generates embeddings using standalone HF vision encoder
# 2. Encode + Aggregated PD workers start for LLaVA
# 3. Test sends chat/completions request with file:///tmp/llava_embeddings.pt
#
# Uses gpu_2: encode worker on GPU 0, PD worker on GPU 1.
# The 7B LLaVA model requires two GPUs because both encode and PD workers
# load the full model (~14GB each in bfloat16), exceeding a single L4's 22GB.
# Runs in the multi-GPU pre-merge CI (marker: pre_merge and trtllm and gpu_2).
"raw_embeddings_epd"
:
TRTLLMConfig
(
name
=
"raw_embeddings_epd"
,
directory
=
SERVE_TEST_DIR
,
script_name
=
"agg_raw_embeddings_llava.sh"
,
marks
=
[
pytest
.
mark
.
gpu_2
,
pytest
.
mark
.
trtllm
,
pytest
.
mark
.
multimodal
,
pytest
.
mark
.
pre_merge
,
pytest
.
mark
.
timeout
(
900
),
# Embeddings generation (~60s) + model load (~120s) + inference
],
model
=
"llava-hf/llava-v1.6-mistral-7b-hf"
,
frontend_port
=
DefaultPort
.
FRONTEND
.
value
,
timeout
=
600
,
# Embeddings generation + worker startup takes longer than normal
delayed_start
=
180
,
request_payloads
=
[
multimodal_payload_default
(
image_url
=
"file:///tmp/llava_embeddings.pt"
,
text
=
"Describe what this image shows."
,
expected_response
=
[
"bench"
,
"person"
,
"image"
,
"picture"
],
)
],
env
=
{
"ENCODE_CUDA_VISIBLE_DEVICES"
:
"0"
,
"PD_CUDA_VISIBLE_DEVICES"
:
"1"
,
},
),
# TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
# TensorRT-LLM video diffusion test using Wan2.1-T2V-1.3B model.
# Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
# Validates the end-to-end video generation pipeline (frontend → worker → /v1/videos).
# Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
# Uses --skip-warmup (warmup at default resolution OOMs on 22 GB L4 GPU),
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment