Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
9780bf3a
Unverified
Commit
9780bf3a
authored
Mar 04, 2026
by
Qi Wang
Committed by
GitHub
Mar 04, 2026
Browse files
perf: multimodal benchmark sweep (#6795)
parent
f0bfda1e
Changes
21
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1053 additions
and
71 deletions
+1053
-71
benchmarks/multimodal/jsonl/generate_images.py
benchmarks/multimodal/jsonl/generate_images.py
+31
-8
benchmarks/multimodal/jsonl/main.py
benchmarks/multimodal/jsonl/main.py
+1
-2
benchmarks/multimodal/sweep/.gitignore
benchmarks/multimodal/sweep/.gitignore
+1
-0
benchmarks/multimodal/sweep/README.md
benchmarks/multimodal/sweep/README.md
+112
-0
benchmarks/multimodal/sweep/__init__.py
benchmarks/multimodal/sweep/__init__.py
+2
-0
benchmarks/multimodal/sweep/__main__.py
benchmarks/multimodal/sweep/__main__.py
+33
-0
benchmarks/multimodal/sweep/args.py
benchmarks/multimodal/sweep/args.py
+65
-0
benchmarks/multimodal/sweep/config.py
benchmarks/multimodal/sweep/config.py
+125
-0
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.sh
...ltimodal/sweep/experiments/embedding_cache/trtllm_e_pd.sh
+65
-0
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.yaml
...imodal/sweep/experiments/embedding_cache/trtllm_e_pd.yaml
+29
-0
benchmarks/multimodal/sweep/experiments/embedding_cache/vllm_e_pd.yaml
...ltimodal/sweep/experiments/embedding_cache/vllm_e_pd.yaml
+36
-0
benchmarks/multimodal/sweep/experiments/embedding_cache/vllm_serve.yaml
...timodal/sweep/experiments/embedding_cache/vllm_serve.yaml
+38
-0
benchmarks/multimodal/sweep/orchestrator.py
benchmarks/multimodal/sweep/orchestrator.py
+184
-0
benchmarks/multimodal/sweep/runner.py
benchmarks/multimodal/sweep/runner.py
+119
-0
benchmarks/multimodal/sweep/server.py
benchmarks/multimodal/sweep/server.py
+124
-0
components/src/dynamo/trtllm/multimodal_processor.py
components/src/dynamo/trtllm/multimodal_processor.py
+32
-51
components/src/dynamo/vllm/multimodal_utils/prefill_worker_utils.py
.../src/dynamo/vllm/multimodal_utils/prefill_worker_utils.py
+0
-9
examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml
...kends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml
+0
-1
examples/backends/trtllm/engine_configs/qwen3-vl-30b-a3b-instruct-fp8/agg.yaml
...llm/engine_configs/qwen3-vl-30b-a3b-instruct-fp8/agg.yaml
+29
-0
examples/backends/trtllm/engine_configs/qwen3-vl-30b-a3b-instruct-fp8/encode.yaml
.../engine_configs/qwen3-vl-30b-a3b-instruct-fp8/encode.yaml
+27
-0
No files found.
benchmarks/multimodal/jsonl/generate_images.py
View file @
9780bf3a
...
...
@@ -59,18 +59,41 @@ def sample_slots(
num_requests
:
int
,
images_per_request
:
int
,
)
->
list
[
str
]:
"""Sample image slots from a fixed pool, no duplicates within each request."""
assert
(
len
(
pool
)
>=
images_per_request
),
f
"images-pool (
{
len
(
pool
)
}
) must be >= images-per-request (
{
images_per_request
}
)"
"""Sample image slots from a fixed pool, no duplicates within each request.
Every image in the pool is guaranteed to appear at least once.
"""
pool_size
=
len
(
pool
)
total_slots
=
num_requests
*
images_per_request
slot_refs
:
list
[
str
]
=
[]
for
_
in
range
(
num_requests
):
slot_refs
.
extend
(
py_rng
.
sample
(
pool
,
images_per_request
))
assert
(
pool_size
>=
images_per_request
),
f
"images-pool (
{
pool_size
}
) must be >= images-per-request (
{
images_per_request
}
)"
assert
total_slots
>=
pool_size
,
(
f
"total slots (
{
num_requests
}
×
{
images_per_request
}
=
{
total_slots
}
) < "
f
"images-pool (
{
pool_size
}
). Increase --num-requests or --images-per-request, "
f
"or reduce --images-pool."
)
# Round-robin every pool image into requests so each appears at least once
shuffled
=
list
(
pool
)
py_rng
.
shuffle
(
shuffled
)
requests
:
list
[
list
[
str
]]
=
[[]
for
_
in
range
(
num_requests
)]
for
i
,
img
in
enumerate
(
shuffled
):
requests
[
i
%
num_requests
].
append
(
img
)
# Fill remaining slots with random pool samples (no intra-request duplicates)
for
req
in
requests
:
remaining
=
images_per_request
-
len
(
req
)
if
remaining
>
0
:
used
=
set
(
req
)
available
=
[
img
for
img
in
pool
if
img
not
in
used
]
req
.
extend
(
py_rng
.
sample
(
available
,
remaining
))
py_rng
.
shuffle
(
req
)
slot_refs
=
[
img
for
req
in
requests
for
img
in
req
]
num_unique
=
len
(
set
(
slot_refs
))
print
(
f
"Generated
{
total_slots
}
image slots from pool of
{
len
(
pool
)
}
: "
f
"Generated
{
total_slots
}
image slots from pool of
{
pool_size
}
: "
f
"
{
num_unique
}
unique in use, "
f
"
{
total_slots
-
num_unique
}
duplicate references "
f
"(
{
(
total_slots
-
num_unique
)
/
total_slots
:.
1
%
}
reuse)"
...
...
benchmarks/multimodal/jsonl/main.py
View file @
9780bf3a
...
...
@@ -46,12 +46,11 @@ def main() -> None:
)
slot_refs
=
sample_slots
(
py_rng
,
pool
,
num_requests
,
images_per_request
)
unique_images
=
len
(
set
(
slot_refs
))
output_path
=
args
.
output
if
output_path
is
None
:
output_path
=
(
Path
(
__file__
).
parent
/
f
"
{
num_requests
}
req_
{
images_per_request
}
img_
{
unique_images
}
pool_
{
args
.
user_text_tokens
}
word_
{
args
.
image_mode
}
.jsonl"
/
f
"
{
num_requests
}
req_
{
images_per_request
}
img_
{
image_pool
}
pool_
{
args
.
user_text_tokens
}
word_
{
args
.
image_mode
}
.jsonl"
)
with
open
(
output_path
,
"w"
)
as
f
:
...
...
benchmarks/multimodal/sweep/.gitignore
0 → 100644
View file @
9780bf3a
results
\ No newline at end of file
benchmarks/multimodal/sweep/README.md
0 → 100644
View file @
9780bf3a
# Multimodal Benchmark Sweep
YAML-driven benchmark orchestrator that launches serving backends, runs
[
aiperf
](
https://github.com/triton-inference-server/perf_analyzer
)
concurrency
sweeps, and optionally generates comparison plots.
## Quick Start
```
bash
# from the repo root
python
-m
benchmarks.multimodal.sweep
\
--config
benchmarks/multimodal/sweep/experiments/embedding_cache/vllm_serve.yaml
```
## How It Works
1.
Parse the YAML experiment config.
2.
For each
**input file**
× each
**benchmark config**
:
-
Launch the serving backend via the workflow script.
-
Run
`aiperf profile`
at every concurrency level.
-
Stop the server (by default the server restarts between concurrency
levels to avoid warm-cache bias — controlled by
`restart_server_every_benchmark`
).
3.
Generate comparison plots across configs for each input file.
## YAML Config Reference
```
yaml
model
:
Qwen/Qwen3-VL-30B-A3B-Instruct-FP8
concurrencies
:
[
16
,
32
,
64
,
128
,
256
]
osl
:
150
# output sequence length
request_count
:
1000
# requests per concurrency level
warmup_count
:
5
port
:
8000
timeout
:
900
# seconds to wait for server readiness
output_dir
:
benchmarks/multimodal/sweep/results/vllm_serve
# Optional env vars injected into the server process
env
:
ENABLE_ENCODER_CACHE
:
"
0"
# JSONL files produced by benchmarks/multimodal/jsonl/
input_files
:
-
benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_http.jsonl
# Each config launches the workflow with its own extra_args
configs
:
-
label
:
cache-off
workflow
:
examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
0"
]
-
label
:
cache-on
workflow
:
examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
10"
]
```
## CLI Overrides
Any top-level YAML field can be overridden from the command line:
```
bash
python
-m
benchmarks.multimodal.sweep
\
--config
experiments/embedding_cache/vllm_serve.yaml
\
--concurrencies
1,2,4
\
--osl
200
\
--request-count
50
\
--skip-plots
```
## Output Directory Structure
Given the config above with two input files and two configs (
`cache-off`
,
`cache-on`
) at concurrencies
`[16, 32]`
, the output tree looks like:
```
<output_dir>/
├── 1000req_1img_200pool_400word_http/ # ← derived from input filename
│ ├── cache-off/ # ← config label
│ │ ├── c16/ # ← concurrency level
│ │ │ ├── profile_export.jsonl
│ │ │ ├── profile_export_aiperf.json
│ │ │ ├── profile_export_aiperf.csv
│ │ │ ├── gpu_telemetry_export.jsonl
│ │ │ ├── inputs.json
│ │ │ └── logs/
│ │ │ └── aiperf.log
│ │ └── c32/
│ │ └── ...
│ ├── cache-on/
│ │ ├── c16/
│ │ │ └── ...
│ │ └── c32/
│ │ └── ...
│ └── plots/ # ← comparison plots across configs
│ └── ...
└── 1000req_4img_200pool_400word_http/
├── cache-off/
│ └── ...
├── cache-on/
│ └── ...
└── plots/
└── ...
```
## Existing Experiments
| Experiment | Config | Backend |
|---|---|---|
| Embedding cache (vLLM serve) |
`experiments/embedding_cache/vllm_serve.yaml`
| Single-node vLLM |
| Embedding cache (vLLM E+PD) |
`experiments/embedding_cache/vllm_e_pd.yaml`
| Disaggregated vLLM E+PD |
| Embedding cache (TRT-LLM E+PD) |
`experiments/embedding_cache/trtllm_e_pd.yaml`
| Disaggregated TRT-LLM E+PD |
benchmarks/multimodal/sweep/__init__.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
benchmarks/multimodal/sweep/__main__.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""
CLI entry point for the multimodal benchmark sweep.
Usage:
python -m benchmarks.multimodal.sweep --config experiment.yaml
python -m benchmarks.multimodal.sweep --config experiment.yaml --output-dir /tmp/results
python -m benchmarks.multimodal.sweep --config experiment.yaml --model MyModel --osl 200
"""
from
__future__
import
annotations
from
.args
import
parse_args
from
.config
import
load_config
,
resolve_repo_root
from
.orchestrator
import
run_sweep
def
main
(
argv
=
None
)
->
None
:
args
=
parse_args
(
argv
)
overrides
=
{
k
:
v
for
k
,
v
in
vars
(
args
).
items
()
if
k
!=
"config"
and
v
is
not
None
}
config
=
load_config
(
args
.
config
,
cli_overrides
=
overrides
or
None
)
repo_root
=
resolve_repo_root
()
config
.
validate
(
repo_root
=
repo_root
)
run_sweep
(
config
,
repo_root
=
repo_root
)
if
__name__
==
"__main__"
:
main
()
benchmarks/multimodal/sweep/args.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
argparse
from
typing
import
List
def
_parse_concurrencies
(
value
:
str
)
->
List
[
int
]:
return
[
int
(
x
.
strip
())
for
x
in
value
.
split
(
","
)]
def
parse_args
(
argv
=
None
)
->
argparse
.
Namespace
:
parser
=
argparse
.
ArgumentParser
(
description
=
"Run a multimodal benchmark sweep from a YAML config."
,
formatter_class
=
argparse
.
RawDescriptionHelpFormatter
,
epilog
=
(
"Examples:
\n
"
" python -m benchmarks.multimodal.sweep --config experiments/cache_sweep.yaml
\n
"
" python -m benchmarks.multimodal.sweep --config exp.yaml --osl 200 --skip-plots
\n
"
),
)
parser
.
add_argument
(
"--config"
,
required
=
True
,
help
=
"Path to YAML experiment config file."
,
)
parser
.
add_argument
(
"--output-dir"
,
default
=
None
,
help
=
"Override output directory from config."
,
)
parser
.
add_argument
(
"--model"
,
default
=
None
,
help
=
"Override model name from config."
,
)
parser
.
add_argument
(
"--concurrencies"
,
type
=
_parse_concurrencies
,
default
=
None
,
help
=
"Override concurrency levels (comma-separated, e.g. '1,2,4,8')."
,
)
parser
.
add_argument
(
"--osl"
,
type
=
int
,
default
=
None
,
help
=
"Override output sequence length."
,
)
parser
.
add_argument
(
"--request-count"
,
type
=
int
,
default
=
None
,
help
=
"Override request count per concurrency level."
,
)
parser
.
add_argument
(
"--skip-plots"
,
action
=
"store_true"
,
default
=
None
,
help
=
"Skip plot generation."
,
)
return
parser
.
parse_args
(
argv
)
benchmarks/multimodal/sweep/config.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
os
from
dataclasses
import
dataclass
,
field
from
pathlib
import
Path
from
typing
import
Any
,
Dict
,
List
,
Optional
import
yaml
@
dataclass
class
BenchmarkConfig
:
"""A single benchmark configuration: a workflow script + arguments."""
label
:
str
workflow
:
str
extra_args
:
List
[
str
]
=
field
(
default_factory
=
list
)
@
dataclass
class
SweepConfig
:
"""Top-level sweep configuration loaded from YAML with optional CLI overrides."""
model
:
str
=
"Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
concurrencies
:
List
[
int
]
=
field
(
default_factory
=
lambda
:
[
1
,
2
,
4
,
8
,
16
,
32
])
osl
:
int
=
150
request_count
:
int
=
1000
warmup_count
:
int
=
5
port
:
int
=
8000
timeout
:
int
=
600
input_files
:
List
[
str
]
=
field
(
default_factory
=
list
)
configs
:
List
[
BenchmarkConfig
]
=
field
(
default_factory
=
list
)
output_dir
:
str
=
"benchmarks/results/multimodal_default"
skip_plots
:
bool
=
False
restart_server_every_benchmark
:
bool
=
True
env
:
Dict
[
str
,
str
]
=
field
(
default_factory
=
dict
)
def
validate
(
self
,
repo_root
:
Optional
[
Path
]
=
None
)
->
None
:
"""Validate that all referenced files and scripts exist."""
if
not
self
.
input_files
:
raise
ValueError
(
"At least one input_file is required."
)
if
not
self
.
configs
:
raise
ValueError
(
"At least one benchmark config is required."
)
for
f
in
self
.
input_files
:
if
not
Path
(
f
).
is_file
():
raise
FileNotFoundError
(
f
"Input file not found:
{
f
}
"
)
for
cfg
in
self
.
configs
:
script
=
Path
(
cfg
.
workflow
)
if
repo_root
and
not
script
.
is_absolute
():
script
=
repo_root
/
script
if
not
script
.
is_file
():
raise
FileNotFoundError
(
f
"Workflow script not found:
{
script
}
(config '
{
cfg
.
label
}
')"
)
if
not
self
.
concurrencies
:
raise
ValueError
(
"At least one concurrency level is required."
)
def
_parse_benchmark_config
(
raw
:
Dict
[
str
,
Any
])
->
BenchmarkConfig
:
return
BenchmarkConfig
(
label
=
raw
[
"label"
],
workflow
=
raw
[
"workflow"
],
extra_args
=
[
str
(
a
)
for
a
in
raw
.
get
(
"extra_args"
,
[])],
)
def
load_config
(
yaml_path
:
str
,
cli_overrides
:
Optional
[
Dict
[
str
,
Any
]]
=
None
,
)
->
SweepConfig
:
"""Load a SweepConfig from a YAML file, applying optional CLI overrides."""
with
open
(
yaml_path
)
as
f
:
raw
=
yaml
.
safe_load
(
f
)
defaults
=
SweepConfig
()
configs
=
[
_parse_benchmark_config
(
c
)
for
c
in
raw
.
get
(
"configs"
,
[])]
cfg
=
SweepConfig
(
model
=
raw
.
get
(
"model"
,
defaults
.
model
),
concurrencies
=
raw
.
get
(
"concurrencies"
,
defaults
.
concurrencies
),
osl
=
raw
.
get
(
"osl"
,
defaults
.
osl
),
request_count
=
raw
.
get
(
"request_count"
,
defaults
.
request_count
),
warmup_count
=
raw
.
get
(
"warmup_count"
,
defaults
.
warmup_count
),
port
=
raw
.
get
(
"port"
,
defaults
.
port
),
timeout
=
raw
.
get
(
"timeout"
,
defaults
.
timeout
),
input_files
=
raw
.
get
(
"input_files"
,
[]),
configs
=
configs
,
output_dir
=
raw
.
get
(
"output_dir"
,
defaults
.
output_dir
),
skip_plots
=
raw
.
get
(
"skip_plots"
,
False
),
restart_server_every_benchmark
=
raw
.
get
(
"restart_server_every_benchmark"
,
defaults
.
restart_server_every_benchmark
,
),
env
=
raw
.
get
(
"env"
,
{}),
)
if
cli_overrides
:
for
key
,
value
in
cli_overrides
.
items
():
if
value
is
None
:
continue
if
hasattr
(
cfg
,
key
):
setattr
(
cfg
,
key
,
value
)
return
cfg
def
input_file_tag
(
path
:
str
)
->
str
:
"""Derive a short directory-safe tag from a JSONL filename."""
return
Path
(
path
).
stem
.
replace
(
" "
,
"_"
)
def
resolve_repo_root
()
->
Path
:
"""Walk up from CWD looking for pyproject.toml to find the repo root."""
candidate
=
Path
(
os
.
getcwd
()).
resolve
()
while
candidate
!=
candidate
.
parent
:
if
(
candidate
/
"pyproject.toml"
).
is_file
():
return
candidate
candidate
=
candidate
.
parent
return
Path
(
os
.
getcwd
()).
resolve
()
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.sh
0 → 100755
View file @
9780bf3a
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# 1 Encode + 1 PD worker (disaggregated E + PD)
# GPU 0: Encode (vision encoder)
# GPU 0: PD worker (prefill + decode, TP=1)
#
# Usage:
# bash trtllm_e_pd.sh
# bash trtllm_e_pd.sh --multimodal-embedding-cache-capacity-gb 10
# Environment variables with defaults
export
DYNAMO_HOME
=
${
DYNAMO_HOME
:-
"/workspace"
}
export
MODEL_PATH
=
"/huggingface/local/llava-v1.6-mistral-7b-hf-pinned"
export
SERVED_MODEL_NAME
=
${
SERVED_MODEL_NAME
:-
"llava-v1.6-mistral-7b-hf"
}
export
PREFILL_ENGINE_ARGS
=
${
PREFILL_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/prefill.yaml"
}
export
DECODE_ENGINE_ARGS
=
${
DECODE_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/decode.yaml"
}
export
ENCODE_ENGINE_ARGS
=
${
ENCODE_ENGINE_ARGS
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/engine_configs/llava-v1.6-mistral-7b-hf/encode.yaml"
}
export
ENCODE_ENDPOINT
=
${
ENCODE_ENDPOINT
:-
"dyn://dynamo.tensorrt_llm_encode.generate"
}
export
MODALITY
=
${
MODALITY
:-
"multimodal"
}
export
ALLOWED_LOCAL_MEDIA_PATH
=
${
ALLOWED_LOCAL_MEDIA_PATH
:-
"/tmp"
}
export
MAX_FILE_SIZE_MB
=
${
MAX_FILE_SIZE_MB
:-
50
}
export
CUSTOM_TEMPLATE
=
${
CUSTOM_TEMPLATE
:-
"
$DYNAMO_HOME
/examples/backends/trtllm/templates/llava_multimodal.jinja"
}
# Extra arguments forwarded to the PD worker (e.g. --multimodal-embedding-cache-capacity-gb 10)
EXTRA_PD_ARGS
=(
"
$@
"
)
# Setup cleanup trap
cleanup
()
{
echo
"Cleaning up background processes..."
kill
$DYNAMO_PID
$ENCODE_PID
$PD_PID_1
2>/dev/null
||
true
wait
$DYNAMO_PID
$ENCODE_PID
$PD_PID_1
2>/dev/null
||
true
echo
"Cleanup complete."
}
trap
cleanup EXIT INT TERM
# run frontend
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python3
-m
dynamo.frontend &
DYNAMO_PID
=
$!
# run encode worker (vision encoder on GPU 0)
CUDA_VISIBLE_DEVICES
=
0 python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--extra-engine-args
"
$ENCODE_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--allowed-local-media-path
"
$ALLOWED_LOCAL_MEDIA_PATH
"
\
--max-file-size-mb
"
$MAX_FILE_SIZE_MB
"
\
--custom-jinja-template
"
$CUSTOM_TEMPLATE
"
\
--disaggregation-mode
encode &
ENCODE_PID
=
$!
# run PD worker 1 (GPU 1)
CUDA_VISIBLE_DEVICES
=
1 python3
-m
dynamo.trtllm
\
--model-path
"
$MODEL_PATH
"
\
--extra-engine-args
"
$PD_ENGINE_ARGS
"
\
--modality
"
$MODALITY
"
\
--encode-endpoint
"
$ENCODE_ENDPOINT
"
\
--disaggregation-mode
prefill_and_decode
\
--custom-jinja-template
"
$CUSTOM_TEMPLATE
"
\
"
${
EXTRA_PD_ARGS
[@]
}
"
&
PD_PID_1
=
$!
wait
$DYNAMO_PID
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.yaml
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Full 3-way comparison: cache-off vs cache-on vs disaggregated EPD.
#
# Usage:
# python -m benchmarks.multimodal.sweep --config benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.yaml
model
:
Qwen/Qwen3-VL-2B-Instruct
concurrencies
:
[
16
,
32
,
64
,
128
,
256
]
osl
:
150
request_count
:
1000
warmup_count
:
5
port
:
8000
timeout
:
900
output_dir
:
benchmarks/results/embedding_cache/trtllm_e_pd
input_files
:
-
benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_1img_800pool_400word_http.jsonl
configs
:
-
label
:
cache-off
workflow
:
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.sh
extra_args
:
[
--multimodal-embedding-cache-capacity-gb
,
"
0"
]
-
label
:
cache-on
workflow
:
benchmarks/multimodal/sweep/experiments/embedding_cache/trtllm_e_pd.sh
extra_args
:
[
--multimodal-embedding-cache-capacity-gb
,
"
10"
]
benchmarks/multimodal/sweep/experiments/embedding_cache/vllm_e_pd.yaml
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Full 3-way comparison: cache-off vs cache-on vs disaggregated EPD.
#
# Usage:
# python -m benchmarks.multimodal.sweep --config benchmarks/multimodal/sweep/experiments/embedding_cache/full_comparison.yaml
model
:
Qwen/Qwen3-VL-30B-A3B-Instruct-FP8
concurrencies
:
[
16
,
32
,
64
,
128
,
256
]
osl
:
150
request_count
:
1000
warmup_count
:
5
port
:
8000
timeout
:
900
output_dir
:
benchmarks/multimodal/sweep/results/vllm_e_pd
env
:
ENABLE_ENCODER_CACHE
:
"
0"
input_files
:
-
benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_1img_800pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_2img_1600pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_3200pool_400word_http.jsonl
configs
:
-
label
:
cache-off
workflow
:
examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
0"
]
-
label
:
cache-on
workflow
:
examples/backends/vllm/launch/disagg_multimodal_e_pd.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
10"
]
benchmarks/multimodal/sweep/experiments/embedding_cache/vllm_serve.yaml
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# Embedding cache ON vs OFF on a single vllm_serve workflow.
#
# Usage:
# python -m benchmarks.multimodal.sweep --config benchmarks/multimodal/sweep/experiments/embedding_cache/cache_on_off.yaml
model
:
Qwen/Qwen3-VL-30B-A3B-Instruct-FP8
concurrencies
:
[
16
,
32
,
64
,
128
,
256
]
osl
:
150
request_count
:
1000
warmup_count
:
5
port
:
8000
timeout
:
900
output_dir
:
benchmarks/multimodal/sweep/results/vllm_serve
env
:
ENABLE_ENCODER_CACHE
:
"
0"
input_files
:
-
benchmarks/multimodal/jsonl/1000req_1img_200pool_400word_base64.jsonl
-
benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_base64.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_base64.jsonl
-
benchmarks/multimodal/jsonl/1000req_2img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_200pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_1img_800pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_2img_1600pool_400word_http.jsonl
-
benchmarks/multimodal/jsonl/1000req_4img_3200pool_400word_http.jsonl
configs
:
-
label
:
cache-off
workflow
:
examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
0"
]
-
label
:
cache-on
workflow
:
examples/backends/vllm/launch/vllm_serve_embedding_cache.sh
extra_args
:
[
--no-enable-prefix-caching
,
--multimodal-embedding-cache-capacity-gb
,
"
10"
]
benchmarks/multimodal/sweep/orchestrator.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
from
pathlib
import
Path
from
typing
import
List
,
Optional
from
.config
import
SweepConfig
,
input_file_tag
,
resolve_repo_root
from
.runner
import
run_aiperf_single
,
run_concurrency_sweep
from
.server
import
ServerManager
def
_resolve_workflow
(
workflow
:
str
,
repo_root
:
Path
)
->
str
:
p
=
Path
(
workflow
)
if
p
.
is_absolute
():
return
str
(
p
)
return
str
(
repo_root
/
p
)
def
_print_banner
(
title
:
str
,
char
:
str
=
"="
,
width
:
int
=
70
)
->
None
:
print
(
f
"
\n
{
char
*
width
}
"
)
print
(
f
"
{
title
}
"
)
print
(
f
"
{
char
*
width
}
"
,
flush
=
True
)
def
run_sweep
(
config
:
SweepConfig
,
repo_root
:
Optional
[
Path
]
=
None
,
)
->
None
:
"""Execute the full benchmark sweep: for each input file x benchmark config."""
if
repo_root
is
None
:
repo_root
=
resolve_repo_root
()
output_base
=
Path
(
config
.
output_dir
)
restart
=
config
.
restart_server_every_benchmark
_print_banner
(
"Multimodal Benchmark Sweep"
)
print
(
f
" Model:
{
config
.
model
}
"
)
print
(
f
" Input files:
{
len
(
config
.
input_files
)
}
"
)
for
f
in
config
.
input_files
:
print
(
f
"
{
f
}
"
)
labels
=
[
c
.
label
for
c
in
config
.
configs
]
print
(
f
" Configs:
{
labels
}
"
)
print
(
f
" Concurrencies:
{
config
.
concurrencies
}
"
)
print
(
f
" OSL:
{
config
.
osl
}
"
)
print
(
f
" Requests:
{
config
.
request_count
}
per concurrency"
)
print
(
f
" Restart every:
{
restart
}
"
)
print
(
f
" Output:
{
output_base
}
"
)
print
(
flush
=
True
)
server
=
ServerManager
(
port
=
config
.
port
,
timeout
=
config
.
timeout
)
env_overrides
=
dict
(
config
.
env
)
if
config
.
env
else
{}
try
:
for
input_file
in
config
.
input_files
:
file_tag
=
input_file_tag
(
input_file
)
file_output_dir
=
output_base
/
file_tag
_print_banner
(
f
"Input:
{
Path
(
input_file
).
name
}
(
{
file_tag
}
)"
,
char
=
"#"
)
for
bench_cfg
in
config
.
configs
:
_print_banner
(
f
"[
{
file_tag
}
] Config:
{
bench_cfg
.
label
}
"
,
char
=
"-"
)
workflow_abs
=
_resolve_workflow
(
bench_cfg
.
workflow
,
repo_root
)
sweep_dir
=
file_output_dir
/
bench_cfg
.
label
if
restart
:
_sweep_with_restart
(
server
=
server
,
workflow_script
=
workflow_abs
,
config
=
config
,
bench_cfg
=
bench_cfg
,
env_overrides
=
env_overrides
,
input_file
=
input_file
,
output_dir
=
sweep_dir
,
)
else
:
server
.
start
(
workflow_script
=
workflow_abs
,
model
=
config
.
model
,
extra_args
=
bench_cfg
.
extra_args
,
env_overrides
=
env_overrides
,
)
try
:
run_concurrency_sweep
(
model
=
config
.
model
,
port
=
config
.
port
,
concurrencies
=
config
.
concurrencies
,
request_count
=
config
.
request_count
,
warmup_count
=
config
.
warmup_count
,
input_file
=
input_file
,
osl
=
config
.
osl
,
output_dir
=
sweep_dir
,
)
finally
:
server
.
stop
()
if
not
config
.
skip_plots
:
_generate_plots_for_file
(
file_output_dir
,
[
c
.
label
for
c
in
config
.
configs
],
)
finally
:
if
server
.
is_running
:
server
.
stop
()
_print_summary
(
config
,
output_base
)
def
_sweep_with_restart
(
server
:
ServerManager
,
workflow_script
:
str
,
config
:
SweepConfig
,
bench_cfg
,
env_overrides
:
dict
,
input_file
:
str
,
output_dir
:
Path
,
)
->
None
:
"""Run each concurrency level with a fresh server to avoid warm-cache effects."""
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
c
in
sorted
(
config
.
concurrencies
):
server
.
start
(
workflow_script
=
workflow_script
,
model
=
config
.
model
,
extra_args
=
bench_cfg
.
extra_args
,
env_overrides
=
env_overrides
,
)
try
:
run_aiperf_single
(
model
=
config
.
model
,
port
=
config
.
port
,
concurrency
=
c
,
request_count
=
config
.
request_count
,
warmup_count
=
config
.
warmup_count
,
input_file
=
input_file
,
osl
=
config
.
osl
,
artifact_dir
=
output_dir
/
f
"c
{
c
}
"
,
)
finally
:
server
.
stop
()
print
(
f
"Sweep complete. Results in
{
output_dir
}
"
,
flush
=
True
)
def
_generate_plots_for_file
(
file_output_dir
:
Path
,
labels
:
List
[
str
],
)
->
None
:
"""Generate comparison plots for one input file across all configs."""
try
:
from
benchmarks.utils.plot
import
generate_plots
plots_dir
=
file_output_dir
/
"plots"
print
(
f
"
\n
Generating plots ->
{
plots_dir
}
"
,
flush
=
True
)
generate_plots
(
base_output_dir
=
file_output_dir
,
output_dir
=
plots_dir
,
benchmark_names
=
labels
,
)
except
ImportError
:
print
(
"WARNING: benchmarks.utils.plot not importable; skipping plots."
,
flush
=
True
,
)
except
Exception
as
exc
:
print
(
f
"WARNING: Plot generation failed:
{
exc
}
"
,
flush
=
True
)
def
_print_summary
(
config
:
SweepConfig
,
output_base
:
Path
)
->
None
:
_print_banner
(
"Sweep Complete!"
)
print
(
f
" Results:
{
output_base
}
"
)
for
input_file
in
config
.
input_files
:
tag
=
input_file_tag
(
input_file
)
print
(
f
" [
{
tag
}
]:"
)
for
cfg
in
config
.
configs
:
result_dir
=
output_base
/
tag
/
cfg
.
label
print
(
f
"
{
cfg
.
label
}
:
{
result_dir
}
"
)
if
not
config
.
skip_plots
:
plots_dir
=
output_base
/
tag
/
"plots"
print
(
f
" plots:
{
plots_dir
}
"
)
print
(
flush
=
True
)
benchmarks/multimodal/sweep/runner.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
subprocess
from
pathlib
import
Path
from
typing
import
List
def
_build_aiperf_cmd
(
model
:
str
,
port
:
int
,
concurrency
:
int
,
request_count
:
int
,
warmup_count
:
int
,
input_file
:
str
,
osl
:
int
,
artifact_dir
:
Path
,
)
->
List
[
str
]:
return
[
"aiperf"
,
"profile"
,
"-m"
,
model
,
"-u"
,
f
"http://localhost:
{
port
}
"
,
"--concurrency"
,
str
(
concurrency
),
"--request-count"
,
str
(
request_count
),
"--warmup-request-count"
,
str
(
warmup_count
),
"--input-file"
,
input_file
,
"--custom-dataset-type"
,
"single_turn"
,
"--extra-inputs"
,
f
"max_tokens:
{
osl
}
"
,
"--extra-inputs"
,
f
"min_tokens:
{
osl
}
"
,
"--extra-inputs"
,
"ignore_eos:true"
,
"--extra-inputs"
,
"stream:true"
,
"--streaming"
,
"--artifact-dir"
,
str
(
artifact_dir
),
"--ui"
,
"none"
,
"--no-server-metrics"
,
]
def
run_aiperf_single
(
model
:
str
,
port
:
int
,
concurrency
:
int
,
request_count
:
int
,
warmup_count
:
int
,
input_file
:
str
,
osl
:
int
,
artifact_dir
:
Path
,
)
->
None
:
"""Run a single aiperf profile invocation."""
artifact_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
cmd
=
_build_aiperf_cmd
(
model
=
model
,
port
=
port
,
concurrency
=
concurrency
,
request_count
=
request_count
,
warmup_count
=
warmup_count
,
input_file
=
input_file
,
osl
=
osl
,
artifact_dir
=
artifact_dir
,
)
print
(
f
" aiperf concurrency=
{
concurrency
}
->
{
artifact_dir
}
"
,
flush
=
True
)
proc
=
subprocess
.
run
(
cmd
,
capture_output
=
True
,
text
=
True
)
if
proc
.
returncode
!=
0
:
print
(
f
" aiperf FAILED (exit
{
proc
.
returncode
}
)"
,
flush
=
True
)
for
stream_name
,
stream
in
[(
"stderr"
,
proc
.
stderr
),
(
"stdout"
,
proc
.
stdout
)]:
if
stream
:
for
line
in
stream
.
strip
().
splitlines
()[
-
15
:]:
print
(
f
" [
{
stream_name
}
]
{
line
}
"
,
flush
=
True
)
raise
subprocess
.
CalledProcessError
(
proc
.
returncode
,
cmd
,
output
=
proc
.
stdout
,
stderr
=
proc
.
stderr
)
print
(
f
" aiperf concurrency=
{
concurrency
}
done."
,
flush
=
True
)
def
run_concurrency_sweep
(
model
:
str
,
port
:
int
,
concurrencies
:
List
[
int
],
request_count
:
int
,
warmup_count
:
int
,
input_file
:
str
,
osl
:
int
,
output_dir
:
Path
,
)
->
None
:
"""Run aiperf across all concurrency levels, writing results under output_dir/c{N}/."""
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
for
c
in
sorted
(
concurrencies
):
run_aiperf_single
(
model
=
model
,
port
=
port
,
concurrency
=
c
,
request_count
=
request_count
,
warmup_count
=
warmup_count
,
input_file
=
input_file
,
osl
=
osl
,
artifact_dir
=
output_dir
/
f
"c
{
c
}
"
,
)
print
(
f
"Sweep complete. Results in
{
output_dir
}
"
,
flush
=
True
)
benchmarks/multimodal/sweep/server.py
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
from
__future__
import
annotations
import
os
import
signal
import
subprocess
import
time
from
pathlib
import
Path
from
typing
import
List
,
Optional
class
ServerManager
:
"""Manages the lifecycle of a serving backend launched via a bash script.
Uses ``setsid`` so the server gets its own process group, allowing clean
shutdown without killing the orchestrator.
"""
def
__init__
(
self
,
port
:
int
=
8000
,
timeout
:
int
=
600
)
->
None
:
self
.
port
=
port
self
.
timeout
=
timeout
self
.
_process
:
Optional
[
subprocess
.
Popen
]
=
None
@
property
def
is_running
(
self
)
->
bool
:
return
self
.
_process
is
not
None
and
self
.
_process
.
poll
()
is
None
def
start
(
self
,
workflow_script
:
str
,
model
:
str
,
extra_args
:
Optional
[
List
[
str
]]
=
None
,
env_overrides
:
Optional
[
dict
]
=
None
,
)
->
None
:
"""Launch the workflow script and block until the model is served."""
if
self
.
is_running
:
raise
RuntimeError
(
"Server is already running. Call stop() first."
)
script
=
Path
(
workflow_script
)
if
not
script
.
is_file
():
raise
FileNotFoundError
(
f
"Workflow script not found:
{
script
}
"
)
model_flag
=
"--model-path"
if
"trtllm"
in
str
(
script
)
else
"--model"
cmd
=
[
"bash"
,
str
(
script
),
model_flag
,
model
]
if
extra_args
:
cmd
.
extend
(
extra_args
)
env
=
os
.
environ
.
copy
()
if
env_overrides
:
env
.
update
(
env_overrides
)
print
(
f
"Launching:
{
' '
.
join
(
cmd
)
}
"
,
flush
=
True
)
self
.
_process
=
subprocess
.
Popen
(
cmd
,
start_new_session
=
True
,
env
=
env
,
)
self
.
wait_for_ready
(
model
)
def
wait_for_ready
(
self
,
model
:
str
)
->
None
:
"""Poll /v1/models until the expected model name appears."""
import
urllib.error
import
urllib.request
url
=
f
"http://localhost:
{
self
.
port
}
/v1/models"
deadline
=
time
.
monotonic
()
+
self
.
timeout
print
(
f
"Waiting for server at
{
url
}
to list model '
{
model
}
' "
f
"(timeout:
{
self
.
timeout
}
s)..."
,
flush
=
True
,
)
while
time
.
monotonic
()
<
deadline
:
if
not
self
.
is_running
:
raise
RuntimeError
(
"Server process exited unexpectedly during startup "
f
"(exit code
{
self
.
_process
.
returncode
}
)."
)
try
:
req
=
urllib
.
request
.
Request
(
url
)
with
urllib
.
request
.
urlopen
(
req
,
timeout
=
5
)
as
resp
:
body
=
resp
.
read
().
decode
()
if
model
in
body
:
print
(
"Server is ready (model registered)."
,
flush
=
True
)
return
except
(
urllib
.
error
.
URLError
,
OSError
,
TimeoutError
):
pass
time
.
sleep
(
5
)
self
.
stop
()
raise
TimeoutError
(
f
"Server did not become ready within
{
self
.
timeout
}
s"
)
def
stop
(
self
)
->
None
:
"""Stop the server by killing its process group."""
if
self
.
_process
is
None
:
return
pid
=
self
.
_process
.
pid
print
(
f
"Stopping server (PID
{
pid
}
)..."
,
flush
=
True
)
try
:
os
.
killpg
(
pid
,
signal
.
SIGTERM
)
except
(
ProcessLookupError
,
PermissionError
):
try
:
self
.
_process
.
terminate
()
except
(
ProcessLookupError
,
PermissionError
):
pass
try
:
self
.
_process
.
wait
(
timeout
=
15
)
except
subprocess
.
TimeoutExpired
:
try
:
os
.
killpg
(
pid
,
signal
.
SIGKILL
)
except
(
ProcessLookupError
,
PermissionError
):
pass
self
.
_process
.
wait
(
timeout
=
5
)
print
(
f
"Server stopped (PID
{
pid
}
)."
,
flush
=
True
)
self
.
_process
=
None
time
.
sleep
(
5
)
components/src/dynamo/trtllm/multimodal_processor.py
View file @
9780bf3a
...
...
@@ -13,7 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
asyncio
import
logging
import
time
from
io
import
BytesIO
...
...
@@ -23,7 +22,6 @@ from urllib.parse import urlparse
from
urllib.request
import
urlopen
import
torch
from
tensorrt_llm.inputs
import
default_multimodal_input_loader
from
tensorrt_llm.llmapi.tokenizer
import
tokenizer_factory
from
dynamo.common.multimodal.image_loader
import
ImageLoader
...
...
@@ -189,11 +187,6 @@ class MultimodalRequestProcessor:
"prompt": str,
"prompt_token_ids": List[int]
}
-----------------------------------------------------------------------------
TODO: Revert default_multimodal_input_loader calls having fixed TRT-LLM's
token IDs & MM data path in generate_async() for the embeddings case.
-----------------------------------------------------------------------------
"""
self
.
previous_decoded_text
=
""
...
...
@@ -221,50 +214,18 @@ class MultimodalRequestProcessor:
# mm_processor_kwargs must be a dict (not None) for TRT-LLM's processor
processed_inputs
=
{
"prompt_token_ids"
:
token_ids
,
"mm_processor_kwargs"
:
{}}
# The aforementioned fallback to default_multimodal_input_loader:
messages
=
request
.
get
(
"extra_args"
,
{}).
get
(
"messages"
,
request
.
get
(
"messages"
,
[])
)
text_prompt
,
_
,
embedding_paths
=
self
.
extract_prompt_and_media
(
messages
)
loader_kwargs
=
{}
# Two cases, both for the default_multimodal_input_loader fallback:
# 1) EPD Flow Case 2: Embeddings received via NIXL from encode worker
# EPD Flow Case 2: Embeddings received via NIXL from encode worker
# The encode worker computed vision embeddings and transferred them via RDMA/NIXL
# We need to pass these embeddings directly to TRT-LLM's generate_async
# 2) PD flow with no NIXL and no encoder
if
embeddings
is
not
None
or
embedding_paths
:
if
embeddings
is
not
None
:
logging
.
info
(
f
"Using NIXL embeddings from encoder: shape=
{
embeddings
.
shape
if
hasattr
(
embeddings
,
'shape'
)
else
'N/A'
}
"
)
loader_kwargs
[
"mm_embeddings"
]
=
[
embeddings
]
elif
embedding_paths
:
# PD flow with no NIXL and no encoder
loader_kwargs
[
"mm_embeddings"
]
=
[
self
.
load_tensor_from_path_or_url
(
path
)
for
path
in
embedding_paths
]
logging
.
info
(
f
"Using embedding paths:
{
embedding_paths
}
"
)
# NOTE: default_multimodal_input_loader downloads images and preprocesses them
# synchronously. Wrap in asyncio.to_thread to allow concurrent image loading
# across multiple requests, improving throughput at high concurrency.
fallback_processed_inputs
=
await
asyncio
.
to_thread
(
lambda
:
default_multimodal_input_loader
(
tokenizer
=
self
.
tokenizer
,
model_dir
=
self
.
model_dir
,
model_type
=
self
.
model_type
,
modality
=
self
.
modality
,
prompts
=
[
text_prompt
],
image_data_format
=
"pt"
,
device
=
"cuda"
,
**
loader_kwargs
,
)
)
# Return the first processed input if available
if
fallback_processed_inputs
:
return
fallback_processed_inputs
[
0
]
return
None
# Structure embeddings in the format TRT-LLM's generate_async expects
processed_inputs
[
"multi_modal_embeddings"
]
=
embeddings
return
processed_inputs
# PD Flow: Pre-tokenized by Rust frontend with direct media loading
# TODO: Add frontend decoding support
...
...
@@ -279,6 +240,7 @@ class MultimodalRequestProcessor:
if
image_items
and
isinstance
(
image_items
,
list
):
# Separate embedding paths from regular image URLs
# Items come from Rust in format: {"Url": "..."} or {"Decoded": ...}
embedding_paths
=
[]
image_urls
=
[]
for
item
in
image_items
:
...
...
@@ -299,7 +261,9 @@ class MultimodalRequestProcessor:
continue
# Check if this is an embedding file based on extension
if
not
url
.
endswith
((
".pt"
,
".pth"
,
".bin"
)):
if
url
.
endswith
((
".pt"
,
".pth"
,
".bin"
)):
embedding_paths
.
append
(
url
)
else
:
# Keep original item format for load_image_batch
image_urls
.
append
(
item
if
isinstance
(
item
,
dict
)
else
{
"Url"
:
item
}
...
...
@@ -321,6 +285,23 @@ class MultimodalRequestProcessor:
logging
.
error
(
f
"Failed to load images:
{
e
}
"
)
return
None
# Load embedding files (.pt, .pth, .bin) for PD flow
# These are pre-computed vision encoder outputs
if
embedding_paths
:
try
:
loaded_embeddings
=
[
self
.
load_tensor_from_path_or_url
(
path
)
for
path
in
embedding_paths
]
if
loaded_embeddings
:
processed_mm_data
[
"embedding"
]
=
loaded_embeddings
logging
.
info
(
f
"Loaded
{
len
(
loaded_embeddings
)
}
embedding file(s) from paths:
{
embedding_paths
}
"
)
except
Exception
as
e
:
logging
.
error
(
f
"Failed to load embeddings:
{
e
}
"
)
return
None
# TODO: Add support for video_url, audio_url
if
processed_mm_data
:
...
...
components/src/dynamo/vllm/multimodal_utils/prefill_worker_utils.py
View file @
9780bf3a
...
...
@@ -110,8 +110,6 @@ def _accumulate_embeddings(
)
)
else
:
# Plain tensor embeddings
logger
.
info
(
f
"Get embedding of shape
{
mm_data
[
'image'
].
shape
}
"
)
# [gluo FIXME] embedding with multiple images?
if
multi_modal_data
[
"image"
]
==
[]:
multi_modal_data
[
"image"
]
=
mm_data
[
"image"
]
...
...
@@ -250,11 +248,6 @@ async def _fetch_embeddings(
# ── 2. Fetch uncached from encode workers ────────────────────────
pending
:
_PendingRelease
|
None
=
None
if
to_fetch
:
if
cache
is
not
None
:
logger
.
info
(
f
"[
{
request_id
}
] Cache miss for
{
len
(
to_fetch
)
}
/
{
len
(
image_urls
)
}
URLs, "
"fetching from encode workers"
)
miss_urls
=
[
url
for
_
,
url
,
_
in
to_fetch
]
groups
,
pending
=
await
_fetch_from_encode_workers
(
encode_worker_client
,
...
...
@@ -274,8 +267,6 @@ async def _fetch_embeddings(
),
)
results
[
idx
]
=
group
else
:
logger
.
info
(
f
"[
{
request_id
}
] All
{
len
(
image_urls
)
}
URLs served from cache"
)
return
[
r
for
r
in
results
if
r
is
not
None
],
pending
...
...
examples/backends/trtllm/engine_configs/qwen3-vl-2b-instruct/agg.yaml
View file @
9780bf3a
...
...
@@ -17,7 +17,6 @@ enable_attention_dp: false
max_num_tokens
:
2048
max_batch_size
:
8
max_seq_len
:
8192
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
...
...
examples/backends/trtllm/engine_configs/qwen3-vl-30b-a3b-instruct-fp8/agg.yaml
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
2048
max_batch_size
:
8
max_seq_len
:
16384
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
kv_cache_config
:
free_gpu_memory_fraction
:
0.95
enable_block_reuse
:
false
cache_transceiver_config
:
backend
:
DEFAULT
examples/backends/trtllm/engine_configs/qwen3-vl-30b-a3b-instruct-fp8/encode.yaml
0 → 100644
View file @
9780bf3a
# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
tensor_parallel_size
:
1
enable_attention_dp
:
false
max_num_tokens
:
1024
max_batch_size
:
4
trust_remote_code
:
true
backend
:
pytorch
enable_chunked_prefill
:
true
# Overlap scheduler not currently supported in prefill only workers.
disable_overlap_scheduler
:
true
# Note: Encode workers use MultimodalEncoder (vision encoder + projector only),
# which ignores most engine_args. No kv_cache_config or cache_transceiver_config
# is needed since MultimodalEncoder doesn't allocate KV cache or transfer buffers.
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment