"docs/kubernetes/api-reference.md" did not exist on "cf433e6825d83f41905da47d69ca5ee30d4eb1ba"
Unverified Commit dacb2980 authored by Ryan McCormick's avatar Ryan McCormick Committed by GitHub
Browse files

chore(multimodal): Cleanup multimodal docs and consolidate launch scripts (#7845)

parent 2075eb67
This diff is collapsed.
......@@ -2,11 +2,11 @@
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated multimodal serving with standard Dynamo preprocessing
# Aggregated multimodal image/video serving with standard Dynamo preprocessing
#
# Architecture: Single-worker PD (Prefill-Decode)
# - Frontend: Rust OpenAIPreprocessor handles image URLs (HTTP and data:// base64)
# - Worker: Standard vLLM worker with vision model support
# - Frontend: Rust OpenAIPreprocessor forwards multimodal requests
# - Worker: Standard vLLM worker with multimodal model support
#
# For EPD (Encode-Prefill-Decode) architecture with dedicated encoding worker,
# see agg_multimodal_epd.sh
......@@ -19,7 +19,7 @@ source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
MODEL_NAME="${DYN_MODEL_NAME:-Qwen/Qwen3-VL-30B-A3B-Instruct-FP8}"
# Parse command line arguments
# Extra arguments are passed through to the vLLM worker
......@@ -48,13 +48,41 @@ while [[ $# -gt 0 ]]; do
done
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
print_launch_banner --multimodal "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT"
# Use TCP transport (instead of default NATS)
# TCP is preferred for multimodal workloads because it overcomes:
# - NATS default 1MB max payload limit (multimodal base64 images can exceed this)
export DYN_REQUEST_PLANE=tcp
print_launch_banner --no-curl "Launching Aggregated Multimodal Serving" "$MODEL_NAME" "$HTTP_PORT" \
"Backend: dynamo.vllm --enable-multimodal" \
"Media: image_url and video_url (model support dependent)"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL_NAME}",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the image"},
{"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/4/47/PNG_transparency_demonstration_1.png/300px-PNG_transparency_demonstration_1.png"}}
]}],
"max_tokens": 50
}'
# For video-capable models such as Qwen/Qwen3-VL-2B-Instruct:
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "Qwen/Qwen3-VL-2B-Instruct",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"}}
]}],
"max_tokens": 128
}'
CURL
# Start frontend with Rust OpenAIPreprocessor
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
......@@ -65,7 +93,7 @@ MAX_CONCURRENT_SEQS="${MAX_CONCURRENT_SEQS:-2}"
MODEL_EXTRA_ARGS=""
case "$MODEL_NAME" in
meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8)
MAX_MODEL_LEN="${MAX_MODEL_LEN:-108960}"
MAX_MODEL_LEN="108960"
MODEL_EXTRA_ARGS="--tensor-parallel-size=8" ;;
esac
......
......@@ -7,6 +7,9 @@ trap 'echo Cleaning up...; kill 0' EXIT
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Use TCP transport for multimodal workloads (base64 images can exceed NATS 1MB limit)
export DYN_REQUEST_PLANE=tcp
# Default values
MODEL_NAME="Qwen/Qwen3-VL-30B-A3B-Instruct-FP8"
SINGLE_GPU=false
......
......@@ -8,6 +8,9 @@ SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Use TCP transport for multimodal workloads (base64 images can exceed NATS 1MB limit)
export DYN_REQUEST_PLANE=tcp
# Default values
MODEL_NAME="llava-hf/llava-1.5-7b-hf"
......@@ -17,7 +20,7 @@ MODEL_NAME="llava-hf/llava-1.5-7b-hf"
# - Enabling --enforce-eager (disables torch.compile and CUDA graph capture)
# - Hardcoding P/D KV cache to 512 MB (skips all memory profiling)
# - Limiting --max-model-len to 4096 tokens on P/D workers
# - Limiting P/D workers to image=1,video=0,audio=0 (--limit-mm-per-prompt)
# - Limiting P/D workers to image=3,video=3,audio=0 (--limit-mm-per-prompt)
# - Using lower gpu-memory-utilization fractions to share the GPU
SINGLE_GPU=false
......@@ -77,10 +80,17 @@ python -m dynamo.frontend &
EXTRA_ARGS=""
PD_EXTRA_ARGS=""
# GPU assignments (override via environment variables)
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}
# GPU assignments (override via environment variables).
# In single-GPU mode all 3 workers default to GPU 0.
if [[ "$SINGLE_GPU" == "true" ]]; then
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-0}
DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-0}
else
DYN_ENCODE_WORKER_GPU=${DYN_ENCODE_WORKER_GPU:-0}
DYN_PREFILL_WORKER_GPU=${DYN_PREFILL_WORKER_GPU:-1}
DYN_DECODE_WORKER_GPU=${DYN_DECODE_WORKER_GPU:-2}
fi
# GPU memory utilization for workers.
# NOTE: --kv-cache-memory-bytes (set below for P/D workers) overrides
......@@ -93,9 +103,15 @@ if [[ -n "${_PROFILE_PYTEST_VRAM_FRAC_OVERRIDE:-}" ]]; then
echo "WARNING: _PROFILE_PYTEST_VRAM_FRAC_OVERRIDE is set but has no effect here because" >&2
echo " --kv-cache-memory-bytes overrides --gpu-memory-utilization in vLLM." >&2
fi
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
if [[ "$SINGLE_GPU" == "true" ]]; then
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.1}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.4}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.4}
else
DYN_ENCODE_GPU_MEM=${DYN_ENCODE_GPU_MEM:-0.9}
DYN_PREFILL_GPU_MEM=${DYN_PREFILL_GPU_MEM:-0.9}
DYN_DECODE_GPU_MEM=${DYN_DECODE_GPU_MEM:-0.9}
fi
# 512 MB KV cache per P/D worker. Setting --kv-cache-memory-bytes bypasses vLLM's
# memory profiling entirely (both language model and multimodal encoder), which avoids
......@@ -105,7 +121,7 @@ PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
if [[ "$SINGLE_GPU" == "true" ]]; then
EXTRA_ARGS="--enforce-eager"
PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":1,\"video\":0,\"audio\":0}"
PD_EXTRA_ARGS="--max-model-len 4096 --kv-cache-memory-bytes $PD_KV_CACHE_BYTES --limit-mm-per-prompt {\"image\":3,\"video\":3,\"audio\":0}"
fi
# Start encode worker
......
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
set -ex
SCRIPT_DIR="$(dirname "$(readlink -f "$0")")"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
# Default values
HEAD_NODE=0
MODEL_NAME="meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8"
EXTRA_ARGS=()
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
--head-node)
HEAD_NODE=1
shift 1
;;
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Disaggregated multimodal serving with separate Prefill/Decode workers for Llama 4"
echo ""
echo "Options:"
echo " --head-node Run as head node. Head node will run the HTTP server, processor and prefill worker."
echo " --model <model_name> Specify the VLM model to use (default: $MODEL_NAME)"
echo " -h, --help Show this help message"
echo ""
echo "Examples:"
echo " # On head node:"
echo " $0 --head-node"
echo ""
echo " # On worker node (requires NATS_SERVER and ETCD_ENDPOINTS pointing to head node):"
echo " $0"
echo ""
exit 0
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
trap 'echo Cleaning up...; kill 0' EXIT
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
if [[ $HEAD_NODE -eq 1 ]]; then
print_launch_banner --multimodal "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
else
print_launch_banner --no-curl "Launching Disaggregated Multimodal Llama 4 (Multi-Node)" "$MODEL_NAME" "$HTTP_PORT"
fi
# Use TCP transport to avoid NATS payload limits for multimodal
export DYN_REQUEST_PLANE=tcp
# Configure model-specific args
GPU_MEM="0.80"
KV_BYTES="${_PROFILE_OVERRIDE_VLLM_KV_CACHE_BYTES:-}"
if [[ -n "$KV_BYTES" ]]; then
GPU_MEM_ARGS="--kv-cache-memory-bytes $KV_BYTES --gpu-memory-utilization 0.01"
else
GPU_MEM_ARGS="--gpu-memory-utilization $GPU_MEM"
fi
MODEL_SPECIFIC_ARGS=""
if [[ "$MODEL_NAME" == "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8" ]]; then
MODEL_SPECIFIC_ARGS="--tensor-parallel-size=8 --max-model-len=208960 $GPU_MEM_ARGS"
fi
if [[ $HEAD_NODE -eq 1 ]]; then
# run ingress
# dynamo.frontend accepts either --http-port flag or DYN_HTTP_PORT env var (defaults to 8000)
python -m dynamo.frontend &
# run processor (CPU-only to avoid competing for GPU memory with workers)
CUDA_VISIBLE_DEVICES="" \
python -m dynamo.vllm --route-to-encoder --enable-multimodal --model $MODEL_NAME &
# Prefill worker handles prompt processing and image encoding
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20097 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
--disaggregation-mode prefill \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20080"}' \
"${EXTRA_ARGS[@]}" &
else
# run decode worker on non-head node
# Uses all 8 GPUs for tensor-parallel
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 \
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
python -m dynamo.vllm \
--enable-multimodal \
--model $MODEL_NAME \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
$MODEL_SPECIFIC_ARGS \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \
"${EXTRA_ARGS[@]}" &
fi
# Exit on first worker failure; kill 0 in the EXIT trap tears down the rest
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Aggregated video serving with standard Dynamo preprocessing and vLLM backend.
set -euo pipefail
cleanup() {
echo "Cleaning up..."
local pids
pids="$(jobs -pr)"
if [[ -n "$pids" ]]; then
kill $pids 2>/dev/null || true
fi
}
trap cleanup EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
export PYTHONPATH="${REPO_ROOT}/components/src:${REPO_ROOT}/lib/bindings/python/src${PYTHONPATH:+:${PYTHONPATH}}"
MODEL_NAME="${DYN_MODEL_NAME:-Qwen/Qwen3-VL-2B-Instruct}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
GPU_DEVICE="${CUDA_VISIBLE_DEVICES:-0}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
MAX_NUM_SEQS="${MAX_NUM_SEQS:-2}"
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
-h|--help)
cat <<USAGE
Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]
Options:
--model <model_name> Video-capable VLM to serve (default: $MODEL_NAME)
-h, --help Show this help message
Any arguments after '--' are passed through to the vLLM worker.
USAGE
exit 0
;;
--)
shift
EXTRA_ARGS+=("$@")
break
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
export DYN_REQUEST_PLANE=tcp
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
print_launch_banner --no-curl "Launching Aggregated Video Serving" "$MODEL_NAME" "$HTTP_PORT" \
"Backend: dynamo.vllm --enable-multimodal" \
"Video path: Standard TokensPrompt multi_modal_data flow"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL_NAME}",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"}}
]}],
"max_tokens": 128
}'
CURL
python -m dynamo.frontend &
CUDA_VISIBLE_DEVICES="$GPU_DEVICE" \
python -m dynamo.vllm \
--enable-multimodal \
--model "$MODEL_NAME" \
--max-model-len "$MAX_MODEL_LEN" \
--max-num-seqs "$MAX_NUM_SEQS" \
$GPU_MEM_ARGS \
"${EXTRA_ARGS[@]}" &
wait_any_exit
#!/bin/bash
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Disaggregated video serving with standard Dynamo preprocessing and vLLM backend.
set -euo pipefail
cleanup() {
echo "Cleaning up..."
local pids
pids="$(jobs -pr)"
if [[ -n "$pids" ]]; then
kill $pids 2>/dev/null || true
fi
}
trap cleanup EXIT
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
REPO_ROOT="$(cd "$SCRIPT_DIR/../../../.." && pwd)"
source "$SCRIPT_DIR/../../../common/gpu_utils.sh"
source "$SCRIPT_DIR/../../../common/launch_utils.sh"
export PYTHONPATH="${REPO_ROOT}/components/src:${REPO_ROOT}/lib/bindings/python/src${PYTHONPATH:+:${PYTHONPATH}}"
MODEL_NAME="${DYN_MODEL_NAME:-Qwen/Qwen3-VL-2B-Instruct}"
HTTP_PORT="${DYN_HTTP_PORT:-8000}"
SINGLE_GPU=false
EXTRA_ARGS=()
while [[ $# -gt 0 ]]; do
case $1 in
--model)
MODEL_NAME=$2
shift 2
;;
--single-gpu)
SINGLE_GPU=true
shift
;;
-h|--help)
cat <<USAGE
Usage: $0 [OPTIONS] [-- EXTRA_VLLM_ARGS]
Options:
--model <model_name> Video-capable VLM to serve (default: $MODEL_NAME)
--single-gpu Run prefill and decode on one GPU for functional testing
-h, --help Show this help message
Any arguments after '--' are passed through to both vLLM workers.
USAGE
exit 0
;;
--)
shift
EXTRA_ARGS+=("$@")
break
;;
*)
EXTRA_ARGS+=("$1")
shift
;;
esac
done
export DYN_REQUEST_PLANE=tcp
if [[ "$SINGLE_GPU" == "true" ]]; then
GPU_LABEL="1 GPU"
PREFILL_GPU="${DYN_PREFILL_WORKER_GPU:-${CUDA_VISIBLE_DEVICES:-0}}"
DECODE_GPU="${DYN_DECODE_WORKER_GPU:-${CUDA_VISIBLE_DEVICES:-0}}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-4096}"
PD_KV_CACHE_BYTES=$((512 * 1024 * 1024))
SHARED_GPU_FRACTION=$(build_gpu_mem_args vllm --workers-per-gpu 2)
PREFILL_GPU_MEM="${DYN_PREFILL_GPU_MEM:-${SHARED_GPU_FRACTION:-0.45}}"
DECODE_GPU_MEM="${DYN_DECODE_GPU_MEM:-${SHARED_GPU_FRACTION:-0.45}}"
SHARED_ARGS=(
--enforce-eager
--max-model-len "$MAX_MODEL_LEN"
--kv-cache-memory-bytes "$PD_KV_CACHE_BYTES"
--limit-mm-per-prompt '{"image":1,"video":1,"audio":0}'
)
else
GPU_LABEL="2 GPUs"
PREFILL_GPU="${DYN_PREFILL_WORKER_GPU:-0}"
DECODE_GPU="${DYN_DECODE_WORKER_GPU:-1}"
MAX_MODEL_LEN="${MAX_MODEL_LEN:-8192}"
GPU_MEM_ARGS=$(build_gpu_mem_args vllm)
PREFILL_GPU_MEM="${DYN_PREFILL_GPU_MEM:-${GPU_MEM_ARGS:-0.9}}"
DECODE_GPU_MEM="${DYN_DECODE_GPU_MEM:-${GPU_MEM_ARGS:-0.9}}"
SHARED_ARGS=(--max-model-len "$MAX_MODEL_LEN")
fi
print_launch_banner --no-curl "Launching Disaggregated Video Serving ($GPU_LABEL)" "$MODEL_NAME" "$HTTP_PORT" \
"Backend: Prefill + decode workers via dynamo.vllm" \
"Video path: Standard TokensPrompt multi_modal_data flow"
print_curl_footer <<CURL
curl http://localhost:${HTTP_PORT}/v1/chat/completions \\
-H 'Content-Type: application/json' \\
-d '{
"model": "${MODEL_NAME}",
"messages": [{"role": "user", "content": [
{"type": "text", "text": "Describe the video in detail"},
{"type": "video_url", "video_url": {"url": "https://storage.googleapis.com/gtv-videos-bucket/sample/BigBuckBunny.mp4"}}
]}],
"max_tokens": 128
}'
CURL
python -m dynamo.frontend &
VLLM_NIXL_SIDE_CHANNEL_PORT=20098 \
CUDA_VISIBLE_DEVICES="$PREFILL_GPU" \
python -m dynamo.vllm \
--disaggregation-mode prefill \
--enable-multimodal \
--model "$MODEL_NAME" \
--gpu-memory-utilization "$PREFILL_GPU_MEM" \
"${SHARED_ARGS[@]}" \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20081"}' \
"${EXTRA_ARGS[@]}" &
VLLM_NIXL_SIDE_CHANNEL_PORT=20099 \
CUDA_VISIBLE_DEVICES="$DECODE_GPU" \
python -m dynamo.vllm \
--disaggregation-mode decode \
--enable-multimodal \
--model "$MODEL_NAME" \
--gpu-memory-utilization "$DECODE_GPU_MEM" \
"${SHARED_ARGS[@]}" \
--kv-transfer-config '{"kv_connector":"NixlConnector","kv_role":"kv_both"}' \
--kv-events-config '{"publisher":"zmq","topic":"kv-events","endpoint":"tcp://*:20082"}' \
"${EXTRA_ARGS[@]}" &
wait_any_exit
......@@ -428,14 +428,6 @@ vllm_configs = {
model="Qwen/Qwen3-VL-2B-Instruct",
script_args=["--model", "Qwen/Qwen3-VL-2B-Instruct", "--single-gpu"],
timeout=300,
env={
"DYN_ENCODE_WORKER_GPU": "0",
"DYN_PREFILL_WORKER_GPU": "0",
"DYN_DECODE_WORKER_GPU": "0",
"DYN_ENCODE_GPU_MEM": "0.1",
"DYN_PREFILL_GPU_MEM": "0.4",
"DYN_DECODE_GPU_MEM": "0.4",
},
request_payloads=[
chat_payload(
[
......@@ -536,11 +528,11 @@ vllm_configs = {
),
],
),
# Video multimodal tests for CI using the vLLM video launch scripts.
# Video multimodal tests for CI use the canonical aggregated multimodal launcher.
"multimodal_video_agg": VLLMConfig(
name="multimodal_video_agg",
directory=vllm_dir,
script_name="video_agg.sh",
script_name="agg_multimodal.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
......@@ -568,7 +560,7 @@ vllm_configs = {
"multimodal_video_disagg": VLLMConfig(
name="multimodal_video_disagg",
directory=vllm_dir,
script_name="video_disagg.sh",
script_name="disagg_multimodal_epd.sh",
marks=[
pytest.mark.gpu_1,
pytest.mark.pre_merge,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment