Unverified Commit 64574a3e authored by Alec's avatar Alec Committed by GitHub
Browse files

chore(deps): bump vLLM to 0.16.0 (#6614)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 3659b76a
......@@ -487,7 +487,9 @@ def setup_vllm_engine(config, stat_logger=None):
"Continuing without KV event consolidation. "
"Ensure 'kvbm' package is installed if this feature is needed."
)
vllm_config.consolidator_endpoints = consolidator_endpoints
# Store consolidator endpoints in additional_config (vLLM 0.16+ uses strict
# dataclass fields; monkey-patching attributes onto VllmConfig is no longer safe).
vllm_config.additional_config["consolidator_endpoints"] = consolidator_endpoints
factory = []
if stat_logger:
......@@ -646,13 +648,11 @@ async def init_prefill(
consolidator_enabled = False
consolidator_port = None
if (
hasattr(vllm_config, "consolidator_endpoints")
and vllm_config.consolidator_endpoints
):
_consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
if _consolidator_eps:
# Extract connect endpoint (third element) for clients to subscribe
# consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
consolidator_output_endpoint = _consolidator_eps[2]
consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
consolidator_enabled = True
......@@ -831,13 +831,11 @@ async def init(
consolidator_enabled = False
consolidator_port = None
if (
hasattr(vllm_config, "consolidator_endpoints")
and vllm_config.consolidator_endpoints
):
_consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
if _consolidator_eps:
# Extract connect endpoint (third element) for clients to subscribe
# consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
consolidator_output_endpoint = vllm_config.consolidator_endpoints[2]
consolidator_output_endpoint = _consolidator_eps[2]
consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
consolidator_enabled = True
......
......@@ -27,7 +27,6 @@ class BaseOmniHandler(BaseWorkerHandler):
def __init__(
self,
runtime,
component,
config,
default_sampling_params: Dict[str, Any],
shutdown_event: asyncio.Event | None = None,
......@@ -36,7 +35,6 @@ class BaseOmniHandler(BaseWorkerHandler):
Args:
runtime: Dynamo distributed runtime.
component: Dynamo component handle.
config: Parsed Config object from args.py.
default_sampling_params: Default sampling parameters dict.
shutdown_event: Optional asyncio event for graceful shutdown.
......@@ -56,7 +54,6 @@ class BaseOmniHandler(BaseWorkerHandler):
# TODO: Kv publishers not supported yet
# TODO: Adopt to baseworker initialization pattern
self.runtime = runtime
self.component = component
self.default_sampling_params = default_sampling_params
self.config = config
self.model_max_len = config.engine_args.max_model_len
......
......@@ -144,7 +144,6 @@ class TestVllmRendererApi:
"role",
"content",
"reasoning",
"reasoning_content",
"tool_calls",
}
actual_fields = set(DeltaMessage.model_fields)
......@@ -376,6 +375,7 @@ class TestVllmRendererApi:
"trace_headers",
"resumable",
"external_req_id",
"reasoning_ended",
)
# vllm-omni monkey-patches EngineCoreRequest with an extra field
# (only installed on amd64, not arm64)
......@@ -401,6 +401,7 @@ class TestVllmRendererApi:
"kv_transfer_params",
"trace_headers",
"num_cached_tokens",
"num_external_computed_tokens",
"routed_experts",
"num_nans_in_logits",
)
......
......@@ -42,10 +42,10 @@ vllm:
cuda13.0:
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.15.1
flashinf_ref: v0.6.1
lmcache_ref: 0.3.13
vllm_omni_ref: "0.14.0"
vllm_ref: v0.16.0
flashinf_ref: v0.6.3
lmcache_ref: 0.3.14
vllm_omni_ref: "v0.16.0rc1"
max_jobs: "10"
enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true"
......
......@@ -12,7 +12,7 @@
set -euo pipefail
VLLM_VER="0.15.1"
VLLM_VER="0.16.0"
VLLM_REF="v${VLLM_VER}"
# Basic Configurations
......@@ -24,9 +24,9 @@ INSTALLATION_DIR=/tmp
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF=""
CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.1"
LMCACHE_REF="0.3.13"
VLLM_OMNI_REF="0.14.0"
FLASHINF_REF="v0.6.3"
LMCACHE_REF="0.3.14"
VLLM_OMNI_REF="v0.16.0rc1"
while [[ $# -gt 0 ]]; do
case $1 in
......@@ -146,25 +146,46 @@ echo "✓ vLLM repository cloned"
echo "\n=== Installing vLLM & FlashInfer ==="
# Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo "Installing vLLM $VLLM_REF from PyPI..."
uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS=""
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 "
echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..."
uv pip install \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
--torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1
fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI"
else
echo "⚠ PyPI install failed, installing from GitHub release..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
else
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ==="
......@@ -172,10 +193,19 @@ if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN=$(which vllm)
cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
uv pip install vllm-omni==${VLLM_OMNI_REF}
# Try PyPI first, fall back to building from source
if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
else
echo "⚠ PyPI install failed, building from source..."
git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
uv pip install $INSTALLATION_DIR/vllm-omni
rm -rf $INSTALLATION_DIR/vllm-omni
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
fi
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)"
echo "✓ Original vllm entrypoint preserved"
else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi
......
......@@ -10,6 +10,14 @@ Dynamo supports multimodal generation through the [vLLM-Omni](https://github.com
This guide assumes familiarity with deploying Dynamo with vLLM as described in the [vLLM backend guide](/docs/pages/backends/vllm/README.md).
### Installation
Dynamo container images include vLLM-Omni pre-installed. If you are using `pip install ai-dynamo[vllm]`, vLLM-Omni is **not** included automatically because the matching release is not yet available on PyPI. Install it separately from source:
```bash
pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
```
## Supported Modalities
| Modality | Endpoint(s) | `--output-modalities` |
......
......@@ -14,7 +14,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.0` |
| **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.16.0` | `0.10.0` |
| **v1.0.0** *(in progress)* | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.1` |
| **v0.9.1** | `0.5.8` | `1.3.0rc3` | `0.14.1` | `0.9.0` |
| **v0.9.0** | `0.5.8` | `1.3.0rc1` | `0.14.1` | `0.9.0` |
......
......@@ -74,10 +74,8 @@ class KvConnectorLeader:
consolidator_output_endpoint = None
self._consolidator_output_port = None
if (
hasattr(vllm_config, "consolidator_endpoints")
and vllm_config.consolidator_endpoints
):
_consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
if _consolidator_eps:
# Unpack all three endpoints
# [0]: vllm_endpoint (for consolidator to subscribe to vLLM)
# [1]: output_bind_endpoint (for consolidator to bind/publish)
......@@ -86,7 +84,7 @@ class KvConnectorLeader:
consolidator_vllm_endpoint,
consolidator_output_endpoint,
_consolidator_output_connect_endpoint, # Not needed here
) = vllm_config.consolidator_endpoints
) = _consolidator_eps
self._consolidator_output_port = int(
consolidator_output_endpoint.split(":")[-1]
)
......
......@@ -57,8 +57,11 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.9.0",
"vllm[flashinfer,runai]==0.15.1",
"vllm-omni==0.14.0",
"vllm[flashinfer,runai]==0.16.0",
# vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed.
# "vllm-omni==0.16.0rc1",
"blake3>=1.0.0,<2.0.0",
]
......@@ -188,6 +191,7 @@ filterwarnings = [
"ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
"ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
"ignore:The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*:FutureWarning", # pandas 2.x concat deprecation in AIC SDK TODO: fix in AIC
"ignore:Automatic KV events configuration is deprecated.*:FutureWarning", # Ignore Dynamo's own KV events deprecation warning in tests
# Pydantic V2 deprecation warnings from TRTLLM dependencies (raised at import time during collection)
"ignore:Support for class-based `config`.*:pydantic.warnings.PydanticDeprecatedSince20",
"ignore:Using extra keyword arguments on `Field`.*:pydantic.warnings.PydanticDeprecatedSince20",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment