Unverified Commit 64574a3e authored by Alec's avatar Alec Committed by GitHub
Browse files

chore(deps): bump vLLM to 0.16.0 (#6614)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
Co-authored-by: default avatarClaude Opus 4.6 <noreply@anthropic.com>
parent 3659b76a
...@@ -487,7 +487,9 @@ def setup_vllm_engine(config, stat_logger=None): ...@@ -487,7 +487,9 @@ def setup_vllm_engine(config, stat_logger=None):
"Continuing without KV event consolidation. " "Continuing without KV event consolidation. "
"Ensure 'kvbm' package is installed if this feature is needed." "Ensure 'kvbm' package is installed if this feature is needed."
) )
vllm_config.consolidator_endpoints = consolidator_endpoints # Store consolidator endpoints in additional_config (vLLM 0.16+ uses strict
# dataclass fields; monkey-patching attributes onto VllmConfig is no longer safe).
vllm_config.additional_config["consolidator_endpoints"] = consolidator_endpoints
factory = [] factory = []
if stat_logger: if stat_logger:
...@@ -646,13 +648,11 @@ async def init_prefill( ...@@ -646,13 +648,11 @@ async def init_prefill(
consolidator_enabled = False consolidator_enabled = False
consolidator_port = None consolidator_port = None
if ( _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
hasattr(vllm_config, "consolidator_endpoints") if _consolidator_eps:
and vllm_config.consolidator_endpoints
):
# Extract connect endpoint (third element) for clients to subscribe # Extract connect endpoint (third element) for clients to subscribe
# consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint) # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
consolidator_output_endpoint = vllm_config.consolidator_endpoints[2] consolidator_output_endpoint = _consolidator_eps[2]
consolidator_port = int(consolidator_output_endpoint.split(":")[-1]) consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
consolidator_enabled = True consolidator_enabled = True
...@@ -831,13 +831,11 @@ async def init( ...@@ -831,13 +831,11 @@ async def init(
consolidator_enabled = False consolidator_enabled = False
consolidator_port = None consolidator_port = None
if ( _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
hasattr(vllm_config, "consolidator_endpoints") if _consolidator_eps:
and vllm_config.consolidator_endpoints
):
# Extract connect endpoint (third element) for clients to subscribe # Extract connect endpoint (third element) for clients to subscribe
# consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint) # consolidator_endpoints = (vllm_endpoint, bind_endpoint, connect_endpoint)
consolidator_output_endpoint = vllm_config.consolidator_endpoints[2] consolidator_output_endpoint = _consolidator_eps[2]
consolidator_port = int(consolidator_output_endpoint.split(":")[-1]) consolidator_port = int(consolidator_output_endpoint.split(":")[-1])
consolidator_enabled = True consolidator_enabled = True
......
...@@ -27,7 +27,6 @@ class BaseOmniHandler(BaseWorkerHandler): ...@@ -27,7 +27,6 @@ class BaseOmniHandler(BaseWorkerHandler):
def __init__( def __init__(
self, self,
runtime, runtime,
component,
config, config,
default_sampling_params: Dict[str, Any], default_sampling_params: Dict[str, Any],
shutdown_event: asyncio.Event | None = None, shutdown_event: asyncio.Event | None = None,
...@@ -36,7 +35,6 @@ class BaseOmniHandler(BaseWorkerHandler): ...@@ -36,7 +35,6 @@ class BaseOmniHandler(BaseWorkerHandler):
Args: Args:
runtime: Dynamo distributed runtime. runtime: Dynamo distributed runtime.
component: Dynamo component handle.
config: Parsed Config object from args.py. config: Parsed Config object from args.py.
default_sampling_params: Default sampling parameters dict. default_sampling_params: Default sampling parameters dict.
shutdown_event: Optional asyncio event for graceful shutdown. shutdown_event: Optional asyncio event for graceful shutdown.
...@@ -56,7 +54,6 @@ class BaseOmniHandler(BaseWorkerHandler): ...@@ -56,7 +54,6 @@ class BaseOmniHandler(BaseWorkerHandler):
# TODO: Kv publishers not supported yet # TODO: Kv publishers not supported yet
# TODO: Adopt to baseworker initialization pattern # TODO: Adopt to baseworker initialization pattern
self.runtime = runtime self.runtime = runtime
self.component = component
self.default_sampling_params = default_sampling_params self.default_sampling_params = default_sampling_params
self.config = config self.config = config
self.model_max_len = config.engine_args.max_model_len self.model_max_len = config.engine_args.max_model_len
......
...@@ -144,7 +144,6 @@ class TestVllmRendererApi: ...@@ -144,7 +144,6 @@ class TestVllmRendererApi:
"role", "role",
"content", "content",
"reasoning", "reasoning",
"reasoning_content",
"tool_calls", "tool_calls",
} }
actual_fields = set(DeltaMessage.model_fields) actual_fields = set(DeltaMessage.model_fields)
...@@ -376,6 +375,7 @@ class TestVllmRendererApi: ...@@ -376,6 +375,7 @@ class TestVllmRendererApi:
"trace_headers", "trace_headers",
"resumable", "resumable",
"external_req_id", "external_req_id",
"reasoning_ended",
) )
# vllm-omni monkey-patches EngineCoreRequest with an extra field # vllm-omni monkey-patches EngineCoreRequest with an extra field
# (only installed on amd64, not arm64) # (only installed on amd64, not arm64)
...@@ -401,6 +401,7 @@ class TestVllmRendererApi: ...@@ -401,6 +401,7 @@ class TestVllmRendererApi:
"kv_transfer_params", "kv_transfer_params",
"trace_headers", "trace_headers",
"num_cached_tokens", "num_cached_tokens",
"num_external_computed_tokens",
"routed_experts", "routed_experts",
"num_nans_in_logits", "num_nans_in_logits",
) )
......
...@@ -42,10 +42,10 @@ vllm: ...@@ -42,10 +42,10 @@ vllm:
cuda13.0: cuda13.0:
base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04 base_image_tag: 25.11-cuda13.0-devel-ubuntu24.04
runtime_image_tag: 13.0.2-runtime-ubuntu24.04 runtime_image_tag: 13.0.2-runtime-ubuntu24.04
vllm_ref: v0.15.1 vllm_ref: v0.16.0
flashinf_ref: v0.6.1 flashinf_ref: v0.6.3
lmcache_ref: 0.3.13 lmcache_ref: 0.3.14
vllm_omni_ref: "0.14.0" vllm_omni_ref: "v0.16.0rc1"
max_jobs: "10" max_jobs: "10"
enable_media_ffmpeg: "false" enable_media_ffmpeg: "false"
enable_gpu_memory_service: "true" enable_gpu_memory_service: "true"
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
set -euo pipefail set -euo pipefail
VLLM_VER="0.15.1" VLLM_VER="0.16.0"
VLLM_REF="v${VLLM_VER}" VLLM_REF="v${VLLM_VER}"
# Basic Configurations # Basic Configurations
...@@ -24,9 +24,9 @@ INSTALLATION_DIR=/tmp ...@@ -24,9 +24,9 @@ INSTALLATION_DIR=/tmp
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels -- TODO: check if we need to add 12.0+PTX
DEEPGEMM_REF="" DEEPGEMM_REF=""
CUDA_VERSION="12.9" CUDA_VERSION="12.9"
FLASHINF_REF="v0.6.1" FLASHINF_REF="v0.6.3"
LMCACHE_REF="0.3.13" LMCACHE_REF="0.3.14"
VLLM_OMNI_REF="0.14.0" VLLM_OMNI_REF="v0.16.0rc1"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -146,25 +146,46 @@ echo "✓ vLLM repository cloned" ...@@ -146,25 +146,46 @@ echo "✓ vLLM repository cloned"
echo "\n=== Installing vLLM & FlashInfer ===" echo "\n=== Installing vLLM & FlashInfer ==="
# Build GitHub release wheel URL per CUDA version
# CUDA 12 wheels have no +cu suffix and use manylinux_2_31
# CUDA 13 wheels have +cu130 suffix and use manylinux_2_35
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
echo "Installing vLLM $VLLM_REF from PyPI..." VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}-cp38-abi3-manylinux_2_31_${ALT_ARCH}.whl"
uv pip install vllm[flashinfer,runai]==$VLLM_REF --torch-backend=${TORCH_BACKEND} EXTRA_PIP_ARGS=""
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then elif [[ "$CUDA_VERSION_MAJOR" == "13" ]]; then
echo "⚠ Skipping LMCache on CUDA 13 env since LMCache doesn't support CUDA 13 " VLLM_GITHUB_WHEEL="vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl"
echo "Installing vLLM $VLLM_REF from GitHub since CUDA 13 x86_64 wheel is only present on GitHub..." EXTRA_PIP_ARGS="--index-strategy=unsafe-best-match --extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND}"
uv pip install \
--index-strategy=unsafe-best-match \
--extra-index-url https://download.pytorch.org/whl/${TORCH_BACKEND} \
https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/vllm-${VLLM_VER}+${TORCH_BACKEND}-cp38-abi3-manylinux_2_35_${ALT_ARCH}.whl[flashinfer,runai] \
--torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
else else
echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}" echo "❌ Unsupported CUDA version for vLLM installation: ${CUDA_VERSION}"
exit 1 exit 1
fi fi
VLLM_GITHUB_URL="https://github.com/vllm-project/vllm/releases/download/v${VLLM_VER}/${VLLM_GITHUB_WHEEL}"
# Install vLLM wheel
# CUDA 12: Try PyPI first, fall back to GitHub release
# CUDA 13: Always use GitHub release (PyPI only has cu12 wheels, --torch-backend
# does not prevent uv from resolving the cu12 variant)
echo "Installing vLLM $VLLM_VER (torch backend: $TORCH_BACKEND)..."
if [[ "$CUDA_VERSION_MAJOR" == "12" ]]; then
if uv pip install "vllm[flashinfer,runai]==${VLLM_VER}" ${EXTRA_PIP_ARGS} --torch-backend=${TORCH_BACKEND} 2>&1; then
echo "✓ vLLM ${VLLM_VER} installed from PyPI"
else
echo "⚠ PyPI install failed, installing from GitHub release..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
else
echo "Installing vLLM from GitHub release (cu130 wheel not available on PyPI)..."
uv pip install ${EXTRA_PIP_ARGS} \
"${VLLM_GITHUB_URL}[flashinfer,runai]" \
--torch-backend=${TORCH_BACKEND}
echo "✓ vLLM ${VLLM_VER} installed from GitHub"
fi
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed" echo "✓ vLLM installation completed"
echo "\n=== Installing vLLM-Omni ===" echo "\n=== Installing vLLM-Omni ==="
...@@ -172,10 +193,19 @@ if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then ...@@ -172,10 +193,19 @@ if [ -n "$VLLM_OMNI_REF" ] && [ "$ARCH" = "amd64" ]; then
# Save original vllm entrypoint before vllm-omni overwrites it # Save original vllm entrypoint before vllm-omni overwrites it
VLLM_BIN=$(which vllm) VLLM_BIN=$(which vllm)
cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup cp "$VLLM_BIN" /tmp/vllm-entrypoint-backup
uv pip install vllm-omni==${VLLM_OMNI_REF} # Try PyPI first, fall back to building from source
if uv pip install vllm-omni==${VLLM_OMNI_REF#v} 2>&1; then
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from PyPI"
else
echo "⚠ PyPI install failed, building from source..."
git clone --depth 1 --branch ${VLLM_OMNI_REF} https://github.com/vllm-project/vllm-omni.git $INSTALLATION_DIR/vllm-omni
uv pip install $INSTALLATION_DIR/vllm-omni
rm -rf $INSTALLATION_DIR/vllm-omni
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed from source"
fi
# Restore original vllm CLI entrypoint (vllm-omni replaces it with its own) # Restore original vllm CLI entrypoint (vllm-omni replaces it with its own)
cp /tmp/vllm-entrypoint-backup "$VLLM_BIN" cp /tmp/vllm-entrypoint-backup "$VLLM_BIN"
echo "✓ vLLM-Omni ${VLLM_OMNI_REF} installed (original vllm entrypoint preserved)" echo "✓ Original vllm entrypoint preserved"
else else
echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)" echo "⚠ Skipping vLLM-Omni (no ref provided or ARM64 not supported)"
fi fi
......
...@@ -10,6 +10,14 @@ Dynamo supports multimodal generation through the [vLLM-Omni](https://github.com ...@@ -10,6 +10,14 @@ Dynamo supports multimodal generation through the [vLLM-Omni](https://github.com
This guide assumes familiarity with deploying Dynamo with vLLM as described in the [vLLM backend guide](/docs/pages/backends/vllm/README.md). This guide assumes familiarity with deploying Dynamo with vLLM as described in the [vLLM backend guide](/docs/pages/backends/vllm/README.md).
### Installation
Dynamo container images include vLLM-Omni pre-installed. If you are using `pip install ai-dynamo[vllm]`, vLLM-Omni is **not** included automatically because the matching release is not yet available on PyPI. Install it separately from source:
```bash
pip install git+https://github.com/vllm-project/vllm-omni.git@v0.16.0rc1
```
## Supported Modalities ## Supported Modalities
| Modality | Endpoint(s) | `--output-modalities` | | Modality | Endpoint(s) | `--output-modalities` |
......
...@@ -14,7 +14,7 @@ The following table shows the backend framework versions included with each Dyna ...@@ -14,7 +14,7 @@ The following table shows the backend framework versions included with each Dyna
| **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** | | **Dynamo** | **SGLang** | **TensorRT-LLM** | **vLLM** | **NIXL** |
| :--- | :--- | :--- | :--- | :--- | | :--- | :--- | :--- | :--- | :--- |
| **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.0` | | **main (ToT)** | `0.5.9` | `1.3.0rc5` | `0.16.0` | `0.10.0` |
| **v1.0.0** *(in progress)* | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.1` | | **v1.0.0** *(in progress)* | `0.5.9` | `1.3.0rc5` | `0.15.1` | `0.10.1` |
| **v0.9.1** | `0.5.8` | `1.3.0rc3` | `0.14.1` | `0.9.0` | | **v0.9.1** | `0.5.8` | `1.3.0rc3` | `0.14.1` | `0.9.0` |
| **v0.9.0** | `0.5.8` | `1.3.0rc1` | `0.14.1` | `0.9.0` | | **v0.9.0** | `0.5.8` | `1.3.0rc1` | `0.14.1` | `0.9.0` |
......
...@@ -74,10 +74,8 @@ class KvConnectorLeader: ...@@ -74,10 +74,8 @@ class KvConnectorLeader:
consolidator_output_endpoint = None consolidator_output_endpoint = None
self._consolidator_output_port = None self._consolidator_output_port = None
if ( _consolidator_eps = vllm_config.additional_config.get("consolidator_endpoints")
hasattr(vllm_config, "consolidator_endpoints") if _consolidator_eps:
and vllm_config.consolidator_endpoints
):
# Unpack all three endpoints # Unpack all three endpoints
# [0]: vllm_endpoint (for consolidator to subscribe to vLLM) # [0]: vllm_endpoint (for consolidator to subscribe to vLLM)
# [1]: output_bind_endpoint (for consolidator to bind/publish) # [1]: output_bind_endpoint (for consolidator to bind/publish)
...@@ -86,7 +84,7 @@ class KvConnectorLeader: ...@@ -86,7 +84,7 @@ class KvConnectorLeader:
consolidator_vllm_endpoint, consolidator_vllm_endpoint,
consolidator_output_endpoint, consolidator_output_endpoint,
_consolidator_output_connect_endpoint, # Not needed here _consolidator_output_connect_endpoint, # Not needed here
) = vllm_config.consolidator_endpoints ) = _consolidator_eps
self._consolidator_output_port = int( self._consolidator_output_port = int(
consolidator_output_endpoint.split(":")[-1] consolidator_output_endpoint.split(":")[-1]
) )
......
...@@ -57,8 +57,11 @@ trtllm =[ ...@@ -57,8 +57,11 @@ trtllm =[
vllm = [ vllm = [
"uvloop", "uvloop",
"nixl[cu12]<=0.9.0", "nixl[cu12]<=0.9.0",
"vllm[flashinfer,runai]==0.15.1", "vllm[flashinfer,runai]==0.16.0",
"vllm-omni==0.14.0", # vllm-omni 0.16.0rc1 is not on PyPI; installed from source in container builds
# (see container/deps/vllm/install_vllm.sh). pip install ai-dynamo[vllm] will
# not include vllm-omni — install it separately from source if needed.
# "vllm-omni==0.16.0rc1",
"blake3>=1.0.0,<2.0.0", "blake3>=1.0.0,<2.0.0",
] ]
...@@ -188,6 +191,7 @@ filterwarnings = [ ...@@ -188,6 +191,7 @@ filterwarnings = [
"ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings "ignore:.*Exception ignored in.*:pytest.PytestUnraisableExceptionWarning", # Ignore unraisable exception warnings
"ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py "ignore:The pynvml package is deprecated.*:FutureWarning", # Ignore pynvml deprecation warning, temporary until upstream library updates to nvidia-ml-py
"ignore:The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*:FutureWarning", # pandas 2.x concat deprecation in AIC SDK TODO: fix in AIC "ignore:The behavior of DataFrame concatenation with empty or all-NA entries is deprecated.*:FutureWarning", # pandas 2.x concat deprecation in AIC SDK TODO: fix in AIC
"ignore:Automatic KV events configuration is deprecated.*:FutureWarning", # Ignore Dynamo's own KV events deprecation warning in tests
# Pydantic V2 deprecation warnings from TRTLLM dependencies (raised at import time during collection) # Pydantic V2 deprecation warnings from TRTLLM dependencies (raised at import time during collection)
"ignore:Support for class-based `config`.*:pydantic.warnings.PydanticDeprecatedSince20", "ignore:Support for class-based `config`.*:pydantic.warnings.PydanticDeprecatedSince20",
"ignore:Using extra keyword arguments on `Field`.*:pydantic.warnings.PydanticDeprecatedSince20", "ignore:Using extra keyword arguments on `Field`.*:pydantic.warnings.PydanticDeprecatedSince20",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment