chore: bump vLLM to 0.11.2 (#4476)

60feb955 · Karen Chung · GitHub · c5e8c4c2 · 60feb955 · 60feb955
Unverified Commit 60feb955 authored Dec 03, 2025 by Karen Chung Committed by GitHub Dec 03, 2025
20 changed files
--- a/.github/actions/pytest/action.yml
+++ b/.github/actions/pytest/action.yml
--- a/components/src/dynamo/vllm/args.py
+++ b/components/src/dynamo/vllm/args.py
@@ -207,6 +207,24 @@ def parse_args() -> Config:
    args = parser.parse_args()
    engine_args = AsyncEngineArgs.from_cli_args(args)

+    # Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
+    # With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
+    # process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
+    # blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
+    # processes, avoiding the GIL contention.
+    # Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
+    # and forcing mp can expose race conditions in vLLM's scheduler.
+    # See: https://github.com/vllm-project/vllm/issues/29369
+    connector_list = [c.lower() for c in args.connector] if args.connector else []
+    uses_nixl = "nixl" in connector_list
+    tp_size = getattr(engine_args, "tensor_parallel_size", None) or 1
+    if uses_nixl and tp_size == 1 and engine_args.distributed_executor_backend is None:
+        logger.info(
+            "Setting --distributed-executor-backend=mp for TP=1 to avoid "
+            "UniProcExecutor GIL contention with NIXL connector"
+        )
+        engine_args.distributed_executor_backend = "mp"
+
    if engine_args.enable_prefix_caching is None:
        logger.debug(
            "--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -11,17 +11,18 @@ ARG PYTHON_VERSION
 ARG ENABLE_KVBM

 ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
-ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
-ARG CUDA_VERSION="12.8"
+ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
+ARG CUDA_VERSION="12.9"

 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="v0.11.0"
-# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
-ARG FLASHINF_REF="v0.3.1"
-ARG TORCH_BACKEND="cu128"
+ARG VLLM_REF="v0.11.2"
+# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
+ARG FLASHINF_REF="v0.5.2"

 # If left blank, then we will fallback to vLLM defaults
 ARG DEEPGEMM_REF=""
+# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
+ARG LMCACHE_REF="0.3.9.post2"

 # sccache configuration - inherit from base build
 ARG USE_SCCACHE
@@ -110,7 +111,7 @@ ARG VLLM_REF
 ARG VLLM_GIT_URL
 ARG DEEPGEMM_REF
 ARG FLASHINF_REF
-ARG TORCH_BACKEND
+ARG LMCACHE_REF
 ARG CUDA_VERSION

 ARG MAX_JOBS=16
@@ -144,7 +145,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
    export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
        cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
        chmod +x /tmp/install_vllm.sh && \
-        /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
+        /tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
        /tmp/use-sccache.sh show-stats "vLLM";

 ENV LD_LIBRARY_PATH=\
@@ -236,7 +237,7 @@ RUN apt-get update && \
        # prometheus dependencies
        ca-certificates \
        # DeepGemm uses 'cuobjdump' which does not come with CUDA image
-        cuda-command-line-tools-12-8 && \
+        cuda-command-line-tools-12-9 && \
    rm -rf /var/lib/apt/lists/*

 USER dynamo

--- a/container/build.sh
+++ b/container/build.sh
@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 # Please check https://github.com/ai-dynamo/dynamo/pull/1065
 # for details and reproducer to manually test if the image
 # can be updated to later versions.
-VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
+VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"

 NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
 NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -2,18 +2,16 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0

-# This script is used to install vLLM and its dependencies
-# If installing vLLM from a release tag, we will use pip to manage the install
-# Otherwise, we will use git to checkout the vLLM source code and build it from source.
-# The dependencies are installed in the following order:
-# 1. vLLM
-# 2. LMCache
+# This script installs vLLM and its dependencies from PyPI (release versions only).
+# Installation order:
+# 1. LMCache (installed first so vLLM's dependencies take precedence)
+# 2. vLLM
 # 3. DeepGEMM
 # 4. EP kernels

 set -euo pipefail

-VLLM_REF="v0.11.0"
+VLLM_REF="v0.11.2"

 # Basic Configurations
 ARCH=$(uname -m)
@@ -21,34 +19,19 @@ MAX_JOBS=16
 INSTALLATION_DIR=/tmp

 # VLLM and Dependency Configurations
-TORCH_BACKEND="cu128"
 TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
 DEEPGEMM_REF=""
-CUDA_VERSION="12.8" # For DEEPGEMM
-
-# These flags are applicable when installing vLLM from source code
-EDITABLE=true
-VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
-FLASHINF_REF="v0.3.1"
+CUDA_VERSION="12.9"
+FLASHINF_REF="v0.5.2"
+# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
+LMCACHE_REF="0.3.9.post2"

 while [[ $# -gt 0 ]]; do
    case $1 in
-        --editable)
-            EDITABLE=true
-            shift
-            ;;
-        --no-editable)
-            EDITABLE=false
-            shift
-            ;;
        --vllm-ref)
            VLLM_REF="$2"
            shift 2
            ;;
-        --vllm-git-url)
-            VLLM_GIT_URL="$2"
-            shift 2
-            ;;
        --max-jobs)
            MAX_JOBS="$2"
            shift 2
@@ -69,8 +52,8 @@ while [[ $# -gt 0 ]]; do
            FLASHINF_REF="$2"
            shift 2
            ;;
-        --torch-backend)
-            TORCH_BACKEND="$2"
+        --lmcache-ref)
+            LMCACHE_REF="$2"
            shift 2
            ;;
        --torch-cuda-arch-list)
@@ -82,19 +65,17 @@ while [[ $# -gt 0 ]]; do
            shift 2
            ;;
        -h|--help)
-            echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
+            echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
            echo "Options:"
-            echo "  --editable        Install vllm in editable mode (default)"
-            echo "  --no-editable     Install vllm in non-editable mode"
-            echo "  --vllm-ref REF    Git reference to checkout (default: ${VLLM_REF})"
-            echo "  --max-jobs NUM    Maximum number of parallel jobs (default: ${MAX_JOBS})"
-            echo "  --arch ARCH       Architecture (amd64|arm64, default: auto-detect)"
-            echo "  --installation-dir DIR  Directory to install vllm (default: ${INSTALLATION_DIR})"
-            echo "  --deepgemm-ref REF  Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
-            echo "  --flashinf-ref REF  Git reference for Flash Infer (default: ${FLASHINF_REF})"
-            echo "  --torch-backend BACKEND  Torch backend to use (default: ${TORCH_BACKEND})"
-            echo "  --torch-cuda-arch-list LIST  CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
-            echo "  --cuda-version VERSION  CUDA version to use (default: ${CUDA_VERSION})"
+            echo "  --vllm-ref REF      vLLM release version (default: ${VLLM_REF})"
+            echo "  --max-jobs NUM      Maximum parallel jobs (default: ${MAX_JOBS})"
+            echo "  --arch ARCH         Architecture amd64|arm64 (default: auto-detect)"
+            echo "  --installation-dir DIR  Install directory (default: ${INSTALLATION_DIR})"
+            echo "  --deepgemm-ref REF  DeepGEMM git ref (default: ${DEEPGEMM_REF})"
+            echo "  --flashinf-ref REF  FlashInfer version (default: ${FLASHINF_REF})"
+            echo "  --lmcache-ref REF   LMCache version (default: ${LMCACHE_REF})"
+            echo "  --torch-cuda-arch-list LIST  CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
+            echo "  --cuda-version VERSION  CUDA version (default: ${CUDA_VERSION})"
            exit 0
            ;;
        *)
@@ -114,119 +95,43 @@ fi
 export MAX_JOBS=$MAX_JOBS
 export CUDA_HOME=/usr/local/cuda

+# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
+TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
+
 echo "=== Installing prerequisites ==="
 uv pip install pip cuda-python

 echo "\n=== Configuration Summary ==="
-echo "  VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
-echo "  MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
-echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
-echo "  DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
-echo "  INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"
+echo "  VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
+echo "  FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
+echo "  TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
+
+echo "\n=== Installing LMCache ==="
+if [ "$ARCH" = "amd64" ]; then
+    # LMCache installation currently fails on arm64 due to CUDA dependency issues
+    # Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
+    uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
+    echo "✓ LMCache ${LMCACHE_REF} installed"
+else
+    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
+fi

 echo "\n=== Cloning vLLM repository ==="
-# We need to clone to install dependencies
+# Clone needed for DeepGEMM and EP kernels install scripts
 cd $INSTALLATION_DIR
-git clone $VLLM_GIT_URL vllm
+git clone https://github.com/vllm-project/vllm.git vllm
 cd vllm
 git checkout $VLLM_REF

-# TODO leave this here in case we need to do cherry-picks in future
-# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
-
 echo "\n=== Installing vLLM & FlashInfer ==="
+echo "Installing vLLM $VLLM_REF from PyPI..."

-if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] && [ "$TORCH_BACKEND" = "cu129" ]; }; }; then
-    # VLLM_REF starts with 'v' and either amd64, or arm64 with cu129 backend - use PyPI install
-    echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)"
-
-    uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
-
-else
-    # VLLM_REF does not start with 'v' or amd64 - use git checkout path
-    if [ "$ARCH" = "arm64" ]; then
-
-        # torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
-        # nightly can be unstable so we will not use it here
-        # for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
-
-        echo "Building vLLM from source for ARM64 architecture..."
-
-        # Try to install specific PyTorch version first
-        echo "Attempting to install pinned PyTorch nightly versions..."
-        if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
-            echo "Pinned versions failed"
-            exit 1
-        fi
-
-        # Create constraints file to pin all PyTorch-related versions
-        echo "Creating constraints file to preserve PyTorch ecosystem versions..."
-        TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
-        TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
-        TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")
-
-        rm -rf /tmp/torch_constraints.txt
-        echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
-        echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
-        echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt
-
-        echo "Pinned versions:"
-        echo "  - torch==$TORCH_VERSION"
-        echo "  - torchaudio==$TORCHAUDIO_VERSION"
-        echo "  - torchvision==$TORCHVISION_VERSION"
-
-        python use_existing_torch.py
-        uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt
-
-        if [ "$EDITABLE" = "true" ]; then
-            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
-        else
-            MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
-        fi
-
-        echo "\n=== Installing FlashInfer from source ==="
-        cd $INSTALLATION_DIR
-        git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
-        cd flashinfer
-        git checkout $FLASHINF_REF
-
-        # Install with constraints to prevent PyTorch upgrade
-        uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .
-
-    else
-        echo "Building vLLM from source for AMD64 architecture..."
-
-        # When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
-        # aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
-        export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
-
-        if [ "$EDITABLE" = "true" ]; then
-            uv pip install -e . --torch-backend=$TORCH_BACKEND
-        else
-            uv pip install . --torch-backend=$TORCH_BACKEND
-        fi
-
-        echo "\n=== Installing FlashInfer from PyPI ==="
-        uv pip install flashinfer-python==$FLASHINF_REF
-
-    fi
-fi
+uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
+uv pip install flashinfer-cubin==$FLASHINF_REF
+uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}

 echo "✓ vLLM installation completed"

-echo "\n=== Installing LMCache ==="
-if [ "$ARCH" = "amd64" ]; then
-    # LMCache installation currently fails on arm64 due to CUDA dependency issues:
-    # OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
-    # TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
-
-    # Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
-    uv pip install lmcache==0.3.7
-    echo "✓ LMCache installed"
-else
-    echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
-fi
-
 echo "\n=== Installing DeepGEMM ==="
 cd $INSTALLATION_DIR/vllm/tools

@@ -239,6 +144,7 @@ echo "✓ DeepGEMM installation completed"

 echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
 cd ep_kernels/
+# TODO we will be able to specify which pplx and deepep commit we want in future
 TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh

 echo "\n✅ All installations completed successfully!"
--- a/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+++ b/examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
@@ -8,7 +8,6 @@ On the server side, run one of the following commands:
    vLLM OpenAI API server
    vllm serve <your_model> \
        --swap-space 16 \
-        --disable-log-requests

    (TGI backend)
    ./launch_tgi_server.sh <your_model> <max_batch_total_tokens>

--- a/examples/backends/vllm/deploy/agg_kvbm.yaml
+++ b/examples/backends/vllm/deploy/agg_kvbm.yaml
@@ -42,7 +42,6 @@ spec:
            - Qwen/Qwen3-8B
            - --gpu-memory-utilization
            - "0.45"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager

--- a/examples/backends/vllm/deploy/disagg_kvbm.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm.yaml
@@ -35,7 +35,6 @@ spec:
            - Qwen/Qwen3-8B
            - --gpu-memory-utilization
            - "0.3"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager
@@ -68,7 +67,6 @@ spec:
            - --is-prefill-worker
            - --gpu-memory-utilization
            - "0.3"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager

--- a/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
@@ -35,7 +35,6 @@ spec:
            - Qwen/Qwen3-8B
            - --gpu-memory-utilization
            - "0.3"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager
@@ -68,7 +67,6 @@ spec:
            - --is-prefill-worker
            - --gpu-memory-utilization
            - "0.3"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager

--- a/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
+++ b/examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
@@ -37,7 +37,6 @@ spec:
            - Qwen/Qwen3-8B
            - --gpu-memory-utilization
            - "0.23"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager
@@ -72,7 +71,6 @@ spec:
            - --is-prefill-worker
            - --gpu-memory-utilization
            - "0.23"
-            - --disable-log-requests
            - --max-model-len
            - "32000"
            - --enforce-eager

--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
    from vllm.config import VllmConfig
    from vllm.forward_context import ForwardContext
    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
    from vllm.v1.request import Request


@@ -40,8 +41,15 @@ class DynamoConnectorMetadata(KVConnectorMetadata):


 class DynamoConnector(KVConnectorBase_V1):
-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: Optional["KVCacheConfig"] = None,
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )

        assert vllm_config.kv_transfer_config is not None
        assert vllm_config.kv_transfer_config.engine_id is not None
@@ -90,13 +98,19 @@ class DynamoConnector(KVConnectorBase_V1):
    def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
        self._worker.register_kv_caches(kv_caches)

+    @override
    def bind_connector_metadata(
        self, connector_metadata: DynamoConnectorMetadata
    ) -> None:
+        # Must call super() to set _connector_metadata so has_connector_metadata() returns True
+        # This is required for save_kv_layer to be called during the forward pass
+        super().bind_connector_metadata(connector_metadata)
        assert isinstance(connector_metadata.metadata, bytes)
        self._worker.bind_connector_metadata(connector_metadata.metadata)

+    @override
    def clear_connector_metadata(self) -> None:
+        super().clear_connector_metadata()
        self._worker.clear_connector_metadata()

    @override

--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
        LMCacheConnectorV1,
    )
    from vllm.v1.core.kv_cache_manager import KVCacheBlocks
+    from vllm.v1.kv_cache_interface import KVCacheConfig
    from vllm.v1.request import Request


@@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
    - The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
    """

-    def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
-        super().__init__(vllm_config=vllm_config, role=role)
+    def __init__(
+        self,
+        vllm_config: "VllmConfig",
+        role: KVConnectorRole,
+        kv_cache_config: "KVCacheConfig",
+    ):
+        super().__init__(
+            vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
+        )
        if len(self._connectors) != 2:
            raise ValueError(
                f"PdConnector requires exactly two connectors (got {len(self._connectors)})"

--- a/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
+++ b/lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
@@ -14,7 +14,7 @@ from kvbm.utils import is_dyn_runtime_enabled
 from vllm.config import VllmConfig
 from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
 from vllm.model_executor.models.utils import extract_layer_index
-from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
+from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE

 if TYPE_CHECKING:
    from vllm.attention.backends.abstract import AttentionMetadata

--- a/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
+++ b/lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
@@ -526,22 +526,33 @@ impl Leader for KvConnectorLeader {
        // remove the request from the inflight requests
        self.inflight_requests.remove(&request_id);

-        // if the slot has finished, we can return false to vllm, indicating all gpu blocks are free to be reused
-        // otherwise, we return true, which means there are still outstanding operations on gpu blocks which
-        // must be awaited before the gpu blocks can be reused. if we return true, then it is the worker side
-        // of the connector api which will be used to inform vllm that the request is finished.
+        // Return value semantics:
+        // - `false`: Tells vLLM all GPU blocks are free and the request can be fully cleaned up.
+        //            vLLM will immediately remove the request from its internal hash table.
+        // - `true`:  Tells vLLM there are outstanding async operations on GPU blocks.
+        //            The worker side of the connector API will later call `finish_requests()`
+        //            to notify vLLM when the request is truly complete.
+        //
+        // TODO(jthomson04): This is a temporary fix to ensure vLLM 0.11.2 compatibility.
+        //     IMPORTANT: We must ALWAYS return `true` here, even when the slot is already Finished.
+        //
+        //      Why? If we return `false`, vLLM removes the request from `self.requests` immediately.
+        //      However, our worker connector may still report completion later via `finish_requests()`.
+        //      When that happens, vLLM's scheduler.py has an assertion `req_id in self.requests`
+        //      that will fail because the request was already removed from the hash table.
+        //
+        //      By always returning `true`, we ensure vLLM keeps the request in its hash table until
+        //      our worker explicitly signals completion, avoiding the race condition.
+        //
+        //      If the slot is already Finished (no pending operations), we clean it up from our side
+        //      but still return `true` so vLLM waits for the worker's completion signal.
        if let SlotState::Finished = slot.state() {
-            // All operations complete - safe to remove slot and tell vLLM blocks are free
            self.slot_manager().remove_slot(&request_id)?;
-            Ok(false)
        } else {
            debug_assert!(matches!(slot.state(), SlotState::Finishing));
-            // Still has pending operations - keep slot alive for worker to process
-            // Don't remove slot here. Worker needs it to process the finish event.
-            // Worker will remove it after verifying all operations are complete.
-            // The lock on the slot prevents new operations from being created in offload_blocks()
-            Ok(true)
        }
+
+        Ok(true)
    }

    fn has_slot(&self, request_id: String) -> bool {

--- a/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
+++ b/lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
@@ -278,11 +278,6 @@ impl Worker for KvConnectorWorker {
            self.maybe_finished_onboarding.insert(request_id);
        }

-        // delay offloading operations until the end of the forward pass
-        debug_assert!(
-            self.offloading_operations.is_empty(),
-            "offloading operations should be empty"
-        );
        self.offloading_operations = offloading_operations;

        Ok(())
@@ -304,15 +299,34 @@ impl Worker for KvConnectorWorker {
    /// Trigger block-wise completion signals afer last layer.
    fn save_kv_layer(&mut self, _layer_name: String) -> anyhow::Result<()> {
        self.layers_complete += 1;
+        tracing::debug!(
+            iteration = self.iteration,
+            layers_complete = self.layers_complete,
+            total_layers = self.kv_cache_layers.len(),
+            pending_offload_ops = self.offloading_operations.len(),
+            "save_kv_layer called"
+        );
        if self.layers_complete == self.kv_cache_layers.len() {
            let offloading_operations = std::mem::take(&mut self.offloading_operations);

+            tracing::info!(
+                iteration = self.iteration,
+                num_operations = offloading_operations.len(),
+                "All layers complete, enqueuing {} offload operations",
+                offloading_operations.len()
+            );
+
            // block on the the completion of the last layer
            // todo(ryan): capture the context, pass this to the scheduler to do the await on another thread
            // or put the event on a stream and use stream waits to keep it all on device.
            event_sync_blocking(self.layer_events[self.layers_complete - 1]);
-            for operation in offloading_operations {
-                self.connector.enqueue_request(operation);
+            for operation in &offloading_operations {
+                tracing::debug!(
+                    request_id = %operation.request_id,
+                    operation_id = %operation.uuid,
+                    "Enqueuing offload operation to scheduler"
+                );
+                self.connector.enqueue_request(operation.clone());
            }
        }
        Ok(())

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl[cu12]<=0.7.1",
-    "vllm[flashinfer]==0.11.0",
+    "vllm[flashinfer]==0.11.2",
 ]

 sglang = [

--- a/recipes/llama-3-70b/vllm/agg/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/agg/deploy.yaml
@@ -43,7 +43,7 @@ spec:
            - name: HF_HOME
              value: /opt/models
          args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c

--- a/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
@@ -43,7 +43,7 @@ spec:
            - name: HF_HOME
              value: /opt/models
          args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c
@@ -74,7 +74,7 @@ spec:
            - name: HF_HOME
              value: /opt/models
          args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c

--- a/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+++ b/recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
@@ -55,7 +55,7 @@ spec:
            - name: HF_HOME
              value: /opt/models
          args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c
@@ -98,7 +98,7 @@ spec:
            - name: HF_HOME
              value: /opt/models
          args:
-          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
+          - "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
          command:
          - /bin/sh
          - -c

--- a/tests/dependencies/test_vllm_imports.py
+++ b/tests/dependencies/test_vllm_imports.py
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests to sanity check that required dependencies can be imported."""
+
+import pytest
+
+
+@pytest.mark.vllm
+@pytest.mark.unit
+@pytest.mark.gpu_1
+def test_import_deep_ep():
+    """Test that deep_ep module can be imported."""
+    try:
+        import deep_ep
+
+        assert deep_ep is not None
+    except ImportError as e:
+        pytest.fail(f"Failed to import deep_ep: {e}")
+
+
+@pytest.mark.vllm
+@pytest.mark.unit
+@pytest.mark.gpu_1
+def test_import_pplx_kernels():
+    """Test that pplx_kernels module can be imported."""
+    try:
+        import pplx_kernels
+
+        assert pplx_kernels is not None
+    except ImportError as e:
+        pytest.fail(f"Failed to import pplx_kernels: {e}")