"lib/llm/vscode:/vscode.git/clone" did not exist on "c8276cd28b3b12f4a7779784f57cd952d4d74da3"
Unverified Commit 60feb955 authored by Karen Chung's avatar Karen Chung Committed by GitHub
Browse files

chore: bump vLLM to 0.11.2 (#4476)

parent c5e8c4c2
......@@ -139,4 +139,4 @@ runs:
path: |
test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
retention-days: 7
\ No newline at end of file
retention-days: 7
......@@ -207,6 +207,24 @@ def parse_args() -> Config:
args = parser.parse_args()
engine_args = AsyncEngineArgs.from_cli_args(args)
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
# processes, avoiding the GIL contention.
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
# and forcing mp can expose race conditions in vLLM's scheduler.
# See: https://github.com/vllm-project/vllm/issues/29369
connector_list = [c.lower() for c in args.connector] if args.connector else []
uses_nixl = "nixl" in connector_list
tp_size = getattr(engine_args, "tensor_parallel_size", None) or 1
if uses_nixl and tp_size == 1 and engine_args.distributed_executor_backend is None:
logger.info(
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
"UniProcExecutor GIL contention with NIXL connector"
)
engine_args.distributed_executor_backend = "mp"
if engine_args.enable_prefix_caching is None:
logger.debug(
"--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"
......
......@@ -11,17 +11,18 @@ ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.8"
ARG RUNTIME_IMAGE_TAG="12.9.0-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.9"
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.11.0"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.1"
ARG TORCH_BACKEND="cu128"
ARG VLLM_REF="v0.11.2"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.2"
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
ARG LMCACHE_REF="0.3.9.post2"
# sccache configuration - inherit from base build
ARG USE_SCCACHE
......@@ -110,7 +111,7 @@ ARG VLLM_REF
ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG TORCH_BACKEND
ARG LMCACHE_REF
ARG CUDA_VERSION
ARG MAX_JOBS=16
......@@ -144,7 +145,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} --torch-backend $TORCH_BACKEND --cuda-version $CUDA_VERSION && \
/tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"} ${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"} --cuda-version $CUDA_VERSION && \
/tmp/use-sccache.sh show-stats "vLLM";
ENV LD_LIBRARY_PATH=\
......@@ -236,7 +237,7 @@ RUN apt-get update && \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-8 && \
cuda-command-line-tools-12-9 && \
rm -rf /var/lib/apt/lists/*
USER dynamo
......
......@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
VLLM_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG="25.04-cuda12.9-devel-ubuntu24.04"
NONE_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
......@@ -989,4 +989,4 @@ elif [[ "${LOCAL_DEV_BUILD:-}" == "true" ]]; then
fi
{ set +x; } 2>/dev/null
\ No newline at end of file
{ set +x; } 2>/dev/null
......@@ -2,18 +2,16 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 3. DeepGEMM
# 4. EP kernels
set -euo pipefail
VLLM_REF="v0.11.0"
VLLM_REF="v0.11.2"
# Basic Configurations
ARCH=$(uname -m)
......@@ -21,34 +19,19 @@ MAX_JOBS=16
INSTALLATION_DIR=/tmp
# VLLM and Dependency Configurations
TORCH_BACKEND="cu128"
TORCH_CUDA_ARCH_LIST="9.0;10.0" # For EP Kernels
DEEPGEMM_REF=""
CUDA_VERSION="12.8" # For DEEPGEMM
# These flags are applicable when installing vLLM from source code
EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.1"
CUDA_VERSION="12.9"
FLASHINF_REF="v0.5.2"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
LMCACHE_REF="0.3.9.post2"
while [[ $# -gt 0 ]]; do
case $1 in
--editable)
EDITABLE=true
shift
;;
--no-editable)
EDITABLE=false
shift
;;
--vllm-ref)
VLLM_REF="$2"
shift 2
;;
--vllm-git-url)
VLLM_GIT_URL="$2"
shift 2
;;
--max-jobs)
MAX_JOBS="$2"
shift 2
......@@ -69,8 +52,8 @@ while [[ $# -gt 0 ]]; do
FLASHINF_REF="$2"
shift 2
;;
--torch-backend)
TORCH_BACKEND="$2"
--lmcache-ref)
LMCACHE_REF="$2"
shift 2
;;
--torch-cuda-arch-list)
......@@ -82,19 +65,17 @@ while [[ $# -gt 0 ]]; do
shift 2
;;
-h|--help)
echo "Usage: $0 [--editable|--no-editable] [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--torch-backend BACKEND] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Usage: $0 [--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--lmcache-ref REF] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo "Options:"
echo " --editable Install vllm in editable mode (default)"
echo " --no-editable Install vllm in non-editable mode"
echo " --vllm-ref REF Git reference to checkout (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum number of parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture (amd64|arm64, default: auto-detect)"
echo " --installation-dir DIR Directory to install vllm (default: ${INSTALLATION_DIR})"
echo " --deepgemm-ref REF Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF Git reference for Flash Infer (default: ${FLASHINF_REF})"
echo " --torch-backend BACKEND Torch backend to use (default: ${TORCH_BACKEND})"
echo " --torch-cuda-arch-list LIST CUDA architectures to compile for (default: ${TORCH_CUDA_ARCH_LIST})"
echo " --cuda-version VERSION CUDA version to use (default: ${CUDA_VERSION})"
echo " --vllm-ref REF vLLM release version (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
echo " --installation-dir DIR Install directory (default: ${INSTALLATION_DIR})"
echo " --deepgemm-ref REF DeepGEMM git ref (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF FlashInfer version (default: ${FLASHINF_REF})"
echo " --lmcache-ref REF LMCache version (default: ${LMCACHE_REF})"
echo " --torch-cuda-arch-list LIST CUDA architectures (default: ${TORCH_CUDA_ARCH_LIST})"
echo " --cuda-version VERSION CUDA version (default: ${CUDA_VERSION})"
exit 0
;;
*)
......@@ -114,119 +95,43 @@ fi
export MAX_JOBS=$MAX_JOBS
export CUDA_HOME=/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND="cu$(echo $CUDA_VERSION | tr -d '.')"
echo "=== Installing prerequisites ==="
uv pip install pip cuda-python
echo "\n=== Configuration Summary ==="
echo " VLLM_REF=$VLLM_REF | EDITABLE=$EDITABLE | ARCH=$ARCH"
echo " MAX_JOBS=$MAX_JOBS | TORCH_BACKEND=$TORCH_BACKEND | CUDA_VERSION=$CUDA_VERSION"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST"
echo " DEEPGEMM_REF=$DEEPGEMM_REF | FLASHINF_REF=$FLASHINF_REF"
echo " INSTALLATION_DIR=$INSTALLATION_DIR | VLLM_GIT_URL=$VLLM_GIT_URL"
echo " VLLM_REF=$VLLM_REF | ARCH=$ARCH | CUDA_VERSION=$CUDA_VERSION | TORCH_BACKEND=$TORCH_BACKEND"
echo " FLASHINF_REF=$FLASHINF_REF | LMCACHE_REF=$LMCACHE_REF | DEEPGEMM_REF=$DEEPGEMM_REF"
echo " TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST | INSTALLATION_DIR=$INSTALLATION_DIR"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip install lmcache==${LMCACHE_REF} --torch-backend=${TORCH_BACKEND}
echo "✓ LMCache ${LMCACHE_REF} installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo "\n=== Cloning vLLM repository ==="
# We need to clone to install dependencies
# Clone needed for DeepGEMM and EP kernels install scripts
cd $INSTALLATION_DIR
git clone $VLLM_GIT_URL vllm
git clone https://github.com/vllm-project/vllm.git vllm
cd vllm
git checkout $VLLM_REF
# TODO leave this here in case we need to do cherry-picks in future
# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
echo "\n=== Installing vLLM & FlashInfer ==="
echo "Installing vLLM $VLLM_REF from PyPI..."
if [[ $VLLM_REF =~ ^v ]] && { [ "$ARCH" = "amd64" ] || { [ "$ARCH" = "arm64" ] && [ "$TORCH_BACKEND" = "cu129" ]; }; }; then
# VLLM_REF starts with 'v' and either amd64, or arm64 with cu129 backend - use PyPI install
echo "Installing vLLM $VLLM_REF from PyPI... (ARCH=$ARCH, TORCH_BACKEND=$TORCH_BACKEND)"
uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=$TORCH_BACKEND
else
# VLLM_REF does not start with 'v' or amd64 - use git checkout path
if [ "$ARCH" = "arm64" ]; then
# torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
# nightly can be unstable so we will not use it here
# for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
echo "Building vLLM from source for ARM64 architecture..."
# Try to install specific PyTorch version first
echo "Attempting to install pinned PyTorch nightly versions..."
if ! uv pip install torch==2.7.1+cu128 torchaudio==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu128; then
echo "Pinned versions failed"
exit 1
fi
# Create constraints file to pin all PyTorch-related versions
echo "Creating constraints file to preserve PyTorch ecosystem versions..."
TORCH_VERSION=$(python -c "import torch; print(torch.__version__)")
TORCHAUDIO_VERSION=$(python -c "import torchaudio; print(torchaudio.__version__)")
TORCHVISION_VERSION=$(python -c "import torchvision; print(torchvision.__version__)")
rm -rf /tmp/torch_constraints.txt
echo "torch==$TORCH_VERSION" > /tmp/torch_constraints.txt
echo "torchaudio==$TORCHAUDIO_VERSION" >> /tmp/torch_constraints.txt
echo "torchvision==$TORCHVISION_VERSION" >> /tmp/torch_constraints.txt
echo "Pinned versions:"
echo " - torch==$TORCH_VERSION"
echo " - torchaudio==$TORCHAUDIO_VERSION"
echo " - torchvision==$TORCHVISION_VERSION"
python use_existing_torch.py
uv pip install -c /tmp/torch_constraints.txt -r requirements/build.txt
if [ "$EDITABLE" = "true" ]; then
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt -e . -v
else
MAX_JOBS=${MAX_JOBS} uv pip install --no-build-isolation -c /tmp/torch_constraints.txt . -v
fi
echo "\n=== Installing FlashInfer from source ==="
cd $INSTALLATION_DIR
git clone https://github.com/flashinfer-ai/flashinfer.git --recursive
cd flashinfer
git checkout $FLASHINF_REF
# Install with constraints to prevent PyTorch upgrade
uv pip install -v --no-build-isolation -c /tmp/torch_constraints.txt .
else
echo "Building vLLM from source for AMD64 architecture..."
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
export VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
if [ "$EDITABLE" = "true" ]; then
uv pip install -e . --torch-backend=$TORCH_BACKEND
else
uv pip install . --torch-backend=$TORCH_BACKEND
fi
echo "\n=== Installing FlashInfer from PyPI ==="
uv pip install flashinfer-python==$FLASHINF_REF
fi
fi
uv pip install vllm[flashinfer]==$VLLM_REF --torch-backend=${TORCH_BACKEND}
uv pip install flashinfer-cubin==$FLASHINF_REF
uv pip install flashinfer-jit-cache==$FLASHINF_REF --extra-index-url https://flashinfer.ai/whl/${TORCH_BACKEND}
echo "✓ vLLM installation completed"
echo "\n=== Installing LMCache ==="
if [ "$ARCH" = "amd64" ]; then
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
# Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
uv pip install lmcache==0.3.7
echo "✓ LMCache installed"
else
echo "⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo "\n=== Installing DeepGEMM ==="
cd $INSTALLATION_DIR/vllm/tools
......@@ -239,6 +144,7 @@ echo "✓ DeepGEMM installation completed"
echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
echo "\n✅ All installations completed successfully!"
......@@ -8,7 +8,6 @@ On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
......
......@@ -42,7 +42,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.45"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......
......@@ -35,7 +35,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......@@ -68,7 +67,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......
......@@ -35,7 +35,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......@@ -68,7 +67,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.3"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......
......@@ -37,7 +37,6 @@ spec:
- Qwen/Qwen3-8B
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......@@ -72,7 +71,6 @@ spec:
- --is-prefill-worker
- --gpu-memory-utilization
- "0.23"
- --disable-log-requests
- --max-model-len
- "32000"
- --enforce-eager
......
......@@ -23,6 +23,7 @@ if TYPE_CHECKING:
from vllm.config import VllmConfig
from vllm.forward_context import ForwardContext
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request
......@@ -40,8 +41,15 @@ class DynamoConnectorMetadata(KVConnectorMetadata):
class DynamoConnector(KVConnectorBase_V1):
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: Optional["KVCacheConfig"] = None,
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)
assert vllm_config.kv_transfer_config is not None
assert vllm_config.kv_transfer_config.engine_id is not None
......@@ -90,13 +98,19 @@ class DynamoConnector(KVConnectorBase_V1):
def register_kv_caches(self, kv_caches: dict[str, torch.Tensor]):
self._worker.register_kv_caches(kv_caches)
@override
def bind_connector_metadata(
self, connector_metadata: DynamoConnectorMetadata
) -> None:
# Must call super() to set _connector_metadata so has_connector_metadata() returns True
# This is required for save_kv_layer to be called during the forward pass
super().bind_connector_metadata(connector_metadata)
assert isinstance(connector_metadata.metadata, bytes)
self._worker.bind_connector_metadata(connector_metadata.metadata)
@override
def clear_connector_metadata(self) -> None:
super().clear_connector_metadata()
self._worker.clear_connector_metadata()
@override
......
......@@ -29,6 +29,7 @@ if TYPE_CHECKING:
LMCacheConnectorV1,
)
from vllm.v1.core.kv_cache_manager import KVCacheBlocks
from vllm.v1.kv_cache_interface import KVCacheConfig
from vllm.v1.request import Request
......@@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
- The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
"""
def __init__(self, vllm_config: "VllmConfig", role: KVConnectorRole):
super().__init__(vllm_config=vllm_config, role=role)
def __init__(
self,
vllm_config: "VllmConfig",
role: KVConnectorRole,
kv_cache_config: "KVCacheConfig",
):
super().__init__(
vllm_config=vllm_config, role=role, kv_cache_config=kv_cache_config
)
if len(self._connectors) != 2:
raise ValueError(
f"PdConnector requires exactly two connectors (got {len(self._connectors)})"
......
......@@ -14,7 +14,7 @@ from kvbm.utils import is_dyn_runtime_enabled
from vllm.config import VllmConfig
from vllm.distributed.kv_transfer.kv_connector.v1.base import KVConnectorMetadata
from vllm.model_executor.models.utils import extract_layer_index
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE
from vllm.utils.torch_utils import STR_DTYPE_TO_TORCH_DTYPE
if TYPE_CHECKING:
from vllm.attention.backends.abstract import AttentionMetadata
......
......@@ -526,22 +526,33 @@ impl Leader for KvConnectorLeader {
// remove the request from the inflight requests
self.inflight_requests.remove(&request_id);
// if the slot has finished, we can return false to vllm, indicating all gpu blocks are free to be reused
// otherwise, we return true, which means there are still outstanding operations on gpu blocks which
// must be awaited before the gpu blocks can be reused. if we return true, then it is the worker side
// of the connector api which will be used to inform vllm that the request is finished.
// Return value semantics:
// - `false`: Tells vLLM all GPU blocks are free and the request can be fully cleaned up.
// vLLM will immediately remove the request from its internal hash table.
// - `true`: Tells vLLM there are outstanding async operations on GPU blocks.
// The worker side of the connector API will later call `finish_requests()`
// to notify vLLM when the request is truly complete.
//
// TODO(jthomson04): This is a temporary fix to ensure vLLM 0.11.2 compatibility.
// IMPORTANT: We must ALWAYS return `true` here, even when the slot is already Finished.
//
// Why? If we return `false`, vLLM removes the request from `self.requests` immediately.
// However, our worker connector may still report completion later via `finish_requests()`.
// When that happens, vLLM's scheduler.py has an assertion `req_id in self.requests`
// that will fail because the request was already removed from the hash table.
//
// By always returning `true`, we ensure vLLM keeps the request in its hash table until
// our worker explicitly signals completion, avoiding the race condition.
//
// If the slot is already Finished (no pending operations), we clean it up from our side
// but still return `true` so vLLM waits for the worker's completion signal.
if let SlotState::Finished = slot.state() {
// All operations complete - safe to remove slot and tell vLLM blocks are free
self.slot_manager().remove_slot(&request_id)?;
Ok(false)
} else {
debug_assert!(matches!(slot.state(), SlotState::Finishing));
// Still has pending operations - keep slot alive for worker to process
// Don't remove slot here. Worker needs it to process the finish event.
// Worker will remove it after verifying all operations are complete.
// The lock on the slot prevents new operations from being created in offload_blocks()
Ok(true)
}
Ok(true)
}
fn has_slot(&self, request_id: String) -> bool {
......
......@@ -278,11 +278,6 @@ impl Worker for KvConnectorWorker {
self.maybe_finished_onboarding.insert(request_id);
}
// delay offloading operations until the end of the forward pass
debug_assert!(
self.offloading_operations.is_empty(),
"offloading operations should be empty"
);
self.offloading_operations = offloading_operations;
Ok(())
......@@ -304,15 +299,34 @@ impl Worker for KvConnectorWorker {
/// Trigger block-wise completion signals afer last layer.
fn save_kv_layer(&mut self, _layer_name: String) -> anyhow::Result<()> {
self.layers_complete += 1;
tracing::debug!(
iteration = self.iteration,
layers_complete = self.layers_complete,
total_layers = self.kv_cache_layers.len(),
pending_offload_ops = self.offloading_operations.len(),
"save_kv_layer called"
);
if self.layers_complete == self.kv_cache_layers.len() {
let offloading_operations = std::mem::take(&mut self.offloading_operations);
tracing::info!(
iteration = self.iteration,
num_operations = offloading_operations.len(),
"All layers complete, enqueuing {} offload operations",
offloading_operations.len()
);
// block on the the completion of the last layer
// todo(ryan): capture the context, pass this to the scheduler to do the await on another thread
// or put the event on a stream and use stream waits to keep it all on device.
event_sync_blocking(self.layer_events[self.layers_complete - 1]);
for operation in offloading_operations {
self.connector.enqueue_request(operation);
for operation in &offloading_operations {
tracing::debug!(
request_id = %operation.request_id,
operation_id = %operation.uuid,
"Enqueuing offload operation to scheduler"
);
self.connector.enqueue_request(operation.clone());
}
}
Ok(())
......
......@@ -56,7 +56,7 @@ trtllm =[
vllm = [
"uvloop",
"nixl[cu12]<=0.7.1",
"vllm[flashinfer]==0.11.0",
"vllm[flashinfer]==0.11.2",
]
sglang = [
......
......@@ -43,7 +43,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......
......@@ -43,7 +43,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......@@ -74,7 +74,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 8 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......
......@@ -55,7 +55,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --disable-log-requests --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 2 --data-parallel-size 1 --is-prefill-worker --gpu-memory-utilization 0.95 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......@@ -98,7 +98,7 @@ spec:
- name: HF_HOME
value: /opt/models
args:
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --disable-log-requests --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
- "python3 -m dynamo.vllm --model $MODEL_PATH --served-model-name $SERVED_MODEL_NAME --tensor-parallel-size 4 --data-parallel-size 1 --gpu-memory-utilization 0.90 --no-enable-prefix-caching --block-size 128"
command:
- /bin/sh
- -c
......
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests to sanity check that required dependencies can be imported."""
import pytest
@pytest.mark.vllm
@pytest.mark.unit
@pytest.mark.gpu_1
def test_import_deep_ep():
"""Test that deep_ep module can be imported."""
try:
import deep_ep
assert deep_ep is not None
except ImportError as e:
pytest.fail(f"Failed to import deep_ep: {e}")
@pytest.mark.vllm
@pytest.mark.unit
@pytest.mark.gpu_1
def test_import_pplx_kernels():
"""Test that pplx_kernels module can be imported."""
try:
import pplx_kernels
assert pplx_kernels is not None
except ImportError as e:
pytest.fail(f"Failed to import pplx_kernels: {e}")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment