Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
60feb955
Unverified
Commit
60feb955
authored
Dec 03, 2025
by
Karen Chung
Committed by
GitHub
Dec 03, 2025
Browse files
chore: bump vLLM to 0.11.2 (#4476)
parent
c5e8c4c2
Changes
23
Show whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
183 additions
and
187 deletions
+183
-187
.github/actions/pytest/action.yml
.github/actions/pytest/action.yml
+1
-1
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+18
-0
container/Dockerfile.vllm
container/Dockerfile.vllm
+10
-9
container/build.sh
container/build.sh
+2
-2
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+44
-138
examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
...kends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+0
-1
examples/backends/vllm/deploy/agg_kvbm.yaml
examples/backends/vllm/deploy/agg_kvbm.yaml
+0
-1
examples/backends/vllm/deploy/disagg_kvbm.yaml
examples/backends/vllm/deploy/disagg_kvbm.yaml
+0
-2
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
+0
-2
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
+0
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
...ython/kvbm/vllm_integration/connector/dynamo_connector.py
+16
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
...bm/python/kvbm/vllm_integration/connector/pd_connector.py
+10
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
...ngs/kvbm/python/kvbm/vllm_integration/connector_worker.py
+1
-1
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
+22
-11
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
+21
-7
pyproject.toml
pyproject.toml
+1
-1
recipes/llama-3-70b/vllm/agg/deploy.yaml
recipes/llama-3-70b/vllm/agg/deploy.yaml
+1
-1
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+2
-2
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+2
-2
tests/dependencies/test_vllm_imports.py
tests/dependencies/test_vllm_imports.py
+32
-0
No files found.
.github/actions/pytest/action.yml
View file @
60feb955
components/src/dynamo/vllm/args.py
View file @
60feb955
...
...
@@ -207,6 +207,24 @@ def parse_args() -> Config:
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
# processes, avoiding the GIL contention.
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
# and forcing mp can expose race conditions in vLLM's scheduler.
# See: https://github.com/vllm-project/vllm/issues/29369
connector_list
=
[
c
.
lower
()
for
c
in
args
.
connector
]
if
args
.
connector
else
[]
uses_nixl
=
"nixl"
in
connector_list
tp_size
=
getattr
(
engine_args
,
"tensor_parallel_size"
,
None
)
or
1
if
uses_nixl
and
tp_size
==
1
and
engine_args
.
distributed_executor_backend
is
None
:
logger
.
info
(
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
"UniProcExecutor GIL contention with NIXL connector"
)
engine_args
.
distributed_executor_backend
=
"mp"
if
engine_args
.
enable_prefix_caching
is
None
:
logger
.
debug
(
"--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"
...
...
container/Dockerfile.vllm
View file @
60feb955
...
...
@@ -11,17 +11,18 @@ ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.
8.1
-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.
8
"
ARG RUNTIME_IMAGE_TAG="12.
9.0
-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.
9
"
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.11.0"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.1"
ARG TORCH_BACKEND="cu128"
ARG VLLM_REF="v0.11.2"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.2"
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
ARG LMCACHE_REF="0.3.9.post2"
# sccache configuration - inherit from base build
ARG USE_SCCACHE
...
...
@@ -110,7 +111,7 @@ ARG VLLM_REF
ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG
TORCH_BACKEND
ARG
LMCACHE_REF
ARG CUDA_VERSION
ARG MAX_JOBS=16
...
...
@@ -144,7 +145,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh
--editable
--vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"}
--torch-backend $TORCH_BACKEND
--cuda-version $CUDA_VERSION && \
/tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"}
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"}
--cuda-version $CUDA_VERSION && \
/tmp/use-sccache.sh show-stats "vLLM";
ENV LD_LIBRARY_PATH=\
...
...
@@ -236,7 +237,7 @@ RUN apt-get update && \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-
8
&& \
cuda-command-line-tools-12-
9
&& \
rm -rf /var/lib/apt/lists/*
USER dynamo
...
...
container/build.sh
View file @
60feb955
...
...
@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
VLLM_BASE_IMAGE_TAG
=
"25.0
1
-cuda12.
8
-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG
=
"25.0
4
-cuda12.
9
-devel-ubuntu24.04"
NONE_BASE_IMAGE
=
"nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG
=
"25.01-cuda12.8-devel-ubuntu24.04"
...
...
container/deps/vllm/install_vllm.sh
View file @
60feb955
...
...
@@ -2,18 +2,16 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 3. DeepGEMM
# 4. EP kernels
set
-euo
pipefail
VLLM_REF
=
"v0.11.
0
"
VLLM_REF
=
"v0.11.
2
"
# Basic Configurations
ARCH
=
$(
uname
-m
)
...
...
@@ -21,34 +19,19 @@ MAX_JOBS=16
INSTALLATION_DIR
=
/tmp
# VLLM and Dependency Configurations
TORCH_BACKEND
=
"cu128"
TORCH_CUDA_ARCH_LIST
=
"9.0;10.0"
# For EP Kernels
DEEPGEMM_REF
=
""
CUDA_VERSION
=
"12.8"
# For DEEPGEMM
# These flags are applicable when installing vLLM from source code
EDITABLE
=
true
VLLM_GIT_URL
=
"https://github.com/vllm-project/vllm.git"
FLASHINF_REF
=
"v0.3.1"
CUDA_VERSION
=
"12.9"
FLASHINF_REF
=
"v0.5.2"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
LMCACHE_REF
=
"0.3.9.post2"
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--editable
)
EDITABLE
=
true
shift
;;
--no-editable
)
EDITABLE
=
false
shift
;;
--vllm-ref
)
VLLM_REF
=
"
$2
"
shift
2
;;
--vllm-git-url
)
VLLM_GIT_URL
=
"
$2
"
shift
2
;;
--max-jobs
)
MAX_JOBS
=
"
$2
"
shift
2
...
...
@@ -69,8 +52,8 @@ while [[ $# -gt 0 ]]; do
FLASHINF_REF
=
"
$2
"
shift
2
;;
--
torch-backend
)
TORCH_BACKEND
=
"
$2
"
--
lmcache-ref
)
LMCACHE_REF
=
"
$2
"
shift
2
;;
--torch-cuda-arch-list
)
...
...
@@ -82,19 +65,17 @@ while [[ $# -gt 0 ]]; do
shift
2
;;
-h
|
--help
)
echo
"Usage:
$0
[--editable|--no-editable]
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--
torch-backend BACKEND
] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Usage:
$0
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--
lmcache-ref REF
] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Options:"
echo
" --editable Install vllm in editable mode (default)"
echo
" --no-editable Install vllm in non-editable mode"
echo
" --vllm-ref REF Git reference to checkout (default:
${
VLLM_REF
}
)"
echo
" --max-jobs NUM Maximum number of parallel jobs (default:
${
MAX_JOBS
}
)"
echo
" --arch ARCH Architecture (amd64|arm64, default: auto-detect)"
echo
" --installation-dir DIR Directory to install vllm (default:
${
INSTALLATION_DIR
}
)"
echo
" --deepgemm-ref REF Git reference for DeepGEMM (default:
${
DEEPGEMM_REF
}
)"
echo
" --flashinf-ref REF Git reference for Flash Infer (default:
${
FLASHINF_REF
}
)"
echo
" --torch-backend BACKEND Torch backend to use (default:
${
TORCH_BACKEND
}
)"
echo
" --torch-cuda-arch-list LIST CUDA architectures to compile for (default:
${
TORCH_CUDA_ARCH_LIST
}
)"
echo
" --cuda-version VERSION CUDA version to use (default:
${
CUDA_VERSION
}
)"
echo
" --vllm-ref REF vLLM release version (default:
${
VLLM_REF
}
)"
echo
" --max-jobs NUM Maximum parallel jobs (default:
${
MAX_JOBS
}
)"
echo
" --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
echo
" --installation-dir DIR Install directory (default:
${
INSTALLATION_DIR
}
)"
echo
" --deepgemm-ref REF DeepGEMM git ref (default:
${
DEEPGEMM_REF
}
)"
echo
" --flashinf-ref REF FlashInfer version (default:
${
FLASHINF_REF
}
)"
echo
" --lmcache-ref REF LMCache version (default:
${
LMCACHE_REF
}
)"
echo
" --torch-cuda-arch-list LIST CUDA architectures (default:
${
TORCH_CUDA_ARCH_LIST
}
)"
echo
" --cuda-version VERSION CUDA version (default:
${
CUDA_VERSION
}
)"
exit
0
;;
*
)
...
...
@@ -114,119 +95,43 @@ fi
export
MAX_JOBS
=
$MAX_JOBS
export
CUDA_HOME
=
/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND
=
"cu
$(
echo
$CUDA_VERSION
|
tr
-d
'.'
)
"
echo
"=== Installing prerequisites ==="
uv pip
install
pip cuda-python
echo
"
\n
=== Configuration Summary ==="
echo
" VLLM_REF=
$VLLM_REF
| EDITABLE=
$EDITABLE
| ARCH=
$ARCH
"
echo
" MAX_JOBS=
$MAX_JOBS
| TORCH_BACKEND=
$TORCH_BACKEND
| CUDA_VERSION=
$CUDA_VERSION
"
echo
" TORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
"
echo
" DEEPGEMM_REF=
$DEEPGEMM_REF
| FLASHINF_REF=
$FLASHINF_REF
"
echo
" INSTALLATION_DIR=
$INSTALLATION_DIR
| VLLM_GIT_URL=
$VLLM_GIT_URL
"
echo
" VLLM_REF=
$VLLM_REF
| ARCH=
$ARCH
| CUDA_VERSION=
$CUDA_VERSION
| TORCH_BACKEND=
$TORCH_BACKEND
"
echo
" FLASHINF_REF=
$FLASHINF_REF
| LMCACHE_REF=
$LMCACHE_REF
| DEEPGEMM_REF=
$DEEPGEMM_REF
"
echo
" TORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
| INSTALLATION_DIR=
$INSTALLATION_DIR
"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip
install
lmcache
==
${
LMCACHE_REF
}
--torch-backend
=
${
TORCH_BACKEND
}
echo
"✓ LMCache
${
LMCACHE_REF
}
installed"
else
echo
"⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo
"
\n
=== Cloning vLLM repository ==="
#
W
e need
to clone to install dependencie
s
#
Clon
e need
ed
for DeepGEMM and EP kernels install script
s
cd
$INSTALLATION_DIR
git clone
$VLLM_GIT_URL
vllm
git clone
https://github.com/vllm-project/vllm.git
vllm
cd
vllm
git checkout
$VLLM_REF
# TODO leave this here in case we need to do cherry-picks in future
# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
echo
"
\n
=== Installing vLLM & FlashInfer ==="
echo
"Installing vLLM
$VLLM_REF
from PyPI..."
if
[[
$VLLM_REF
=
~ ^v
]]
&&
{
[
"
$ARCH
"
=
"amd64"
]
||
{
[
"
$ARCH
"
=
"arm64"
]
&&
[
"
$TORCH_BACKEND
"
=
"cu129"
]
;
}
;
}
;
then
# VLLM_REF starts with 'v' and either amd64, or arm64 with cu129 backend - use PyPI install
echo
"Installing vLLM
$VLLM_REF
from PyPI... (ARCH=
$ARCH
, TORCH_BACKEND=
$TORCH_BACKEND
)"
uv pip
install
vllm[flashinfer]
==
$VLLM_REF
--torch-backend
=
$TORCH_BACKEND
else
# VLLM_REF does not start with 'v' or amd64 - use git checkout path
if
[
"
$ARCH
"
=
"arm64"
]
;
then
# torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
# nightly can be unstable so we will not use it here
# for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
echo
"Building vLLM from source for ARM64 architecture..."
# Try to install specific PyTorch version first
echo
"Attempting to install pinned PyTorch nightly versions..."
if
!
uv pip
install
torch
==
2.7.1+cu128
torchaudio
==
2.7.1
torchvision
==
0.22.1
--index-url
https://download.pytorch.org/whl/cu128
;
then
echo
"Pinned versions failed"
exit
1
fi
# Create constraints file to pin all PyTorch-related versions
echo
"Creating constraints file to preserve PyTorch ecosystem versions..."
TORCH_VERSION
=
$(
python
-c
"import torch; print(torch.__version__)"
)
TORCHAUDIO_VERSION
=
$(
python
-c
"import torchaudio; print(torchaudio.__version__)"
)
TORCHVISION_VERSION
=
$(
python
-c
"import torchvision; print(torchvision.__version__)"
)
rm
-rf
/tmp/torch_constraints.txt
echo
"torch==
$TORCH_VERSION
"
>
/tmp/torch_constraints.txt
echo
"torchaudio==
$TORCHAUDIO_VERSION
"
>>
/tmp/torch_constraints.txt
echo
"torchvision==
$TORCHVISION_VERSION
"
>>
/tmp/torch_constraints.txt
echo
"Pinned versions:"
echo
" - torch==
$TORCH_VERSION
"
echo
" - torchaudio==
$TORCHAUDIO_VERSION
"
echo
" - torchvision==
$TORCHVISION_VERSION
"
python use_existing_torch.py
uv pip
install
-c
/tmp/torch_constraints.txt
-r
requirements/build.txt
if
[
"
$EDITABLE
"
=
"true"
]
;
then
MAX_JOBS
=
${
MAX_JOBS
}
uv pip
install
--no-build-isolation
-c
/tmp/torch_constraints.txt
-e
.
-v
else
MAX_JOBS
=
${
MAX_JOBS
}
uv pip
install
--no-build-isolation
-c
/tmp/torch_constraints.txt
.
-v
fi
echo
"
\n
=== Installing FlashInfer from source ==="
cd
$INSTALLATION_DIR
git clone https://github.com/flashinfer-ai/flashinfer.git
--recursive
cd
flashinfer
git checkout
$FLASHINF_REF
# Install with constraints to prevent PyTorch upgrade
uv pip
install
-v
--no-build-isolation
-c
/tmp/torch_constraints.txt
.
else
echo
"Building vLLM from source for AMD64 architecture..."
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
export
VLLM_PRECOMPILED_WHEEL_LOCATION
=
"https://vllm-wheels.s3.us-west-2.amazonaws.com/
${
VLLM_REF
}
/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
if
[
"
$EDITABLE
"
=
"true"
]
;
then
uv pip
install
-e
.
--torch-backend
=
$TORCH_BACKEND
else
uv pip
install
.
--torch-backend
=
$TORCH_BACKEND
fi
echo
"
\n
=== Installing FlashInfer from PyPI ==="
uv pip
install
flashinfer-python
==
$FLASHINF_REF
fi
fi
uv pip
install
vllm[flashinfer]
==
$VLLM_REF
--torch-backend
=
${
TORCH_BACKEND
}
uv pip
install
flashinfer-cubin
==
$FLASHINF_REF
uv pip
install
flashinfer-jit-cache
==
$FLASHINF_REF
--extra-index-url
https://flashinfer.ai/whl/
${
TORCH_BACKEND
}
echo
"✓ vLLM installation completed"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
# Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
uv pip
install
lmcache
==
0.3.7
echo
"✓ LMCache installed"
else
echo
"⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo
"
\n
=== Installing DeepGEMM ==="
cd
$INSTALLATION_DIR
/vllm/tools
...
...
@@ -239,6 +144,7 @@ echo "✓ DeepGEMM installation completed"
echo
"
\n
=== Installing EP Kernels (PPLX and DeepEP) ==="
cd
ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST
=
"
$TORCH_CUDA_ARCH_LIST
"
bash install_python_libraries.sh
echo
"
\n
✅ All installations completed successfully!"
examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
View file @
60feb955
...
...
@@ -8,7 +8,6 @@ On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
...
...
examples/backends/vllm/deploy/agg_kvbm.yaml
View file @
60feb955
...
...
@@ -42,7 +42,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.45"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm.yaml
View file @
60feb955
...
...
@@ -35,7 +35,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -68,7 +67,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
View file @
60feb955
...
...
@@ -35,7 +35,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -68,7 +67,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
View file @
60feb955
...
...
@@ -37,7 +37,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.23"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -72,7 +71,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.23"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
View file @
60feb955
...
...
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
from
vllm.config
import
VllmConfig
from
vllm.forward_context
import
ForwardContext
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.request
import
Request
...
...
@@ -40,8 +41,15 @@ class DynamoConnectorMetadata(KVConnectorMetadata):
class
DynamoConnector
(
KVConnectorBase_V1
):
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
,
kv_cache_config
:
Optional
[
"KVCacheConfig"
]
=
None
,
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
,
kv_cache_config
=
kv_cache_config
)
assert
vllm_config
.
kv_transfer_config
is
not
None
assert
vllm_config
.
kv_transfer_config
.
engine_id
is
not
None
...
...
@@ -90,13 +98,19 @@ class DynamoConnector(KVConnectorBase_V1):
def
register_kv_caches
(
self
,
kv_caches
:
dict
[
str
,
torch
.
Tensor
]):
self
.
_worker
.
register_kv_caches
(
kv_caches
)
@
override
def
bind_connector_metadata
(
self
,
connector_metadata
:
DynamoConnectorMetadata
)
->
None
:
# Must call super() to set _connector_metadata so has_connector_metadata() returns True
# This is required for save_kv_layer to be called during the forward pass
super
().
bind_connector_metadata
(
connector_metadata
)
assert
isinstance
(
connector_metadata
.
metadata
,
bytes
)
self
.
_worker
.
bind_connector_metadata
(
connector_metadata
.
metadata
)
@
override
def
clear_connector_metadata
(
self
)
->
None
:
super
().
clear_connector_metadata
()
self
.
_worker
.
clear_connector_metadata
()
@
override
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
View file @
60feb955
...
...
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
LMCacheConnectorV1
,
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.request
import
Request
...
...
@@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
- The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
"""
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
,
kv_cache_config
:
"KVCacheConfig"
,
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
,
kv_cache_config
=
kv_cache_config
)
if
len
(
self
.
_connectors
)
!=
2
:
raise
ValueError
(
f
"PdConnector requires exactly two connectors (got
{
len
(
self
.
_connectors
)
}
)"
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
View file @
60feb955
...
...
@@ -14,7 +14,7 @@ from kvbm.utils import is_dyn_runtime_enabled
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
KVConnectorMetadata
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
if
TYPE_CHECKING
:
from
vllm.attention.backends.abstract
import
AttentionMetadata
...
...
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
View file @
60feb955
...
...
@@ -526,22 +526,33 @@ impl Leader for KvConnectorLeader {
// remove the request from the inflight requests
self
.inflight_requests
.remove
(
&
request_id
);
// if the slot has finished, we can return false to vllm, indicating all gpu blocks are free to be reused
// otherwise, we return true, which means there are still outstanding operations on gpu blocks which
// must be awaited before the gpu blocks can be reused. if we return true, then it is the worker side
// of the connector api which will be used to inform vllm that the request is finished.
// Return value semantics:
// - `false`: Tells vLLM all GPU blocks are free and the request can be fully cleaned up.
// vLLM will immediately remove the request from its internal hash table.
// - `true`: Tells vLLM there are outstanding async operations on GPU blocks.
// The worker side of the connector API will later call `finish_requests()`
// to notify vLLM when the request is truly complete.
//
// TODO(jthomson04): This is a temporary fix to ensure vLLM 0.11.2 compatibility.
// IMPORTANT: We must ALWAYS return `true` here, even when the slot is already Finished.
//
// Why? If we return `false`, vLLM removes the request from `self.requests` immediately.
// However, our worker connector may still report completion later via `finish_requests()`.
// When that happens, vLLM's scheduler.py has an assertion `req_id in self.requests`
// that will fail because the request was already removed from the hash table.
//
// By always returning `true`, we ensure vLLM keeps the request in its hash table until
// our worker explicitly signals completion, avoiding the race condition.
//
// If the slot is already Finished (no pending operations), we clean it up from our side
// but still return `true` so vLLM waits for the worker's completion signal.
if
let
SlotState
::
Finished
=
slot
.state
()
{
// All operations complete - safe to remove slot and tell vLLM blocks are free
self
.slot_manager
()
.remove_slot
(
&
request_id
)
?
;
Ok
(
false
)
}
else
{
debug_assert!
(
matches!
(
slot
.state
(),
SlotState
::
Finishing
));
// Still has pending operations - keep slot alive for worker to process
// Don't remove slot here. Worker needs it to process the finish event.
// Worker will remove it after verifying all operations are complete.
// The lock on the slot prevents new operations from being created in offload_blocks()
Ok
(
true
)
}
Ok
(
true
)
}
fn
has_slot
(
&
self
,
request_id
:
String
)
->
bool
{
...
...
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
View file @
60feb955
...
...
@@ -278,11 +278,6 @@ impl Worker for KvConnectorWorker {
self
.maybe_finished_onboarding
.insert
(
request_id
);
}
// delay offloading operations until the end of the forward pass
debug_assert!
(
self
.offloading_operations
.is_empty
(),
"offloading operations should be empty"
);
self
.offloading_operations
=
offloading_operations
;
Ok
(())
...
...
@@ -304,15 +299,34 @@ impl Worker for KvConnectorWorker {
/// Trigger block-wise completion signals afer last layer.
fn
save_kv_layer
(
&
mut
self
,
_
layer_name
:
String
)
->
anyhow
::
Result
<
()
>
{
self
.layers_complete
+=
1
;
tracing
::
debug!
(
iteration
=
self
.iteration
,
layers_complete
=
self
.layers_complete
,
total_layers
=
self
.kv_cache_layers
.len
(),
pending_offload_ops
=
self
.offloading_operations
.len
(),
"save_kv_layer called"
);
if
self
.layers_complete
==
self
.kv_cache_layers
.len
()
{
let
offloading_operations
=
std
::
mem
::
take
(
&
mut
self
.offloading_operations
);
tracing
::
info!
(
iteration
=
self
.iteration
,
num_operations
=
offloading_operations
.len
(),
"All layers complete, enqueuing {} offload operations"
,
offloading_operations
.len
()
);
// block on the the completion of the last layer
// todo(ryan): capture the context, pass this to the scheduler to do the await on another thread
// or put the event on a stream and use stream waits to keep it all on device.
event_sync_blocking
(
self
.layer_events
[
self
.layers_complete
-
1
]);
for
operation
in
offloading_operations
{
self
.connector
.enqueue_request
(
operation
);
for
operation
in
&
offloading_operations
{
tracing
::
debug!
(
request_id
=
%
operation
.request_id
,
operation_id
=
%
operation
.uuid
,
"Enqueuing offload operation to scheduler"
);
self
.connector
.enqueue_request
(
operation
.clone
());
}
}
Ok
(())
...
...
pyproject.toml
View file @
60feb955
...
...
@@ -56,7 +56,7 @@ trtllm =[
vllm
=
[
"uvloop"
,
"nixl[cu12]<=0.7.1"
,
"vllm[flashinfer]==0.11.
0
"
,
"vllm[flashinfer]==0.11.
2
"
,
]
sglang
=
[
...
...
recipes/llama-3-70b/vllm/agg/deploy.yaml
View file @
60feb955
...
...
@@ -43,7 +43,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
View file @
60feb955
...
...
@@ -43,7 +43,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -74,7 +74,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
View file @
60feb955
...
...
@@ -55,7 +55,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -98,7 +98,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
tests/dependencies/test_vllm_imports.py
0 → 100644
View file @
60feb955
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests to sanity check that required dependencies can be imported."""
import
pytest
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
unit
@
pytest
.
mark
.
gpu_1
def
test_import_deep_ep
():
"""Test that deep_ep module can be imported."""
try
:
import
deep_ep
assert
deep_ep
is
not
None
except
ImportError
as
e
:
pytest
.
fail
(
f
"Failed to import deep_ep:
{
e
}
"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
unit
@
pytest
.
mark
.
gpu_1
def
test_import_pplx_kernels
():
"""Test that pplx_kernels module can be imported."""
try
:
import
pplx_kernels
assert
pplx_kernels
is
not
None
except
ImportError
as
e
:
pytest
.
fail
(
f
"Failed to import pplx_kernels:
{
e
}
"
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment