Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
60feb955
"lib/llm/vscode:/vscode.git/clone" did not exist on "c8276cd28b3b12f4a7779784f57cd952d4d74da3"
Unverified
Commit
60feb955
authored
Dec 03, 2025
by
Karen Chung
Committed by
GitHub
Dec 03, 2025
Browse files
chore: bump vLLM to 0.11.2 (#4476)
parent
c5e8c4c2
Changes
23
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
183 additions
and
187 deletions
+183
-187
.github/actions/pytest/action.yml
.github/actions/pytest/action.yml
+1
-1
components/src/dynamo/vllm/args.py
components/src/dynamo/vllm/args.py
+18
-0
container/Dockerfile.vllm
container/Dockerfile.vllm
+10
-9
container/build.sh
container/build.sh
+2
-2
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+44
-138
examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
...kends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
+0
-1
examples/backends/vllm/deploy/agg_kvbm.yaml
examples/backends/vllm/deploy/agg_kvbm.yaml
+0
-1
examples/backends/vllm/deploy/disagg_kvbm.yaml
examples/backends/vllm/deploy/disagg_kvbm.yaml
+0
-2
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
+0
-2
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
+0
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
...ython/kvbm/vllm_integration/connector/dynamo_connector.py
+16
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
...bm/python/kvbm/vllm_integration/connector/pd_connector.py
+10
-2
lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
...ngs/kvbm/python/kvbm/vllm_integration/connector_worker.py
+1
-1
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
+22
-11
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
+21
-7
pyproject.toml
pyproject.toml
+1
-1
recipes/llama-3-70b/vllm/agg/deploy.yaml
recipes/llama-3-70b/vllm/agg/deploy.yaml
+1
-1
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
+2
-2
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
+2
-2
tests/dependencies/test_vllm_imports.py
tests/dependencies/test_vllm_imports.py
+32
-0
No files found.
.github/actions/pytest/action.yml
View file @
60feb955
...
...
@@ -139,4 +139,4 @@ runs:
path
:
|
test-results/pytest_test_report_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.xml
test-results/test_metadata_${{ inputs.framework }}_${{ env.STR_TEST_TYPE }}_${{ inputs.platform_arch }}.json
retention-days
:
7
\ No newline at end of file
retention-days
:
7
components/src/dynamo/vllm/args.py
View file @
60feb955
...
...
@@ -207,6 +207,24 @@ def parse_args() -> Config:
args
=
parser
.
parse_args
()
engine_args
=
AsyncEngineArgs
.
from_cli_args
(
args
)
# Workaround for vLLM GIL contention bug with NIXL connector when using UniProcExecutor.
# With TP=1, vLLM defaults to UniProcExecutor which runs scheduler and worker in the same
# process. This causes a hot loop in _process_engine_step that doesn't release the GIL,
# blocking NIXL's add_remote_agent from completing. Using "mp" backend forces separate
# processes, avoiding the GIL contention.
# Note: Only apply for NIXL - other connectors (kvbm, lmcache) work fine with UniProcExecutor
# and forcing mp can expose race conditions in vLLM's scheduler.
# See: https://github.com/vllm-project/vllm/issues/29369
connector_list
=
[
c
.
lower
()
for
c
in
args
.
connector
]
if
args
.
connector
else
[]
uses_nixl
=
"nixl"
in
connector_list
tp_size
=
getattr
(
engine_args
,
"tensor_parallel_size"
,
None
)
or
1
if
uses_nixl
and
tp_size
==
1
and
engine_args
.
distributed_executor_backend
is
None
:
logger
.
info
(
"Setting --distributed-executor-backend=mp for TP=1 to avoid "
"UniProcExecutor GIL contention with NIXL connector"
)
engine_args
.
distributed_executor_backend
=
"mp"
if
engine_args
.
enable_prefix_caching
is
None
:
logger
.
debug
(
"--enable-prefix-caching or --no-enable-prefix-caching not specified. Defaulting to True (vLLM v1 default behavior)"
...
...
container/Dockerfile.vllm
View file @
60feb955
...
...
@@ -11,17 +11,18 @@ ARG PYTHON_VERSION
ARG ENABLE_KVBM
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.
8.1
-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.
8
"
ARG RUNTIME_IMAGE_TAG="12.
9.0
-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.
9
"
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.11.0"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.1"
ARG TORCH_BACKEND="cu128"
ARG VLLM_REF="v0.11.2"
# FlashInfer Ref used to install flashinfer-cubin and flashinfer-jit-cache
ARG FLASHINF_REF="v0.5.2"
# If left blank, then we will fallback to vLLM defaults
ARG DEEPGEMM_REF=""
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
ARG LMCACHE_REF="0.3.9.post2"
# sccache configuration - inherit from base build
ARG USE_SCCACHE
...
...
@@ -110,7 +111,7 @@ ARG VLLM_REF
ARG VLLM_GIT_URL
ARG DEEPGEMM_REF
ARG FLASHINF_REF
ARG
TORCH_BACKEND
ARG
LMCACHE_REF
ARG CUDA_VERSION
ARG MAX_JOBS=16
...
...
@@ -144,7 +145,7 @@ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
export SCCACHE_S3_KEY_PREFIX=${SCCACHE_S3_KEY_PREFIX:-${ARCH}} && \
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh
--editable
--vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"}
--torch-backend $TORCH_BACKEND
--cuda-version $CUDA_VERSION && \
/tmp/install_vllm.sh --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt ${DEEPGEMM_REF:+--deepgemm-ref "$DEEPGEMM_REF"} ${FLASHINF_REF:+--flashinf-ref "$FLASHINF_REF"}
${LMCACHE_REF:+--lmcache-ref "$LMCACHE_REF"}
--cuda-version $CUDA_VERSION && \
/tmp/use-sccache.sh show-stats "vLLM";
ENV LD_LIBRARY_PATH=\
...
...
@@ -236,7 +237,7 @@ RUN apt-get update && \
# prometheus dependencies
ca-certificates \
# DeepGemm uses 'cuobjdump' which does not come with CUDA image
cuda-command-line-tools-12-
8
&& \
cuda-command-line-tools-12-
9
&& \
rm -rf /var/lib/apt/lists/*
USER dynamo
...
...
container/build.sh
View file @
60feb955
...
...
@@ -106,7 +106,7 @@ VLLM_BASE_IMAGE="nvcr.io/nvidia/cuda-dl-base"
# Please check https://github.com/ai-dynamo/dynamo/pull/1065
# for details and reproducer to manually test if the image
# can be updated to later versions.
VLLM_BASE_IMAGE_TAG
=
"25.0
1
-cuda12.
8
-devel-ubuntu24.04"
VLLM_BASE_IMAGE_TAG
=
"25.0
4
-cuda12.
9
-devel-ubuntu24.04"
NONE_BASE_IMAGE
=
"nvcr.io/nvidia/cuda-dl-base"
NONE_BASE_IMAGE_TAG
=
"25.01-cuda12.8-devel-ubuntu24.04"
...
...
@@ -989,4 +989,4 @@ elif [[ "${LOCAL_DEV_BUILD:-}" == "true" ]]; then
fi
{
set
+x
;
}
2>/dev/null
\ No newline at end of file
{
set
+x
;
}
2>/dev/null
container/deps/vllm/install_vllm.sh
View file @
60feb955
...
...
@@ -2,18 +2,16 @@
# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# This script is used to install vLLM and its dependencies
# If installing vLLM from a release tag, we will use pip to manage the install
# Otherwise, we will use git to checkout the vLLM source code and build it from source.
# The dependencies are installed in the following order:
# 1. vLLM
# 2. LMCache
# This script installs vLLM and its dependencies from PyPI (release versions only).
# Installation order:
# 1. LMCache (installed first so vLLM's dependencies take precedence)
# 2. vLLM
# 3. DeepGEMM
# 4. EP kernels
set
-euo
pipefail
VLLM_REF
=
"v0.11.
0
"
VLLM_REF
=
"v0.11.
2
"
# Basic Configurations
ARCH
=
$(
uname
-m
)
...
...
@@ -21,34 +19,19 @@ MAX_JOBS=16
INSTALLATION_DIR
=
/tmp
# VLLM and Dependency Configurations
TORCH_BACKEND
=
"cu128"
TORCH_CUDA_ARCH_LIST
=
"9.0;10.0"
# For EP Kernels
DEEPGEMM_REF
=
""
CUDA_VERSION
=
"12.8"
# For DEEPGEMM
# These flags are applicable when installing vLLM from source code
EDITABLE
=
true
VLLM_GIT_URL
=
"https://github.com/vllm-project/vllm.git"
FLASHINF_REF
=
"v0.3.1"
CUDA_VERSION
=
"12.9"
FLASHINF_REF
=
"v0.5.2"
# LMCache version - 0.3.9+ required for vLLM 0.11.2 compatibility
LMCACHE_REF
=
"0.3.9.post2"
while
[[
$#
-gt
0
]]
;
do
case
$1
in
--editable
)
EDITABLE
=
true
shift
;;
--no-editable
)
EDITABLE
=
false
shift
;;
--vllm-ref
)
VLLM_REF
=
"
$2
"
shift
2
;;
--vllm-git-url
)
VLLM_GIT_URL
=
"
$2
"
shift
2
;;
--max-jobs
)
MAX_JOBS
=
"
$2
"
shift
2
...
...
@@ -69,8 +52,8 @@ while [[ $# -gt 0 ]]; do
FLASHINF_REF
=
"
$2
"
shift
2
;;
--
torch-backend
)
TORCH_BACKEND
=
"
$2
"
--
lmcache-ref
)
LMCACHE_REF
=
"
$2
"
shift
2
;;
--torch-cuda-arch-list
)
...
...
@@ -82,19 +65,17 @@ while [[ $# -gt 0 ]]; do
shift
2
;;
-h
|
--help
)
echo
"Usage:
$0
[--editable|--no-editable]
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--
torch-backend BACKEND
] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Usage:
$0
[--vllm-ref REF] [--max-jobs NUM] [--arch ARCH] [--deepgemm-ref REF] [--flashinf-ref REF] [--
lmcache-ref REF
] [--torch-cuda-arch-list LIST] [--cuda-version VERSION]"
echo
"Options:"
echo
" --editable Install vllm in editable mode (default)"
echo
" --no-editable Install vllm in non-editable mode"
echo
" --vllm-ref REF Git reference to checkout (default:
${
VLLM_REF
}
)"
echo
" --max-jobs NUM Maximum number of parallel jobs (default:
${
MAX_JOBS
}
)"
echo
" --arch ARCH Architecture (amd64|arm64, default: auto-detect)"
echo
" --installation-dir DIR Directory to install vllm (default:
${
INSTALLATION_DIR
}
)"
echo
" --deepgemm-ref REF Git reference for DeepGEMM (default:
${
DEEPGEMM_REF
}
)"
echo
" --flashinf-ref REF Git reference for Flash Infer (default:
${
FLASHINF_REF
}
)"
echo
" --torch-backend BACKEND Torch backend to use (default:
${
TORCH_BACKEND
}
)"
echo
" --torch-cuda-arch-list LIST CUDA architectures to compile for (default:
${
TORCH_CUDA_ARCH_LIST
}
)"
echo
" --cuda-version VERSION CUDA version to use (default:
${
CUDA_VERSION
}
)"
echo
" --vllm-ref REF vLLM release version (default:
${
VLLM_REF
}
)"
echo
" --max-jobs NUM Maximum parallel jobs (default:
${
MAX_JOBS
}
)"
echo
" --arch ARCH Architecture amd64|arm64 (default: auto-detect)"
echo
" --installation-dir DIR Install directory (default:
${
INSTALLATION_DIR
}
)"
echo
" --deepgemm-ref REF DeepGEMM git ref (default:
${
DEEPGEMM_REF
}
)"
echo
" --flashinf-ref REF FlashInfer version (default:
${
FLASHINF_REF
}
)"
echo
" --lmcache-ref REF LMCache version (default:
${
LMCACHE_REF
}
)"
echo
" --torch-cuda-arch-list LIST CUDA architectures (default:
${
TORCH_CUDA_ARCH_LIST
}
)"
echo
" --cuda-version VERSION CUDA version (default:
${
CUDA_VERSION
}
)"
exit
0
;;
*
)
...
...
@@ -114,119 +95,43 @@ fi
export
MAX_JOBS
=
$MAX_JOBS
export
CUDA_HOME
=
/usr/local/cuda
# Derive torch backend from CUDA version (e.g., "12.9" -> "cu129")
TORCH_BACKEND
=
"cu
$(
echo
$CUDA_VERSION
|
tr
-d
'.'
)
"
echo
"=== Installing prerequisites ==="
uv pip
install
pip cuda-python
echo
"
\n
=== Configuration Summary ==="
echo
" VLLM_REF=
$VLLM_REF
| EDITABLE=
$EDITABLE
| ARCH=
$ARCH
"
echo
" MAX_JOBS=
$MAX_JOBS
| TORCH_BACKEND=
$TORCH_BACKEND
| CUDA_VERSION=
$CUDA_VERSION
"
echo
" TORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
"
echo
" DEEPGEMM_REF=
$DEEPGEMM_REF
| FLASHINF_REF=
$FLASHINF_REF
"
echo
" INSTALLATION_DIR=
$INSTALLATION_DIR
| VLLM_GIT_URL=
$VLLM_GIT_URL
"
echo
" VLLM_REF=
$VLLM_REF
| ARCH=
$ARCH
| CUDA_VERSION=
$CUDA_VERSION
| TORCH_BACKEND=
$TORCH_BACKEND
"
echo
" FLASHINF_REF=
$FLASHINF_REF
| LMCACHE_REF=
$LMCACHE_REF
| DEEPGEMM_REF=
$DEEPGEMM_REF
"
echo
" TORCH_CUDA_ARCH_LIST=
$TORCH_CUDA_ARCH_LIST
| INSTALLATION_DIR=
$INSTALLATION_DIR
"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
# LMCache installation currently fails on arm64 due to CUDA dependency issues
# Install LMCache BEFORE vLLM so vLLM's dependencies take precedence
uv pip
install
lmcache
==
${
LMCACHE_REF
}
--torch-backend
=
${
TORCH_BACKEND
}
echo
"✓ LMCache
${
LMCACHE_REF
}
installed"
else
echo
"⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo
"
\n
=== Cloning vLLM repository ==="
#
W
e need
to clone to install dependencie
s
#
Clon
e need
ed
for DeepGEMM and EP kernels install script
s
cd
$INSTALLATION_DIR
git clone
$VLLM_GIT_URL
vllm
git clone
https://github.com/vllm-project/vllm.git
vllm
cd
vllm
git checkout
$VLLM_REF
# TODO leave this here in case we need to do cherry-picks in future
# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
echo
"
\n
=== Installing vLLM & FlashInfer ==="
echo
"Installing vLLM
$VLLM_REF
from PyPI..."
if
[[
$VLLM_REF
=
~ ^v
]]
&&
{
[
"
$ARCH
"
=
"amd64"
]
||
{
[
"
$ARCH
"
=
"arm64"
]
&&
[
"
$TORCH_BACKEND
"
=
"cu129"
]
;
}
;
}
;
then
# VLLM_REF starts with 'v' and either amd64, or arm64 with cu129 backend - use PyPI install
echo
"Installing vLLM
$VLLM_REF
from PyPI... (ARCH=
$ARCH
, TORCH_BACKEND=
$TORCH_BACKEND
)"
uv pip
install
vllm[flashinfer]
==
$VLLM_REF
--torch-backend
=
$TORCH_BACKEND
else
# VLLM_REF does not start with 'v' or amd64 - use git checkout path
if
[
"
$ARCH
"
=
"arm64"
]
;
then
# torch 2.8.0 doesn't have a aarch wheel for cu128, vLLM uses torch 2.8.0 nightly wheel builds to compile its aarch wheel against
# nightly can be unstable so we will not use it here
# for now we will use torch 2.7.1+cu128 but this requires a recompilation from source
echo
"Building vLLM from source for ARM64 architecture..."
# Try to install specific PyTorch version first
echo
"Attempting to install pinned PyTorch nightly versions..."
if
!
uv pip
install
torch
==
2.7.1+cu128
torchaudio
==
2.7.1
torchvision
==
0.22.1
--index-url
https://download.pytorch.org/whl/cu128
;
then
echo
"Pinned versions failed"
exit
1
fi
# Create constraints file to pin all PyTorch-related versions
echo
"Creating constraints file to preserve PyTorch ecosystem versions..."
TORCH_VERSION
=
$(
python
-c
"import torch; print(torch.__version__)"
)
TORCHAUDIO_VERSION
=
$(
python
-c
"import torchaudio; print(torchaudio.__version__)"
)
TORCHVISION_VERSION
=
$(
python
-c
"import torchvision; print(torchvision.__version__)"
)
rm
-rf
/tmp/torch_constraints.txt
echo
"torch==
$TORCH_VERSION
"
>
/tmp/torch_constraints.txt
echo
"torchaudio==
$TORCHAUDIO_VERSION
"
>>
/tmp/torch_constraints.txt
echo
"torchvision==
$TORCHVISION_VERSION
"
>>
/tmp/torch_constraints.txt
echo
"Pinned versions:"
echo
" - torch==
$TORCH_VERSION
"
echo
" - torchaudio==
$TORCHAUDIO_VERSION
"
echo
" - torchvision==
$TORCHVISION_VERSION
"
python use_existing_torch.py
uv pip
install
-c
/tmp/torch_constraints.txt
-r
requirements/build.txt
if
[
"
$EDITABLE
"
=
"true"
]
;
then
MAX_JOBS
=
${
MAX_JOBS
}
uv pip
install
--no-build-isolation
-c
/tmp/torch_constraints.txt
-e
.
-v
else
MAX_JOBS
=
${
MAX_JOBS
}
uv pip
install
--no-build-isolation
-c
/tmp/torch_constraints.txt
.
-v
fi
echo
"
\n
=== Installing FlashInfer from source ==="
cd
$INSTALLATION_DIR
git clone https://github.com/flashinfer-ai/flashinfer.git
--recursive
cd
flashinfer
git checkout
$FLASHINF_REF
# Install with constraints to prevent PyTorch upgrade
uv pip
install
-v
--no-build-isolation
-c
/tmp/torch_constraints.txt
.
else
echo
"Building vLLM from source for AMD64 architecture..."
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
export
VLLM_PRECOMPILED_WHEEL_LOCATION
=
"https://vllm-wheels.s3.us-west-2.amazonaws.com/
${
VLLM_REF
}
/vllm-0.10.2-cp38-abi3-manylinux1_x86_64.whl"
if
[
"
$EDITABLE
"
=
"true"
]
;
then
uv pip
install
-e
.
--torch-backend
=
$TORCH_BACKEND
else
uv pip
install
.
--torch-backend
=
$TORCH_BACKEND
fi
echo
"
\n
=== Installing FlashInfer from PyPI ==="
uv pip
install
flashinfer-python
==
$FLASHINF_REF
fi
fi
uv pip
install
vllm[flashinfer]
==
$VLLM_REF
--torch-backend
=
${
TORCH_BACKEND
}
uv pip
install
flashinfer-cubin
==
$FLASHINF_REF
uv pip
install
flashinfer-jit-cache
==
$FLASHINF_REF
--extra-index-url
https://flashinfer.ai/whl/
${
TORCH_BACKEND
}
echo
"✓ vLLM installation completed"
echo
"
\n
=== Installing LMCache ==="
if
[
"
$ARCH
"
=
"amd64"
]
;
then
# LMCache installation currently fails on arm64 due to CUDA dependency issues:
# OSError: CUDA_HOME environment variable is not set. Please set it to your CUDA install root.
# TODO: Re-enable for arm64 after verifying lmcache compatibility and resolving the build issue.
# Alec: Likely lmcache was compiled witha different version of torch and need to install it from source for arm64
uv pip
install
lmcache
==
0.3.7
echo
"✓ LMCache installed"
else
echo
"⚠ Skipping LMCache on ARM64 (compatibility issues)"
fi
echo
"
\n
=== Installing DeepGEMM ==="
cd
$INSTALLATION_DIR
/vllm/tools
...
...
@@ -239,6 +144,7 @@ echo "✓ DeepGEMM installation completed"
echo
"
\n
=== Installing EP Kernels (PPLX and DeepEP) ==="
cd
ep_kernels/
# TODO we will be able to specify which pplx and deepep commit we want in future
TORCH_CUDA_ARCH_LIST
=
"
$TORCH_CUDA_ARCH_LIST
"
bash install_python_libraries.sh
echo
"
\n
✅ All installations completed successfully!"
examples/backends/sglang/slurm_jobs/scripts/vllm/benchmark_serving.py
View file @
60feb955
...
...
@@ -8,7 +8,6 @@ On the server side, run one of the following commands:
vLLM OpenAI API server
vllm serve <your_model> \
--swap-space 16 \
--disable-log-requests
(TGI backend)
./launch_tgi_server.sh <your_model> <max_batch_total_tokens>
...
...
examples/backends/vllm/deploy/agg_kvbm.yaml
View file @
60feb955
...
...
@@ -42,7 +42,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.45"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm.yaml
View file @
60feb955
...
...
@@ -35,7 +35,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -68,7 +67,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm_2p2d.yaml
View file @
60feb955
...
...
@@ -35,7 +35,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -68,7 +67,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.3"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
examples/backends/vllm/deploy/disagg_kvbm_tp2.yaml
View file @
60feb955
...
...
@@ -37,7 +37,6 @@ spec:
-
Qwen/Qwen3-8B
-
--gpu-memory-utilization
-
"
0.23"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
@@ -72,7 +71,6 @@ spec:
-
--is-prefill-worker
-
--gpu-memory-utilization
-
"
0.23"
-
--disable-log-requests
-
--max-model-len
-
"
32000"
-
--enforce-eager
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/dynamo_connector.py
View file @
60feb955
...
...
@@ -23,6 +23,7 @@ if TYPE_CHECKING:
from
vllm.config
import
VllmConfig
from
vllm.forward_context
import
ForwardContext
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.request
import
Request
...
...
@@ -40,8 +41,15 @@ class DynamoConnectorMetadata(KVConnectorMetadata):
class
DynamoConnector
(
KVConnectorBase_V1
):
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
,
kv_cache_config
:
Optional
[
"KVCacheConfig"
]
=
None
,
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
,
kv_cache_config
=
kv_cache_config
)
assert
vllm_config
.
kv_transfer_config
is
not
None
assert
vllm_config
.
kv_transfer_config
.
engine_id
is
not
None
...
...
@@ -90,13 +98,19 @@ class DynamoConnector(KVConnectorBase_V1):
def
register_kv_caches
(
self
,
kv_caches
:
dict
[
str
,
torch
.
Tensor
]):
self
.
_worker
.
register_kv_caches
(
kv_caches
)
@
override
def
bind_connector_metadata
(
self
,
connector_metadata
:
DynamoConnectorMetadata
)
->
None
:
# Must call super() to set _connector_metadata so has_connector_metadata() returns True
# This is required for save_kv_layer to be called during the forward pass
super
().
bind_connector_metadata
(
connector_metadata
)
assert
isinstance
(
connector_metadata
.
metadata
,
bytes
)
self
.
_worker
.
bind_connector_metadata
(
connector_metadata
.
metadata
)
@
override
def
clear_connector_metadata
(
self
)
->
None
:
super
().
clear_connector_metadata
()
self
.
_worker
.
clear_connector_metadata
()
@
override
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector/pd_connector.py
View file @
60feb955
...
...
@@ -29,6 +29,7 @@ if TYPE_CHECKING:
LMCacheConnectorV1
,
)
from
vllm.v1.core.kv_cache_manager
import
KVCacheBlocks
from
vllm.v1.kv_cache_interface
import
KVCacheConfig
from
vllm.v1.request
import
Request
...
...
@@ -46,8 +47,15 @@ class PdConnector(MultiConnector):
- The second connector must be NIXL and will be used by decode worker to get KV blocks from prefill worker.
"""
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
)
def
__init__
(
self
,
vllm_config
:
"VllmConfig"
,
role
:
KVConnectorRole
,
kv_cache_config
:
"KVCacheConfig"
,
):
super
().
__init__
(
vllm_config
=
vllm_config
,
role
=
role
,
kv_cache_config
=
kv_cache_config
)
if
len
(
self
.
_connectors
)
!=
2
:
raise
ValueError
(
f
"PdConnector requires exactly two connectors (got
{
len
(
self
.
_connectors
)
}
)"
...
...
lib/bindings/kvbm/python/kvbm/vllm_integration/connector_worker.py
View file @
60feb955
...
...
@@ -14,7 +14,7 @@ from kvbm.utils import is_dyn_runtime_enabled
from
vllm.config
import
VllmConfig
from
vllm.distributed.kv_transfer.kv_connector.v1.base
import
KVConnectorMetadata
from
vllm.model_executor.models.utils
import
extract_layer_index
from
vllm.utils
import
STR_DTYPE_TO_TORCH_DTYPE
from
vllm.utils
.torch_utils
import
STR_DTYPE_TO_TORCH_DTYPE
if
TYPE_CHECKING
:
from
vllm.attention.backends.abstract
import
AttentionMetadata
...
...
lib/bindings/kvbm/src/block_manager/vllm/connector/leader.rs
View file @
60feb955
...
...
@@ -526,22 +526,33 @@ impl Leader for KvConnectorLeader {
// remove the request from the inflight requests
self
.inflight_requests
.remove
(
&
request_id
);
// if the slot has finished, we can return false to vllm, indicating all gpu blocks are free to be reused
// otherwise, we return true, which means there are still outstanding operations on gpu blocks which
// must be awaited before the gpu blocks can be reused. if we return true, then it is the worker side
// of the connector api which will be used to inform vllm that the request is finished.
// Return value semantics:
// - `false`: Tells vLLM all GPU blocks are free and the request can be fully cleaned up.
// vLLM will immediately remove the request from its internal hash table.
// - `true`: Tells vLLM there are outstanding async operations on GPU blocks.
// The worker side of the connector API will later call `finish_requests()`
// to notify vLLM when the request is truly complete.
//
// TODO(jthomson04): This is a temporary fix to ensure vLLM 0.11.2 compatibility.
// IMPORTANT: We must ALWAYS return `true` here, even when the slot is already Finished.
//
// Why? If we return `false`, vLLM removes the request from `self.requests` immediately.
// However, our worker connector may still report completion later via `finish_requests()`.
// When that happens, vLLM's scheduler.py has an assertion `req_id in self.requests`
// that will fail because the request was already removed from the hash table.
//
// By always returning `true`, we ensure vLLM keeps the request in its hash table until
// our worker explicitly signals completion, avoiding the race condition.
//
// If the slot is already Finished (no pending operations), we clean it up from our side
// but still return `true` so vLLM waits for the worker's completion signal.
if
let
SlotState
::
Finished
=
slot
.state
()
{
// All operations complete - safe to remove slot and tell vLLM blocks are free
self
.slot_manager
()
.remove_slot
(
&
request_id
)
?
;
Ok
(
false
)
}
else
{
debug_assert!
(
matches!
(
slot
.state
(),
SlotState
::
Finishing
));
// Still has pending operations - keep slot alive for worker to process
// Don't remove slot here. Worker needs it to process the finish event.
// Worker will remove it after verifying all operations are complete.
// The lock on the slot prevents new operations from being created in offload_blocks()
Ok
(
true
)
}
Ok
(
true
)
}
fn
has_slot
(
&
self
,
request_id
:
String
)
->
bool
{
...
...
lib/bindings/kvbm/src/block_manager/vllm/connector/worker.rs
View file @
60feb955
...
...
@@ -278,11 +278,6 @@ impl Worker for KvConnectorWorker {
self
.maybe_finished_onboarding
.insert
(
request_id
);
}
// delay offloading operations until the end of the forward pass
debug_assert!
(
self
.offloading_operations
.is_empty
(),
"offloading operations should be empty"
);
self
.offloading_operations
=
offloading_operations
;
Ok
(())
...
...
@@ -304,15 +299,34 @@ impl Worker for KvConnectorWorker {
/// Trigger block-wise completion signals afer last layer.
fn
save_kv_layer
(
&
mut
self
,
_
layer_name
:
String
)
->
anyhow
::
Result
<
()
>
{
self
.layers_complete
+=
1
;
tracing
::
debug!
(
iteration
=
self
.iteration
,
layers_complete
=
self
.layers_complete
,
total_layers
=
self
.kv_cache_layers
.len
(),
pending_offload_ops
=
self
.offloading_operations
.len
(),
"save_kv_layer called"
);
if
self
.layers_complete
==
self
.kv_cache_layers
.len
()
{
let
offloading_operations
=
std
::
mem
::
take
(
&
mut
self
.offloading_operations
);
tracing
::
info!
(
iteration
=
self
.iteration
,
num_operations
=
offloading_operations
.len
(),
"All layers complete, enqueuing {} offload operations"
,
offloading_operations
.len
()
);
// block on the the completion of the last layer
// todo(ryan): capture the context, pass this to the scheduler to do the await on another thread
// or put the event on a stream and use stream waits to keep it all on device.
event_sync_blocking
(
self
.layer_events
[
self
.layers_complete
-
1
]);
for
operation
in
offloading_operations
{
self
.connector
.enqueue_request
(
operation
);
for
operation
in
&
offloading_operations
{
tracing
::
debug!
(
request_id
=
%
operation
.request_id
,
operation_id
=
%
operation
.uuid
,
"Enqueuing offload operation to scheduler"
);
self
.connector
.enqueue_request
(
operation
.clone
());
}
}
Ok
(())
...
...
pyproject.toml
View file @
60feb955
...
...
@@ -56,7 +56,7 @@ trtllm =[
vllm
=
[
"uvloop"
,
"nixl[cu12]<=0.7.1"
,
"vllm[flashinfer]==0.11.
0
"
,
"vllm[flashinfer]==0.11.
2
"
,
]
sglang
=
[
...
...
recipes/llama-3-70b/vllm/agg/deploy.yaml
View file @
60feb955
...
...
@@ -43,7 +43,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-multi-node/deploy.yaml
View file @
60feb955
...
...
@@ -43,7 +43,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -74,7 +74,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
8
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
recipes/llama-3-70b/vllm/disagg-single-node/deploy.yaml
View file @
60feb955
...
...
@@ -55,7 +55,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--disable-log-requests
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
2
--data-parallel-size
1
--is-prefill-worker
--gpu-memory-utilization
0.95
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
@@ -98,7 +98,7 @@ spec:
-
name
:
HF_HOME
value
:
/opt/models
args
:
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--disable-log-requests
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
-
"
python3
-m
dynamo.vllm
--model
$MODEL_PATH
--served-model-name
$SERVED_MODEL_NAME
--tensor-parallel-size
4
--data-parallel-size
1
--gpu-memory-utilization
0.90
--no-enable-prefix-caching
--block-size
128"
command
:
-
/bin/sh
-
-c
...
...
tests/dependencies/test_vllm_imports.py
0 → 100644
View file @
60feb955
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
"""Unit tests to sanity check that required dependencies can be imported."""
import
pytest
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
unit
@
pytest
.
mark
.
gpu_1
def
test_import_deep_ep
():
"""Test that deep_ep module can be imported."""
try
:
import
deep_ep
assert
deep_ep
is
not
None
except
ImportError
as
e
:
pytest
.
fail
(
f
"Failed to import deep_ep:
{
e
}
"
)
@
pytest
.
mark
.
vllm
@
pytest
.
mark
.
unit
@
pytest
.
mark
.
gpu_1
def
test_import_pplx_kernels
():
"""Test that pplx_kernels module can be imported."""
try
:
import
pplx_kernels
assert
pplx_kernels
is
not
None
except
ImportError
as
e
:
pytest
.
fail
(
f
"Failed to import pplx_kernels:
{
e
}
"
)
Prev
1
2
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment