Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
dynamo
Commits
90dc7589
Unverified
Commit
90dc7589
authored
Oct 12, 2025
by
Alec
Committed by
GitHub
Oct 12, 2025
Browse files
chore: bump vllm to 0.11.0 (#3422)
Signed-off-by:
alec-flowers
<
aflowers@nvidia.com
>
parent
60975b51
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
38 additions
and
18 deletions
+38
-18
container/Dockerfile.vllm
container/Dockerfile.vllm
+2
-2
container/deps/vllm/install_vllm.sh
container/deps/vllm/install_vllm.sh
+5
-7
examples/multimodal/utils/protocol.py
examples/multimodal/utils/protocol.py
+2
-1
lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
...ython/src/dynamo/llm/vllm_integration/connector_leader.py
+3
-1
tests/conftest.py
tests/conftest.py
+15
-0
tests/fault_tolerance/cancellation/test_vllm.py
tests/fault_tolerance/cancellation/test_vllm.py
+2
-2
tests/fault_tolerance/test_request_migration.py
tests/fault_tolerance/test_request_migration.py
+3
-1
tests/frontend/reasoning_effort/test_reasoning_effort.py
tests/frontend/reasoning_effort/test_reasoning_effort.py
+3
-1
tests/kvbm/test_determinism_agg.py
tests/kvbm/test_determinism_agg.py
+1
-1
tests/kvbm/test_determinism_disagg.py
tests/kvbm/test_determinism_disagg.py
+2
-2
No files found.
container/Dockerfile.vllm
View file @
90dc7589
...
@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
...
@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.8"
ARG CUDA_VERSION="12.8"
# Make sure to update the dependency version in pyproject.toml when updating this
# Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.1
0.2
"
ARG VLLM_REF="v0.1
1.0
"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.
0
"
ARG FLASHINF_REF="v0.3.
1
"
ARG TORCH_BACKEND="cu128"
ARG TORCH_BACKEND="cu128"
# If left blank, then we will fallback to vLLM defaults
# If left blank, then we will fallback to vLLM defaults
...
...
container/deps/vllm/install_vllm.sh
View file @
90dc7589
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
set
-euo
pipefail
set
-euo
pipefail
VLLM_REF
=
"v0.1
0.2
"
VLLM_REF
=
"v0.1
1.0
"
# Basic Configurations
# Basic Configurations
ARCH
=
$(
uname
-m
)
ARCH
=
$(
uname
-m
)
...
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
...
@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
# These flags are applicable when installing vLLM from source code
# These flags are applicable when installing vLLM from source code
EDITABLE
=
true
EDITABLE
=
true
VLLM_GIT_URL
=
"https://github.com/vllm-project/vllm.git"
VLLM_GIT_URL
=
"https://github.com/vllm-project/vllm.git"
FLASHINF_REF
=
"v0.3.
0
"
FLASHINF_REF
=
"v0.3.
1
"
while
[[
$#
-gt
0
]]
;
do
while
[[
$#
-gt
0
]]
;
do
case
$1
in
case
$1
in
...
@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
...
@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
cd
vllm
cd
vllm
git checkout
$VLLM_REF
git checkout
$VLLM_REF
# TODO remove in future vLLM release, re-instate ignore torch script
# TODO leave this here in case we need to do cherry-picks in future
# https://github.com/vllm-project/vllm/pull/24729
# GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
GIT_COMMITTER_NAME
=
"Container Build"
GIT_COMMITTER_EMAIL
=
"container@buildkitsandbox.local"
git cherry-pick 740f064
echo
"
\n
=== Installing vLLM & FlashInfer ==="
echo
"
\n
=== Installing vLLM & FlashInfer ==="
...
@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
...
@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd
ep_kernels/
cd
ep_kernels/
TORCH_CUDA_ARCH_LIST
=
"
$TORCH_CUDA_ARCH_LIST
"
bash install_python_libraries.sh
TORCH_CUDA_ARCH_LIST
=
"
$TORCH_CUDA_ARCH_LIST
"
bash install_python_libraries.sh
echo
"
\n
✅ All installations completed successfully!"
echo
"
\n
✅ All installations completed successfully!"
\ No newline at end of file
examples/multimodal/utils/protocol.py
View file @
90dc7589
...
@@ -22,10 +22,11 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator
...
@@ -22,10 +22,11 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator
from
pydantic_core
import
core_schema
from
pydantic_core
import
core_schema
from
typing_extensions
import
NotRequired
from
typing_extensions
import
NotRequired
from
vllm.inputs.data
import
TokensPrompt
from
vllm.inputs.data
import
TokensPrompt
from
vllm.logprobs
import
PromptLogprobs
from
vllm.multimodal.inputs
import
MultiModalUUIDDict
# noqa: F401
from
vllm.multimodal.inputs
import
MultiModalUUIDDict
# noqa: F401
from
vllm.outputs
import
CompletionOutput
from
vllm.outputs
import
CompletionOutput
from
vllm.sampling_params
import
SamplingParams
from
vllm.sampling_params
import
SamplingParams
from
vllm.sequence
import
PromptLogprobs
,
RequestMetrics
from
vllm.sequence
import
RequestMetrics
import
dynamo.nixl_connect
as
connect
import
dynamo.nixl_connect
as
connect
...
...
lib/bindings/python/src/dynamo/llm/vllm_integration/connector_leader.py
View file @
90dc7589
...
@@ -192,7 +192,9 @@ class KvConnectorLeader:
...
@@ -192,7 +192,9 @@ class KvConnectorLeader:
if
self
.
_connector
.
has_slot
(
request
.
request_id
):
if
self
.
_connector
.
has_slot
(
request
.
request_id
):
return
None
return
None
if
bool
(
request
.
mm_positions
):
if
bool
(
getattr
(
request
,
"mm_features"
,
None
))
or
bool
(
getattr
(
request
,
"mm_positions"
,
None
)
):
raise
ValueError
(
"Unsupported request - requires mm extra keys"
)
raise
ValueError
(
"Unsupported request - requires mm extra keys"
)
all_token_ids
=
request
.
all_token_ids
all_token_ids
=
request
.
all_token_ids
...
...
tests/conftest.py
View file @
90dc7589
...
@@ -40,6 +40,21 @@ logging.basicConfig(
...
@@ -40,6 +40,21 @@ logging.basicConfig(
)
)
@
pytest
.
fixture
()
def
set_ucx_tls_no_mm
():
"""Set UCX env defaults for all tests."""
mp
=
pytest
.
MonkeyPatch
()
# CI note:
# - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel
# - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init
# (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers
# start on the same node (maybe a shared-memory segment collision/limits).
# - Mitigation: disable UCX "mm" shared-memory transport globally for tests
mp
.
setenv
(
"UCX_TLS"
,
"^mm"
)
yield
mp
.
undo
()
def
download_models
(
model_list
=
None
,
ignore_weights
=
False
):
def
download_models
(
model_list
=
None
,
ignore_weights
=
False
):
"""Download models - can be called directly or via fixture
"""Download models - can be called directly or via fixture
...
...
tests/fault_tolerance/cancellation/test_vllm.py
View file @
90dc7589
...
@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
...
@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
def
test_request_cancellation_vllm_decode_cancel
(
def
test_request_cancellation_vllm_decode_cancel
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
"""
"""
End-to-end test for request cancellation during decode phase.
End-to-end test for request cancellation during decode phase.
...
@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
...
@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
def
test_request_cancellation_vllm_remote_prefill_cancel
(
def
test_request_cancellation_vllm_remote_prefill_cancel
(
request
,
runtime_services
,
predownload_models
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
):
"""
"""
End-to-end test for request cancellation during remote prefill phase.
End-to-end test for request cancellation during remote prefill phase.
...
...
tests/fault_tolerance/test_request_migration.py
View file @
90dc7589
...
@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
...
@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
gpu_1
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
e2e
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
@
pytest
.
mark
.
model
(
FAULT_TOLERANCE_MODEL_NAME
)
def
test_request_migration_vllm
(
request
,
runtime_services
,
predownload_models
):
def
test_request_migration_vllm
(
request
,
runtime_services
,
predownload_models
,
set_ucx_tls_no_mm
):
"""
"""
End-to-end test for worker fault tolerance with migration support.
End-to-end test for worker fault tolerance with migration support.
...
...
tests/frontend/reasoning_effort/test_reasoning_effort.py
View file @
90dc7589
...
@@ -58,6 +58,8 @@ class GPTOSSWorkerProcess(ManagedProcess):
...
@@ -58,6 +58,8 @@ class GPTOSSWorkerProcess(ManagedProcess):
"dynamo.vllm"
,
"dynamo.vllm"
,
"--model"
,
"--model"
,
REASONING_TEST_MODEL
,
REASONING_TEST_MODEL
,
"--connector"
,
"none"
,
# skip nixl registration, noticing long startup times in CI. Potentially a bug...
"--enforce-eager"
,
"--enforce-eager"
,
"--dyn-tool-call-parser"
,
"--dyn-tool-call-parser"
,
"harmony"
,
"harmony"
,
...
@@ -85,7 +87,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
...
@@ -85,7 +87,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
(
"http://localhost:8000/v1/models"
,
check_models_api
),
(
"http://localhost:8000/v1/models"
,
check_models_api
),
(
"http://localhost:8083/health"
,
self
.
is_ready
),
(
"http://localhost:8083/health"
,
self
.
is_ready
),
],
],
timeout
=
3
00
,
timeout
=
5
00
,
display_output
=
True
,
display_output
=
True
,
terminate_existing
=
False
,
terminate_existing
=
False
,
stragglers
=
[
"VLLM::EngineCore"
],
stragglers
=
[
"VLLM::EngineCore"
],
...
...
tests/kvbm/test_determinism_agg.py
View file @
90dc7589
...
@@ -111,7 +111,7 @@ class LLMServerManager:
...
@@ -111,7 +111,7 @@ class LLMServerManager:
"--kv-transfer-config"
,
"--kv-transfer-config"
,
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}'
,
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}'
,
os
.
environ
.
get
(
"KVBM_MODEL_ID"
,
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
),
os
.
environ
.
get
(
"KVBM_MODEL_ID"
,
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
),
"--max-
seq
-len"
,
"--max-
model
-len"
,
"8000"
,
# required to fit on L4 GPU when using 8b model
"8000"
,
# required to fit on L4 GPU when using 8b model
]
]
...
...
tests/kvbm/test_determinism_disagg.py
View file @
90dc7589
...
@@ -132,7 +132,7 @@ class LLMServerManager:
...
@@ -132,7 +132,7 @@ class LLMServerManager:
os
.
environ
.
get
(
"KVBM_MODEL_ID"
,
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
),
os
.
environ
.
get
(
"KVBM_MODEL_ID"
,
"deepseek-ai/DeepSeek-R1-Distill-Llama-8B"
),
"--block-size"
,
"--block-size"
,
"16"
,
"16"
,
"--max-
seq
-len"
,
"--max-
model
-len"
,
"8000"
,
# required to fit on L4 GPU when using 8b model
"8000"
,
# required to fit on L4 GPU when using 8b model
"--connector"
,
"--connector"
,
"nixl"
,
"nixl"
,
...
@@ -148,7 +148,7 @@ class LLMServerManager:
...
@@ -148,7 +148,7 @@ class LLMServerManager:
"--is-prefill-worker"
,
"--is-prefill-worker"
,
"--block-size"
,
"--block-size"
,
"16"
,
"16"
,
"--max-
seq
-len"
,
"--max-
model
-len"
,
"8000"
,
# required to fit on L4 GPU when using 8b model
"8000"
,
# required to fit on L4 GPU when using 8b model
"--connector"
,
"--connector"
,
"kvbm"
,
"kvbm"
,
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment