Unverified Commit 90dc7589 authored by Alec's avatar Alec Committed by GitHub
Browse files

chore: bump vllm to 0.11.0 (#3422)


Signed-off-by: default avataralec-flowers <aflowers@nvidia.com>
parent 60975b51
...@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" ...@@ -15,9 +15,9 @@ ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG CUDA_VERSION="12.8" ARG CUDA_VERSION="12.8"
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="v0.10.2" ARG VLLM_REF="v0.11.0"
# FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds # FlashInfer only respected when building vLLM from source, ie when VLLM_REF does not start with 'v' or for arm64 builds
ARG FLASHINF_REF="v0.3.0" ARG FLASHINF_REF="v0.3.1"
ARG TORCH_BACKEND="cu128" ARG TORCH_BACKEND="cu128"
# If left blank, then we will fallback to vLLM defaults # If left blank, then we will fallback to vLLM defaults
......
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
set -euo pipefail set -euo pipefail
VLLM_REF="v0.10.2" VLLM_REF="v0.11.0"
# Basic Configurations # Basic Configurations
ARCH=$(uname -m) ARCH=$(uname -m)
...@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM ...@@ -29,7 +29,7 @@ CUDA_VERSION="12.8" # For DEEPGEMM
# These flags are applicable when installing vLLM from source code # These flags are applicable when installing vLLM from source code
EDITABLE=true EDITABLE=true
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git" VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
FLASHINF_REF="v0.3.0" FLASHINF_REF="v0.3.1"
while [[ $# -gt 0 ]]; do while [[ $# -gt 0 ]]; do
case $1 in case $1 in
...@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm ...@@ -131,10 +131,8 @@ git clone $VLLM_GIT_URL vllm
cd vllm cd vllm
git checkout $VLLM_REF git checkout $VLLM_REF
# TODO remove in future vLLM release, re-instate ignore torch script # TODO leave this here in case we need to do cherry-picks in future
# https://github.com/vllm-project/vllm/pull/24729 # GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
GIT_COMMITTER_NAME="Container Build" GIT_COMMITTER_EMAIL="container@buildkitsandbox.local" git cherry-pick 740f064
echo "\n=== Installing vLLM & FlashInfer ===" echo "\n=== Installing vLLM & FlashInfer ==="
...@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ===" ...@@ -243,4 +241,4 @@ echo "\n=== Installing EP Kernels (PPLX and DeepEP) ==="
cd ep_kernels/ cd ep_kernels/
TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh TORCH_CUDA_ARCH_LIST="$TORCH_CUDA_ARCH_LIST" bash install_python_libraries.sh
echo "\n✅ All installations completed successfully!" echo "\n✅ All installations completed successfully!"
\ No newline at end of file
...@@ -22,10 +22,11 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator ...@@ -22,10 +22,11 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator
from pydantic_core import core_schema from pydantic_core import core_schema
from typing_extensions import NotRequired from typing_extensions import NotRequired
from vllm.inputs.data import TokensPrompt from vllm.inputs.data import TokensPrompt
from vllm.logprobs import PromptLogprobs
from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401 from vllm.multimodal.inputs import MultiModalUUIDDict # noqa: F401
from vllm.outputs import CompletionOutput from vllm.outputs import CompletionOutput
from vllm.sampling_params import SamplingParams from vllm.sampling_params import SamplingParams
from vllm.sequence import PromptLogprobs, RequestMetrics from vllm.sequence import RequestMetrics
import dynamo.nixl_connect as connect import dynamo.nixl_connect as connect
......
...@@ -192,7 +192,9 @@ class KvConnectorLeader: ...@@ -192,7 +192,9 @@ class KvConnectorLeader:
if self._connector.has_slot(request.request_id): if self._connector.has_slot(request.request_id):
return None return None
if bool(request.mm_positions): if bool(getattr(request, "mm_features", None)) or bool(
getattr(request, "mm_positions", None)
):
raise ValueError("Unsupported request - requires mm extra keys") raise ValueError("Unsupported request - requires mm extra keys")
all_token_ids = request.all_token_ids all_token_ids = request.all_token_ids
......
...@@ -40,6 +40,21 @@ logging.basicConfig( ...@@ -40,6 +40,21 @@ logging.basicConfig(
) )
@pytest.fixture()
def set_ucx_tls_no_mm():
"""Set UCX env defaults for all tests."""
mp = pytest.MonkeyPatch()
# CI note:
# - Affected test: tests/fault_tolerance/cancellation/test_vllm.py::test_request_cancellation_vllm_decode_cancel
# - Symptom on L40 CI: UCX/NIXL mm transport assertion during worker init
# (uct_mem.c:482: mem.memh != UCT_MEM_HANDLE_NULL) when two workers
# start on the same node (maybe a shared-memory segment collision/limits).
# - Mitigation: disable UCX "mm" shared-memory transport globally for tests
mp.setenv("UCX_TLS", "^mm")
yield
mp.undo()
def download_models(model_list=None, ignore_weights=False): def download_models(model_list=None, ignore_weights=False):
"""Download models - can be called directly or via fixture """Download models - can be called directly or via fixture
......
...@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated( ...@@ -193,7 +193,7 @@ def test_request_cancellation_vllm_aggregated(
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_cancellation_vllm_decode_cancel( def test_request_cancellation_vllm_decode_cancel(
request, runtime_services, predownload_models request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
""" """
End-to-end test for request cancellation during decode phase. End-to-end test for request cancellation during decode phase.
...@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel( ...@@ -266,7 +266,7 @@ def test_request_cancellation_vllm_decode_cancel(
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_cancellation_vllm_remote_prefill_cancel( def test_request_cancellation_vllm_remote_prefill_cancel(
request, runtime_services, predownload_models request, runtime_services, predownload_models, set_ucx_tls_no_mm
): ):
""" """
End-to-end test for request cancellation during remote prefill phase. End-to-end test for request cancellation during remote prefill phase.
......
...@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None: ...@@ -290,7 +290,9 @@ def verify_migration_occurred(frontend_process: DynamoFrontendProcess) -> None:
@pytest.mark.gpu_1 @pytest.mark.gpu_1
@pytest.mark.e2e @pytest.mark.e2e
@pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME) @pytest.mark.model(FAULT_TOLERANCE_MODEL_NAME)
def test_request_migration_vllm(request, runtime_services, predownload_models): def test_request_migration_vllm(
request, runtime_services, predownload_models, set_ucx_tls_no_mm
):
""" """
End-to-end test for worker fault tolerance with migration support. End-to-end test for worker fault tolerance with migration support.
......
...@@ -58,6 +58,8 @@ class GPTOSSWorkerProcess(ManagedProcess): ...@@ -58,6 +58,8 @@ class GPTOSSWorkerProcess(ManagedProcess):
"dynamo.vllm", "dynamo.vllm",
"--model", "--model",
REASONING_TEST_MODEL, REASONING_TEST_MODEL,
"--connector",
"none", # skip nixl registration, noticing long startup times in CI. Potentially a bug...
"--enforce-eager", "--enforce-eager",
"--dyn-tool-call-parser", "--dyn-tool-call-parser",
"harmony", "harmony",
...@@ -85,7 +87,7 @@ class GPTOSSWorkerProcess(ManagedProcess): ...@@ -85,7 +87,7 @@ class GPTOSSWorkerProcess(ManagedProcess):
("http://localhost:8000/v1/models", check_models_api), ("http://localhost:8000/v1/models", check_models_api),
("http://localhost:8083/health", self.is_ready), ("http://localhost:8083/health", self.is_ready),
], ],
timeout=300, timeout=500,
display_output=True, display_output=True,
terminate_existing=False, terminate_existing=False,
stragglers=["VLLM::EngineCore"], stragglers=["VLLM::EngineCore"],
......
...@@ -111,7 +111,7 @@ class LLMServerManager: ...@@ -111,7 +111,7 @@ class LLMServerManager:
"--kv-transfer-config", "--kv-transfer-config",
'{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}', '{"kv_connector":"DynamoConnector","kv_role":"kv_both", "kv_connector_module_path": "dynamo.llm.vllm_integration.connector"}',
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
"--max-seq-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
] ]
......
...@@ -132,7 +132,7 @@ class LLMServerManager: ...@@ -132,7 +132,7 @@ class LLMServerManager:
os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"), os.environ.get("KVBM_MODEL_ID", "deepseek-ai/DeepSeek-R1-Distill-Llama-8B"),
"--block-size", "--block-size",
"16", "16",
"--max-seq-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
"--connector", "--connector",
"nixl", "nixl",
...@@ -148,7 +148,7 @@ class LLMServerManager: ...@@ -148,7 +148,7 @@ class LLMServerManager:
"--is-prefill-worker", "--is-prefill-worker",
"--block-size", "--block-size",
"16", "16",
"--max-seq-len", "--max-model-len",
"8000", # required to fit on L4 GPU when using 8b model "8000", # required to fit on L4 GPU when using 8b model
"--connector", "--connector",
"kvbm", "kvbm",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment