chore: Finish vllm upgrade to 0.10.1 + cleanup (#2528)

5f57ea5f · Dmitry Tokarev · GitHub · 07cfc3a1 · 5f57ea5f · 5f57ea5f
Unverified Commit 5f57ea5f authored Aug 19, 2025 by Dmitry Tokarev Committed by GitHub Aug 19, 2025
4 changed files
--- a/components/backends/vllm/src/dynamo/vllm/args.py
+++ b/components/backends/vllm/src/dynamo/vllm/args.py
@@ -170,7 +170,7 @@ async def configure_ports_with_etcd(config: Config, etcd_client):
        logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})")
    # Allocate side channel ports
-    # https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L372
+    # https://github.com/vllm-project/vllm/blob/releases/v0.10.1/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L443
    # NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank
    # For dp_rank, we need to reserve tp_size consecutive ports
    tp_size = config.engine_args.tensor_parallel_size or 1

--- a/container/Dockerfile.vllm
+++ b/container/Dockerfile.vllm
@@ -13,15 +13,15 @@ ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
 ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
 # Make sure to update the dependency version in pyproject.toml when updating this
-ARG VLLM_REF="77a6bf07aedf132aad2b6719f6d87abc5d3311ab"
+ARG VLLM_REF="aab549870df50edf0512f0a59b574f692f546465"  # from v0.10.1
 ARG TORCH_BACKEND="cu128"
-# Match 0.10.0 vLLM release
+# Match 0.10.1 vLLM release
-# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
+# https://github.com/vllm-project/vllm/releases/tag/v0.10.1
 # Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
 # "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
 ARG DEEPGEMM_REF="f85ec64"
-ARG FLASHINF_REF="v0.2.8rc1"
+ARG FLASHINF_REF="v0.2.11"
 # Define general architecture ARGs for supporting both x86 and aarch64 builds.
 #   ARCH: Used for package suffixes (e.g., amd64, arm64)

--- a/container/deps/vllm/install_vllm.sh
+++ b/container/deps/vllm/install_vllm.sh
@@ -20,13 +20,16 @@ set -euo pipefail
 # Parse arguments
 EDITABLE=true
-VLLM_REF="77a6bf07aedf132aad2b6719f6d87abc5d3311ab"
+VLLM_REF="aab549870df50edf0512f0a59b574f692f546465"  # from v0.10.1
+# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
+# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
+VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.1-cp38-abi3-manylinux1_x86_64.whl"
 VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
 MAX_JOBS=16
 INSTALLATION_DIR=/tmp
 ARCH=$(uname -m)
 DEEPGEMM_REF="f85ec64"
-FLASHINF_REF="v0.2.8rc1"
+FLASHINF_REF="v0.2.11"
 TORCH_BACKEND="cu128"
 # Convert x86_64 to amd64 for consistency with Docker ARG
@@ -83,13 +86,13 @@ while [[ $# -gt 0 ]]; do
            echo "Options:"
            echo "  --editable        Install vllm in editable mode (default)"
            echo "  --no-editable     Install vllm in non-editable mode"
-            echo "  --vllm-ref REF    Git reference to checkout (default: f4135232b9a8c4845f8961fb1cd17581c56ae2ce)"
+            echo f"  --vllm-ref REF    Git reference to checkout (default: ${VLLM_REF})"
-            echo "  --max-jobs NUM    Maximum number of parallel jobs (default: 16)"
+            echo f"  --max-jobs NUM    Maximum number of parallel jobs (default: ${MAX_JOBS})"
            echo "  --arch ARCH       Architecture (amd64|arm64, default: auto-detect)"
-            echo "  --installation-dir DIR  Directory to install vllm (default: /tmp/vllm)"
+            echo f"  --installation-dir DIR  Directory to install vllm (default: ${INSTALLATION_DIR})"
-            echo "  --deepgemm-ref REF  Git reference for DeepGEMM (default: 1876566)"
+            echo f"  --deepgemm-ref REF  Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
-            echo "  --flashinf-ref REF  Git reference for Flash Infer (default: v0.2.8rc1)"
+            echo f"  --flashinf-ref REF  Git reference for Flash Infer (default: ${FLASHINF_REF})"
-            echo "  --torch-backend BACKEND  Torch backend to use (default: cu128)"
+            echo f"  --torch-backend BACKEND  Torch backend to use (default: ${TORCH_BACKEND})"
            exit 0
            ;;
        *)
@@ -154,7 +157,7 @@ else
        exit 1
    fi
-    export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
+    export VLLM_PRECOMPILED_WHEEL_LOCATION="${VLLM_PRECOMPILED_WHEEL_LOCATION}"
    if [ "$EDITABLE" = "true" ]; then
 	uv pip install -e . --torch-backend=$TORCH_BACKEND

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -56,7 +56,7 @@ trtllm =[
 vllm = [
    "uvloop",
    "nixl<=0.4.1",
-    "vllm==0.10.0",
+    "vllm[flashinfer]==0.10.1",
 ]
 sglang = [