Unverified Commit 5f57ea5f authored by Dmitry Tokarev's avatar Dmitry Tokarev Committed by GitHub
Browse files

chore: Finish vllm upgrade to 0.10.1 + cleanup (#2528)

parent 07cfc3a1
...@@ -170,7 +170,7 @@ async def configure_ports_with_etcd(config: Config, etcd_client): ...@@ -170,7 +170,7 @@ async def configure_ports_with_etcd(config: Config, etcd_client):
logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})") logger.info(f"Allocated ZMQ KV events port: {kv_port} (worker_id={worker_id})")
# Allocate side channel ports # Allocate side channel ports
# https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L372 # https://github.com/vllm-project/vllm/blob/releases/v0.10.1/vllm/distributed/kv_transfer/kv_connector/v1/nixl_connector.py#L443
# NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank # NIXL calculates ports as: base_port + (dp_rank * tp_size) + tp_rank
# For dp_rank, we need to reserve tp_size consecutive ports # For dp_rank, we need to reserve tp_size consecutive ports
tp_size = config.engine_args.tensor_parallel_size or 1 tp_size = config.engine_args.tensor_parallel_size or 1
......
...@@ -13,15 +13,15 @@ ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ...@@ -13,15 +13,15 @@ ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_REF="77a6bf07aedf132aad2b6719f6d87abc5d3311ab" ARG VLLM_REF="aab549870df50edf0512f0a59b574f692f546465" # from v0.10.1
ARG TORCH_BACKEND="cu128" ARG TORCH_BACKEND="cu128"
# Match 0.10.0 vLLM release # Match 0.10.1 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.0 # https://github.com/vllm-project/vllm/releases/tag/v0.10.1
# Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100: # Pinned to commit before https://github.com/deepseek-ai/DeepGEMM/pull/112 for DeepGEMM which seems to break on H100:
# "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'" # "RuntimeError: Failed: CUDA runtime error csrc/jit/kernel_runtime.hpp:108 '98'"
ARG DEEPGEMM_REF="f85ec64" ARG DEEPGEMM_REF="f85ec64"
ARG FLASHINF_REF="v0.2.8rc1" ARG FLASHINF_REF="v0.2.11"
# Define general architecture ARGs for supporting both x86 and aarch64 builds. # Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64) # ARCH: Used for package suffixes (e.g., amd64, arm64)
......
...@@ -20,13 +20,16 @@ set -euo pipefail ...@@ -20,13 +20,16 @@ set -euo pipefail
# Parse arguments # Parse arguments
EDITABLE=true EDITABLE=true
VLLM_REF="77a6bf07aedf132aad2b6719f6d87abc5d3311ab" VLLM_REF="aab549870df50edf0512f0a59b574f692f546465" # from v0.10.1
# When updating above VLLM_REF make sure precompiled wheel file URL is correct. Run this command:
# aws s3 ls s3://vllm-wheels/${VLLM_REF}/ --region us-west-2 --no-sign-request
VLLM_PRECOMPILED_WHEEL_LOCATION="https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-0.10.1-cp38-abi3-manylinux1_x86_64.whl"
VLLM_GIT_URL="https://github.com/vllm-project/vllm.git" VLLM_GIT_URL="https://github.com/vllm-project/vllm.git"
MAX_JOBS=16 MAX_JOBS=16
INSTALLATION_DIR=/tmp INSTALLATION_DIR=/tmp
ARCH=$(uname -m) ARCH=$(uname -m)
DEEPGEMM_REF="f85ec64" DEEPGEMM_REF="f85ec64"
FLASHINF_REF="v0.2.8rc1" FLASHINF_REF="v0.2.11"
TORCH_BACKEND="cu128" TORCH_BACKEND="cu128"
# Convert x86_64 to amd64 for consistency with Docker ARG # Convert x86_64 to amd64 for consistency with Docker ARG
...@@ -83,13 +86,13 @@ while [[ $# -gt 0 ]]; do ...@@ -83,13 +86,13 @@ while [[ $# -gt 0 ]]; do
echo "Options:" echo "Options:"
echo " --editable Install vllm in editable mode (default)" echo " --editable Install vllm in editable mode (default)"
echo " --no-editable Install vllm in non-editable mode" echo " --no-editable Install vllm in non-editable mode"
echo " --vllm-ref REF Git reference to checkout (default: f4135232b9a8c4845f8961fb1cd17581c56ae2ce)" echo f" --vllm-ref REF Git reference to checkout (default: ${VLLM_REF})"
echo " --max-jobs NUM Maximum number of parallel jobs (default: 16)" echo f" --max-jobs NUM Maximum number of parallel jobs (default: ${MAX_JOBS})"
echo " --arch ARCH Architecture (amd64|arm64, default: auto-detect)" echo " --arch ARCH Architecture (amd64|arm64, default: auto-detect)"
echo " --installation-dir DIR Directory to install vllm (default: /tmp/vllm)" echo f" --installation-dir DIR Directory to install vllm (default: ${INSTALLATION_DIR})"
echo " --deepgemm-ref REF Git reference for DeepGEMM (default: 1876566)" echo f" --deepgemm-ref REF Git reference for DeepGEMM (default: ${DEEPGEMM_REF})"
echo " --flashinf-ref REF Git reference for Flash Infer (default: v0.2.8rc1)" echo f" --flashinf-ref REF Git reference for Flash Infer (default: ${FLASHINF_REF})"
echo " --torch-backend BACKEND Torch backend to use (default: cu128)" echo f" --torch-backend BACKEND Torch backend to use (default: ${TORCH_BACKEND})"
exit 0 exit 0
;; ;;
*) *)
...@@ -154,7 +157,7 @@ else ...@@ -154,7 +157,7 @@ else
exit 1 exit 1
fi fi
export VLLM_PRECOMPILED_WHEEL_LOCATION=https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_REF}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl export VLLM_PRECOMPILED_WHEEL_LOCATION="${VLLM_PRECOMPILED_WHEEL_LOCATION}"
if [ "$EDITABLE" = "true" ]; then if [ "$EDITABLE" = "true" ]; then
uv pip install -e . --torch-backend=$TORCH_BACKEND uv pip install -e . --torch-backend=$TORCH_BACKEND
......
...@@ -56,7 +56,7 @@ trtllm =[ ...@@ -56,7 +56,7 @@ trtllm =[
vllm = [ vllm = [
"uvloop", "uvloop",
"nixl<=0.4.1", "nixl<=0.4.1",
"vllm==0.10.0", "vllm[flashinfer]==0.10.1",
] ]
sglang = [ sglang = [
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment