"tests/vscode:/vscode.git/clone" did not exist on "6df209556e1661e82cd2274b025f3634c0dab97f"
Unverified Commit e82bc4ec authored by ptarasiewiczNV's avatar ptarasiewiczNV Committed by GitHub
Browse files

chore: update vLLM to 0.10.0 (#2114)


Co-authored-by: default avataralec-flowers <aflowers@nvidia.com>
parent 615580d8
...@@ -207,7 +207,11 @@ def overwrite_args(config): ...@@ -207,7 +207,11 @@ def overwrite_args(config):
defaults = { defaults = {
"task": "generate", "task": "generate",
"skip_tokenizer_init": True, # As of vLLM >=0.10.0 the engine unconditionally calls
# `sampling_params.update_from_tokenizer(...)`, so we can no longer
# skip tokenizer initialisation. Setting this to **False** avoids
# a NoneType error when the processor accesses the tokenizer.
"skip_tokenizer_init": False,
"disable_log_requests": True, "disable_log_requests": True,
# KV routing relies on logging KV metrics # KV routing relies on logging KV metrics
"disable_log_stats": False, "disable_log_stats": False,
......
...@@ -110,6 +110,8 @@ class DecodeWorkerHandler(BaseWorkerHandler): ...@@ -110,6 +110,8 @@ class DecodeWorkerHandler(BaseWorkerHandler):
prompt = TokensPrompt(prompt_token_ids=request["token_ids"]) prompt = TokensPrompt(prompt_token_ids=request["token_ids"])
sampling_params = SamplingParams(**self.default_sampling_params) sampling_params = SamplingParams(**self.default_sampling_params)
sampling_params.detokenize = False
for key, value in request["sampling_options"].items(): for key, value in request["sampling_options"].items():
if value is not None and hasattr(sampling_params, key): if value is not None and hasattr(sampling_params, key):
setattr(sampling_params, key, value) setattr(sampling_params, key, value)
......
...@@ -25,6 +25,7 @@ class NullStatLogger(StatLoggerBase): ...@@ -25,6 +25,7 @@ class NullStatLogger(StatLoggerBase):
self, self,
scheduler_stats: Optional[SchedulerStats], scheduler_stats: Optional[SchedulerStats],
iteration_stats: Optional[IterationStats], iteration_stats: Optional[IterationStats],
engine_idx: int = 0,
): ):
pass pass
...@@ -51,7 +52,10 @@ class DynamoStatLoggerPublisher(StatLoggerBase): ...@@ -51,7 +52,10 @@ class DynamoStatLoggerPublisher(StatLoggerBase):
self.request_total_slots = request_total_slots self.request_total_slots = request_total_slots
def record( def record(
self, scheduler_stats: SchedulerStats, iteration_stats: Optional[IterationStats] self,
scheduler_stats: SchedulerStats,
iteration_stats: Optional[IterationStats],
engine_idx: int = 0,
): ):
# request_total_slots and kv_total_blocks are properties of model + gpu # request_total_slots and kv_total_blocks are properties of model + gpu
# we should only publish them once, not every metric update # we should only publish them once, not every metric update
......
...@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04" ...@@ -10,16 +10,15 @@ ARG BASE_IMAGE_TAG="25.01-cuda12.8-devel-ubuntu24.04"
ARG RELEASE_BUILD ARG RELEASE_BUILD
ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda" ARG RUNTIME_IMAGE="nvcr.io/nvidia/cuda"
ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04" ARG RUNTIME_IMAGE_TAG="12.8.1-runtime-ubuntu24.04"
ARG VLLM_REF="059d4cd"
ARG TORCH_BACKEND="cu128"
# After this commit deepgemm API changed
# 1.0.0 -> 2.0.0
ARG DEEPGEMM_REF="03d0be3"
ARG FLASHINF_REF="1d72ed4"
# Make sure to update the dependency version in pyproject.toml when updating this # Make sure to update the dependency version in pyproject.toml when updating this
ARG VLLM_VERSION="0.9.2" ARG VLLM_REF="v0.10.0"
ARG TORCH_BACKEND="cu128"
# Match 0.10.0 vLLM release
# https://github.com/vllm-project/vllm/releases/tag/v0.10.0
ARG DEEPGEMM_REF="1876566"
ARG FLASHINF_REF="v0.2.8rc1"
# Define general architecture ARGs for supporting both x86 and aarch64 builds. # Define general architecture ARGs for supporting both x86 and aarch64 builds.
# ARCH: Used for package suffixes (e.g., amd64, arm64) # ARCH: Used for package suffixes (e.g., amd64, arm64)
...@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64 ...@@ -42,11 +41,10 @@ ARG ARCH_ALT=x86_64
FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base FROM ${BASE_IMAGE}:${BASE_IMAGE_TAG} AS base
# Redeclare ARCH, ARCH_ALT, TORCH_BACKEND, VLLM_VERSION so they're available in this stage # Redeclare ARCH, ARCH_ALT, TORCH_BACKEND so they're available in this stage
ARG ARCH ARG ARCH
ARG ARCH_ALT ARG ARCH_ALT
ARG TORCH_BACKEND ARG TORCH_BACKEND
ARG VLLM_VERSION
USER root USER root
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
...@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda ...@@ -195,15 +193,11 @@ ENV CUDA_HOME=/usr/local/cuda
RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \ RUN --mount=type=bind,source=./container/deps/,target=/tmp/deps \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
if [ "$ARCH" = "arm64" ]; then \
# TODO - split vllm, DeepEP, DeepGeMM, PPLX installs # TODO - split vllm, DeepEP, DeepGeMM, PPLX installs
# Should be able to select how you want your build to go # Should be able to select how you want your build to go
cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \ cp /tmp/deps/vllm/install_vllm.sh /tmp/install_vllm.sh && \
chmod +x /tmp/install_vllm.sh && \ chmod +x /tmp/install_vllm.sh && \
/tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND; \ /tmp/install_vllm.sh --editable --vllm-ref $VLLM_REF --max-jobs $MAX_JOBS --arch $ARCH --installation-dir /opt --deepgemm-ref $DEEPGEMM_REF --flashinf-ref $FLASHINF_REF --torch-backend $TORCH_BACKEND;
else \
uv pip install "vllm==${VLLM_VERSION}"; \
fi
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
...@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX ...@@ -464,9 +458,7 @@ COPY --from=wheel_builder $NIXL_PREFIX $NIXL_PREFIX
COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/ COPY --from=wheel_builder /workspace/dist/*.whl wheelhouse/
# Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries # Copies vllm, DeepEP, DeepGEMM, PPLX repos (all editable installs) and nvshmem binaries
RUN if [ "$ARCH" = "arm64" ]; then \ COPY --from=base /opt/vllm /opt/vllm
COPY --from=base /opt/vllm /opt/vllm; \
fi
ENV LD_LIBRARY_PATH=\ ENV LD_LIBRARY_PATH=\
/opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\ /opt/vllm/tools/ep_kernels/ep_kernels_workspace/nvshmem_install/lib:\
......
...@@ -20,12 +20,12 @@ set -euo pipefail ...@@ -20,12 +20,12 @@ set -euo pipefail
# Parse arguments # Parse arguments
EDITABLE=true EDITABLE=true
VLLM_REF="059d4cd" VLLM_REF="v0.10.0"
MAX_JOBS=16 MAX_JOBS=16
INSTALLATION_DIR=/tmp INSTALLATION_DIR=/tmp
ARCH=$(uname -m) ARCH=$(uname -m)
DEEPGEMM_REF="6c9558e" DEEPGEMM_REF="1876566"
FLASHINF_REF="1d72ed4" FLASHINF_REF="v0.2.8rc1"
TORCH_BACKEND="cu128" TORCH_BACKEND="cu128"
# Convert x86_64 to amd64 for consistency with Docker ARG # Convert x86_64 to amd64 for consistency with Docker ARG
......
...@@ -67,7 +67,7 @@ trtllm =[ ...@@ -67,7 +67,7 @@ trtllm =[
vllm = [ vllm = [
"uvloop", "uvloop",
"nixl", "nixl",
"vllm==0.9.2", "vllm==0.10.0",
] ]
sglang = [ sglang = [
......
...@@ -59,7 +59,7 @@ class VLLMConfig: ...@@ -59,7 +59,7 @@ class VLLMConfig:
endpoints: List[str] endpoints: List[str]
response_handlers: List[Callable[[Any], str]] response_handlers: List[Callable[[Any], str]]
model: str model: str
timeout: int = 60 timeout: int = 120
delayed_start: int = 0 delayed_start: int = 0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment