Commit cc7f22a8 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.1' into v0.9.1-ori

parents b9ea0c09 b6553be1
#include "dispatch_utils.h"
#include <torch/cuda.h>
#include <c10/cuda/CUDAGuard.h>
#ifndef USE_ROCM
#include <cub/cub.cuh>
#else
#include <hipcub/hipcub.hpp>
#endif
namespace vllm {
template <typename scalar_t>
__global__ void apply_repetition_penalties_kernel(
scalar_t* __restrict__ logits, // [num_seqs, vocab_size]
const bool* __restrict__ prompt_mask, // [num_seqs, vocab_size]
const bool* __restrict__ output_mask, // [num_seqs, vocab_size]
const scalar_t* __restrict__ repetition_penalties, // [num_seqs]
const int num_seqs, const int vocab_size, const int tile_size) {
// Each block handles one sequence and a tile of vocab
const int seq_idx = blockIdx.x;
if (seq_idx >= num_seqs) return;
const int tile_start = blockIdx.y * tile_size;
const int tile_end = min(tile_start + tile_size, vocab_size);
// Load repetition penalty for this sequence
const scalar_t penalty = repetition_penalties[seq_idx];
// Each thread processes multiple vocab items within the tile
for (int vocab_idx = tile_start + threadIdx.x; vocab_idx < tile_end;
vocab_idx += blockDim.x) {
const int64_t idx = static_cast<int64_t>(seq_idx) * vocab_size + vocab_idx;
const bool is_repeated = prompt_mask[idx] || output_mask[idx];
if (is_repeated) {
scalar_t logit = logits[idx];
if (logit > 0) {
logits[idx] = logit / penalty;
} else {
logits[idx] = logit * penalty;
}
}
}
}
} // namespace vllm
void apply_repetition_penalties_(
torch::Tensor& logits, // [num_seqs, vocab_size], in-place
const torch::Tensor& prompt_mask, // [num_seqs, vocab_size]
const torch::Tensor& output_mask, // [num_seqs, vocab_size]
const torch::Tensor& repetition_penalties) { // [num_seqs]
TORCH_CHECK(logits.is_contiguous());
TORCH_CHECK(prompt_mask.is_contiguous());
TORCH_CHECK(output_mask.is_contiguous());
TORCH_CHECK(repetition_penalties.is_contiguous());
int vocab_size = logits.size(-1);
int num_seqs = logits.size(0);
// Get number of SMs on the current device
int sms = 0;
cudaDeviceGetAttribute(&sms, cudaDevAttrMultiProcessorCount,
logits.get_device());
// Compute tile_num and tile_size
int tile_num =
std::min(vocab_size, std::max(1, (sms + num_seqs - 1) / num_seqs));
int tile_size = (vocab_size + tile_num - 1) / tile_num;
// Each block handles one sequence and a tile of vocab
dim3 grid(num_seqs, tile_num);
dim3 block(std::min(tile_size, 1024));
const at::cuda::OptionalCUDAGuard device_guard(device_of(logits));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
VLLM_DISPATCH_FLOATING_TYPES(
logits.scalar_type(), "apply_repetition_penalties_kernel", [&] {
vllm::apply_repetition_penalties_kernel<scalar_t>
<<<grid, block, 0, stream>>>(
logits.data_ptr<scalar_t>(), prompt_mask.data_ptr<bool>(),
output_mask.data_ptr<bool>(),
repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
tile_size);
});
}
\ No newline at end of file
......@@ -170,6 +170,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"float epsilon) -> ()");
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
// Apply repetition penalties to logits in-place
ops.def(
"apply_repetition_penalties_(Tensor! logits, Tensor prompt_mask, "
"Tensor output_mask, Tensor repetition_penalties) -> ()");
ops.impl("apply_repetition_penalties_", torch::kCUDA,
&apply_repetition_penalties_);
// Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor.
// ops.def(
......@@ -432,7 +439,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_moe_mm(Tensor! out_tensors, Tensor a_tensors, Tensor b_tensors, "
" Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
" Tensor problem_sizes, Tensor a_strides, "
" Tensor b_strides, Tensor c_strides) -> ()",
" Tensor b_strides, Tensor c_strides, bool per_act_token, "
" bool per_out_ch) -> ()",
{stride_tag});
ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
......@@ -447,10 +455,26 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes1, Tensor! problem_sizes2, "
" Tensor! input_permutation, "
" Tensor! output_permutation, int num_experts, "
" int n, int k) -> ()",
" int n, int k, Tensor? blockscale_offsets) -> ()",
{stride_tag});
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
// A function that computes data required to run fused MoE with w8a8 grouped
// GEMM and PPLX. It takes expert_num_tokens and non_zero_expert_idxs
// as an input, and computes expert_offsets (token start indices of each
// expert). In addition to this, it computes problem sizes for each expert's
// multiplication used by the two mms called from fused MoE operation.
ops.def(
"get_cutlass_pplx_moe_mm_data(Tensor! expert_offsets, "
" Tensor! problem_sizes1, "
" Tensor! problem_sizes2, "
" Tensor expert_num_tokens, "
" int num_local_experts, int padded_m, "
" int n, int k) -> ()",
{stride_tag});
ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
&get_cutlass_pplx_moe_mm_data);
// Check if cutlass scaled_mm supports block quantization (used by DeepSeekV3)
ops.def(
"cutlass_scaled_mm_supports_block_fp8(int cuda_device_capability) -> "
......
......@@ -75,6 +75,7 @@ RUN --mount=type=bind,source=.git,target=.git \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel
......@@ -85,7 +86,7 @@ WORKDIR /workspace/vllm
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get install -y --no-install-recommends vim numactl
apt-get install -y --no-install-recommends vim numactl xz-utils
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
......@@ -108,8 +109,11 @@ FROM base AS vllm-test
WORKDIR /workspace/
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,src=requirements/test.txt,target=requirements/test.txt \
uv pip install -r requirements/test.txt
--mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/test-cpu.in && \
sed -i '/mamba_ssm/d' requirements/test-cpu.in && \
uv pip compile requirements/test-cpu.in -o requirements/cpu-test.txt && \
uv pip install -r requirements/cpu-test.txt
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
......
......@@ -34,7 +34,7 @@ RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN python3 -m pip install -U \
'cmake>=3.26' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
'cmake>=3.26.1' ninja packaging 'setuptools-scm>=8' wheel jinja2 \
-r requirements/neuron.txt
ENV VLLM_TARGET_DEVICE neuron
......
......@@ -312,4 +312,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
# Logging to confirm all the packages are installed
RUN pip freeze
#################### UNITTEST IMAGE #############################
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
###############################################################
# Stage to build openblas
###############################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
ARG MAX_JOBS
ARG OPENBLAS_VERSION=0.3.29
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
&& source /opt/rh/gcc-toolset-13/enable \
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
&& unzip OpenBLAS-$OPENBLAS_VERSION.zip \
&& cd OpenBLAS-$OPENBLAS_VERSION \
&& make -j${MAX_JOBS} TARGET=POWER9 BINARY=64 USE_OPENMP=1 USE_THREAD=1 NUM_THREADS=120 DYNAMIC_ARCH=1 INTERFACE64=0 \
&& cd /tmp && touch control
###############################################################
# base stage with dependencies coming from centos mirrors
###############################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS centos-deps-builder
RUN microdnf install -y dnf && \
dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
dnf config-manager --set-enabled crb
RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel && \
dnf remove -y centos-gpg-keys-9.0-24.el9.noarch centos-stream-repos-9.0-24.el9.noarch
###############################################################
# base stage with basic dependencies
###############################################################
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS base-builder
FROM centos-deps-builder AS base-builder
ARG PYTHON_VERSION=3.12
ARG OPENBLAS_VERSION=0.3.29
......@@ -20,25 +51,27 @@ ENV UV_LINK_MODE=copy
# Note: A symlink for libatomic.so is created for gcc-13 (linker fails to find libatomic otherwise - reqd. for sentencepiece)
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
# when `--jobs=<N>` is passed with podman build command
RUN microdnf install -y openssl-devel dnf \
&& dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
&& dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
COPY --from=openblas-builder /tmp/control /dev/null
RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
dnf install -y openssl-devel \
&& dnf install -y \
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
git tar gcc-toolset-13 automake libtool \
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
libtiff-devel libjpeg-devel openjpeg2-devel zlib-devel \
freetype-devel lcms2-devel libwebp-devel tcl-devel tk-devel \
harfbuzz-devel fribidi-devel libraqm-devel libimagequant-devel libxcb-devel \
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
&& dnf clean all \
&& PREFIX=/usr/local make -C /openblas install \
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv \
&& uv pip install wheel build "setuptools<70" setuptools_scm setuptools_rust meson-python 'cmake<4' ninja cython scikit_build_core scikit_build \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y \
&& cd /tmp && touch control
###############################################################
# Stage to build torch family
###############################################################
......@@ -48,6 +81,8 @@ FROM base-builder AS torch-builder
ARG MAX_JOBS
ARG TORCH_VERSION=2.6.0
ARG _GLIBCXX_USE_CXX11_ABI=1
ARG OPENBLAS_VERSION=0.3.29
RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \
git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
......@@ -109,7 +144,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
.. && \
make install -j ${MAX_JOBS:-$(nproc)} && \
cd ../../python/ && \
uv pip install -v -r requirements-wheel-build.txt && \
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
python setup.py build_ext \
--build-type=release --bundle-arrow-cpp \
......@@ -132,8 +168,25 @@ RUN --mount=type=cache,target=/root/.cache/uv \
cd opencv-python && \
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
cd opencv && git cherry-pick --no-commit $OPENCV_PATCH && cd .. && \
uv pip install scikit-build && \
python -m build --wheel --installer=uv --outdir /opencvwheels/
###############################################################
# Stage to build numactl
###############################################################
FROM base-builder AS numa-builder
# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
ARG MAX_JOBS
ARG NUMACTL_VERSION=2.0.19
RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
&& cd numactl \
&& autoreconf -i && ./configure \
&& make -j ${MAX_JOBS:-$(nproc)}
###############################################################
# Stage to build vllm - this stage builds and installs
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache
......@@ -145,6 +198,7 @@ FROM base-builder AS vllmcache-builder
COPY --from=torch-builder /tmp/control /dev/null
COPY --from=arrow-builder /tmp/control /dev/null
COPY --from=cv-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null
ARG VLLM_TARGET_DEVICE=cpu
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
......@@ -160,11 +214,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
--mount=type=bind,src=.,dst=/src/,rw \
source /opt/rh/gcc-toolset-13/enable && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \
make -C /numactl install && \
# sentencepiece.pc is in some pkgconfig inside uv cache
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
......@@ -173,21 +229,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install /vllmwheel/*.whl
###############################################################
# Stage to build numactl
###############################################################
FROM base-builder AS numa-builder
# Note: Building numactl with gcc-11. Compiling with gcc-13 in this builder stage will
# trigger recompilation with gcc-11 (and require libtool) in the final stage where we do not have gcc-13
ARG MAX_JOBS
ARG NUMACTL_VERSION=2.0.19
RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_VERSION} \
&& cd numactl \
&& autoreconf -i && ./configure \
&& make -j ${MAX_JOBS:-$(nproc)}
###############################################################
# Stage to build lapack
###############################################################
......@@ -217,6 +258,7 @@ ENV PATH=${VIRTUAL_ENV}/bin:$PATH
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig/
ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/lib64:/usr/local/lib:/usr/lib64:/usr/lib
ENV UV_LINK_MODE=copy
ENV OMP_NUM_THREADS=16
# create artificial dependencies between stages for independent stages to build in parallel
COPY --from=torch-builder /tmp/control /dev/null
......@@ -225,11 +267,13 @@ COPY --from=cv-builder /tmp/control /dev/null
COPY --from=vllmcache-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null
COPY --from=lapack-builder /tmp/control /dev/null
COPY --from=openblas-builder /tmp/control /dev/null
# install gcc-11, python, openblas, numactl, lapack
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
--mount=type=bind,from=lapack-builder,source=/lapack/,target=/lapack/,rw \
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
microdnf install --nodocs -y \
tar findutils openssl \
......@@ -241,8 +285,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
&& microdnf clean all \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv --no-cache \
&& curl -sL https://ftp2.osuosl.org/pub/ppc64el/openblas/latest/Openblas_${OPENBLAS_VERSION}_ppc64le.tar.gz | tar xvf - -C /usr/local \
&& make -C /numactl install \
&& PREFIX=/usr/local make -C /openblas install \
&& uv pip install 'cmake<4' \
&& cmake --install /lapack/build \
&& uv pip uninstall cmake
......
# default base image
ARG REMOTE_VLLM="0"
ARG USE_CYTHON="0"
ARG BUILD_RPD="1"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
......@@ -15,7 +13,7 @@ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl
# Remove sccache
RUN python3 -m pip install --upgrade pip && pip install setuptools_scm
RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}
......@@ -30,18 +28,17 @@ ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git checkout ${VLLM_BRANCH}
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
&& git checkout FETCH_HEAD
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# vLLM build stages
FROM fetch_vllm AS build_vllm
ARG USE_CYTHON
# Build vLLM
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
&& if [ ${USE_CYTHON} -eq "1" ]; then python3 tests/build_cython.py build_ext --inplace; fi \
&& python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
......@@ -90,13 +87,6 @@ RUN case "$(which python3)" in \
*) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli]
ARG BUILD_RPD
RUN if [ ${BUILD_RPD} -eq "1" ]; then \
git clone -b nvtx_enabled https://github.com/ROCm/rocmProfileData.git \
&& cd rocmProfileData/rpd_tracer \
&& pip install -r requirements.txt && cd ../ \
&& make && make install \
&& cd hipMarker && python3 setup.py install ; fi
# Install vLLM
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
......@@ -117,12 +107,6 @@ ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
# you must use the 'spawn' start method
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1
......
......@@ -12,6 +12,7 @@ nav:
- User Guide: usage/README.md
- Developer Guide: contributing/README.md
- API Reference: api/README.md
- CLI Reference: cli/README.md
- Timeline:
- Roadmap: https://roadmap.vllm.ai
- Releases: https://github.com/vllm-project/vllm/releases
......@@ -56,6 +57,8 @@ nav:
- Contents:
- glob: api/vllm/*
preserve_directory_names: true
- CLI Reference:
- Summary: cli/README.md
- Community:
- community/*
- Blog: https://blog.vllm.ai
......
......@@ -12,8 +12,8 @@
<p style="text-align:center">
<script async defer src="https://buttons.github.io/buttons.js"></script>
<a class="github-button" href="https://github.com/vllm-project/vllm" data-show-count="true" data-size="large" aria-label="Star">Star</a>
<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
<a class="github-button" href="https://github.com/vllm-project/vllm/subscription" data-show-count="true" data-icon="octicon-eye" data-size="large" aria-label="Watch">Watch</a>
<a class="github-button" href="https://github.com/vllm-project/vllm/fork" data-show-count="true" data-icon="octicon-repo-forked" data-size="large" aria-label="Fork">Fork</a>
</p>
vLLM is a fast and easy-to-use library for LLM inference and serving.
......
# vLLM CLI Guide
The vllm command-line tool is used to run and manage vLLM models. You can start by viewing the help message with:
```
vllm --help
```
Available Commands:
```
vllm {chat,complete,serve,bench,collect-env,run-batch}
```
## serve
Start the vLLM OpenAI Compatible API server.
Examples:
```bash
# Start with a model
vllm serve meta-llama/Llama-2-7b-hf
# Specify the port
vllm serve meta-llama/Llama-2-7b-hf --port 8100
# Check with --help for more options
# To list all groups
vllm serve --help=listgroup
# To view a argument group
vllm serve --help=ModelConfig
# To view a single argument
vllm serve --help=max-num-seqs
# To search by keyword
vllm serve --help=max
```
## chat
Generate chat completions via the running API server.
Examples:
```bash
# Directly connect to localhost API without arguments
vllm chat
# Specify API url
vllm chat --url http://{vllm-serve-host}:{vllm-serve-port}/v1
# Quick chat with a single prompt
vllm chat --quick "hi"
```
## complete
Generate text completions based on the given prompt via the running API server.
Examples:
```bash
# Directly connect to localhost API without arguments
vllm complete
# Specify API url
vllm complete --url http://{vllm-serve-host}:{vllm-serve-port}/v1
# Quick complete with a single prompt
vllm complete --quick "The future of AI is"
```
## bench
Run benchmark tests for latency online serving throughput and offline inference throughput.
To use benchmark commands, please install with extra dependencies using `pip install vllm[bench]`.
Available Commands:
```bash
vllm bench {latency, serve, throughput}
```
### latency
Benchmark the latency of a single batch of requests.
Example:
```bash
vllm bench latency \
--model meta-llama/Llama-3.2-1B-Instruct \
--input-len 32 \
--output-len 1 \
--enforce-eager \
--load-format dummy
```
### serve
Benchmark the online serving throughput.
Example:
```bash
vllm bench serve \
--model meta-llama/Llama-3.2-1B-Instruct \
--host server-host \
--port server-port \
--random-input-len 32 \
--random-output-len 4 \
--num-prompts 5
```
### throughput
Benchmark offline inference throughput.
Example:
```bash
vllm bench throughput \
--model meta-llama/Llama-3.2-1B-Instruct \
--input-len 32 \
--output-len 1 \
--enforce-eager \
--load-format dummy
```
## collect-env
Start collecting environment information.
```bash
vllm collect-env
```
## run-batch
Run batch prompts and write results to file.
Examples:
```bash
# Running with a local file
vllm run-batch \
-i offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
# Using remote file
vllm run-batch \
-i https://raw.githubusercontent.com/vllm-project/vllm/main/examples/offline_inference/openai_batch/openai_example_batch.jsonl \
-o results.jsonl \
--model meta-llama/Meta-Llama-3-8B-Instruct
```
## More Help
For detailed options of any subcommand, use:
```bash
vllm <subcommand> --help
```
......@@ -29,20 +29,68 @@ See <gh-file:LICENSE>.
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation.
Check out the [building from source][build-from-source] documentation for details.
### Building the docs
### Building the docs with MkDocs
Install the dependencies:
#### Introduction to MkDocs
[MkDocs](https://github.com/mkdocs/mkdocs) is a fast, simple and downright gorgeous static site generator that's geared towards building project documentation. Documentation source files are written in Markdown, and configured with a single YAML configuration file.
#### Install MkDocs and Plugins
Install MkDocs along with the [plugins](https://github.com/vllm-project/vllm/blob/main/mkdocs.yaml) used in the vLLM documentation, as well as required dependencies:
```bash
pip install -r requirements/docs.txt
```
Start the autoreloading MkDocs server:
!!! note
Ensure that your Python version is compatible with the plugins (e.g., `mkdocs-awesome-nav` requires Python 3.10+)
#### Verify Installation
Confirm that MkDocs is correctly installed:
```bash
mkdocs --version
```
Example output:
```console
mkdocs, version 1.6.1 from /opt/miniconda3/envs/mkdoc/lib/python3.10/site-packages/mkdocs (Python 3.10)
```
#### Clone the `vLLM` repository
```bash
git clone https://github.com/vllm-project/vllm.git
cd vllm
```
#### Start the Development Server
MkDocs comes with a built-in dev-server that lets you preview your documentation as you work on it. Make sure you're in the same directory as the `mkdocs.yml` configuration file, and then start the server by running the `mkdocs serve` command:
```bash
mkdocs serve
```
Example output:
```console
INFO - Documentation built in 106.83 seconds
INFO - [22:02:02] Watching paths for changes: 'docs', 'mkdocs.yaml'
INFO - [22:02:02] Serving on http://127.0.0.1:8000/
```
#### View in Your Browser
Open up [http://127.0.0.1:8000/](http://127.0.0.1:8000/) in your browser to see a live preview:.
#### Learn More
For additional features and advanced configurations, refer to the official [MkDocs Documentation](https://www.mkdocs.org/).
## Testing
```bash
......@@ -60,6 +108,9 @@ pre-commit run mypy-3.9 --hook-stage manual --all-files
# Unit tests
pytest tests/
# Run tests for a single test file with detailed output
pytest -s -v tests/test_logger.py
```
!!! tip
......
# CI Failures
What should I do when a CI job fails on my PR, but I don't think my PR caused
the failure?
- Check the dashboard of current CI test failures:
👉 [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20)
- If your failure **is already listed**, it's likely unrelated to your PR.
Help fixing it is always welcome!
- Leave comments with links to additional instances of the failure.
- React with a 👍 to signal how many are affected.
- If your failure **is not listed**, you should **file an issue**.
## Filing a CI Test Failure Issue
- **File a bug report:**
👉 [New CI Failure Report](https://github.com/vllm-project/vllm/issues/new?template=450-ci-failure.yml)
- **Use this title format:**
```
[CI Failure]: failing-test-job - regex/matching/failing:test
```
- **For the environment field:**
```
Still failing on main as of commit abcdef123
```
- **In the description, include failing tests:**
```
FAILED failing/test.py:failing_test1 - Failure description
FAILED failing/test.py:failing_test2 - Failure description
https://github.com/orgs/vllm-project/projects/20
https://github.com/vllm-project/vllm/issues/new?template=400-bug-report.yml
FAILED failing/test.py:failing_test3 - Failure description
```
- **Attach logs** (collapsible section example):
<details>
<summary>Logs:</summary>
```text
ERROR 05-20 03:26:38 [dump_input.py:68] Dumping input data
--- Logging error ---
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/vllm/v1/engine/core.py", line 203, in execute_model
return self.model_executor.execute_model(scheduler_output)
...
FAILED failing/test.py:failing_test1 - Failure description
FAILED failing/test.py:failing_test2 - Failure description
FAILED failing/test.py:failing_test3 - Failure description
```
</details>
## Logs Wrangling
Download the full log file from Buildkite locally.
Strip timestamps and colorization:
<gh-file:.buildkite/scripts/ci-clean-log.sh>
```bash
./ci-clean-log.sh ci.log
```
Use a tool [wl-clipboard](https://github.com/bugaevc/wl-clipboard) for quick copy-pasting:
```bash
tail -525 ci_build.log | wl-copy
```
## Investigating a CI Test Failure
1. Go to 👉 [Buildkite main branch](https://buildkite.com/vllm/ci/builds?branch=main)
2. Bisect to find the first build that shows the issue.
3. Add your findings to the GitHub issue.
4. If you find a strong candidate PR, mention it in the issue and ping contributors.
## Reproducing a Failure
CI test failures may be flaky. Use a bash loop to run repeatedly:
<gh-file:.buildkite/scripts/rerun-test.sh>
```bash
./rerun-test.sh tests/v1/engine/test_engine_core_client.py::test_kv_cache_events[True-tcp]
```
## Submitting a PR
If you submit a PR to fix a CI failure:
- Link the PR to the issue:
Add `Closes #12345` to the PR description.
- Add the `ci-failure` label:
This helps track it in the [CI Failures GitHub Project](https://github.com/orgs/vllm-project/projects/20).
## Other Resources
- 🔍 [Test Reliability on `main`](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&order=ASC&sort_by=reliability)
- 🧪 [Latest Buildkite CI Runs](https://buildkite.com/vllm/ci/builds?branch=main)
## Daily Triage
Use [Buildkite analytics (2-day view)](https://buildkite.com/organizations/vllm/analytics/suites/ci-1/tests?branch=main&period=2days) to:
- Identify recent test failures **on `main`**.
- Exclude legitimate test failures on PRs.
- (Optional) Ignore tests with 0% reliability.
Compare to the [CI Failures Dashboard](https://github.com/orgs/vllm-project/projects/20).
......@@ -46,11 +46,11 @@ You can add any other [engine-args][engine-args] you need after the image tag (`
create a custom Dockerfile on top of the base image with an extra layer that installs them:
```Dockerfile
FROM vllm/vllm-openai:v0.8.3
FROM vllm/vllm-openai:v0.9.0
# e.g. install the `audio` optional dependencies
# NOTE: Make sure the version of vLLM matches the base image!
RUN uv pip install --system vllm[audio]==0.8.3
RUN uv pip install --system vllm[audio]==0.9.0
```
!!! tip
......@@ -107,10 +107,21 @@ DOCKER_BUILDKIT=1 docker build . \
-t vllm/vllm-gh200-openai:latest \
--build-arg max_jobs=66 \
--build-arg nvcc_threads=2 \
--build-arg torch_cuda_arch_list="9.0+PTX" \
--build-arg torch_cuda_arch_list="9.0 10.0+PTX" \
--build-arg vllm_fa_cmake_gpu_arches="90-real"
```
!!! note
If you are building the `linux/arm64` image on a non-ARM host (e.g., an x86_64 machine), you need to ensure your system is set up for cross-compilation using QEMU. This allows your host machine to emulate ARM64 execution.
Run the following command on your host machine to register QEMU user static handlers:
```console
docker run --rm --privileged multiarch/qemu-user-static --reset -p yes
```
After setting up QEMU, you can use the `--platform "linux/arm64"` flag in your `docker build` command.
## Use the custom-built vLLM Docker image
To run vLLM with the custom-built Docker image:
......
......@@ -5,16 +5,6 @@ title: Using Nginx
This document shows how to launch multiple vLLM serving containers and use Nginx to act as a load balancer between the servers.
Table of contents:
1. [Build Nginx Container][nginxloadbalancer-nginx-build]
2. [Create Simple Nginx Config file][nginxloadbalancer-nginx-conf]
3. [Build vLLM Container][nginxloadbalancer-nginx-vllm-container]
4. [Create Docker Network][nginxloadbalancer-nginx-docker-network]
5. [Launch vLLM Containers][nginxloadbalancer-nginx-launch-container]
6. [Launch Nginx][nginxloadbalancer-nginx-launch-nginx]
7. [Verify That vLLM Servers Are Ready][nginxloadbalancer-nginx-verify-nginx]
[](){ #nginxloadbalancer-nginx-build }
## Build Nginx Container
......
......@@ -48,8 +48,7 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```
More API details can be found in the [Offline Inference]
(#offline-inference-api) section of the API docs.
More API details can be found in the [Offline Inference](#offline-inference-api) section of the API docs.
The code for the `LLM` class can be found in <gh-file:vllm/entrypoints/llm.py>.
......
......@@ -22,13 +22,13 @@ This document describes how vLLM deals with these challenges.
[Python multiprocessing methods](https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods) include:
- `spawn` - spawn a new Python process. This will be the default as of Python
3.14. In macOS, this is already the default.
- `spawn` - spawn a new Python process. The default on Windows and macOS.
- `fork` - Use `os.fork()` to fork the Python interpreter. This is the default
in Python versions prior to 3.14.
- `fork` - Use `os.fork()` to fork the Python interpreter. The default on
Linux for Python versions prior to 3.14.
- `forkserver` - Spawn a server process that will fork a new process on request.
The default on Linux for Python version 3.14 and newer.
### Tradeoffs
......
......@@ -104,7 +104,7 @@ class KVCacheBlock:
block_id: int
# The block hash (will be assigned when the block is full,
# and will be reset when the block is evicted).
block_hash: BlockHashType
block_hash: BlockHash
# The number of requests using this block now.
ref_cnt: int
......@@ -144,7 +144,7 @@ As a result, we will have the following components when the KV cache manager is
**Running request:** Workflow for the scheduler to schedule a running request with KV cache block allocation:
1. The scheduler calls `kv_cache_manager.append_slots()`. It does the following steps:
1. The scheduler calls `kv_cache_manager.allocate_slots()`. It does the following steps:
1. Compute the number of new required blocks, and return if there are no sufficient blocks to allocate.
2. Allocate new blocks by popping the heads of the free queue. If the head block is a cached block, this also “evicts” the block so that no other requests can reuse it anymore from now on.
3. Append token IDs to the slots in existing blocks as well as the new blocks. If a block is full, we add it to the Cache Block to cache it.
......
......@@ -10,6 +10,7 @@ The symbols used have the following meanings:
- ✅ = Full compatibility
- 🟠 = Partial compatibility
- ❌ = No compatibility
- ❔ = Unknown or TBD
!!! note
Check the ❌ or 🟠 with links to see tracking issue for unsupported feature/hardware combination.
......@@ -36,23 +37,23 @@ th:not(:first-child) {
}
</style>
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|-----------------------------------------------------------|-------------------------|-----------------------------------|------------------------|---------------------------------------------------|---------------------|--------------|-----------------------------------------------|-------------------------------------------------------|--------------------------------------|---------------------------------------------------|-------------------------------------------------------------|--------------------|---------------------------------------------|-----------|---------------|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [](gh-issue:7366) | ❌ | ❌ | [](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
| best-of | ✅ | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ✅ | ✅ | |
| beam-search | ✅ | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ❔ | ✅ | ✅ |
| Feature | [CP][chunked-prefill] | [APC][automatic-prefix-caching] | [LoRA][lora-adapter] | <abbr title="Prompt Adapter">prmpt adptr</abbr> | [SD][spec-decode] | CUDA graph | <abbr title="Pooling Models">pooling</abbr> | <abbr title="Encoder-Decoder Models">enc-dec</abbr> | <abbr title="Logprobs">logP</abbr> | <abbr title="Prompt Logprobs">prmpt logP</abbr> | <abbr title="Async Output Processing">async output</abbr> | multi-step | <abbr title="Multimodal Inputs">mm</abbr> | best-of | beam-search |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| [CP][chunked-prefill] | ✅ | | | | | | | | | | | | | | |
| [APC][automatic-prefix-caching] | ✅ | ✅ | | | | | | | | | | | | | |
| [LoRA][lora-adapter] | ✅ | ✅ | ✅ | | | | | | | | | | | | |
| <abbr title="Prompt Adapter">prmpt adptr</abbr> | ✅ | ✅ | ✅ | ✅ | | | | | | | | | | | |
| [SD][spec-decode] | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | | | | | |
| CUDA graph | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | | | | | | | | | |
| <abbr title="Pooling Models">pooling</abbr> | ❌ | ❌ | ❌ | ❌ | ❌ | ❌ | ✅ | | | | | | | | |
| <abbr title="Encoder-Decoder Models">enc-dec</abbr> | ❌ | [](gh-issue:7366) | ❌ | ❌ | [](gh-issue:7366) | ✅ | ✅ | ✅ | | | | | | | |
| <abbr title="Logprobs">logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | | | | | | |
| <abbr title="Prompt Logprobs">prmpt logP</abbr> | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ | | | | | |
| <abbr title="Async Output Processing">async output</abbr> | ✅ | ✅ | ✅ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | | | | |
| multi-step | ❌ | ✅ | ❌ | ✅ | ❌ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ | ✅ | | | |
| <abbr title="Multimodal Inputs">mm</abbr> | ✅ | [🟠](gh-pr:8348) | [🟠](gh-pr:4194) | ❔ | ❔ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ❔ | ✅ | | |
| best-of | ✅ | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ✅ | ✅ | |
| beam-search | ✅ | ✅ | ✅ | ✅ | [](gh-issue:6137) | ✅ | ❌ | ✅ | ✅ | ✅ | ❔ | [](gh-issue:7968) | ❔ | ✅ | ✅ |
[](){ #feature-x-hardware }
......@@ -75,3 +76,6 @@ th:not(:first-child) {
| multi-step | ✅ | ✅ | ✅ | ✅ | ✅ | [](gh-issue:8477) | ✅ |
| best-of | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
| beam-search | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
!!! note
Please refer to [Feature support through NxD Inference backend][feature-support-through-nxd-inference-backend] for features supported on AWS Neuron hardware
......@@ -165,6 +165,7 @@ it will first look in the local directory for a directory `foobar`, and attempt
that adapter will then be available for normal use on the server.
Alternatively, follow these example steps to implement your own plugin:
1. Implement the LoRAResolver interface.
Example of a simple S3 LoRAResolver implementation:
......@@ -198,9 +199,9 @@ Alternatively, follow these example steps to implement your own plugin:
return lora_request
```
2. Register LoRAResolver plugin.
2. Register `LoRAResolver` plugin.
```python
```python
from vllm.lora.resolver import LoRAResolverRegistry
s3_resolver = S3LoRAResolver()
......
......@@ -5,13 +5,13 @@ title: Supported Hardware
The table below shows the compatibility of various quantization implementations with different hardware platforms in vLLM:
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Inferentia | Google TPU |
| Implementation | Volta | Turing | Ampere | Ada | Hopper | AMD GPU | Intel GPU | x86 CPU | AWS Neuron | Google TPU |
|-----------------------|---------|----------|----------|-------|----------|-----------|-------------|-----------|------------------|--------------|
| AWQ | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
| GPTQ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ✅︎ | ✅︎ | ❌ | ❌ |
| Marlin (GPTQ/AWQ/FP8) | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | | ✅︎ |
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | | ❌ |
| INT8 (W8A8) | ❌ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ |
| FP8 (W8A8) | ❌ | ❌ | ❌ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ✅︎ | ❌ |
| BitBLAS (GPTQ) | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| AQLM | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
| bitsandbytes | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ✅︎ | ❌ | ❌ | ❌ | ❌ | ❌ |
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment