Commit 7a985548 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.9.0' into v0.9.0-ori

parents 45d3785c dc1440cf
...@@ -77,13 +77,40 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -77,13 +77,40 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor suffix_output," " Tensor suffix_output,"
" Tensor suffix_lse) -> ()"); " Tensor suffix_lse) -> ()");
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states); ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
ops.def(
"convert_vertical_slash_indexes("
" Tensor! block_count, Tensor! block_offset, "
" Tensor! column_count, Tensor! column_index, "
" Tensor q_seqlens, Tensor q_seqlens, "
" Tensor vertical_indexes, Tensor slash_indexes, "
" int context_size, int block_size_M, int block_size_N, "
" bool causal) -> ()");
ops.impl("convert_vertical_slash_indexes", torch::kCUDA,
&convert_vertical_slash_indexes);
ops.def(
"convert_vertical_slash_indexes_mergehead("
" Tensor! block_count, Tensor! block_offset, "
" Tensor! column_count, Tensor! column_index, "
" Tensor q_seqlens, Tensor q_seqlens, "
" Tensor vertical_indexes, Tensor slash_indexes, "
" Tensor vertical_indices_count, Tensor slash_indices_count, "
" int context_size, int block_size_M, int block_size_N, "
" bool causal) -> ()");
ops.impl("convert_vertical_slash_indexes_mergehead", torch::kCUDA,
&convert_vertical_slash_indexes_mergehead);
#endif #endif
// Activation ops // Activation ops
// Activation function used in SwiGLU. // Activation function used in SwiGLU.
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); ops.def("silu_and_mul(Tensor! result, Tensor input) -> ()");
ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul); ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
ops.def(
"silu_and_mul_quant(Tensor! result, Tensor input, Tensor scale) -> ()");
ops.impl("silu_and_mul_quant", torch::kCUDA, &silu_and_mul_quant);
ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()"); ops.def("mul_and_silu(Tensor! out, Tensor input) -> ()");
ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu); ops.impl("mul_and_silu", torch::kCUDA, &mul_and_silu);
...@@ -130,13 +157,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -130,13 +157,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
") -> ()"); ") -> ()");
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer); ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
// Compute MLA decode using cutlass.
// ops.def(
// "cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
// " Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
// " Tensor page_table, float scale) -> ()");
// ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
// Layernorm // Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def( ops.def(
...@@ -179,7 +199,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -179,7 +199,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key. // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
ops.def( ops.def(
"rotary_embedding(Tensor positions, Tensor! query," "rotary_embedding(Tensor positions, Tensor! query,"
" Tensor! key, int head_size," " Tensor!? key, int head_size,"
" Tensor cos_sin_cache, bool is_neox) -> ()"); " Tensor cos_sin_cache, bool is_neox) -> ()");
ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding); ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
...@@ -187,7 +207,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -187,7 +207,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// (supports multiple loras). // (supports multiple loras).
ops.def( ops.def(
"batched_rotary_embedding(Tensor positions, Tensor! query," "batched_rotary_embedding(Tensor positions, Tensor! query,"
" Tensor! key, int head_size," " Tensor!? key, int head_size,"
" Tensor cos_sin_cache, bool is_neox," " Tensor cos_sin_cache, bool is_neox,"
" int rot_dim," " int rot_dim,"
" Tensor cos_sin_cache_offsets) -> ()"); " Tensor cos_sin_cache_offsets) -> ()");
...@@ -298,12 +318,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -298,12 +318,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// gptq_marlin Optimized Quantized GEMM for GPTQ. // gptq_marlin Optimized Quantized GEMM for GPTQ.
ops.def( ops.def(
"gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, " "Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? "
"int b_q_type, " "g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, "
"SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
"bool has_zp, bool use_atomic_add, bool use_fp32_reduce, " "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor",
"bool is_zp_float) -> Tensor",
{stride_tag}); {stride_tag});
// conditionally compiled so impl registration is in source file // conditionally compiled so impl registration is in source file
...@@ -345,17 +364,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -345,17 +364,15 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor"); "int type, SymInt row, SymInt top_k, SymInt tokens) -> Tensor");
ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8); ops.impl("ggml_moe_a8", torch::kCUDA, &ggml_moe_a8);
ops.def(
"ggml_moe_a8_vec(Tensor X, Tensor W, "
"Tensor topk_ids, int top_k, "
"int type, SymInt row, SymInt tokens) -> Tensor");
ops.impl("ggml_moe_a8_vec", torch::kCUDA, &ggml_moe_a8_vec);
ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size); ops.def("ggml_moe_get_block_size", &ggml_moe_get_block_size);
#ifndef USE_ROCM #ifndef USE_ROCM
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def(
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
"SymInt size_k) -> Tensor",
{stride_tag});
// conditionally compiled so impl registration is in source file
// marlin_qqq_gemm for QQQ. // marlin_qqq_gemm for QQQ.
ops.def( ops.def(
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
...@@ -373,6 +390,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -373,6 +390,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
{stride_tag}); {stride_tag});
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
// cutlass nvfp4 block scaled group GEMM
ops.def(
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
{stride_tag});
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias // quantization, as well as bias
ops.def( ops.def(
...@@ -454,6 +479,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -454,6 +479,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]"); ops.def("cutlass_sparse_compress(Tensor a) -> Tensor[]");
ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress); ops.impl("cutlass_sparse_compress", &cutlass_sparse_compress);
// CUTLASS MLA decode
ops.def(
"cutlass_mla_decode(Tensor! out, Tensor q_nope, Tensor q_pe,"
" Tensor kv_c_and_k_pe_cache, Tensor seq_lens,"
" Tensor page_table, float scale) -> ()");
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
// Mamba selective scan kernel // Mamba selective scan kernel
ops.def( ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta," "selective_scan_fwd(Tensor! u, Tensor! delta,"
...@@ -495,6 +527,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -495,6 +527,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! output_scale, Tensor input_scale) -> ()"); " Tensor! output_scale, Tensor input_scale) -> ()");
ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant); ops.impl("scaled_fp4_quant", torch::kCUDA, &scaled_fp4_quant);
// Compute NVFP4 experts quantization.
ops.def(
"scaled_fp4_experts_quant(Tensor! output, Tensor! output_scale,"
"Tensor input, Tensor input_global_scale, Tensor input_offset_by_experts,"
"Tensor output_scale_offset_by_experts) -> ()");
ops.impl("scaled_fp4_experts_quant", torch::kCUDA, &scaled_fp4_experts_quant);
// Check if cutlass_scaled_mm_fp4 is supported for CUDA devices // Check if cutlass_scaled_mm_fp4 is supported for CUDA devices
// of the given capability // of the given capability
ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool"); ops.def("cutlass_scaled_mm_supports_fp4(int cuda_device_capability) -> bool");
......
...@@ -5,11 +5,11 @@ ...@@ -5,11 +5,11 @@
# docs/source/contributing/dockerfile/dockerfile.md and # docs/source/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png # docs/source/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.8.1
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
# prepare basic build environment # prepare basic build environment
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 AS base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG TARGETPLATFORM ARG TARGETPLATFORM
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
...@@ -19,7 +19,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -19,7 +19,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \ && apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
...@@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -34,6 +37,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519 # Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels # as it was causing spam when compiling the CUTLASS kernels
...@@ -66,13 +70,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -66,13 +70,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY requirements/common.txt requirements/common.txt COPY requirements/common.txt requirements/common.txt
COPY requirements/cuda.txt requirements/cuda.txt COPY requirements/cuda.txt requirements/cuda.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/cuda.txt uv pip install --system -r requirements/cuda.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# cuda arch list used by torch # cuda arch list used by torch
# can be useful for both `dev` and `test` # can be useful for both `dev` and `test`
# explicitly set the list to avoid issues with torch 2.2 # explicitly set the list to avoid issues with torch 2.2
# see https://github.com/pytorch/pytorch/pull/123243 # see https://github.com/pytorch/pytorch/pull/123243
ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX' ARG torch_cuda_arch_list='7.0 7.5 8.0 8.9 9.0 10.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Override the arch list for flash-attn to reduce the binary size # Override the arch list for flash-attn to reduce the binary size
ARG vllm_fa_cmake_gpu_arches='80-real;90-real' ARG vllm_fa_cmake_gpu_arches='80-real;90-real'
...@@ -89,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt ...@@ -89,9 +94,11 @@ COPY requirements/build.txt requirements/build.txt
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt uv pip install --system -r requirements/build.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
COPY . . COPY . .
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
...@@ -158,22 +165,25 @@ FROM base as dev ...@@ -158,22 +165,25 @@ FROM base as dev
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
COPY requirements/lint.txt requirements/lint.txt COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt COPY requirements/dev.txt requirements/dev.txt
# Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation uv pip install --system -r requirements/dev.txt \
RUN --mount=type=cache,target=/root/.cache/uv \ --extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
uv pip install --system -r requirements/dev.txt
#################### DEV IMAGE #################### #################### DEV IMAGE ####################
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
# image with vLLM installed # image with vLLM installed
# TODO: Restore to base image after FlashInfer AOT wheel fixed # TODO: Restore to base image after FlashInfer AOT wheel fixed
FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base FROM nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 AS vllm-base
ARG CUDA_VERSION=12.4.1 ARG CUDA_VERSION=12.8.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
...@@ -188,7 +198,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -188,7 +198,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
...@@ -203,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -203,6 +216,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Workaround for https://github.com/openai/triton/issues/2507 and # Workaround for https://github.com/openai/triton/issues/2507 and
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully # https://github.com/pytorch/pytorch/issues/107960 -- hopefully
...@@ -223,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -223,7 +237,8 @@ RUN --mount=type=cache,target=/root/.cache/uv \
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose uv pip install --system dist/*.whl --verbose \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # If we need to build FlashInfer wheel before its release:
# $ export FLASHINFER_ENABLE_AOT=1 # $ export FLASHINFER_ENABLE_AOT=1
...@@ -240,19 +255,32 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ...@@ -240,19 +255,32 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.1.post2/flashinfer_python-0.2.1.post2+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
# TESTING: install FlashInfer from source to test 2.7.0 final RC
if [[ "$CUDA_VERSION" == 12.8* ]]; then \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
else \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
fi && \
export FLASHINFER_ENABLE_AOT=1; \
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
fi fi
COPY examples examples COPY examples examples
COPY benchmarks benchmarks COPY benchmarks benchmarks
COPY ./vllm/collect_env.py . COPY ./vllm/collect_env.py .
RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \
uv pip list
# Although we build Flashinfer with AOT mode, there's still # Although we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to # some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation. # install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed # TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt uv pip install --system -r requirements/build.txt \
--extra-index-url https://download.pytorch.org/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
...@@ -266,11 +294,13 @@ ADD . /vllm-workspace/ ...@@ -266,11 +294,13 @@ ADD . /vllm-workspace/
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# install development dependencies (for testing)
# Workaround for #17068 # Workaround for #17068
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system mamba-ssm==2.2.4 --no-build-isolation uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt uv pip install --system -r requirements/dev.txt
......
...@@ -16,7 +16,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -16,7 +16,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo \ && apt-get install -y ccache software-properties-common git curl sudo \
&& add-apt-repository ppa:deadsnakes/ppa \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
...@@ -197,7 +200,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -197,7 +200,10 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ && apt-get install -y ffmpeg libsm6 libxext6 libgl1 \
&& add-apt-repository ppa:deadsnakes/ppa \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
...@@ -303,5 +309,7 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1 ...@@ -303,5 +309,7 @@ ENV HF_HUB_ENABLE_HF_TRANSFER 1
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt uv pip install --system -r requirements/nightly_torch_test.txt
#################### UNITTEST IMAGE ############################# # Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer'
#################### UNITTEST IMAGE #############################
...@@ -21,12 +21,8 @@ ENV UV_LINK_MODE=copy ...@@ -21,12 +21,8 @@ ENV UV_LINK_MODE=copy
# Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel # Note: A dummy file 'control' is created in /tmp/ to artificially create dependencies between stages when building stages in parallel
# when `--jobs=<N>` is passed with podman build command # when `--jobs=<N>` is passed with podman build command
RUN microdnf install -y openssl-devel dnf \ RUN microdnf install -y openssl-devel dnf \
&& dnf install -y https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-gpg-keys-9.0-24.el9.noarch.rpm \ && dnf install -y https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os/Packages/centos-stream-repos-9.0-24.el9.noarch.rpm \ && dnf config-manager --set-enabled codeready-builder-for-rhel-9-ppc64le-rpms \
https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm \
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/BaseOS/`arch`/os \
&& dnf config-manager --add-repo https://mirror.stream.centos.org/9-stream/AppStream/`arch`/os \
&& dnf config-manager --set-enabled crb \
&& dnf install -y \ && dnf install -y \
git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \ git tar gcc-toolset-13 automake libtool numactl-devel lapack-devel \
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
......
...@@ -114,8 +114,16 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples ...@@ -114,8 +114,16 @@ COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1 ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1
# User-friendly environment setting for multi-processing to avoid below RuntimeError.
# RuntimeError: Cannot re-initialize CUDA in forked subprocess. To use CUDA with multiprocessing,
# you must use the 'spawn' start method
# See https://pytorch.org/docs/stable/notes/multiprocessing.html#cuda-in-multiprocessing
ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
# Performance environment variable. # Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1 ENV HIP_FORCE_DEV_KERNARG=1
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git" ...@@ -12,7 +12,7 @@ ARG PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="1a7f4dfa" ARG FA_BRANCH="1a7f4dfa"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="7e1ed08" ARG AITER_BRANCH="5a77249"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
...@@ -32,7 +32,10 @@ ENV DEBIAN_FRONTEND=noninteractive ...@@ -32,7 +32,10 @@ ENV DEBIAN_FRONTEND=noninteractive
# Install Python and other dependencies # Install Python and other dependencies
RUN apt-get update -y \ RUN apt-get update -y \
&& apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \ && apt-get install -y software-properties-common git curl sudo vim less libgfortran5 \
&& add-apt-repository ppa:deadsnakes/ppa \ && for i in 1 2 3; do \
add-apt-repository -y ppa:deadsnakes/ppa && break || \
{ echo "Attempt $i failed, retrying in 5s..."; sleep 5; }; \
done \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \ && apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv \
python${PYTHON_VERSION}-lib2to3 python-is-python3 \ python${PYTHON_VERSION}-lib2to3 python-is-python3 \
......
...@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \ ...@@ -16,7 +16,7 @@ ENV LANG=C.UTF-8 \
RUN microdnf install -y \ RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake && \ openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy && \
microdnf clean all microdnf clean all
# Python Installation # Python Installation
...@@ -123,6 +123,7 @@ ENV UV_LINK_MODE=copy ...@@ -123,6 +123,7 @@ ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
COPY . /workspace/vllm COPY . /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
......
...@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -23,7 +23,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 -m pip install \ python3 -m pip install \
-r requirements/tpu.txt -r requirements/tpu.txt
RUN python3 setup.py develop RUN python3 -m pip install -e .
# install development dependencies (for testing) # install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils RUN python3 -m pip install -e tests/vllm_test_utils
......
...@@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -40,12 +40,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 setup.py install python3 setup.py install
# Please refer xpu doc, we need manually install intel-extension-for-pytorch 2.6.10+xpu due to there are some conflict dependencies with torch 2.6.0+xpu
# FIXME: This will be fix in ipex 2.7. just leave this here for awareness.
RUN --mount=type=cache,target=/root/.cache/pip \
pip install intel-extension-for-pytorch==2.6.10+xpu \
--extra-index-url=https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
CMD ["/bin/bash"] CMD ["/bin/bash"]
FROM vllm-base AS vllm-openai FROM vllm-base AS vllm-openai
......
...@@ -22,3 +22,4 @@ help: ...@@ -22,3 +22,4 @@ help:
clean: clean:
@$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) @$(SPHINXBUILD) -M clean "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
rm -rf "$(SOURCEDIR)/getting_started/examples" rm -rf "$(SOURCEDIR)/getting_started/examples"
rm -rf "$(SOURCEDIR)/api/vllm"
# AsyncLLMEngine
```{eval-rst}
.. autoclass:: vllm.AsyncLLMEngine
:members:
:show-inheritance:
```
# vLLM Engine
```{eval-rst}
.. automodule:: vllm.engine
```
```{eval-rst}
.. currentmodule:: vllm.engine
```
:::{toctree}
:caption: Engines
:maxdepth: 2
llm_engine
async_llm_engine
:::
# LLMEngine
```{eval-rst}
.. autoclass:: vllm.LLMEngine
:members:
:show-inheritance:
```
# Inference Parameters
Inference parameters for vLLM APIs.
(sampling-params)=
## Sampling Parameters
```{eval-rst}
.. autoclass:: vllm.SamplingParams
:members:
```
(pooling-params)=
## Pooling Parameters
```{eval-rst}
.. autoclass:: vllm.PoolingParams
:members:
```
# Model Adapters
## Module Contents
```{eval-rst}
.. automodule:: vllm.model_executor.models.adapters
:members:
:member-order: bysource
```
# Model Development
## Submodules
:::{toctree}
:maxdepth: 1
interfaces_base
interfaces
adapters
:::
# Optional Interfaces
## Module Contents
```{eval-rst}
.. automodule:: vllm.model_executor.models.interfaces
:members:
:member-order: bysource
```
# Base Model Interfaces
## Module Contents
```{eval-rst}
.. automodule:: vllm.model_executor.models.interfaces_base
:members:
:member-order: bysource
```
(multi-modality)=
# Multi-Modality
vLLM provides experimental support for multi-modal models through the {mod}`vllm.multimodal` package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models](#supported-mm-models)
via the `multi_modal_data` field in {class}`vllm.inputs.PromptType`.
Looking to add your own multi-modal model? Please follow the instructions listed [here](#supports-multimodal).
## Module Contents
```{eval-rst}
.. autodata:: vllm.multimodal.MULTIMODAL_REGISTRY
```
## Submodules
:::{toctree}
:maxdepth: 1
inputs
parse
processing
profiling
registry
:::
# Input Definitions
## User-facing inputs
```{eval-rst}
.. autodata:: vllm.multimodal.inputs.MultiModalDataDict
```
## Internal data structures
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.PlaceholderRange
:members:
:show-inheritance:
```
```{eval-rst}
.. autodata:: vllm.multimodal.inputs.NestedTensors
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldElem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalFieldConfig
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargsItem
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalKwargs
:members:
:show-inheritance:
```
```{eval-rst}
.. autoclass:: vllm.multimodal.inputs.MultiModalInputs
:members:
:show-inheritance:
```
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment