Commit 006693ed authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.11.2' into v0.11.2-ori

parents 4b51e6f1 275de341
...@@ -33,11 +33,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -33,11 +33,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif #endif
// ops.def( // ops.def(
// "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! " // "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
// "y_q, Tensor! y_s, int group_size, " // "y_q, Tensor! y_s,"
// "bool use_ue8m0, int num_parallel_tokens) -> ()"); // "bool use_ue8m0) -> ()");
// ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA, // ops.impl("persistent_masked_m_silu_mul_quant", torch::kCUDA,
// &silu_mul_fp8_quant_deep_gemm_cuda); // &persistent_masked_m_silu_mul_quant);
ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
...@@ -175,11 +175,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -175,11 +175,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"float epsilon) -> ()"); "float epsilon) -> ()");
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
// Polynomial Normalization. // Function for fused QK Norm and RoPE
ops.def( ops.def(
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float " "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
"epsilon) -> ()"); "int num_heads_k, int num_heads_v, int head_dim, float eps, "
ops.impl("poly_norm", torch::kCUDA, &poly_norm); "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
"bool is_neox, Tensor position_ids) -> ()");
ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
// Apply repetition penalties to logits in-place // Apply repetition penalties to logits in-place
ops.def( ops.def(
...@@ -188,6 +190,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -188,6 +190,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("apply_repetition_penalties_", torch::kCUDA, ops.impl("apply_repetition_penalties_", torch::kCUDA,
&apply_repetition_penalties_); &apply_repetition_penalties_);
// Optimized top-k per row operation
ops.def(
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
"Tensor! indices, int numRows, int stride0, "
"int stride1) -> ()");
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
ops.def(
"top_k_per_row_decode(Tensor logits, int next_n, "
"Tensor seq_lens, Tensor! indices, int numRows, "
"int stride0, int stride1) -> ()");
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
// Layernorm-quant // Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
// ops.def( // ops.def(
...@@ -401,7 +416,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -401,7 +416,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas," " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()", " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()",
{stride_tag}); {stride_tag});
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm); // conditionally compiled so impl registration is in source file
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias // quantization, as well as bias
...@@ -554,7 +569,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -554,7 +569,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// to prevent the meta function registry. // to prevent the meta function registry.
ops.def( ops.def(
"gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, " "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
"Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) " "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
"use_v2_format, int bit) "
"-> Tensor", "-> Tensor",
{stride_tag}); {stride_tag});
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
...@@ -607,7 +623,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -607,7 +623,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor? cache_indices," "Tensor? cache_indices,"
"Tensor? has_initial_state," "Tensor? has_initial_state,"
"Tensor! ssm_states," "Tensor! ssm_states,"
"int pad_slot_id) -> ()"); "int pad_slot_id,"
"int block_size,"
"Tensor? block_idx_first_scheduled_token,"
"Tensor? block_idx_last_scheduled_token,"
"Tensor? initial_state_idx) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
// Hadamard transforms // Hadamard transforms
...@@ -724,6 +744,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -724,6 +744,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"int quant_block_size, str kv_cache_dtype) -> ()"); "int quant_block_size, str kv_cache_dtype) -> ()");
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA, cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
&indexer_k_quant_and_cache); &indexer_k_quant_and_cache);
cache_ops.def(
"cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
"dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
cache_ops.impl("cp_gather_indexer_k_quant_cache", torch::kCUDA,
&cp_gather_indexer_k_quant_cache);
} }
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
......
...@@ -29,6 +29,22 @@ struct _typeConvert { ...@@ -29,6 +29,22 @@ struct _typeConvert {
static constexpr bool exists = false; static constexpr bool exists = false;
}; };
template <>
struct _typeConvert<float> {
static constexpr bool exists = true;
using hip_type = float;
using packed_hip_type = float2;
using packed_hip_type4 = float4; // For 128-bit vectorization
__device__ static __forceinline__ float convert(hip_type x) { return x; }
__device__ static __forceinline__ float2 convert(packed_hip_type x) {
return x;
}
__device__ static __forceinline__ float4 convert(packed_hip_type4 x) {
return x;
}
};
#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
// CUDA < 12.0 runs into issues with packed type conversion // CUDA < 12.0 runs into issues with packed type conversion
template <> template <>
...@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> { ...@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> {
using hip_type = __half; using hip_type = __half;
using packed_hip_type = __half2; using packed_hip_type = __half2;
__device__ static inline float convert(hip_type x) { return __half2float(x); } __device__ static __forceinline__ float convert(hip_type x) {
__device__ static inline float2 convert(packed_hip_type x) { return __half2float(x);
}
__device__ static __forceinline__ float2 convert(packed_hip_type x) {
return __half22float2(x); return __half22float2(x);
} }
__device__ static inline hip_type convert(float x) { __device__ static __forceinline__ hip_type convert(float x) {
return __float2half_rn(x); return __float2half_rn(x);
} }
__device__ static inline packed_hip_type convert(float2 x) { __device__ static __forceinline__ packed_hip_type convert(float2 x) {
return __float22half2_rn(x); return __float22half2_rn(x);
} }
}; };
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) || defined(USE_ROCM)
// CUDA_ARCH < 800 does not have BF16 support // CUDA_ARCH < 800 does not have BF16 support
// TODO: Add in ROCm support once public headers handle bf16 maturely // ROCm 7.0+ supports bfloat16
template <> template <>
struct _typeConvert<c10::BFloat16> { struct _typeConvert<c10::BFloat16> {
static constexpr bool exists = true; static constexpr bool exists = true;
using hip_type = __nv_bfloat16; using hip_type = __nv_bfloat16;
using packed_hip_type = __nv_bfloat162; using packed_hip_type = __nv_bfloat162;
__device__ static inline float convert(hip_type x) { __device__ static __forceinline__ float convert(hip_type x) {
return __bfloat162float(x); return __bfloat162float(x);
} }
__device__ static inline float2 convert(packed_hip_type x) { __device__ static __forceinline__ float2 convert(packed_hip_type x) {
return __bfloat1622float2(x); return __bfloat1622float2(x);
} }
__device__ static inline hip_type convert(float x) { __device__ static __forceinline__ hip_type convert(float x) {
return __float2bfloat16(x); return __float2bfloat16(x);
} }
__device__ static inline packed_hip_type convert(float2 x) { __device__ static __forceinline__ packed_hip_type convert(float2 x) {
return __float22bfloat162_rn(x); return __float22bfloat162_rn(x);
} }
}; };
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) ||
// defined(USE_ROCM)
#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= #endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
// 12000)) // 12000))
...@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec { ...@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec {
if constexpr (width % 2 == 0) { if constexpr (width % 2 == 0) {
#pragma unroll #pragma unroll
for (int i = 0; i < width; i += 2) { for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]}; if constexpr (std::is_same_v<T2, float2>) {
temp += T2{other.data[i], other.data[i + 1]}; data[i] += other.data[i];
data[i] = temp.x; data[i + 1] += other.data[i + 1];
data[i + 1] = temp.y; } else {
T2 temp{data[i], data[i + 1]};
temp += T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} }
} else { } else {
#pragma unroll #pragma unroll
...@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec { ...@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec {
if constexpr (width % 2 == 0) { if constexpr (width % 2 == 0) {
#pragma unroll #pragma unroll
for (int i = 0; i < width; i += 2) { for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]}; if constexpr (std::is_same_v<T2, float2>) {
temp *= T2{other.data[i], other.data[i + 1]}; data[i] *= other.data[i];
data[i] = temp.x; data[i + 1] *= other.data[i + 1];
data[i + 1] = temp.y; } else {
T2 temp{data[i], data[i + 1]};
temp *= T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} }
} else { } else {
#pragma unroll #pragma unroll
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# docs/contributing/dockerfile/dockerfile.md and # docs/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png # docs/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.9.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
# By parameterizing the base images, we allow third-party to use their own # By parameterizing the base images, we allow third-party to use their own
...@@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12 ...@@ -15,7 +15,7 @@ ARG PYTHON_VERSION=3.12
# Example: # Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Important: We build with an old version of Ubuntu to maintain broad # Important: We build with an old version of Ubuntu to maintain broad
# compatibility with other Linux OSes. The main reason for this is that the # compatibility with other Linux OSes. The main reason for this is that the
# glibc version is baked into the distro, and binaries built with one glibc # glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version. # version are not backwards compatible with OSes that use an earlier version.
...@@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -229,7 +229,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=450 ARG VLLM_MAX_SIZE_MB=500
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
...@@ -273,6 +273,7 @@ WORKDIR /vllm-workspace ...@@ -273,6 +273,7 @@ WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8 ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE # Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
...@@ -356,65 +357,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ...@@ -356,65 +357,14 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # Install FlashInfer pre-compiled kernel cache and binaries
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # https://docs.flashinfer.ai/installation.html
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' RUN --mount=type=cache,target=/root/.cache/uv \
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive uv pip install --system flashinfer-cubin==0.5.2 \
# $ cd flashinfer && uv pip install --system flashinfer-jit-cache==0.5.2 \
# $ git checkout v0.2.6.post1 --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
# $ python -m flashinfer.aot && flashinfer show-config
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.3.1"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE=false
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
pushd flashinfer
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
fi
popd
rm -rf flashinfer
BASH
COPY examples examples COPY examples examples
COPY benchmarks benchmarks COPY benchmarks benchmarks
COPY ./vllm/collect_env.py . COPY ./vllm/collect_env.py .
...@@ -436,7 +386,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -436,7 +386,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARG DEEPGEMM_GIT_REF ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
COPY tools/install_gdrcopy.sh install_gdrcopy.sh COPY tools/install_gdrcopy.sh install_gdrcopy.sh
RUN set -eux; \ RUN set -eux; \
...@@ -451,9 +401,15 @@ RUN set -eux; \ ...@@ -451,9 +401,15 @@ RUN set -eux; \
# Install EP kernels(pplx-kernels and DeepEP) # Install EP kernels(pplx-kernels and DeepEP)
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh
ENV CUDA_HOME=/usr/local/cuda ENV CUDA_HOME=/usr/local/cuda
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \ RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a 10.0a+PTX}" \
&& bash install_python_libraries.sh && bash install_python_libraries.sh
# CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
# return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
# consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
# Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
...@@ -467,6 +423,7 @@ ARG PYTHON_VERSION ...@@ -467,6 +423,7 @@ ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
...@@ -479,7 +436,8 @@ ENV UV_LINK_MODE=copy ...@@ -479,7 +436,8 @@ ENV UV_LINK_MODE=copy
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \ if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt; \ uv pip install --system -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi fi
# install development dependencies (for testing) # install development dependencies (for testing)
...@@ -526,7 +484,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -526,7 +484,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \ else \
BITSANDBYTES_VERSION="0.46.1"; \ BITSANDBYTES_VERSION="0.46.1"; \
fi; \ fi; \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3] uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
...@@ -539,5 +497,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"] ...@@ -539,5 +497,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# vllm-dev: used for development # vllm-dev: used for development
# #
# Build arguments: # Build arguments:
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9 # PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
# VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_DISABLE_AVX512=false (default)|true
# VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true
# VLLM_CPU_AVX512VNNI=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true
# VLLM_CPU_AMXBF16=false (default)|true
# #
######################### COMMON BASE IMAGE ######################### ######################### COMMON BASE IMAGE #########################
...@@ -31,7 +32,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ...@@ -31,7 +32,7 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \ apt-get update -y \
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \ && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh && curl -LsSf https://astral.sh/uv/install.sh | sh
...@@ -47,7 +48,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" ...@@ -47,7 +48,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
# Install Python dependencies # Install Python dependencies
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_INDEX_STRATEGY="unsafe-best-match"
...@@ -79,6 +80,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc ...@@ -79,6 +80,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### BUILD IMAGE ######################### ######################### BUILD IMAGE #########################
FROM base AS vllm-build FROM base AS vllm-build
ARG max_jobs=32
ENV MAX_JOBS=${max_jobs}
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512=0 ARG VLLM_CPU_DISABLE_AVX512=0
...@@ -89,6 +93,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} ...@@ -89,6 +93,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ... # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
ARG VLLM_CPU_AVX512VNNI=0 ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=0
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -104,20 +111,24 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -104,20 +111,24 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \ --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
######################### TEST DEPS ######################### ######################### TEST DEPS #########################
FROM base AS vllm-test-deps FROM base AS vllm-test-deps
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \ cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
sed -i 's/^torch==.*/torch==2.8.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cpu-test.txt uv pip install -r requirements/cpu-test.txt
######################### DEV IMAGE ######################### ######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev FROM vllm-build AS vllm-dev
...@@ -130,12 +141,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -130,12 +141,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop VLLM_TARGET_DEVICE=cpu python3 setup.py develop
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
...@@ -160,11 +171,12 @@ ADD ./benchmarks/ ./benchmarks/ ...@@ -160,11 +171,12 @@ ADD ./benchmarks/ ./benchmarks/
ADD ./vllm/collect_env.py . ADD ./vllm/collect_env.py .
ADD ./.buildkite/ ./.buildkite/ ADD ./.buildkite/ ./.buildkite/
# Create symlink for vllm-workspace to maintain CI compatibility
RUN ln -sf /workspace /vllm-workspace
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
ENTRYPOINT ["bash"]
######################### RELEASE IMAGE ######################### ######################### RELEASE IMAGE #########################
FROM base AS vllm-openai FROM base AS vllm-openai
...@@ -176,4 +188,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -176,4 +188,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl uv pip install dist/*.whl
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. ...@@ -246,7 +246,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
# build flashinfer for torch nightly from source around 10 mins # build flashinfer for torch nightly from source around 10 mins
# release version: v0.3.1 # release version: v0.5.2
# todo(elainewy): cache flashinfer build result for faster build # todo(elainewy): cache flashinfer build result for faster build
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
...@@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -254,7 +254,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
echo "git clone flashinfer..." \ echo "git clone flashinfer..." \
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
&& cd flashinfer \ && cd flashinfer \
&& git checkout v0.3.1 \ && git checkout v0.5.2 \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& echo "finish git clone flashinfer..." \ && echo "finish git clone flashinfer..." \
&& rm -rf build \ && rm -rf build \
......
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
############################################################### ###############################################################
# Stage to build openblas # Stage to build openblas
...@@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 ...@@ -7,7 +7,7 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \ RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \
&& source /opt/rh/gcc-toolset-13/enable \ && source /opt/rh/gcc-toolset-13/enable \
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \ && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
...@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel ...@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
FROM centos-deps-builder AS base-builder FROM centos-deps-builder AS base-builder
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
# Set Environment Variables for venv, cargo & openblas # Set Environment Variables for venv, cargo & openblas
ENV VIRTUAL_ENV=/opt/vllm ENV VIRTUAL_ENV=/opt/vllm
...@@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/, ...@@ -61,7 +61,7 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \ libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \ harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
&& dnf clean all \ && dnf clean all \
&& PREFIX=/usr/local make -C /openblas install \ && PREFIX=/usr/local make -C /openblas install \
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
...@@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/, ...@@ -79,9 +79,9 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
FROM base-builder AS torch-builder FROM base-builder AS torch-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG TORCH_VERSION=2.6.0 ARG TORCH_VERSION=2.7.0
ARG _GLIBCXX_USE_CXX11_ABI=1 ARG _GLIBCXX_USE_CXX11_ABI=1
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
...@@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -93,7 +93,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
MAX_JOBS=${MAX_JOBS:-$(nproc)} \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/ PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
ARG TORCHVISION_VERSION=0.21.0 ARG TORCHVISION_VERSION=0.22.0
ARG TORCHVISION_USE_NVJPEG=0 ARG TORCHVISION_USE_NVJPEG=0
ARG TORCHVISION_USE_FFMPEG=0 ARG TORCHVISION_USE_FFMPEG=0
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -104,7 +104,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
BUILD_VERSION=${TORCHVISION_VERSION} \ BUILD_VERSION=${TORCHVISION_VERSION} \
uv build --wheel --out-dir /torchwheels/ --no-build-isolation uv build --wheel --out-dir /torchwheels/ --no-build-isolation
ARG TORCHAUDIO_VERSION=2.6.0 ARG TORCHAUDIO_VERSION=2.7.0
ARG BUILD_SOX=1 ARG BUILD_SOX=1
ARG BUILD_KALDI=1 ARG BUILD_KALDI=1
ARG BUILD_RNNT=1 ARG BUILD_RNNT=1
...@@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder ...@@ -128,7 +128,7 @@ FROM base-builder AS arrow-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG PYARROW_PARALLEL ARG PYARROW_PARALLEL
ARG PYARROW_VERSION=19.0.1 ARG PYARROW_VERSION=21.0.0
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \ git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
...@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
make install -j ${MAX_JOBS:-$(nproc)} && \ make install -j ${MAX_JOBS:-$(nproc)} && \
cd ../../python/ && \ cd ../../python/ && \
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \ uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \ PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
python setup.py build_ext \ python setup.py build_ext \
--build-type=release --bundle-arrow-cpp \ --build-type=release --bundle-arrow-cpp \
...@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V ...@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
&& make -j ${MAX_JOBS:-$(nproc)} && make -j ${MAX_JOBS:-$(nproc)}
###############################################################
# Stage to build numba
###############################################################
FROM base-builder AS numba-builder
ARG MAX_JOBS
ARG NUMBA_VERSION=0.61.2
# Clone all required dependencies
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-13/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
cd ./numba && \
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
fi && python -m build --wheel --installer=uv --outdir /numbawheels/
############################################################### ###############################################################
# Stage to build vllm - this stage builds and installs # Stage to build vllm - this stage builds and installs
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
...@@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null ...@@ -199,6 +215,7 @@ COPY --from=torch-builder /tmp/control /dev/null
COPY --from=arrow-builder /tmp/control /dev/null COPY --from=arrow-builder /tmp/control /dev/null
COPY --from=cv-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null
COPY --from=numba-builder /tmp/control /dev/null
ARG VLLM_TARGET_DEVICE=cpu ARG VLLM_TARGET_DEVICE=cpu
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
...@@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ...@@ -206,6 +223,8 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
# this step installs vllm and populates uv cache # this step installs vllm and populates uv cache
# with all the transitive dependencies # with all the transitive dependencies
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
dnf install llvm15 llvm15-devel -y && \
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \ git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
uv pip install maturin && \ uv pip install maturin && \
...@@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -215,15 +234,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \ --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
--mount=type=bind,src=.,dst=/src/,rw \ --mount=type=bind,src=.,dst=/src/,rw \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-13/enable && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ export PATH=$PATH:/usr/lib64/llvm15/bin && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
make -C /numactl install && \ make -C /numactl install && \
# sentencepiece.pc is in some pkgconfig inside uv cache # sentencepiece.pc is in some pkgconfig inside uv cache
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
cd /src/ && \ cd /src/ && \
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \ uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
uv pip install /vllmwheel/*.whl uv pip install /vllmwheel/*.whl
...@@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L ...@@ -250,7 +272,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
# Set Environment Variables for venv & openblas # Set Environment Variables for venv & openblas
ENV VIRTUAL_ENV=/opt/vllm ENV VIRTUAL_ENV=/opt/vllm
...@@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null ...@@ -268,6 +290,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null
COPY --from=lapack-builder /tmp/control /dev/null COPY --from=lapack-builder /tmp/control /dev/null
COPY --from=openblas-builder /tmp/control /dev/null COPY --from=openblas-builder /tmp/control /dev/null
COPY --from=numba-builder /tmp/control /dev/null
# install gcc-11, python, openblas, numactl, lapack # install gcc-11, python, openblas, numactl, lapack
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -276,13 +299,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
microdnf install --nodocs -y \ microdnf install --nodocs -y \
tar findutils openssl \ libomp tar findutils openssl llvm15 llvm15-devel \
pkgconfig xsimd g++ gcc-fortran libsndfile \ pkgconfig xsimd g++ gcc-fortran libsndfile \
libtiff libjpeg openjpeg2 zlib zeromq \ libtiff libjpeg openjpeg2 zlib zeromq \
freetype lcms2 libwebp tcl tk utf8proc \ freetype lcms2 libwebp tcl tk utf8proc \
harfbuzz fribidi libraqm libimagequant libxcb \ harfbuzz fribidi libraqm libimagequant libxcb util-linux \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
&& microdnf clean all \ && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv --no-cache \ && python -m pip install -U pip uv --no-cache \
&& make -C /numactl install \ && make -C /numactl install \
...@@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -298,7 +321,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \ --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -314,4 +340,4 @@ WORKDIR /workspace/ ...@@ -314,4 +340,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
...@@ -12,9 +12,20 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} ...@@ -12,9 +12,20 @@ ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache # Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR} WORKDIR ${COMMON_WORKDIR}
...@@ -59,13 +70,15 @@ FROM base AS test ...@@ -59,13 +70,15 @@ FROM base AS test
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM # Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \ cd /install \
&& pip install -U -r requirements/rocm.txt \ && uv pip install --system -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \ && uv pip install --system -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& pip install *.whl && uv pip install --system *.whl
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
...@@ -75,7 +88,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace ...@@ -75,7 +88,6 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
RUN cd /vllm-workspace \ RUN cd /vllm-workspace \
&& rm -rf vllm \ && rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \ && python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# ----------------------- # -----------------------
...@@ -90,14 +102,17 @@ RUN case "$(which python3)" in \ ...@@ -90,14 +102,17 @@ RUN case "$(which python3)" in \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac *) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli] RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --upgrade huggingface-hub[cli]
# Install vLLM # Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \ cd /install \
&& pip install -U -r requirements/rocm.txt \ && uv pip install --system -r requirements/rocm.txt \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& pip install *.whl && uv pip install --system *.whl
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
......
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG TRITON_BRANCH="f9e5bf54" ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="b2fb6885" ARG PYTORCH_BRANCH="1c57644d"
ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG FA_BRANCH="0e60e394" ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="2ab9f4cd" ARG AITER_BRANCH="59bd8ff2"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
...@@ -15,10 +15,13 @@ FROM ${BASE_IMAGE} AS base ...@@ -15,10 +15,13 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV ROCM_PATH=/opt/rocm ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950 ENV AITER_ROCM_ARCH=gfx942;gfx950
# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
RUN mkdir -p /app RUN mkdir -p /app
...@@ -141,4 +144,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ ...@@ -141,4 +144,4 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
&& echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt && echo "AITER_REPO: ${AITER_REPO}" >> /app/versions.txt
\ No newline at end of file
...@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \ ...@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \
# Install development utilities # Install development utilities
RUN microdnf install -y \ RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \ openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
clang llvm-devel llvm-static clang-devel && \ clang llvm-devel llvm-static clang-devel && \
...@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ ...@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
rustup default stable && \ rustup default stable && \
rustup show rustup show
FROM python-install AS torch
ARG TORCH_VERSION=2.7.0
ENV export _GLIBCXX_USE_CXX11_ABI=1
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/pytorch/pytorch.git && \
cd pytorch && \
git checkout v2.7.0 && \
git submodule sync && \
git submodule update --init --recursive && \
uv pip install cmake ninja && \
uv pip install -r requirements.txt && \
python setup.py bdist_wheel
FROM python-install AS torch-vision FROM python-install AS torch-vision
# Install torchvision # Install torchvision
ARG TORCH_VERSION=2.7.0 ARG TORCH_VISION_VERSION=v0.23.0
ARG TORCH_VISION_VERSION=v0.20.1
WORKDIR /tmp WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
git clone https://github.com/pytorch/vision.git && \ git clone https://github.com/pytorch/vision.git && \
cd vision && \ cd vision && \
git checkout $TORCH_VISION_VERSION && \ git checkout $TORCH_VISION_VERSION && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
uv pip install -v $TORCH_WHL_FILE && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder FROM python-install AS hf-xet-builder
...@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \ if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
fi && python setup.py bdist_wheel fi && python setup.py bdist_wheel
# Edit aws-lc-sys to support s390x
FROM python-install AS aws-lc-sys-editor
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ARG AWS_LC_VERSION=v0.30.0
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone --recursive https://github.com/aws/aws-lc-rs.git && \
cd aws-lc-rs && \
git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
git submodule sync && \
git submodule update --init --recursive && \
cd aws-lc-sys && \
sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
# Build Outlines Core # Build Outlines Core
FROM python-install AS outlines-core-builder FROM python-install AS outlines-core-builder
...@@ -226,17 +181,17 @@ WORKDIR /tmp ...@@ -226,17 +181,17 @@ WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ARG OUTLINES_CORE_VERSION=0.2.10 COPY requirements/common.txt /tmp/requirements/common.txt
ARG OUTLINES_CORE_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \ OUTLINES_CORE_VERSION=${OUTLINES_CORE_VERSION:-$(grep -E '^outlines_core\s*==\s*[0-9.]+' /tmp/requirements/common.txt | grep -Eo '[0-9.]+')} && \
if [ -z "${OUTLINES_CORE_VERSION}" ]; then echo "ERROR: Could not determine outlines_core version"; exit 1; fi && \
git clone https://github.com/dottxt-ai/outlines-core.git && \ git clone https://github.com/dottxt-ai/outlines-core.git && \
cd outlines-core && \ cd outlines-core && \
git checkout tags/${OUTLINES_CORE_VERSION} && \ git checkout tags/${OUTLINES_CORE_VERSION} && \
sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \ sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
echo '[patch.crates-io]' >> Cargo.toml && \
echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
uv pip install maturin && \ uv pip install maturin && \
python -m maturin build --release --out dist python -m maturin build --release --out dist
...@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu ...@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu
ARG PYTHON_VERSION ARG PYTHON_VERSION
# Set correct library path for torch and numactl # Set correct library path for torch and numactl
ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH" ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
COPY . /workspace/vllm COPY . /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
--mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
--mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
--mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \ --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
...@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \ HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \ LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \ NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \ OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
...@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \ $HF_XET_WHL_FILE \
$TORCH_WHL_FILE \
$LLVM_WHL_FILE \ $LLVM_WHL_FILE \
$NUMBA_WHL_FILE \ $NUMBA_WHL_FILE \
$OUTLINES_CORE_WHL_FILE \ $OUTLINES_CORE_WHL_FILE \
...@@ -309,4 +263,4 @@ USER 2000 ...@@ -309,4 +263,4 @@ USER 2000
WORKDIR /home/vllm WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \ ...@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
libxext6 \ libxext6 \
libgl1 \ libgl1 \
lsb-release \ lsb-release \
libaio-dev \
numactl \ numactl \
wget \ wget \
vim \ vim \
...@@ -54,7 +55,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ...@@ -54,7 +55,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 setup.py install pip install --no-build-isolation .
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -64,9 +65,14 @@ FROM vllm-base AS vllm-openai ...@@ -64,9 +65,14 @@ FROM vllm-base AS vllm-openai
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# install nixl from source code
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y pip uninstall oneccl oneccl-devel -y
# install development dependencies (for testing) ENTRYPOINT ["vllm", "serve"]
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
...@@ -46,7 +46,10 @@ nav: ...@@ -46,7 +46,10 @@ nav:
- contributing/model/multimodal.md - contributing/model/multimodal.md
- contributing/model/transcription.md - contributing/model/transcription.md
- CI: contributing/ci - CI: contributing/ci
- Design Documents: design - Design Documents:
- Plugins:
- design/*plugin*.md
- design/*
- API Reference: - API Reference:
- api/README.md - api/README.md
- api/vllm - api/vllm
......
...@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at ...@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
Where to get started with vLLM depends on the type of user. If you are looking to: Where to get started with vLLM depends on the type of user. If you are looking to:
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md) - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage) - Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
- Build vLLM, we recommend starting with [Developer Guide](./contributing) - Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
For information about the development of vLLM, see: For information about the development of vLLM, see:
...@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with: ...@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference - Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
......
...@@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes. ...@@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
- [vllm.config.CompilationConfig][] - [vllm.config.CompilationConfig][]
- [vllm.config.VllmConfig][] - [vllm.config.VllmConfig][]
[](){ #offline-inference-api }
## Offline Inference ## Offline Inference
LLM Class. LLM Class.
...@@ -45,18 +43,14 @@ Engine classes for offline and online inference. ...@@ -45,18 +43,14 @@ Engine classes for offline and online inference.
Inference parameters for vLLM APIs. Inference parameters for vLLM APIs.
[](){ #sampling-params }
- [vllm.SamplingParams][] - [vllm.SamplingParams][]
- [vllm.PoolingParams][] - [vllm.PoolingParams][]
[](){ #multi-modality }
## Multi-Modality ## Multi-Modality
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package. vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models] Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
via the `multi_modal_data` field in [vllm.inputs.PromptType][]. via the `multi_modal_data` field in [vllm.inputs.PromptType][].
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md). Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
......
search: search:
boost: 0.5 exclude: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment