Commit 41199996 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.12.0' into v0.12.0-dev

parents 31021d81 4fd9d6a8
...@@ -23,7 +23,7 @@ ...@@ -23,7 +23,7 @@
#include <algorithm> #include <algorithm>
#include "../attention/dtype_fp8.cuh" #include "../attention/dtype_fp8.cuh"
#include "../quantization/fp8/amd/quant_utils.cuh" #include "../quantization/w8a8/fp8/amd/quant_utils.cuh"
// ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent // ROCm 6.2 compatibility: map OCP fp8 types to FNUZ variants if OCP is absent
#if !defined(HIP_FP8_TYPE_OCP) #if !defined(HIP_FP8_TYPE_OCP)
...@@ -40,7 +40,8 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz; ...@@ -40,7 +40,8 @@ using __hip_fp8_e5m2 = __hip_fp8_e5m2_fnuz;
#define __HIP__FP8MFMA__ #define __HIP__FP8MFMA__
#endif #endif
#if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__)) #if defined(__HIPCC__) && (defined(__gfx1100__) || defined(__gfx1101__) || \
defined(__gfx1150__) || defined(__gfx1151__))
#define __HIP__GFX11__ #define __HIP__GFX11__
#endif #endif
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include "../cuda_compat.h" #include "../cuda_compat.h"
#include "dispatch_utils.h" #include "dispatch_utils.h"
#include "quantization/fp8/common.cuh" #include "quantization/w8a8/fp8/common.cuh"
#if defined(__HIPCC__) && \ #if defined(__HIPCC__) && \
(defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__)) (defined(__gfx90a__) || defined(__gfx942__) || defined(__gfx950__))
......
...@@ -44,6 +44,256 @@ __global__ void apply_repetition_penalties_kernel( ...@@ -44,6 +44,256 @@ __global__ void apply_repetition_penalties_kernel(
} }
} }
static inline __device__ uint16_t extractBinIdx(float x) {
union {
__half h;
uint16_t u16;
} tmp;
tmp.h = __float2half_rn(x);
tmp.u16 = (x < 0.f) ? (~tmp.u16 & 0xffff) : (tmp.u16 | 0x8000);
return 511 - (tmp.u16 >> 7);
}
template <int kNumThreadsPerBlock = 512, int kNumBins = 512, int kTopK = 2048>
__device__ void topKPerRowJob(const float* logits, const int rowStart,
const int rowEnd, const int rowIdx,
int* outIndices, int stride0, int stride1) {
// The number of elements per thread for the final top-k sort.
static constexpr int kNumTopKItemsPerThread = kTopK / kNumThreadsPerBlock;
// The class to sort the elements during the final top-k sort.
using TopKSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
kNumTopKItemsPerThread, int>;
// The number of slots for the final pass.
static constexpr int kNumFinalItems = 3072;
// The number of elements per thread for the final sort.
static constexpr int kNumFinalItemsPerThread =
kNumFinalItems / kNumThreadsPerBlock;
// The class to sort the elements during the final pass.
using FinalSort = cub::BlockRadixSort<float, kNumThreadsPerBlock,
kNumFinalItemsPerThread, int>;
// The class to compute the inclusive prefix-sum over the histogram.
using Scan = cub::BlockScan<int, kNumThreadsPerBlock>;
// Shared memory to compute the block scan.
__shared__ typename Scan::TempStorage smemScan;
// The structure to store the final items (for the final pass).
struct FinalItems {
// Shared memory to store the indices for the final pass.
int indices[kNumFinalItems];
// Shared memory to store the logits for the final pass.
float logits[kNumFinalItems];
};
// Shared memory to compute the block sort.
__shared__ union {
FinalItems items;
typename FinalSort::TempStorage finalSort;
typename TopKSort::TempStorage topKSort;
} smemFinal;
// Shared memory to store the histogram.
__shared__ int smemHistogram[kNumBins];
// Shared memory to store the selected indices.
__shared__ int smemIndices[kTopK];
// Shared memory to store the threshold bin.
__shared__ int smemThresholdBinIdx[1];
// Shared memory counter to register the candidates for the final phase.
__shared__ int smemFinalDstIdx[1];
// The length of the row.
int rowLen = rowEnd - rowStart;
// Shortcut if the length of the row is smaller than Top-K. Indices are not
// sorted by their corresponding logit.
if (rowLen <= kTopK) {
for (int rowIt = threadIdx.x; rowIt < rowLen;
rowIt += kNumThreadsPerBlock) {
int idx = rowStart + rowIt;
outIndices[rowIdx * kTopK + rowIt] = idx - rowStart;
}
for (int rowIt = rowLen + threadIdx.x; rowIt < kTopK;
rowIt += kNumThreadsPerBlock) {
outIndices[rowIdx * kTopK + rowIt] = -1;
}
return;
}
// Clear the histogram.
if (threadIdx.x < kNumBins) {
smemHistogram[threadIdx.x] = 0;
}
// Make sure the histogram is ready.
__syncthreads();
// Fetch elements one-by-one.
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
rowIt += kNumThreadsPerBlock) {
uint16_t idx = extractBinIdx(logits[rowIdx * stride0 + rowIt * stride1]);
atomicAdd(&smemHistogram[idx], 1);
}
// Make sure the histogram is ready.
__syncthreads();
// Read the values from SMEM.
int binCount{0};
if (threadIdx.x < kNumBins) {
binCount = smemHistogram[threadIdx.x];
}
// Make sure each thread has read its value.
__syncthreads();
// Compute the prefix sum.
int prefixSum{0}, totalSum{0};
Scan(smemScan).ExclusiveSum(binCount, prefixSum, totalSum);
// Update the histogram with the prefix sums.
if (threadIdx.x < kNumBins) {
smemHistogram[threadIdx.x] = prefixSum;
}
// Make sure the data is in shared memory.
__syncthreads();
// Find the last valid bin.
if (threadIdx.x < kNumBins) {
int nextPrefixSum =
threadIdx.x == kNumBins - 1 ? totalSum : smemHistogram[threadIdx.x + 1];
if (prefixSum < kTopK && nextPrefixSum >= kTopK) {
smemThresholdBinIdx[0] = threadIdx.x;
}
}
// Clear the counter to store the items for the final phase.
if (threadIdx.x == 0) {
smemFinalDstIdx[0] = 0;
}
// Make sure the data is in shared memory.
__syncthreads();
// The threshold bin.
int thresholdBinIdx = smemThresholdBinIdx[0];
// Fetch elements one-by-one and populate the shared memory buffers.
for (int rowIt = rowStart + threadIdx.x; rowIt < rowEnd;
rowIt += kNumThreadsPerBlock) {
float logit = logits[rowIdx * stride0 + rowIt * stride1];
uint16_t idx = extractBinIdx(logit);
if (idx < thresholdBinIdx) {
int dstIdx = atomicAdd(&smemHistogram[idx], 1);
smemIndices[dstIdx] = rowIt;
} else if (idx == thresholdBinIdx) {
int dstIdx = atomicAdd(&smemFinalDstIdx[0], 1);
if (dstIdx < kNumFinalItems) {
smemFinal.items.logits[dstIdx] = logit;
smemFinal.items.indices[dstIdx] = rowIt;
}
}
}
// Make sure the elements are in shared memory.
__syncthreads();
// The logits of the elements to be sorted in the final pass.
float finalLogits[kNumFinalItemsPerThread];
// The indices of the elements to be sorted in the final pass.
int finalIndices[kNumFinalItemsPerThread];
// Init.
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
finalLogits[ii] = -FLT_MAX;
}
// Read the elements from SMEM.
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
if (srcIdx < smemFinalDstIdx[0]) {
finalLogits[ii] = smemFinal.items.logits[srcIdx];
finalIndices[ii] = smemFinal.items.indices[srcIdx];
}
}
// Make sure the shared memory has been read.
__syncthreads();
// Sort the elements.
FinalSort(smemFinal.finalSort)
.SortDescendingBlockedToStriped(finalLogits, finalIndices);
// Copy the data back to the shared memory storage.
int baseIdx = thresholdBinIdx > 0 ? smemHistogram[thresholdBinIdx - 1] : 0;
#pragma unroll
for (int ii = 0; ii < kNumFinalItemsPerThread; ++ii) {
int srcIdx = ii * kNumThreadsPerBlock + threadIdx.x;
int dstIdx = baseIdx + srcIdx;
if (dstIdx < kTopK) {
smemIndices[dstIdx] = finalIndices[ii];
}
}
// Make sure the data is in shared memory.
__syncthreads();
// Store to global memory.
#pragma unroll
for (int ii = 0; ii < kNumTopKItemsPerThread; ++ii) {
int offset = rowIdx * kTopK + ii * kNumThreadsPerBlock + threadIdx.x;
outIndices[offset] =
smemIndices[ii * kNumThreadsPerBlock + threadIdx.x] - rowStart;
}
}
template <int kNumThreadsPerBlock = 512>
static __global__ void topKPerRow(const float* logits, const int* rowStarts,
const int* rowEnds, int* outIndices,
int stride0, int stride1) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = rowStarts[rowIdx];
int rowEnd = rowEnds[rowIdx];
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
}
template <int kNumThreadsPerBlock = 512>
static __global__ void topKPerRowDecode(const float* logits, const int* seqLens,
int* outIndices, int stride0,
int stride1, int next_n) {
// The number of bins in the histogram.
static constexpr int kNumBins = 512;
// The top-k width.
static constexpr int kTopK = 2048;
// The row computed by this block.
int rowIdx = blockIdx.x;
// The range of logits within the row.
int rowStart = 0;
int seq_len = seqLens[rowIdx / next_n];
int rowEnd = seq_len - next_n + (rowIdx % next_n) + 1;
topKPerRowJob<kNumThreadsPerBlock, kNumBins, kTopK>(
logits, rowStart, rowEnd, rowIdx, outIndices, stride0, stride1);
}
} // namespace vllm } // namespace vllm
void apply_repetition_penalties_( void apply_repetition_penalties_(
...@@ -85,4 +335,32 @@ void apply_repetition_penalties_( ...@@ -85,4 +335,32 @@ void apply_repetition_penalties_(
repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size, repetition_penalties.data_ptr<scalar_t>(), num_seqs, vocab_size,
tile_size); tile_size);
}); });
} }
\ No newline at end of file
void top_k_per_row_decode(const torch::Tensor& logits, int64_t next_n,
const torch::Tensor& seqLens, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1) {
// Compute the results on the device.
constexpr int kNumThreadsPerBlock = 512;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::topKPerRowDecode<kNumThreadsPerBlock>
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
logits.data_ptr<float>(), seqLens.data_ptr<int>(),
indices.data_ptr<int>(), static_cast<int>(stride0),
static_cast<int>(stride1), static_cast<int>(next_n));
}
void top_k_per_row(const torch::Tensor& logits, const torch::Tensor& rowStarts,
const torch::Tensor& rowEnds, torch::Tensor& indices,
int64_t numRows, int64_t stride0, int64_t stride1) {
// Compute the results on the device.
constexpr int kNumThreadsPerBlock = 512;
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::topKPerRow<kNumThreadsPerBlock>
<<<numRows, kNumThreadsPerBlock, 0, stream>>>(
logits.data_ptr<float>(), rowStarts.data_ptr<int>(),
rowEnds.data_ptr<int>(), indices.data_ptr<int>(),
static_cast<int>(stride0), static_cast<int>(stride1));
}
...@@ -20,24 +20,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -20,24 +20,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops // vLLM custom ops
// //
// The default behavior in PyTorch 2.6 was changed to "requires_contiguous", ops.def(
// so we need "persistent_masked_m_silu_mul_quant(Tensor input, Tensor counts, Tensor! "
// to override this for many GEMMs with the following tag. Otherwise, "y_q, Tensor! y_s,"
// torch.compile will force all input tensors to be contiguous(), which "bool use_ue8m0) -> ()");
// will break many custom ops that require column-major weight matrices. ops.impl("persistent_masked_m_silu_mul_quant", torch::kCUDA,
// This was a bug and PyTorch 2.7 has since fixed this. &persistent_masked_m_silu_mul_quant);
#if TORCH_VERSION_MAJOR == 2 && TORCH_VERSION_MINOR == 6
#define stride_tag at::Tag::needs_fixed_stride_order
#else
#define stride_tag
#endif
// ops.def(
// "silu_mul_fp8_quant_deep_gemm_cuda(Tensor input, Tensor counts, Tensor! "
// "y_q, Tensor! y_s, int group_size, "
// "bool use_ue8m0, int num_parallel_tokens) -> ()");
// ops.impl("silu_mul_fp8_quant_deep_gemm_cuda", torch::kCUDA,
// &silu_mul_fp8_quant_deep_gemm_cuda);
ops.def("weak_ref_tensor(Tensor input) -> Tensor"); ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor); ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
...@@ -131,102 +119,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -131,102 +119,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int blocksparse_head_sliding_step) -> ()"); " int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2_opt_tc", torch::kCUDA, &paged_attention_v2_opt_tc); ops.impl("paged_attention_v2_opt_tc", torch::kCUDA, &paged_attention_v2_opt_tc);
// paged_attention with atth_masks
ops.def(
"paged_attention_v1_with_mask("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v1_with_mask", torch::kCUDA, &paged_attention_v1_with_mask);
// PagedAttention V2.
ops.def(
"paged_attention_v2_with_mask("
" Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
" Tensor! tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v2_with_mask", torch::kCUDA, &paged_attention_v2_with_mask);
// Compute the attention between an input query and the cached
// keys/values using PagedAttention. (opt)
ops.def(
"paged_attention_v1_opt_with_mask("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v1_opt_with_mask", torch::kCUDA, &paged_attention_v1_opt_with_mask);
// PagedAttention V2 (opt).
ops.def(
"paged_attention_v2_opt_with_mask("
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v2_opt_with_mask", torch::kCUDA, &paged_attention_v2_opt_with_mask);
// Compute the attention between an input query and the cached
// keys/values using PagedAttention. (opt)
ops.def(
"paged_attention_v1_opt_tc_with_mask("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v1_opt_tc_with_mask", torch::kCUDA, &paged_attention_v1_opt_tc_with_mask);
// PagedAttention V2 (opt).
ops.def(
"paged_attention_v2_opt_tc_with_mask("
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, Tensor k_scale, Tensor v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step,"
" Tensor? attn_masks,"
" int attn_masks_stride) -> ()");
ops.impl("paged_attention_v2_opt_tc_with_mask", torch::kCUDA, &paged_attention_v2_opt_tc_with_mask);
// Merge attn states // Merge attn states
// Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005 // Implements section 2.2 of https://www.arxiv.org/pdf/2501.01005
// can be used to combine partial attention results (in the split-KV case) // can be used to combine partial attention results (in the split-KV case)
...@@ -240,7 +132,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -240,7 +132,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor suffix_lse) -> ()"); " Tensor suffix_lse) -> ()");
ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states); ops.impl("merge_attn_states", torch::kCUDA, &merge_attn_states);
// #ifndef USE_ROCM
ops.def( ops.def(
"convert_vertical_slash_indexes(" "convert_vertical_slash_indexes("
" Tensor! block_count, Tensor! block_offset, " " Tensor! block_count, Tensor! block_offset, "
...@@ -338,11 +230,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -338,11 +230,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"float epsilon) -> ()"); "float epsilon) -> ()");
ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm); ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
// Polynomial Normalization. // Function for fused QK Norm and RoPE
ops.def( ops.def(
"poly_norm(Tensor! out, Tensor input, Tensor weight, Tensor bias, float " "fused_qk_norm_rope(Tensor! qkv, int num_heads_q, "
"epsilon) -> ()"); "int num_heads_k, int num_heads_v, int head_dim, float eps, "
ops.impl("poly_norm", torch::kCUDA, &poly_norm); "Tensor q_weight, Tensor k_weight, Tensor cos_sin_cache, "
"bool is_neox, Tensor position_ids) -> ()");
ops.impl("fused_qk_norm_rope", torch::kCUDA, &fused_qk_norm_rope);
// Apply repetition penalties to logits in-place // Apply repetition penalties to logits in-place
ops.def( ops.def(
...@@ -351,6 +245,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -351,6 +245,19 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("apply_repetition_penalties_", torch::kCUDA, ops.impl("apply_repetition_penalties_", torch::kCUDA,
&apply_repetition_penalties_); &apply_repetition_penalties_);
// Optimized top-k per row operation
ops.def(
"top_k_per_row(Tensor logits, Tensor rowStarts, Tensor rowEnds, "
"Tensor! indices, int numRows, int stride0, "
"int stride1) -> ()");
ops.impl("top_k_per_row", torch::kCUDA, &top_k_per_row);
ops.def(
"top_k_per_row_decode(Tensor logits, int next_n, "
"Tensor seq_lens, Tensor! indices, int numRows, "
"int stride0, int stride1) -> ()");
ops.impl("top_k_per_row_decode", torch::kCUDA, &top_k_per_row_decode);
// Layernorm-quant // Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def( ops.def(
...@@ -406,15 +313,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -406,15 +313,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Quantized GEMM for AWQ. // Quantized GEMM for AWQ.
ops.def( ops.def(
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, SymInt split_k_iters) -> Tensor", "Tensor _zeros, SymInt split_k_iters) -> Tensor");
{stride_tag});
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
// Dequantization for AWQ. // Dequantization for AWQ.
ops.def( ops.def(
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor", "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
{stride_tag});
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
// Note about marlin kernel 'workspace' arguments: // Note about marlin kernel 'workspace' arguments:
...@@ -436,8 +341,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -436,8 +341,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
"Tensor b_scales, Tensor workspace, " "Tensor b_scales, Tensor workspace, "
"int b_q_type, " "int b_q_type, "
"SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor", "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
{stride_tag});
// conditionally compiled so impl in source file // conditionally compiled so impl in source file
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper. // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
...@@ -463,8 +367,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -463,8 +367,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor? channel_scales," " Tensor? channel_scales,"
" Tensor? token_scales," " Tensor? token_scales,"
" str? schedule" " str? schedule"
") -> Tensor", ") -> Tensor");
{stride_tag});
ops.def( ops.def(
"machete_prepack_B(" "machete_prepack_B("
" Tensor B," " Tensor B,"
...@@ -480,24 +383,30 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -480,24 +383,30 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// gptq_marlin Optimized Quantized GEMM for GPTQ. // gptq_marlin Optimized Quantized GEMM for GPTQ.
ops.def( ops.def(
"gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, " "gptq_marlin_gemm(Tensor a, Tensor? c_or_none, Tensor b_q_weight, "
"Tensor? b_bias_or_none," "Tensor? b_bias_or_none,Tensor b_scales, "
"Tensor b_scales, Tensor? global_scale, Tensor? b_zeros_or_none, Tensor? " "Tensor? a_scales, Tensor? global_scale, Tensor? b_zeros_or_none, "
"g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_q_type, " "Tensor? "
"g_idx_or_none, Tensor? perm_or_none, Tensor workspace, int b_type_id, "
"SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
"bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor", "bool use_atomic_add, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
{stride_tag});
// conditionally compiled so impl registration is in source file // conditionally compiled so impl registration is in source file
// gptq_marlin repack from GPTQ. // gptq_marlin repack from GPTQ.
ops.def( ops.def(
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, " "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor"); "SymInt size_k, SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
// conditionally compiled so impl registrations are in source file // conditionally compiled so impl registrations are in source file
// awq_marlin repack from AWQ. // awq_marlin repack from AWQ.
ops.def( ops.def(
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, " "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
"SymInt size_n, int num_bits) -> Tensor"); "SymInt size_n, int num_bits, bool is_a_8bit) -> Tensor");
// conditionally compiled so impl registrations are in source file
// preprocess W-int4A-fp8 weight for marlin kernel
ops.def(
"marlin_int4_fp8_preprocess(Tensor qweight, "
"Tensor? qzeros_or_none, bool inplace) -> Tensor");
// conditionally compiled so impl registrations are in source file // conditionally compiled so impl registrations are in source file
// CUTLASS w4a8 GEMM // CUTLASS w4a8 GEMM
...@@ -511,8 +420,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -511,8 +420,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor token_scales," " Tensor token_scales,"
" ScalarType? out_type," " ScalarType? out_type,"
" str? maybe_schedule" " str? maybe_schedule"
") -> Tensor", ") -> Tensor");
{stride_tag});
// pack scales // pack scales
ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor"); ops.def("cutlass_pack_scale_fp8(Tensor scales) -> Tensor");
// encode and reorder weight matrix // encode and reorder weight matrix
...@@ -559,33 +467,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -559,33 +467,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def( ops.def(
"cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b," "cutlass_scaled_fp4_mm(Tensor! out, Tensor a, Tensor b,"
" Tensor block_scale_a, Tensor block_scale_b," " Tensor block_scale_a, Tensor block_scale_b,"
" Tensor alpha) -> ()", " Tensor alpha) -> ()");
{stride_tag});
ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm); ops.impl("cutlass_scaled_fp4_mm", torch::kCUDA, &cutlass_scaled_fp4_mm);
// cutlass blockwise scaledgroup GEMM // cutlass blockwise scaledgroup GEMM
ops.def( ops.def(
"cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, " "cutlass_blockwise_scaled_grouped_mm(Tensor! output, Tensor a, Tensor b, "
"Tensor scales_a, Tensor scales_b, " "Tensor scales_a, Tensor scales_b, "
"Tensor problem_sizes, Tensor expert_offsets) -> ()", "Tensor problem_sizes, Tensor expert_offsets) -> ()");
{stride_tag});
// conditionally compiled so impl registration is in source file // conditionally compiled so impl registration is in source file
// cutlass nvfp4 block scaled group GEMM // cutlass nvfp4 block scaled group GEMM
ops.def( ops.def(
"cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b," "cutlass_fp4_group_mm(Tensor! out, Tensor a, Tensor b,"
" Tensor a_blockscale, Tensor b_blockscales, Tensor alphas," " Tensor a_blockscale, Tensor b_blockscales, Tensor alphas,"
" Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()", " Tensor problem_sizes, Tensor expert_offsets, Tensor sf_offsets) -> ()");
{stride_tag}); // conditionally compiled so impl registration is in source file
ops.impl("cutlass_fp4_group_mm", torch::kCUDA, &cutlass_fp4_group_mm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias // quantization, as well as bias
ops.def( ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a," "cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()", " Tensor b_scales, Tensor? bias) -> ()");
{stride_tag});
ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
// CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
...@@ -594,8 +498,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -594,8 +498,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_mm_azp(Tensor! out, Tensor a," "cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj," " Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()", " Tensor? azp, Tensor? bias) -> ()");
{stride_tag});
ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp); ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
// Check if cutlass scaled_mm is supported for CUDA devices of the given // Check if cutlass scaled_mm is supported for CUDA devices of the given
...@@ -614,8 +517,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -614,8 +517,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor a_scales, Tensor b_scales, Tensor expert_offsets, " " Tensor a_scales, Tensor b_scales, Tensor expert_offsets, "
" Tensor problem_sizes, Tensor a_strides, " " Tensor problem_sizes, Tensor a_strides, "
" Tensor b_strides, Tensor c_strides, bool per_act_token, " " Tensor b_strides, Tensor c_strides, bool per_act_token, "
" bool per_out_ch) -> ()", " bool per_out_ch) -> ()");
{stride_tag});
ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm); ops.impl("cutlass_moe_mm", torch::kCUDA, &cutlass_moe_mm);
// A function that computes data required to run fused MoE with w8a8 grouped // A function that computes data required to run fused MoE with w8a8 grouped
...@@ -629,8 +531,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -629,8 +531,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes1, Tensor! problem_sizes2, " " Tensor! problem_sizes1, Tensor! problem_sizes2, "
" Tensor! input_permutation, " " Tensor! input_permutation, "
" Tensor! output_permutation, int num_experts, " " Tensor! output_permutation, int num_experts, "
" int n, int k, Tensor? blockscale_offsets) -> ()", " int n, int k, Tensor? blockscale_offsets) -> "
{stride_tag}); "()");
ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data); ops.impl("get_cutlass_moe_mm_data", torch::kCUDA, &get_cutlass_moe_mm_data);
// A function that computes problem sizes for each expert's multiplication // A function that computes problem sizes for each expert's multiplication
...@@ -641,8 +543,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -641,8 +543,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes1, " " Tensor! problem_sizes1, "
" Tensor! problem_sizes2, " " Tensor! problem_sizes2, "
" int num_experts, int n, int k, " " int num_experts, int n, int k, "
" Tensor? blockscale_offsets) -> ()", " Tensor? blockscale_offsets) -> ()");
{stride_tag});
ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA, ops.impl("get_cutlass_moe_mm_problem_sizes", torch::kCUDA,
&get_cutlass_moe_mm_problem_sizes); &get_cutlass_moe_mm_problem_sizes);
...@@ -657,8 +558,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -657,8 +558,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! problem_sizes2, " " Tensor! problem_sizes2, "
" Tensor expert_num_tokens, " " Tensor expert_num_tokens, "
" int num_local_experts, int padded_m, " " int num_local_experts, int padded_m, "
" int n, int k) -> ()", " int n, int k) -> ()");
{stride_tag});
ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA, ops.impl("get_cutlass_pplx_moe_mm_data", torch::kCUDA,
&get_cutlass_pplx_moe_mm_data); &get_cutlass_pplx_moe_mm_data);
...@@ -682,8 +582,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -682,8 +582,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"cutlass_scaled_sparse_mm(Tensor! out, Tensor a," "cutlass_scaled_sparse_mm(Tensor! out, Tensor a,"
" Tensor bt_nzs," " Tensor bt_nzs,"
" Tensor bt_meta, Tensor a_scales," " Tensor bt_meta, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()", " Tensor b_scales, Tensor? bias) -> ()");
{stride_tag});
ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm); ops.impl("cutlass_scaled_sparse_mm", torch::kCUDA, &cutlass_scaled_sparse_mm);
// CUTLASS sparse matrix compressor // CUTLASS sparse matrix compressor
...@@ -731,9 +630,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -731,9 +630,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// ops.def( // ops.def(
// "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, " // "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
// "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) " // "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, bool "
// "-> Tensor", // "use_v2_format, int bit) "
// {stride_tag}); // "-> Tensor");
// ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
// Post processing for GPTQ. // Post processing for GPTQ.
...@@ -784,7 +683,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -784,7 +683,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor? cache_indices," "Tensor? cache_indices,"
"Tensor? has_initial_state," "Tensor? has_initial_state,"
"Tensor! ssm_states," "Tensor! ssm_states,"
"int pad_slot_id) -> ()"); "int pad_slot_id,"
"int block_size,"
"Tensor? block_idx_first_scheduled_token,"
"Tensor? block_idx_last_scheduled_token,"
"Tensor? initial_state_idx) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
// Hadamard transforms // Hadamard transforms
...@@ -909,7 +812,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -909,7 +812,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
cache_ops.def( cache_ops.def(
"gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, " "gather_and_maybe_dequant_cache(Tensor src_cache, Tensor! dst, "
" Tensor block_table, Tensor cu_seq_lens, " " Tensor block_table, Tensor cu_seq_lens, "
" int batch_size, " " Tensor token_to_seq, "
" int num_tokens, "
" str kv_cache_dtype, " " str kv_cache_dtype, "
" Tensor scale, Tensor? seq_starts) -> ()"); " Tensor scale, Tensor? seq_starts) -> ()");
cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA, cache_ops.impl("gather_and_maybe_dequant_cache", torch::kCUDA,
...@@ -926,6 +830,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -926,6 +830,12 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
"int quant_block_size, str kv_cache_dtype) -> ()"); "int quant_block_size, str kv_cache_dtype) -> ()");
cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA, cache_ops.impl("indexer_k_quant_and_cache", torch::kCUDA,
&indexer_k_quant_and_cache); &indexer_k_quant_and_cache);
cache_ops.def(
"cp_gather_indexer_k_quant_cache(Tensor kv_cache, Tensor! dst_k, Tensor! "
"dst_scale, Tensor block_table, Tensor cu_seq_lens) -> ()");
cache_ops.impl("cp_gather_indexer_k_quant_cache", torch::kCUDA,
&cp_gather_indexer_k_quant_cache);
} }
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
......
...@@ -29,6 +29,22 @@ struct _typeConvert { ...@@ -29,6 +29,22 @@ struct _typeConvert {
static constexpr bool exists = false; static constexpr bool exists = false;
}; };
template <>
struct _typeConvert<float> {
static constexpr bool exists = true;
using hip_type = float;
using packed_hip_type = float2;
using packed_hip_type4 = float4; // For 128-bit vectorization
__device__ static __forceinline__ float convert(hip_type x) { return x; }
__device__ static __forceinline__ float2 convert(packed_hip_type x) {
return x;
}
__device__ static __forceinline__ float4 convert(packed_hip_type4 x) {
return x;
}
};
#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)) #if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
// CUDA < 12.0 runs into issues with packed type conversion // CUDA < 12.0 runs into issues with packed type conversion
template <> template <>
...@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> { ...@@ -37,41 +53,44 @@ struct _typeConvert<c10::Half> {
using hip_type = __half; using hip_type = __half;
using packed_hip_type = __half2; using packed_hip_type = __half2;
__device__ static inline float convert(hip_type x) { return __half2float(x); } __device__ static __forceinline__ float convert(hip_type x) {
__device__ static inline float2 convert(packed_hip_type x) { return __half2float(x);
}
__device__ static __forceinline__ float2 convert(packed_hip_type x) {
return __half22float2(x); return __half22float2(x);
} }
__device__ static inline hip_type convert(float x) { __device__ static __forceinline__ hip_type convert(float x) {
return __float2half_rn(x); return __float2half_rn(x);
} }
__device__ static inline packed_hip_type convert(float2 x) { __device__ static __forceinline__ packed_hip_type convert(float2 x) {
return __float22half2_rn(x); return __float22half2_rn(x);
} }
}; };
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) || defined(USE_ROCM)
// CUDA_ARCH < 800 does not have BF16 support // CUDA_ARCH < 800 does not have BF16 support
// TODO: Add in ROCm support once public headers handle bf16 maturely // ROCm 7.0+ supports bfloat16
template <> template <>
struct _typeConvert<c10::BFloat16> { struct _typeConvert<c10::BFloat16> {
static constexpr bool exists = true; static constexpr bool exists = true;
using hip_type = __nv_bfloat16; using hip_type = __nv_bfloat16;
using packed_hip_type = __nv_bfloat162; using packed_hip_type = __nv_bfloat162;
__device__ static inline float convert(hip_type x) { __device__ static __forceinline__ float convert(hip_type x) {
return __bfloat162float(x); return __bfloat162float(x);
} }
__device__ static inline float2 convert(packed_hip_type x) { __device__ static __forceinline__ float2 convert(packed_hip_type x) {
return __bfloat1622float2(x); return __bfloat1622float2(x);
} }
__device__ static inline hip_type convert(float x) { __device__ static __forceinline__ hip_type convert(float x) {
return __float2bfloat16(x); return __float2bfloat16(x);
} }
__device__ static inline packed_hip_type convert(float2 x) { __device__ static __forceinline__ packed_hip_type convert(float2 x) {
return __float22bfloat162_rn(x); return __float22bfloat162_rn(x);
} }
}; };
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 #endif // (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800) ||
// defined(USE_ROCM)
#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= #endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
// 12000)) // 12000))
...@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec { ...@@ -95,10 +114,15 @@ struct alignas(16) _f16Vec {
if constexpr (width % 2 == 0) { if constexpr (width % 2 == 0) {
#pragma unroll #pragma unroll
for (int i = 0; i < width; i += 2) { for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]}; if constexpr (std::is_same_v<T2, float2>) {
temp += T2{other.data[i], other.data[i + 1]}; data[i] += other.data[i];
data[i] = temp.x; data[i + 1] += other.data[i + 1];
data[i + 1] = temp.y; } else {
T2 temp{data[i], data[i + 1]};
temp += T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} }
} else { } else {
#pragma unroll #pragma unroll
...@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec { ...@@ -111,10 +135,15 @@ struct alignas(16) _f16Vec {
if constexpr (width % 2 == 0) { if constexpr (width % 2 == 0) {
#pragma unroll #pragma unroll
for (int i = 0; i < width; i += 2) { for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]}; if constexpr (std::is_same_v<T2, float2>) {
temp *= T2{other.data[i], other.data[i + 1]}; data[i] *= other.data[i];
data[i] = temp.x; data[i + 1] *= other.data[i + 1];
data[i + 1] = temp.y; } else {
T2 temp{data[i], data[i + 1]};
temp *= T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} }
} else { } else {
#pragma unroll #pragma unroll
......
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
# docs/contributing/dockerfile/dockerfile.md and # docs/contributing/dockerfile/dockerfile.md and
# docs/assets/contributing/dockerfile-stages-dependency.png # docs/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.9.1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
# By parameterizing the base images, we allow third-party to use their own # By parameterizing the base images, we allow third-party to use their own
...@@ -15,13 +15,13 @@ ARG PYTHON_VERSION=3.12 ...@@ -15,13 +15,13 @@ ARG PYTHON_VERSION=3.12
# Example: # Example:
# docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 # docker build --build-arg BUILD_BASE_IMAGE=registry.acme.org/mirror/nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# Important: We build with an old version of Ubuntu to maintain broad # Important: We build with an old version of Ubuntu to maintain broad
# compatibility with other Linux OSes. The main reason for this is that the # compatibility with other Linux OSes. The main reason for this is that the
# glibc version is baked into the distro, and binaries built with one glibc # glibc version is baked into the distro, and binaries built with one glibc
# version are not backwards compatible with OSes that use an earlier version. # version are not backwards compatible with OSes that use an earlier version.
ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04 ARG BUILD_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu20.04
# TODO: Restore to base image after FlashInfer AOT wheel fixed # Using cuda base image with minimal dependencies necessary for JIT compilation (FlashInfer, DeepGEMM, EP kernels)
ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-devel-ubuntu22.04 ARG FINAL_BASE_IMAGE=nvidia/cuda:${CUDA_VERSION}-base-ubuntu22.04
# By parameterizing the Deadsnakes repository URL, we allow third-party to use # By parameterizing the Deadsnakes repository URL, we allow third-party to use
# their own mirror. When doing so, we don't benefit from the transparent # their own mirror. When doing so, we don't benefit from the transparent
...@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ...@@ -56,7 +56,6 @@ ARG UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
# PyTorch provides its own indexes for standard and nightly builds # PyTorch provides its own indexes for standard and nightly builds
ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl ARG PYTORCH_CUDA_INDEX_BASE_URL=https://download.pytorch.org/whl
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL=https://download.pytorch.org/whl/nightly
# PIP supports multiple authentication schemes, including keyring # PIP supports multiple authentication schemes, including keyring
# By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to # By parameterizing the PIP_KEYRING_PROVIDER variable and setting it to
...@@ -86,7 +85,20 @@ ARG GET_PIP_URL ...@@ -86,7 +85,20 @@ ARG GET_PIP_URL
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl sudo python3-pip \ && apt-get install -y --no-install-recommends \
ccache \
software-properties-common \
git \
curl \
sudo \
python3-pip \
libibverbs-dev \
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
gcc-10 \
g++-10 \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10 \
&& rm -rf /var/lib/apt/lists/* \
&& curl -LsSf https://astral.sh/uv/install.sh | sh \ && curl -LsSf https://astral.sh/uv/install.sh | sh \
&& $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \ && $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION} \
&& rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \ && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \
...@@ -98,7 +110,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -98,7 +110,6 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Activate virtual environment and add uv to PATH # Activate virtual environment and add uv to PATH
...@@ -112,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ...@@ -112,10 +123,6 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts # Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Upgrade to GCC 10 to avoid https://gcc.gnu.org/bugzilla/show_bug.cgi?id=92519
# as it was causing spam when compiling the CUTLASS kernels
RUN apt-get install -y gcc-10 g++-10
RUN update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-10 110 --slave /usr/bin/g++ g++ /usr/bin/g++-10
RUN <<EOF RUN <<EOF
gcc --version gcc --version
EOF EOF
...@@ -226,10 +233,32 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -226,10 +233,32 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \
fi fi
# Install DeepGEMM from source
ARG DEEPGEMM_GIT_REF
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 TORCH_CUDA_ARCH_LIST="9.0a 10.0a" /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"} --wheel-dir /tmp/deepgemm/dist
# Ensure the wheel dir exists so later-stage COPY won't fail when DeepGEMM is skipped
RUN mkdir -p /tmp/deepgemm/dist && touch /tmp/deepgemm/dist/.deepgemm_skipped
COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries.sh
# Install EP kernels(pplx-kernels and DeepEP)
ARG PPLX_COMMIT_HASH
ARG DEEPEP_COMMIT_HASH
RUN --mount=type=cache,target=/root/.cache/uv \
export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \
/tmp/install_python_libraries.sh \
--workspace /tmp/ep_kernels_workspace \
--mode wheel \
${PPLX_COMMIT_HASH:+--pplx-ref "$PPLX_COMMIT_HASH"} \
${DEEPEP_COMMIT_HASH:+--deepep-ref "$DEEPEP_COMMIT_HASH"} && \
find /tmp/ep_kernels_workspace/nvshmem -name '*.a' -delete
# Check the size of the wheel if RUN_WHEEL_CHECK is true # Check the size of the wheel if RUN_WHEEL_CHECK is true
COPY .buildkite/check-wheel-size.py check-wheel-size.py COPY .buildkite/check-wheel-size.py check-wheel-size.py
# sync the default value with .buildkite/check-wheel-size.py # sync the default value with .buildkite/check-wheel-size.py
ARG VLLM_MAX_SIZE_MB=450 ARG VLLM_MAX_SIZE_MB=500
ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB
ARG RUN_WHEEL_CHECK=true ARG RUN_WHEEL_CHECK=true
RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \
...@@ -254,7 +283,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ...@@ -254,7 +283,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
# Install libnuma-dev, required by fastsafetensors (fixes #20384) # Install libnuma-dev, required by fastsafetensors (fixes #20384)
RUN apt-get update && apt-get install -y libnuma-dev && rm -rf /var/lib/apt/lists/* RUN apt-get update && apt-get install -y --no-install-recommends libnuma-dev && rm -rf /var/lib/apt/lists/*
COPY requirements/lint.txt requirements/lint.txt COPY requirements/lint.txt requirements/lint.txt
COPY requirements/test.txt requirements/test.txt COPY requirements/test.txt requirements/test.txt
COPY requirements/dev.txt requirements/dev.txt COPY requirements/dev.txt requirements/dev.txt
...@@ -273,6 +302,7 @@ WORKDIR /vllm-workspace ...@@ -273,6 +302,7 @@ WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
# TODO (huydhn): There is no prebuilt gdrcopy package on 12.9 at the moment
ARG GDRCOPY_CUDA_VERSION=12.8 ARG GDRCOPY_CUDA_VERSION=12.8
# Keep in line with FINAL_BASE_IMAGE # Keep in line with FINAL_BASE_IMAGE
ARG GDRCOPY_OS_VERSION=Ubuntu22_04 ARG GDRCOPY_OS_VERSION=Ubuntu22_04
...@@ -290,8 +320,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ ...@@ -290,8 +320,15 @@ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \ && echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y ccache software-properties-common git curl wget sudo vim python3-pip \ && apt-get install -y --no-install-recommends \
&& apt-get install -y ffmpeg libsm6 libxext6 libgl1 \ software-properties-common \
curl \
sudo \
python3-pip \
ffmpeg \
libsm6 \
libxext6 \
libgl1 \
&& if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \ && if [ ! -z ${DEADSNAKES_MIRROR_URL} ] ; then \
if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \ if [ ! -z "${DEADSNAKES_GPGKEY_URL}" ] ; then \
mkdir -p -m 0755 /etc/apt/keyrings ; \ mkdir -p -m 0755 /etc/apt/keyrings ; \
...@@ -306,17 +343,38 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \ ...@@ -306,17 +343,38 @@ RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
done ; \ done ; \
fi \ fi \
&& apt-get update -y \ && apt-get update -y \
&& apt-get install -y python${PYTHON_VERSION} python${PYTHON_VERSION}-dev python${PYTHON_VERSION}-venv libibverbs-dev \ && apt-get install -y --no-install-recommends \
python${PYTHON_VERSION} \
python${PYTHON_VERSION}-dev \
python${PYTHON_VERSION}-venv \
libibverbs-dev \
&& rm -rf /var/lib/apt/lists/* \
&& update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \ && update-alternatives --install /usr/bin/python3 python3 /usr/bin/python${PYTHON_VERSION} 1 \
&& update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \ && update-alternatives --set python3 /usr/bin/python${PYTHON_VERSION} \
&& ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \ && ln -sf /usr/bin/python${PYTHON_VERSION}-config /usr/bin/python3-config \
&& curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \ && curl -sS ${GET_PIP_URL} | python${PYTHON_VERSION} \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
# Install CUDA development tools and build essentials for runtime JIT compilation
# (FlashInfer, DeepGEMM, EP kernels all require compilation at runtime)
RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \
apt-get update -y && \
apt-get install -y --no-install-recommends \
cuda-nvcc-${CUDA_VERSION_DASH} \
cuda-cudart-${CUDA_VERSION_DASH} \
cuda-nvrtc-${CUDA_VERSION_DASH} \
cuda-cuobjdump-${CUDA_VERSION_DASH} \
# https://github.com/vllm-project/vllm/issues/29590
libcurand-dev-${CUDA_VERSION_DASH} \
libcublas-${CUDA_VERSION_DASH} \
# Fixes nccl_allocator requiring nccl.h at runtime
# https://github.com/vllm-project/vllm/blob/1336a1ea244fa8bfd7e72751cabbdb5b68a0c11a/vllm/distributed/device_communicators/pynccl_allocator.py#L22
libnccl-dev && \
rm -rf /var/lib/apt/lists/*
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL ARG PYTORCH_CUDA_INDEX_BASE_URL
ARG PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL
ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER ARG PIP_KEYRING_PROVIDER UV_KEYRING_PROVIDER
# Install uv for faster pip installs # Install uv for faster pip installs
...@@ -336,85 +394,20 @@ ENV UV_LINK_MODE=copy ...@@ -336,85 +394,20 @@ ENV UV_LINK_MODE=copy
# or future versions of triton. # or future versions of triton.
RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/ RUN ldconfig /usr/local/cuda-$(echo $CUDA_VERSION | cut -d. -f1,2)/compat/
# arm64 (GH200) build follows the practice of "use existing pytorch" build,
# we need to install torch and torchvision from the nightly builds first,
# pytorch will not appear as a vLLM dependency in all of the following steps
# after this step
RUN --mount=type=cache,target=/root/.cache/uv \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
uv pip install --system \
--index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
"torch==2.8.0.dev20250318+cu128" "torchvision==0.22.0.dev20250319" ; \
uv pip install --system \
--index-url ${PYTORCH_CUDA_NIGHTLY_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
--pre pytorch_triton==3.3.0+gitab727c40 ; \
fi
# Install vllm wheel first, so that torch etc will be installed. # Install vllm wheel first, so that torch etc will be installed.
RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system dist/*.whl --verbose \ uv pip install --system dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# If we need to build FlashInfer wheel before its release: # Install FlashInfer pre-compiled kernel cache and binaries
# $ # Note we remove 7.0 from the arch list compared to the list below, since FlashInfer only supports sm75+ # https://docs.flashinfer.ai/installation.html
# $ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0a 10.0a 12.0' RUN --mount=type=cache,target=/root/.cache/uv \
# $ git clone https://github.com/flashinfer-ai/flashinfer.git --recursive uv pip install --system flashinfer-cubin==0.5.3 \
# $ cd flashinfer && uv pip install --system flashinfer-jit-cache==0.5.3 \
# $ git checkout v0.2.6.post1 --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') \
# $ python -m flashinfer.aot && flashinfer show-config
# $ python -m build --no-isolation --wheel
# $ ls -la dist
# -rw-rw-r-- 1 mgoin mgoin 205M Jun 9 18:03 flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# $ # upload the wheel to a public location, e.g. https://wheels.vllm.ai/flashinfer/v0.2.6.post1/flashinfer_python-0.2.6.post1-cp39-abi3-linux_x86_64.whl
# Install FlashInfer from source
ARG FLASHINFER_GIT_REPO="https://github.com/flashinfer-ai/flashinfer.git"
# Keep this in sync with "flashinfer" extra in setup.py
ARG FLASHINFER_GIT_REF="v0.3.1"
# Flag to control whether to compile FlashInfer AOT kernels
# Set to "true" to enable AOT compilation:
# docker build --build-arg FLASHINFER_AOT_COMPILE=true ...
ARG FLASHINFER_AOT_COMPILE=false
RUN --mount=type=cache,target=/root/.cache/uv bash - <<'BASH'
. /etc/environment
git clone --depth 1 --recursive --shallow-submodules \
--branch ${FLASHINFER_GIT_REF} \
${FLASHINFER_GIT_REPO} flashinfer
pushd flashinfer
if [ "${FLASHINFER_AOT_COMPILE}" = "true" ]; then
# Exclude CUDA arches for older versions (11.x and 12.0-12.7)
# TODO: Update this to allow setting TORCH_CUDA_ARCH_LIST as a build arg.
if [[ "${CUDA_VERSION}" == 11.* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9"
elif [[ "${CUDA_VERSION}" == 12.[0-7]* ]]; then
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a"
else
# CUDA 12.8+ supports 10.0a and 12.0
FI_TORCH_CUDA_ARCH_LIST="7.5 8.0 8.9 9.0a 10.0a 12.0"
fi
echo "🏗️ Installing FlashInfer with AOT compilation for arches: ${FI_TORCH_CUDA_ARCH_LIST}"
export FLASHINFER_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}"
# HACK: We need these to run flashinfer.aot before installing flashinfer, get from the package in the future
uv pip install --system cuda-python==$(echo $CUDA_VERSION | cut -d. -f1,2) pynvml==$(echo $CUDA_VERSION | cut -d. -f1) nvidia-nvshmem-cu$(echo $CUDA_VERSION | cut -d. -f1)
# Build AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer.aot
# Install with no-build-isolation since we already built AOT kernels
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
uv pip install --system --no-build-isolation . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Download pre-compiled cubins
TORCH_CUDA_ARCH_LIST="${FI_TORCH_CUDA_ARCH_LIST}" \
python3 -m flashinfer --download-cubin || echo "WARNING: Failed to download flashinfer cubins."
else
echo "🏗️ Installing FlashInfer without AOT compilation in JIT mode"
uv pip install --system . \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
fi
popd
rm -rf flashinfer
BASH
COPY examples examples COPY examples examples
COPY benchmarks benchmarks COPY benchmarks benchmarks
COPY ./vllm/collect_env.py . COPY ./vllm/collect_env.py .
...@@ -423,36 +416,38 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -423,36 +416,38 @@ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
uv pip list uv pip list
# Even when we build Flashinfer with AOT mode, there's still # Install deepgemm wheel that has been built in the `build` stage
# some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed
COPY requirements/build.txt requirements/build.txt
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/build.txt \ --mount=type=bind,from=build,source=/tmp/deepgemm/dist,target=/tmp/deepgemm/dist,ro \
sh -c 'if ls /tmp/deepgemm/dist/*.whl >/dev/null 2>&1; then \
uv pip install --system /tmp/deepgemm/dist/*.whl; \
else \
echo "No DeepGEMM wheels to install; skipping."; \
fi'
# Pytorch now installs NVSHMEM, setting LD_LIBRARY_PATH (https://github.com/pytorch/pytorch/blob/d38164a545b4a4e4e0cf73ce67173f70574890b6/.ci/manywheel/build_cuda.sh#L141C14-L141C36)
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
# Install EP kernels wheels (pplx-kernels and DeepEP) that have been built in the `build` stage
RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system ep_kernels/dist/*.whl --verbose \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.')
# Install DeepGEMM from source RUN --mount=type=bind,source=tools/install_gdrcopy.sh,target=/tmp/install_gdrcopy.sh,ro \
ARG DEEPGEMM_GIT_REF set -eux; \
COPY tools/install_deepgemm.sh /tmp/install_deepgemm.sh
RUN --mount=type=cache,target=/root/.cache/uv \
VLLM_DOCKER_BUILD_CONTEXT=1 /tmp/install_deepgemm.sh --cuda-version "${CUDA_VERSION}" ${DEEPGEMM_GIT_REF:+--ref "$DEEPGEMM_GIT_REF"}
COPY tools/install_gdrcopy.sh install_gdrcopy.sh
RUN set -eux; \
case "${TARGETPLATFORM}" in \ case "${TARGETPLATFORM}" in \
linux/arm64) UUARCH="aarch64" ;; \ linux/arm64) UUARCH="aarch64" ;; \
linux/amd64) UUARCH="x64" ;; \ linux/amd64) UUARCH="x64" ;; \
*) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \ *) echo "Unsupported TARGETPLATFORM: ${TARGETPLATFORM}" >&2; exit 1 ;; \
esac; \ esac; \
./install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"; \ /tmp/install_gdrcopy.sh "${GDRCOPY_OS_VERSION}" "${GDRCOPY_CUDA_VERSION}" "${UUARCH}"
rm ./install_gdrcopy.sh
# Install EP kernels(pplx-kernels and DeepEP) # CUDA image changed from /usr/local/nvidia to /usr/local/cuda in 12.8 but will
COPY tools/ep_kernels/install_python_libraries.sh install_python_libraries.sh # return to /usr/local/nvidia in 13.0 to allow container providers to mount drivers
ENV CUDA_HOME=/usr/local/cuda # consistently from the host (see https://github.com/vllm-project/vllm/issues/18859).
RUN export TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST:-9.0a+PTX}" \ # Until then, add /usr/local/nvidia/lib64 before the image cuda path to allow override.
&& bash install_python_libraries.sh ENV LD_LIBRARY_PATH=/usr/local/nvidia/lib64:${LD_LIBRARY_PATH}
#################### vLLM installation IMAGE #################### #################### vLLM installation IMAGE ####################
...@@ -467,6 +462,7 @@ ARG PYTHON_VERSION ...@@ -467,6 +462,7 @@ ARG PYTHON_VERSION
ARG PIP_INDEX_URL UV_INDEX_URL ARG PIP_INDEX_URL UV_INDEX_URL
ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
ARG PYTORCH_CUDA_INDEX_BASE_URL
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out # This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
...@@ -475,11 +471,17 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" ...@@ -475,11 +471,17 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts # Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
RUN echo 'tzdata tzdata/Areas select America' | debconf-set-selections \
&& echo 'tzdata tzdata/Zones/America select Los_Angeles' | debconf-set-selections \
&& apt-get update -y \
&& apt-get install -y git
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \ if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt; \ uv pip install --system -r requirements/dev.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \
fi fi
# install development dependencies (for testing) # install development dependencies (for testing)
...@@ -514,19 +516,18 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL ...@@ -514,19 +516,18 @@ ARG PIP_EXTRA_INDEX_URL UV_EXTRA_INDEX_URL
# Reference: https://github.com/astral-sh/uv/pull/1694 # Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
COPY requirements/kv_connectors.txt requirements/kv_connectors.txt
# install additional dependencies for openai api server # install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \
if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \ if [ "$INSTALL_KV_CONNECTORS" = "true" ]; then \
uv pip install --system -r requirements/kv_connectors.txt; \ uv pip install --system -r /tmp/kv_connectors.txt; \
fi; \ fi; \
if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
BITSANDBYTES_VERSION="0.42.0"; \ BITSANDBYTES_VERSION="0.42.0"; \
else \ else \
BITSANDBYTES_VERSION="0.46.1"; \ BITSANDBYTES_VERSION="0.46.1"; \
fi; \ fi; \
uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' boto3 runai-model-streamer runai-model-streamer[s3] uv pip install --system accelerate hf_transfer modelscope "bitsandbytes>=${BITSANDBYTES_VERSION}" 'timm>=1.0.17' 'runai-model-streamer[s3,gcs]>=0.15.0'
ENV VLLM_USAGE_SOURCE production-docker-image ENV VLLM_USAGE_SOURCE production-docker-image
...@@ -539,5 +540,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"] ...@@ -539,5 +540,5 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"]
FROM vllm-openai-base AS vllm-openai FROM vllm-openai-base AS vllm-openai
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
...@@ -13,10 +13,11 @@ ...@@ -13,10 +13,11 @@
# vllm-dev: used for development # vllm-dev: used for development
# #
# Build arguments: # Build arguments:
# PYTHON_VERSION=3.12 (default)|3.11|3.10|3.9 # PYTHON_VERSION=3.13|3.12 (default)|3.11|3.10
# VLLM_CPU_DISABLE_AVX512=false (default)|true # VLLM_CPU_DISABLE_AVX512=false (default)|true
# VLLM_CPU_AVX512BF16=false (default)|true # VLLM_CPU_AVX512BF16=false (default)|true
# VLLM_CPU_AVX512VNNI=false (default)|true # VLLM_CPU_AVX512VNNI=false (default)|true
# VLLM_CPU_AMXBF16=false (default)|true
# #
######################### COMMON BASE IMAGE ######################### ######################### COMMON BASE IMAGE #########################
...@@ -31,11 +32,12 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" ...@@ -31,11 +32,12 @@ ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \ --mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \ apt-get update -y \
&& apt-get install -y --no-install-recommends ccache git curl wget ca-certificates \ && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \ gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh && curl -LsSf https://astral.sh/uv/install.sh | sh
ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
...@@ -47,7 +49,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH" ...@@ -47,7 +49,7 @@ ENV PATH="$VIRTUAL_ENV/bin:$PATH"
ENV UV_HTTP_TIMEOUT=500 ENV UV_HTTP_TIMEOUT=500
# Install Python dependencies # Install Python dependencies
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL} ENV UV_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_INDEX_STRATEGY="unsafe-best-match"
...@@ -79,6 +81,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc ...@@ -79,6 +81,9 @@ RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### BUILD IMAGE ######################### ######################### BUILD IMAGE #########################
FROM base AS vllm-build FROM base AS vllm-build
ARG max_jobs=32
ENV MAX_JOBS=${max_jobs}
ARG GIT_REPO_CHECK=0 ARG GIT_REPO_CHECK=0
# Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ... # Support for building with non-AVX512 vLLM: docker build --build-arg VLLM_CPU_DISABLE_AVX512="true" ...
ARG VLLM_CPU_DISABLE_AVX512=0 ARG VLLM_CPU_DISABLE_AVX512=0
...@@ -89,6 +94,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16} ...@@ -89,6 +94,9 @@ ENV VLLM_CPU_AVX512BF16=${VLLM_CPU_AVX512BF16}
# Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ... # Support for building with AVX512VNNI ISA: docker build --build-arg VLLM_CPU_AVX512VNNI="true" ...
ARG VLLM_CPU_AVX512VNNI=0 ARG VLLM_CPU_AVX512VNNI=0
ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI} ENV VLLM_CPU_AVX512VNNI=${VLLM_CPU_AVX512VNNI}
# Support for building with AMXBF16 ISA: docker build --build-arg VLLM_CPU_AMXBF16="true" ...
ARG VLLM_CPU_AMXBF16=0
ENV VLLM_CPU_AMXBF16=${VLLM_CPU_AMXBF16}
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -104,20 +112,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -104,20 +112,33 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \ --mount=type=cache,target=/workspace/vllm/.deps,sharing=locked \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel VLLM_TARGET_DEVICE=cpu python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
######################### TEST DEPS ######################### ######################### TEST DEPS #########################
FROM base AS vllm-test-deps FROM base AS vllm-test-deps
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
# TODO: Update to 2.9.0 when there is a new build for intel_extension_for_pytorch for that version
RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \ RUN --mount=type=bind,src=requirements/test.in,target=requirements/test.in \
cp requirements/test.in requirements/cpu-test.in && \ cp requirements/test.in requirements/cpu-test.in && \
sed -i '/mamba_ssm/d' requirements/cpu-test.in && \ sed -i '/mamba_ssm/d' requirements/cpu-test.in && \
remove_packages_not_supported_on_aarch64() { \
case "$(uname -m)" in \
aarch64|arm64) \
sed -i '/decord/d' requirements/cpu-test.in; \
sed -i '/terratorch/d' requirements/cpu-test.in; \
;; \
esac; \
}; \
remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.9.1/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -r requirements/cpu-test.txt uv pip install -r requirements/cpu-test.txt
######################### DEV IMAGE ######################### ######################### DEV IMAGE #########################
FROM vllm-build AS vllm-dev FROM vllm-build AS vllm-dev
...@@ -130,12 +151,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \ ...@@ -130,12 +151,12 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \ --mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=cpu python3 setup.py develop VLLM_TARGET_DEVICE=cpu python3 setup.py develop
COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt COPY --from=vllm-test-deps /workspace/vllm/requirements/cpu-test.txt requirements/test.txt
...@@ -160,11 +181,12 @@ ADD ./benchmarks/ ./benchmarks/ ...@@ -160,11 +181,12 @@ ADD ./benchmarks/ ./benchmarks/
ADD ./vllm/collect_env.py . ADD ./vllm/collect_env.py .
ADD ./.buildkite/ ./.buildkite/ ADD ./.buildkite/ ./.buildkite/
# Create symlink for vllm-workspace to maintain CI compatibility
RUN ln -sf /workspace /vllm-workspace
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
ENTRYPOINT ["bash"]
######################### RELEASE IMAGE ######################### ######################### RELEASE IMAGE #########################
FROM base AS vllm-openai FROM base AS vllm-openai
...@@ -176,4 +198,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -176,4 +198,4 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \ --mount=type=bind,from=vllm-build,src=/workspace/vllm/dist,target=dist \
uv pip install dist/*.whl uv pip install dist/*.whl
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
...@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -76,34 +76,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/common.txt uv pip install --system -r requirements/common.txt
# must put before installing xformers, so it can install the correct version of xfomrers.
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# Build xformers with cuda and torch nightly
# following official xformers guidance: https://github.com/facebookresearch/xformers#build
# todo(elainewy): cache xformers build result for faster build
ARG max_jobs=16
ENV MAX_JOBS=${max_jobs}
ARG XFORMERS_COMMIT=f2de641ef670510cadab099ce6954031f52f191c
ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \
--mount=type=cache,target=/root/.cache/uv \
echo 'git clone xformers...' \
&& git clone https://github.com/facebookresearch/xformers.git --recursive \
&& cd xformers \
&& git checkout ${XFORMERS_COMMIT} \
&& git submodule update --init --recursive \
&& echo 'finish git clone xformers...' \
&& rm -rf build \
&& python3 setup.py bdist_wheel --dist-dir=../xformers-dist --verbose \
&& cd .. \
&& rm -rf xformers
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system xformers-dist/*.whl --verbose
# build can take a long time, and the torch nightly version fetched from url can be different in next docker stage. # build can take a long time, and the torch nightly version fetched from url can be different in next docker stage.
# track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same # track the nightly torch version used in the build, when we set up runtime environment we can make sure the version is the same
RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt RUN uv pip freeze | grep -i '^torch\|^torchvision\|^torchaudio' > torch_build_versions.txt
...@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm ...@@ -233,11 +205,6 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/vllm
--mount=type=cache,target=/root/.cache/uv \ --mount=type=cache,target=/root/.cache/uv \
uv pip install --system vllm-dist/*.whl --verbose uv pip install --system vllm-dist/*.whl --verbose
# install xformers again for the new environment
RUN --mount=type=bind,from=base,src=/workspace/xformers-dist,target=/vllm-workspace/xformers-dist \
--mount=type=cache,target=/root/.cache/uv \
uv pip install --system /vllm-workspace/xformers-dist/*.whl --verbose
ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0' ARG torch_cuda_arch_list='8.0;8.6;8.9;9.0'
# install package for build flashinfer # install package for build flashinfer
...@@ -246,7 +213,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2. ...@@ -246,7 +213,7 @@ RUN pip install setuptools==75.6.0 packaging==23.2 ninja==1.11.1.3 build==1.2.2.
# build flashinfer for torch nightly from source around 10 mins # build flashinfer for torch nightly from source around 10 mins
# release version: v0.3.1 # release version: v0.5.2
# todo(elainewy): cache flashinfer build result for faster build # todo(elainewy): cache flashinfer build result for faster build
ENV CCACHE_DIR=/root/.cache/ccache ENV CCACHE_DIR=/root/.cache/ccache
RUN --mount=type=cache,target=/root/.cache/ccache \ RUN --mount=type=cache,target=/root/.cache/ccache \
...@@ -254,7 +221,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ ...@@ -254,7 +221,7 @@ RUN --mount=type=cache,target=/root/.cache/ccache \
echo "git clone flashinfer..." \ echo "git clone flashinfer..." \
&& git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \ && git clone --recursive https://github.com/flashinfer-ai/flashinfer.git \
&& cd flashinfer \ && cd flashinfer \
&& git checkout v0.3.1 \ && git checkout v0.5.2 \
&& git submodule update --init --recursive \ && git submodule update --init --recursive \
&& echo "finish git clone flashinfer..." \ && echo "finish git clone flashinfer..." \
&& rm -rf build \ && rm -rf build \
...@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -307,7 +274,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt uv pip install --system -r requirements/nightly_torch_test.txt
# Logging to confirm the torch versions # Logging to confirm the torch versions
RUN pip freeze | grep -E 'torch|xformers|vllm|flashinfer' RUN pip freeze | grep -E 'torch|vllm|flashinfer'
# Logging to confirm all the packages are installed # Logging to confirm all the packages are installed
RUN pip freeze RUN pip freeze
......
ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 ARG BASE_UBI_IMAGE_TAG=9.6-1754584681
############################################################### ###############################################################
# Stage to build openblas # Stage to build openblas
...@@ -7,9 +7,9 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109 ...@@ -7,9 +7,9 @@ ARG BASE_UBI_IMAGE_TAG=9.5-1741850109
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS openblas-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
RUN microdnf install -y dnf && dnf install -y gcc-toolset-13 make wget unzip \ RUN microdnf install -y dnf && dnf install -y gcc-toolset-14 make wget unzip \
&& source /opt/rh/gcc-toolset-13/enable \ && source /opt/rh/gcc-toolset-14/enable \
&& wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \ && wget https://github.com/OpenMathLib/OpenBLAS/releases/download/v$OPENBLAS_VERSION/OpenBLAS-$OPENBLAS_VERSION.zip \
&& unzip OpenBLAS-$OPENBLAS_VERSION.zip \ && unzip OpenBLAS-$OPENBLAS_VERSION.zip \
&& cd OpenBLAS-$OPENBLAS_VERSION \ && cd OpenBLAS-$OPENBLAS_VERSION \
...@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel ...@@ -38,7 +38,7 @@ RUN dnf install -y openjpeg2-devel lcms2-devel tcl-devel tk-devel fribidi-devel
FROM centos-deps-builder AS base-builder FROM centos-deps-builder AS base-builder
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
# Set Environment Variables for venv, cargo & openblas # Set Environment Variables for venv, cargo & openblas
ENV VIRTUAL_ENV=/opt/vllm ENV VIRTUAL_ENV=/opt/vllm
...@@ -57,11 +57,11 @@ COPY --from=openblas-builder /tmp/control /dev/null ...@@ -57,11 +57,11 @@ COPY --from=openblas-builder /tmp/control /dev/null
RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
dnf install -y openssl-devel \ dnf install -y openssl-devel \
&& dnf install -y \ && dnf install -y \
git tar gcc-toolset-13 automake libtool \ git tar gcc-toolset-14 automake libtool \
pkgconfig xsimd zeromq-devel kmod findutils protobuf* \ pkgconfig xsimd zeromq-devel kmod findutils protobuf* \
libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \ libtiff-devel libjpeg-devel zlib-devel freetype-devel libwebp-devel \
harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \ harfbuzz-devel libraqm-devel libimagequant-devel libxcb-devel \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip clang-devel \
&& dnf clean all \ && dnf clean all \
&& PREFIX=/usr/local make -C /openblas install \ && PREFIX=/usr/local make -C /openblas install \
&& ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \ && ln -sf /usr/lib64/libatomic.so.1 /usr/lib64/libatomic.so \
...@@ -79,12 +79,12 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/, ...@@ -79,12 +79,12 @@ RUN --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,
FROM base-builder AS torch-builder FROM base-builder AS torch-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG TORCH_VERSION=2.6.0 ARG TORCH_VERSION=2.7.0
ARG _GLIBCXX_USE_CXX11_ABI=1 ARG _GLIBCXX_USE_CXX11_ABI=1
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \ git clone --recursive https://github.com/pytorch/pytorch.git -b v${TORCH_VERSION} && \
cd pytorch && \ cd pytorch && \
uv pip install -r requirements.txt && \ uv pip install -r requirements.txt && \
...@@ -93,18 +93,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -93,18 +93,18 @@ RUN --mount=type=cache,target=/root/.cache/uv \
MAX_JOBS=${MAX_JOBS:-$(nproc)} \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \
PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/ PYTORCH_BUILD_VERSION=${TORCH_VERSION} PYTORCH_BUILD_NUMBER=1 uv build --wheel --out-dir /torchwheels/
ARG TORCHVISION_VERSION=0.21.0 ARG TORCHVISION_VERSION=0.22.0
ARG TORCHVISION_USE_NVJPEG=0 ARG TORCHVISION_USE_NVJPEG=0
ARG TORCHVISION_USE_FFMPEG=0 ARG TORCHVISION_USE_FFMPEG=0
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \ git clone --recursive https://github.com/pytorch/vision.git -b v${TORCHVISION_VERSION} && \
cd vision && \ cd vision && \
MAX_JOBS=${MAX_JOBS:-$(nproc)} \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \
BUILD_VERSION=${TORCHVISION_VERSION} \ BUILD_VERSION=${TORCHVISION_VERSION} \
uv build --wheel --out-dir /torchwheels/ --no-build-isolation uv build --wheel --out-dir /torchwheels/ --no-build-isolation
ARG TORCHAUDIO_VERSION=2.6.0 ARG TORCHAUDIO_VERSION=2.7.0
ARG BUILD_SOX=1 ARG BUILD_SOX=1
ARG BUILD_KALDI=1 ARG BUILD_KALDI=1
ARG BUILD_RNNT=1 ARG BUILD_RNNT=1
...@@ -113,7 +113,7 @@ ARG USE_ROCM=0 ...@@ -113,7 +113,7 @@ ARG USE_ROCM=0
ARG USE_CUDA=0 ARG USE_CUDA=0
ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1 ARG TORCHAUDIO_TEST_ALLOW_SKIP_IF_NO_FFMPEG=1
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \ git clone --recursive https://github.com/pytorch/audio.git -b v${TORCHAUDIO_VERSION} && \
cd audio && \ cd audio && \
MAX_JOBS=${MAX_JOBS:-$(nproc)} \ MAX_JOBS=${MAX_JOBS:-$(nproc)} \
...@@ -128,9 +128,9 @@ FROM base-builder AS arrow-builder ...@@ -128,9 +128,9 @@ FROM base-builder AS arrow-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG PYARROW_PARALLEL ARG PYARROW_PARALLEL
ARG PYARROW_VERSION=19.0.1 ARG PYARROW_VERSION=21.0.0
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \ git clone --recursive https://github.com/apache/arrow.git -b apache-arrow-${PYARROW_VERSION} && \
cd arrow/cpp && \ cd arrow/cpp && \
mkdir build && cd build && \ mkdir build && cd build && \
...@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -145,7 +145,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
make install -j ${MAX_JOBS:-$(nproc)} && \ make install -j ${MAX_JOBS:-$(nproc)} && \
cd ../../python/ && \ cd ../../python/ && \
uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \ uv pip install -v -r requirements-build.txt && uv pip install numpy==2.1.3 && \
pip show numpy && ls -lrt /opt/vllm/lib/python3.12/site-packages/numpy && \
PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \ PYARROW_PARALLEL=${PYARROW_PARALLEL:-$(nproc)} \
python setup.py build_ext \ python setup.py build_ext \
--build-type=release --bundle-arrow-cpp \ --build-type=release --bundle-arrow-cpp \
...@@ -163,7 +162,7 @@ ARG OPENCV_VERSION=86 ...@@ -163,7 +162,7 @@ ARG OPENCV_VERSION=86
ARG OPENCV_PATCH=97f3f39 ARG OPENCV_PATCH=97f3f39
ARG ENABLE_HEADLESS=1 ARG ENABLE_HEADLESS=1
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \ git clone --recursive https://github.com/opencv/opencv-python.git -b ${OPENCV_VERSION} && \
cd opencv-python && \ cd opencv-python && \
sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \ sed -i -E -e 's/"setuptools.+",/"setuptools",/g' pyproject.toml && \
...@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V ...@@ -187,6 +186,23 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
&& make -j ${MAX_JOBS:-$(nproc)} && make -j ${MAX_JOBS:-$(nproc)}
###############################################################
# Stage to build numba
###############################################################
FROM base-builder AS numba-builder
ARG MAX_JOBS
ARG NUMBA_VERSION=0.61.2
# Clone all required dependencies
RUN dnf install ninja-build llvm15 llvm15-devel -y && source /opt/rh/gcc-toolset-14/enable && export PATH=$PATH:/usr/lib64/llvm15/bin && \
git clone --recursive https://github.com/numba/numba.git -b ${NUMBA_VERSION} && \
cd ./numba && \
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
fi && python -m build --wheel --installer=uv --outdir /numbawheels/
############################################################### ###############################################################
# Stage to build vllm - this stage builds and installs # Stage to build vllm - this stage builds and installs
# vllm, tensorizer and vllm-tgis-adapter and builds uv cache # vllm, tensorizer and vllm-tgis-adapter and builds uv cache
...@@ -195,10 +211,14 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V ...@@ -195,10 +211,14 @@ RUN git clone --recursive https://github.com/numactl/numactl.git -b v${NUMACTL_V
FROM base-builder AS vllmcache-builder FROM base-builder AS vllmcache-builder
ENV LLVM_CONFIG=/usr/lib64/llvm15/bin/llvm-config
ENV PATH=/usr/lib64/llvm15/bin:$PATH
COPY --from=torch-builder /tmp/control /dev/null COPY --from=torch-builder /tmp/control /dev/null
COPY --from=arrow-builder /tmp/control /dev/null COPY --from=arrow-builder /tmp/control /dev/null
COPY --from=cv-builder /tmp/control /dev/null COPY --from=cv-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null
COPY --from=numba-builder /tmp/control /dev/null
ARG VLLM_TARGET_DEVICE=cpu ARG VLLM_TARGET_DEVICE=cpu
ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
...@@ -206,24 +226,32 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ...@@ -206,24 +226,32 @@ ARG GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
# this step installs vllm and populates uv cache # this step installs vllm and populates uv cache
# with all the transitive dependencies # with all the transitive dependencies
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
source /opt/rh/gcc-toolset-13/enable && \ dnf install llvm15 llvm15-devel -y && \
rpm -ivh --nodeps https://mirror.stream.centos.org/9-stream/CRB/ppc64le/os/Packages/protobuf-lite-devel-3.14.0-16.el9.ppc64le.rpm && \
source /opt/rh/gcc-toolset-14/enable && \
git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \ git clone https://github.com/huggingface/xet-core.git && cd xet-core/hf_xet/ && \
uv pip install maturin && \ uv pip install maturin && \
uv build --wheel --out-dir /hf_wheels/ uv build --wheel --out-dir /hf_wheels/
ENV CXXFLAGS="-fno-lto -Wno-error=free-nonheap-object" \
CFLAGS="-fno-lto"
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \ --mount=type=bind,from=torch-builder,source=/torchwheels/,target=/torchwheels/,ro \
--mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \ --mount=type=bind,from=arrow-builder,source=/arrowwheels/,target=/arrowwheels/,ro \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \ --mount=type=bind,from=numa-builder,source=/numactl/,target=/numactl/,rw \
--mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
--mount=type=bind,src=.,dst=/src/,rw \ --mount=type=bind,src=.,dst=/src/,rw \
source /opt/rh/gcc-toolset-13/enable && \ source /opt/rh/gcc-toolset-14/enable && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl && \ export PATH=$PATH:/usr/lib64/llvm15/bin && \
uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl && \
sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \ sed -i -e 's/.*torch.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install pandas pythran pybind11 /hf_wheels/*.whl && \ sed -i -e 's/.*sentencepiece.*//g' /src/pyproject.toml /src/requirements/*.txt && \
uv pip install sentencepiece==0.2.0 pandas pythran nanobind pybind11 /hf_wheels/*.whl && \
make -C /numactl install && \ make -C /numactl install && \
# sentencepiece.pc is in some pkgconfig inside uv cache # sentencepiece.pc is in some pkgconfig inside uv cache
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \ export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && \
uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \ nanobind_DIR=$(uv pip show nanobind | grep Location | sed 's/^Location: //;s/$/\/nanobind\/cmake/') && uv pip install -r /src/requirements/common.txt -r /src/requirements/cpu.txt -r /src/requirements/build.txt --no-build-isolation && \
cd /src/ && \ cd /src/ && \
uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \ uv build --wheel --out-dir /vllmwheel/ --no-build-isolation && \
uv pip install /vllmwheel/*.whl uv pip install /vllmwheel/*.whl
...@@ -238,7 +266,7 @@ FROM base-builder AS lapack-builder ...@@ -238,7 +266,7 @@ FROM base-builder AS lapack-builder
ARG MAX_JOBS ARG MAX_JOBS
ARG LAPACK_VERSION=3.12.1 ARG LAPACK_VERSION=3.12.1
RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${LAPACK_VERSION} \
&& cd lapack && source /opt/rh/gcc-toolset-13/enable \ && cd lapack && source /opt/rh/gcc-toolset-14/enable \
&& cmake -B build -S . \ && cmake -B build -S . \
&& cmake --build build -j ${MAX_JOBS:-$(nproc)} && cmake --build build -j ${MAX_JOBS:-$(nproc)}
...@@ -250,7 +278,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L ...@@ -250,7 +278,7 @@ RUN git clone --recursive https://github.com/Reference-LAPACK/lapack.git -b v${L
FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} AS vllm-openai
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ARG OPENBLAS_VERSION=0.3.29 ARG OPENBLAS_VERSION=0.3.30
# Set Environment Variables for venv & openblas # Set Environment Variables for venv & openblas
ENV VIRTUAL_ENV=/opt/vllm ENV VIRTUAL_ENV=/opt/vllm
...@@ -268,6 +296,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null ...@@ -268,6 +296,7 @@ COPY --from=vllmcache-builder /tmp/control /dev/null
COPY --from=numa-builder /tmp/control /dev/null COPY --from=numa-builder /tmp/control /dev/null
COPY --from=lapack-builder /tmp/control /dev/null COPY --from=lapack-builder /tmp/control /dev/null
COPY --from=openblas-builder /tmp/control /dev/null COPY --from=openblas-builder /tmp/control /dev/null
COPY --from=numba-builder /tmp/control /dev/null
# install gcc-11, python, openblas, numactl, lapack # install gcc-11, python, openblas, numactl, lapack
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -276,13 +305,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -276,13 +305,13 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \ --mount=type=bind,from=openblas-builder,source=/OpenBLAS-$OPENBLAS_VERSION/,target=/openblas/,rw \
rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \ rpm -ivh https://dl.fedoraproject.org/pub/epel/epel-release-latest-9.noarch.rpm && \
microdnf install --nodocs -y \ microdnf install --nodocs -y \
tar findutils openssl \ libomp libicu tar findutils openssl llvm15 llvm15-devel \
pkgconfig xsimd g++ gcc-fortran libsndfile \ pkgconfig xsimd g++ gcc-fortran libsndfile \
libtiff libjpeg openjpeg2 zlib zeromq \ libtiff libjpeg openjpeg2 zlib zeromq \
freetype lcms2 libwebp tcl tk utf8proc \ freetype lcms2 libwebp tcl tk utf8proc \
harfbuzz fribidi libraqm libimagequant libxcb \ harfbuzz fribidi libraqm libimagequant libxcb util-linux \
python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \ python${PYTHON_VERSION}-devel python${PYTHON_VERSION}-pip \
&& microdnf clean all \ && export PATH=$PATH:/usr/lib64/llvm15/bin && microdnf clean all \
&& python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \ && python${PYTHON_VERSION} -m venv ${VIRTUAL_ENV} \
&& python -m pip install -U pip uv --no-cache \ && python -m pip install -U pip uv --no-cache \
&& make -C /numactl install \ && make -C /numactl install \
...@@ -298,7 +327,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -298,7 +327,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \ --mount=type=bind,from=cv-builder,source=/opencvwheels/,target=/opencvwheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \ --mount=type=bind,from=vllmcache-builder,source=/hf_wheels/,target=/hf_wheels/,ro \
--mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \ --mount=type=bind,from=vllmcache-builder,source=/vllmwheel/,target=/vllmwheel/,ro \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl --mount=type=bind,from=numba-builder,source=/numbawheels/,target=/numbawheels/,ro \
export PKG_CONFIG_PATH=$(find / -type d -name "pkgconfig" 2>/dev/null | tr '\n' ':') && uv pip install sentencepiece==0.2.0 && \
HOME=/root uv pip install /opencvwheels/*.whl /arrowwheels/*.whl /torchwheels/*.whl /numbawheels/*.whl /hf_wheels/*.whl /vllmwheel/*.whl
COPY ./ /workspace/vllm COPY ./ /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -314,4 +346,4 @@ WORKDIR /workspace/ ...@@ -314,4 +346,4 @@ WORKDIR /workspace/
RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
\ No newline at end of file
...@@ -7,14 +7,27 @@ FROM ${BASE_IMAGE} AS base ...@@ -7,14 +7,27 @@ FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
# Install some basic utilities # Install some basic utilities
RUN apt-get update -q -y && apt-get install -q -y \ RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl apt-transport-https ca-certificates wget curl
# Remove sccache # Remove sccache
RUN python3 -m pip install --upgrade pip RUN python3 -m pip install --upgrade pip
RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)"
# Install UV
RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh
# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR} WORKDIR ${COMMON_WORKDIR}
...@@ -52,6 +65,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests ...@@ -52,6 +65,8 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
# Centralized v1 package - copied to both test and final stages
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# ----------------------- # -----------------------
# Test vLLM image # Test vLLM image
...@@ -59,13 +74,15 @@ FROM base AS test ...@@ -59,13 +74,15 @@ FROM base AS test
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM # Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \ cd /install \
&& pip install -U -r requirements/rocm.txt \ && uv pip install --system -r requirements/rocm.txt \
&& pip install -U -r requirements/rocm-test.txt \ && uv pip install --system -r requirements/rocm-test.txt \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& pip install *.whl && uv pip install --system *.whl
WORKDIR /vllm-workspace WORKDIR /vllm-workspace
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
...@@ -73,11 +90,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace ...@@ -73,11 +90,22 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# install development dependencies (for testing) # install development dependencies (for testing)
RUN cd /vllm-workspace \ RUN cd /vllm-workspace \
&& rm -rf vllm \
&& python3 -m pip install -e tests/vllm_test_utils \ && python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install lm-eval[api]==0.4.4 \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1
# Copy in the v1 package
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# Source code is used in the `python_only_compile.sh` test
# We hide it inside `src/` so that this source code
# will not be imported by other tests
RUN mkdir src && mv vllm src/vllm
# ----------------------- # -----------------------
# Final vLLM image # Final vLLM image
FROM base AS final FROM base AS final
...@@ -90,14 +118,20 @@ RUN case "$(which python3)" in \ ...@@ -90,14 +118,20 @@ RUN case "$(which python3)" in \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \ rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
*) ;; esac *) ;; esac
RUN python3 -m pip install --upgrade huggingface-hub[cli] RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --upgrade huggingface-hub[cli]
# Install vLLM # Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \ cd /install \
&& pip install -U -r requirements/rocm.txt \ && uv pip install --system -r requirements/rocm.txt \
&& pip uninstall -y vllm \ && pip uninstall -y vllm \
&& pip install *.whl && uv pip install --system *.whl
# Copy in the v1 package
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
ARG COMMON_WORKDIR ARG COMMON_WORKDIR
...@@ -106,8 +140,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks ...@@ -106,8 +140,6 @@ COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker
ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1
ENV TOKENIZERS_PARALLELISM=false ENV TOKENIZERS_PARALLELISM=false
# ENV that can improve safe tensor loading, and end-to-end time # ENV that can improve safe tensor loading, and end-to-end time
......
ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.0-complete ARG BASE_IMAGE=rocm/dev-ubuntu-22.04:7.1-complete
ARG TRITON_BRANCH="f9e5bf54" ARG TRITON_BRANCH="57c693b6"
ARG TRITON_REPO="https://github.com/ROCm/triton.git" ARG TRITON_REPO="https://github.com/ROCm/triton.git"
ARG PYTORCH_BRANCH="b2fb6885" ARG PYTORCH_BRANCH="1c57644d"
ARG PYTORCH_VISION_BRANCH="v0.23.0" ARG PYTORCH_VISION_BRANCH="v0.23.0"
ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git" ARG PYTORCH_REPO="https://github.com/ROCm/pytorch.git"
ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git" ARG PYTORCH_VISION_REPO="https://github.com/pytorch/vision.git"
ARG PYTORCH_AUDIO_BRANCH="v2.9.0"
ARG PYTORCH_AUDIO_REPO="https://github.com/pytorch/audio.git"
ARG FA_BRANCH="0e60e394" ARG FA_BRANCH="0e60e394"
ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git" ARG FA_REPO="https://github.com/Dao-AILab/flash-attention.git"
ARG AITER_BRANCH="2ab9f4cd" ARG AITER_BRANCH="59bd8ff2"
ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG AITER_REPO="https://github.com/ROCm/aiter.git"
FROM ${BASE_IMAGE} AS base FROM ${BASE_IMAGE} AS base
...@@ -15,11 +17,15 @@ FROM ${BASE_IMAGE} AS base ...@@ -15,11 +17,15 @@ FROM ${BASE_IMAGE} AS base
ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV PATH=/opt/rocm/llvm/bin:/opt/rocm/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
ENV ROCM_PATH=/opt/rocm ENV ROCM_PATH=/opt/rocm
ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib: ENV LD_LIBRARY_PATH=/opt/rocm/lib:/usr/local/lib:
ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201 ARG PYTORCH_ROCM_ARCH=gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151
ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH} ENV PYTORCH_ROCM_ARCH=${PYTORCH_ROCM_ARCH}
ENV AITER_ROCM_ARCH=gfx942;gfx950 ENV AITER_ROCM_ARCH=gfx942;gfx950
# Required for RCCL in ROCm7.1
ENV HSA_NO_SCRATCH_RECLAIM=1
ARG PYTHON_VERSION=3.12 ARG PYTHON_VERSION=3.12
ENV PYTHON_VERSION=${PYTHON_VERSION}
RUN mkdir -p /app RUN mkdir -p /app
WORKDIR /app WORKDIR /app
...@@ -42,6 +48,7 @@ RUN apt-get update -y \ ...@@ -42,6 +48,7 @@ RUN apt-get update -y \
&& python3 --version && python3 -m pip --version && python3 --version && python3 -m pip --version
RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython
RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/*
FROM base AS build_triton FROM base AS build_triton
ARG TRITON_BRANCH ARG TRITON_BRANCH
...@@ -63,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install ...@@ -63,11 +70,14 @@ RUN mkdir -p /app/install && cp /opt/rocm/share/amd_smi/dist/*.whl /app/install
FROM base AS build_pytorch FROM base AS build_pytorch
ARG PYTORCH_BRANCH ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_REPO
RUN git clone ${PYTORCH_REPO} pytorch RUN git clone ${PYTORCH_REPO} pytorch
RUN cd pytorch && git checkout ${PYTORCH_BRANCH} && \ RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \
pip install -r requirements.txt && git submodule update --init --recursive \ && pip install -r requirements.txt && git submodule update --init --recursive \
&& python3 tools/amd_build/build_amd.py \ && python3 tools/amd_build/build_amd.py \
&& CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl && pip install dist/*.whl
...@@ -75,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision ...@@ -75,8 +85,15 @@ RUN git clone ${PYTORCH_VISION_REPO} vision
RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \
&& python3 setup.py bdist_wheel --dist-dir=dist \ && python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl && pip install dist/*.whl
RUN git clone ${PYTORCH_AUDIO_REPO} audio
RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \
&& git submodule update --init --recursive \
&& pip install -r requirements.txt \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& pip install dist/*.whl
RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \
&& cp /app/vision/dist/*.whl /app/install && cp /app/vision/dist/*.whl /app/install \
&& cp /app/audio/dist/*.whl /app/install
FROM base AS build_fa FROM base AS build_fa
ARG FA_BRANCH ARG FA_BRANCH
...@@ -127,6 +144,8 @@ ARG PYTORCH_BRANCH ...@@ -127,6 +144,8 @@ ARG PYTORCH_BRANCH
ARG PYTORCH_VISION_BRANCH ARG PYTORCH_VISION_BRANCH
ARG PYTORCH_REPO ARG PYTORCH_REPO
ARG PYTORCH_VISION_REPO ARG PYTORCH_VISION_REPO
ARG PYTORCH_AUDIO_BRANCH
ARG PYTORCH_AUDIO_REPO
ARG FA_BRANCH ARG FA_BRANCH
ARG FA_REPO ARG FA_REPO
ARG AITER_BRANCH ARG AITER_BRANCH
...@@ -138,6 +157,8 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \ ...@@ -138,6 +157,8 @@ RUN echo "BASE_IMAGE: ${BASE_IMAGE}" > /app/versions.txt \
&& echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \ && echo "PYTORCH_VISION_BRANCH: ${PYTORCH_VISION_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \ && echo "PYTORCH_REPO: ${PYTORCH_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \ && echo "PYTORCH_VISION_REPO: ${PYTORCH_VISION_REPO}" >> /app/versions.txt \
&& echo "PYTORCH_AUDIO_BRANCH: ${PYTORCH_AUDIO_BRANCH}" >> /app/versions.txt \
&& echo "PYTORCH_AUDIO_REPO: ${PYTORCH_AUDIO_REPO}" >> /app/versions.txt \
&& echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \ && echo "FA_BRANCH: ${FA_BRANCH}" >> /app/versions.txt \
&& echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \ && echo "FA_REPO: ${FA_REPO}" >> /app/versions.txt \
&& echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \ && echo "AITER_BRANCH: ${AITER_BRANCH}" >> /app/versions.txt \
......
...@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \ ...@@ -14,7 +14,7 @@ ENV LANG=C.UTF-8 \
# Install development utilities # Install development utilities
RUN microdnf install -y \ RUN microdnf install -y \
which procps findutils tar vim git gcc gcc-gfortran g++ make patch zlib-devel \ which procps findutils tar vim git gcc-toolset-14 gcc-toolset-14-libatomic-devel patch zlib-devel \
libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \ libjpeg-turbo-devel libtiff-devel libpng-devel libwebp-devel freetype-devel harfbuzz-devel \
openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \ openssl-devel openblas openblas-devel autoconf automake libtool cmake numpy libsndfile \
clang llvm-devel llvm-static clang-devel && \ clang llvm-devel llvm-static clang-devel && \
...@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \ ...@@ -85,40 +85,15 @@ RUN curl https://sh.rustup.rs -sSf | sh -s -- -y && \
rustup default stable && \ rustup default stable && \
rustup show rustup show
FROM python-install AS torch
ARG TORCH_VERSION=2.7.0
ENV export _GLIBCXX_USE_CXX11_ABI=1
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone https://github.com/pytorch/pytorch.git && \
cd pytorch && \
git checkout v2.7.0 && \
git submodule sync && \
git submodule update --init --recursive && \
uv pip install cmake ninja && \
uv pip install -r requirements.txt && \
python setup.py bdist_wheel
FROM python-install AS torch-vision FROM python-install AS torch-vision
# Install torchvision # Install torchvision
ARG TORCH_VERSION=2.7.0 ARG TORCH_VISION_VERSION=v0.23.0
ARG TORCH_VISION_VERSION=v0.20.1
WORKDIR /tmp WORKDIR /tmp
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
git clone https://github.com/pytorch/vision.git && \ git clone https://github.com/pytorch/vision.git && \
cd vision && \ cd vision && \
git checkout $TORCH_VISION_VERSION && \ git checkout $TORCH_VISION_VERSION && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl | head -n 1) && \ uv pip install torch==2.8.0 --index-url https://download.pytorch.org/whl/cpu && \
uv pip install -v $TORCH_WHL_FILE && \
python setup.py bdist_wheel python setup.py bdist_wheel
FROM python-install AS hf-xet-builder FROM python-install AS hf-xet-builder
...@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -199,26 +174,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \ if ! grep '#include "dynamic_annotations.h"' numba/_dispatcher.cpp; then \
sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \ sed -i '/#include "internal\/pycore_atomic.h"/i\#include "dynamic_annotations.h"' numba/_dispatcher.cpp; \
fi && python setup.py bdist_wheel fi && python setup.py bdist_wheel
# Edit aws-lc-sys to support s390x
FROM python-install AS aws-lc-sys-editor
WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ARG AWS_LC_VERSION=v0.30.0
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
git clone --recursive https://github.com/aws/aws-lc-rs.git && \
cd aws-lc-rs && \
git checkout tags/aws-lc-sys/${AWS_LC_VERSION} && \
git submodule sync && \
git submodule update --init --recursive && \
cd aws-lc-sys && \
sed -i '682 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
sed -i '712 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c && \
sed -i '747 s/strncmp(buf, "-----END ", 9)/memcmp(buf, "-----END ", 9)/' aws-lc/crypto/pem/pem_lib.c
# Build Outlines Core # Build Outlines Core
FROM python-install AS outlines-core-builder FROM python-install AS outlines-core-builder
...@@ -226,17 +181,17 @@ WORKDIR /tmp ...@@ -226,17 +181,17 @@ WORKDIR /tmp
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH" ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ARG OUTLINES_CORE_VERSION=0.2.10 COPY requirements/common.txt /tmp/requirements/common.txt
ARG OUTLINES_CORE_VERSION
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \ --mount=type=bind,from=rust,source=/root/.cargo,target=/root/.cargo,rw \
--mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \ --mount=type=bind,from=rust,source=/root/.rustup,target=/root/.rustup,rw \
--mount=type=bind,from=aws-lc-sys-editor,source=/tmp/aws-lc-rs/aws-lc-sys,target=/tmp/aws-lc-sys,rw \ OUTLINES_CORE_VERSION=${OUTLINES_CORE_VERSION:-$(grep -E '^outlines_core\s*==\s*[0-9.]+' /tmp/requirements/common.txt | grep -Eo '[0-9.]+')} && \
if [ -z "${OUTLINES_CORE_VERSION}" ]; then echo "ERROR: Could not determine outlines_core version"; exit 1; fi && \
git clone https://github.com/dottxt-ai/outlines-core.git && \ git clone https://github.com/dottxt-ai/outlines-core.git && \
cd outlines-core && \ cd outlines-core && \
git checkout tags/${OUTLINES_CORE_VERSION} && \ git checkout tags/${OUTLINES_CORE_VERSION} && \
sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \ sed -i "s/version = \"0.0.0\"/version = \"${OUTLINES_CORE_VERSION}\"/" Cargo.toml && \
echo '[patch.crates-io]' >> Cargo.toml && \
echo 'aws-lc-sys = { path = "/tmp/aws-lc-sys" }' >> Cargo.toml && \
uv pip install maturin && \ uv pip install maturin && \
python -m maturin build --release --out dist python -m maturin build --release --out dist
...@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu ...@@ -245,13 +200,15 @@ FROM python-install AS vllm-cpu
ARG PYTHON_VERSION ARG PYTHON_VERSION
# Set correct library path for torch and numactl # Set correct library path for torch and numactl
ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:$LD_LIBRARY_PATH" ENV LD_LIBRARY_PATH="/opt/vllm/lib64/python${PYTHON_VERSION}/site-packages/torch/lib:/usr/local/lib:/opt/rh/gcc-toolset-14/root/usr/lib64:$LD_LIBRARY_PATH"
ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH" ENV C_INCLUDE_PATH="/usr/local/include:$C_INCLUDE_PATH"
ENV UV_LINK_MODE=copy ENV UV_LINK_MODE=copy
ENV CARGO_HOME=/root/.cargo ENV CARGO_HOME=/root/.cargo
ENV RUSTUP_HOME=/root/.rustup ENV RUSTUP_HOME=/root/.rustup
ENV PATH="$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1 ENV GRPC_PYTHON_BUILD_SYSTEM_OPENSSL=1
ENV PCP_DIR=/opt/rh/gcc-toolset-14/root
ENV PKG_CONFIG_PATH="/opt/rh/gcc-toolset-14/root/usr/lib64/pkgconfig:/usr/local/lib/pkgconfig/"
ENV PATH="${VIRTUAL_ENV:+${VIRTUAL_ENV}/bin}:/opt/rh/gcc-toolset-14/root/usr/bin:/usr/local/bin:$CARGO_HOME/bin:$RUSTUP_HOME/bin:$PATH"
COPY . /workspace/vllm COPY . /workspace/vllm
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
...@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -266,7 +223,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \ --mount=type=bind,from=pyarrow,source=/tmp/arrow/python/dist,target=/tmp/arrow-wheels \
--mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \ --mount=type=bind,from=torch-vision,source=/tmp/vision/dist,target=/tmp/vision-wheels/ \
--mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \ --mount=type=bind,from=hf-xet-builder,source=/tmp/hf-xet/dist,target=/tmp/hf-xet-wheels/ \
--mount=type=bind,from=torch,source=/tmp/pytorch/dist,target=/tmp/torch-wheels/ \
--mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/llvmlite/dist,target=/tmp/llvmlite-wheels/ \
--mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \ --mount=type=bind,from=numba-builder,source=/tmp/numba/dist,target=/tmp/numba-wheels/ \
--mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \ --mount=type=bind,from=outlines-core-builder,source=/tmp/outlines-core/dist,target=/tmp/outlines-core/dist/ \
...@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -274,7 +230,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \ ARROW_WHL_FILE=$(ls /tmp/arrow-wheels/pyarrow-*.whl) && \
VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \ VISION_WHL_FILE=$(ls /tmp/vision-wheels/*.whl) && \
HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \ HF_XET_WHL_FILE=$(ls /tmp/hf-xet-wheels/*.whl) && \
TORCH_WHL_FILE=$(ls /tmp/torch-wheels/*.whl) && \
LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \ LLVM_WHL_FILE=$(ls /tmp/llvmlite-wheels/*.whl) && \
NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \ NUMBA_WHL_FILE=$(ls /tmp/numba-wheels/*.whl) && \
OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \ OUTLINES_CORE_WHL_FILE=$(ls /tmp/outlines-core/dist/*.whl) && \
...@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -282,7 +237,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
$ARROW_WHL_FILE \ $ARROW_WHL_FILE \
$VISION_WHL_FILE \ $VISION_WHL_FILE \
$HF_XET_WHL_FILE \ $HF_XET_WHL_FILE \
$TORCH_WHL_FILE \
$LLVM_WHL_FILE \ $LLVM_WHL_FILE \
$NUMBA_WHL_FILE \ $NUMBA_WHL_FILE \
$OUTLINES_CORE_WHL_FILE \ $OUTLINES_CORE_WHL_FILE \
...@@ -309,4 +263,4 @@ USER 2000 ...@@ -309,4 +263,4 @@ USER 2000
WORKDIR /home/vllm WORKDIR /home/vllm
# Set the default entrypoint # Set the default entrypoint
ENTRYPOINT ["python", "-m", "vllm.entrypoints.openai.api_server"] ENTRYPOINT ["vllm", "serve"]
FROM intel/deep-learning-essentials:2025.1.3-0-devel-ubuntu24.04 AS vllm-base FROM intel/deep-learning-essentials:2025.2.2-0-devel-ubuntu24.04 AS vllm-base
RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \ echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | tee /etc/apt/sources.list.d/oneAPI.list && \
...@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \ ...@@ -14,6 +14,7 @@ RUN apt clean && apt-get update -y && \
libxext6 \ libxext6 \
libgl1 \ libgl1 \
lsb-release \ lsb-release \
libaio-dev \
numactl \ numactl \
wget \ wget \
vim \ vim \
...@@ -24,10 +25,14 @@ RUN apt clean && apt-get update -y && \ ...@@ -24,10 +25,14 @@ RUN apt clean && apt-get update -y && \
RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1 RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.12 1
RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1 RUN update-alternatives --install /usr/bin/python python /usr/bin/python3.12 1
RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing RUN apt install -y libze1 libze-dev libze-intel-gpu1 intel-opencl-icd libze-intel-gpu-raytracing intel-ocloc
# This oneccl contains the BMG support which is not the case for default version of oneapi 2025.2.
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.6/intel-oneccl-2021.15.6.9_offline.sh
RUN bash intel-oneccl-2021.15.6.9_offline.sh -a --silent --eula accept && \
echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc && \
echo "source /opt/intel/oneapi/ccl/2021.15/env/vars.sh --force" >> /root/.bashrc
RUN wget https://github.com/uxlfoundation/oneCCL/releases/download/2021.15.4/intel-oneccl-2021.15.4.11_offline.sh
RUN bash intel-oneccl-2021.15.4.11_offline.sh -a --silent --eula accept && echo "source /opt/intel/oneapi/setvars.sh --force" >> /root/.bashrc
SHELL ["bash", "-c"] SHELL ["bash", "-c"]
CMD ["bash", "-c", "source /root/.bashrc && exec bash"] CMD ["bash", "-c", "source /root/.bashrc && exec bash"]
...@@ -54,7 +59,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn ...@@ -54,7 +59,7 @@ ENV VLLM_WORKER_MULTIPROC_METHOD=spawn
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \ --mount=type=bind,source=.git,target=.git \
python3 setup.py install pip install --no-build-isolation .
CMD ["/bin/bash"] CMD ["/bin/bash"]
...@@ -64,9 +69,15 @@ FROM vllm-base AS vllm-openai ...@@ -64,9 +69,15 @@ FROM vllm-base AS vllm-openai
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope pip install accelerate hf_transfer pytest pytest_asyncio lm_eval[api] modelscope
# install development dependencies (for testing)
RUN python3 -m pip install -e tests/vllm_test_utils
# install nixl from source code
ENV NIXL_VERSION=0.7.0
RUN python3 /workspace/vllm/tools/install_nixl_from_source_ubuntu.py
# remove torch bundled oneccl to avoid conflicts
RUN --mount=type=cache,target=/root/.cache/pip \ RUN --mount=type=cache,target=/root/.cache/pip \
pip uninstall oneccl oneccl-devel -y pip uninstall oneccl oneccl-devel -y
# install development dependencies (for testing) ENTRYPOINT ["vllm", "serve"]
RUN python3 -m pip install -e tests/vllm_test_utils
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
...@@ -24,14 +24,16 @@ nav: ...@@ -24,14 +24,16 @@ nav:
- deployment/integrations - deployment/integrations
- Training: training - Training: training
- Configuration: - Configuration:
- configuration/README.md
- configuration/* - configuration/*
- TPU: https://docs.vllm.ai/projects/tpu/en/latest/
- Models: - Models:
- models/supported_models.md - models/supported_models.md
- models/generative_models.md - models/generative_models.md
- models/pooling_models.md - models/pooling_models.md
- models/extensions - models/extensions
- Hardware Supported Models: models/hardware_supported_models - Hardware Supported Models:
- models/hardware_supported_models/*
- TPU: https://docs.vllm.ai/projects/tpu/en/latest/recommended_models_features/
- Features: features - Features: features
- Developer Guide: - Developer Guide:
- contributing/README.md - contributing/README.md
...@@ -46,7 +48,15 @@ nav: ...@@ -46,7 +48,15 @@ nav:
- contributing/model/multimodal.md - contributing/model/multimodal.md
- contributing/model/transcription.md - contributing/model/transcription.md
- CI: contributing/ci - CI: contributing/ci
- Design Documents: design - Design Documents:
- Plugins:
- design/*plugin*.md
- design/*
- Benchmarking:
- benchmarking/README.md
- benchmarking/cli.md
- benchmarking/sweeps.md
- benchmarking/dashboard.md
- API Reference: - API Reference:
- api/README.md - api/README.md
- api/vllm - api/vllm
......
...@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at ...@@ -30,8 +30,8 @@ Originally developed in the [Sky Computing Lab](https://sky.cs.berkeley.edu) at
Where to get started with vLLM depends on the type of user. If you are looking to: Where to get started with vLLM depends on the type of user. If you are looking to:
- Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md) - Run open-source models on vLLM, we recommend starting with the [Quickstart Guide](./getting_started/quickstart.md)
- Build applications with vLLM, we recommend starting with the [User Guide](./usage) - Build applications with vLLM, we recommend starting with the [User Guide](./usage/README.md)
- Build vLLM, we recommend starting with [Developer Guide](./contributing) - Build vLLM, we recommend starting with [Developer Guide](./contributing/README.md)
For information about the development of vLLM, see: For information about the development of vLLM, see:
...@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with: ...@@ -56,7 +56,7 @@ vLLM is flexible and easy to use with:
- Tensor, pipeline, data and expert parallelism support for distributed inference - Tensor, pipeline, data and expert parallelism support for distributed inference
- Streaming outputs - Streaming outputs
- OpenAI-compatible API server - OpenAI-compatible API server
- Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend. - Support for NVIDIA GPUs, AMD CPUs and GPUs, Intel CPUs and GPUs, PowerPC CPUs, Arm CPUs, and TPU. Additionally, support for diverse hardware plugins such as Intel Gaudi, IBM Spyre and Huawei Ascend.
- Prefix caching support - Prefix caching support
- Multi-LoRA support - Multi-LoRA support
......
...@@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes. ...@@ -20,8 +20,6 @@ API documentation for vLLM's configuration classes.
- [vllm.config.CompilationConfig][] - [vllm.config.CompilationConfig][]
- [vllm.config.VllmConfig][] - [vllm.config.VllmConfig][]
[](){ #offline-inference-api }
## Offline Inference ## Offline Inference
LLM Class. LLM Class.
...@@ -45,18 +43,14 @@ Engine classes for offline and online inference. ...@@ -45,18 +43,14 @@ Engine classes for offline and online inference.
Inference parameters for vLLM APIs. Inference parameters for vLLM APIs.
[](){ #sampling-params }
- [vllm.SamplingParams][] - [vllm.SamplingParams][]
- [vllm.PoolingParams][] - [vllm.PoolingParams][]
[](){ #multi-modality }
## Multi-Modality ## Multi-Modality
vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package. vLLM provides experimental support for multi-modal models through the [vllm.multimodal][] package.
Multi-modal inputs can be passed alongside text and token prompts to [supported models][supported-mm-models] Multi-modal inputs can be passed alongside text and token prompts to [supported models](../models/supported_models.md#list-of-multimodal-language-models)
via the `multi_modal_data` field in [vllm.inputs.PromptType][]. via the `multi_modal_data` field in [vllm.inputs.PromptType][].
Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md). Looking to add your own multi-modal model? Please follow the instructions listed [here](../contributing/model/multimodal.md).
......
search: search:
boost: 0.5 exclude: true
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment