Commit 4eabe123 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

parents 45840cd2 58738772
...@@ -35,6 +35,7 @@ from transformers import PreTrainedTokenizerBase ...@@ -35,6 +35,7 @@ from transformers import PreTrainedTokenizerBase
from vllm.lora.request import LoRARequest from vllm.lora.request import LoRARequest
from vllm.lora.utils import get_adapter_absolute_path from vllm.lora.utils import get_adapter_absolute_path
from vllm.multimodal import MultiModalDataDict from vllm.multimodal import MultiModalDataDict
from vllm.multimodal.image import convert_image_mode
from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]: ...@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
if isinstance(image, dict) and "bytes" in image: if isinstance(image, dict) and "bytes" in image:
image = Image.open(BytesIO(image["bytes"])) image = Image.open(BytesIO(image["bytes"]))
if isinstance(image, Image.Image): if isinstance(image, Image.Image):
image = image.convert("RGB") image = convert_image_mode(image, "RGB")
with io.BytesIO() as image_data: with io.BytesIO() as image_data:
image.save(image_data, format="JPEG") image.save(image_data, format="JPEG")
image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8") image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")
......
...@@ -189,5 +189,8 @@ if __name__ == "__main__": ...@@ -189,5 +189,8 @@ if __name__ == "__main__":
) )
parser = EngineArgs.add_cli_args(parser) parser = EngineArgs.add_cli_args(parser)
# V1 enables prefix caching by default which skews the latency
# numbers. We need to disable prefix caching by default.
parser.set_defaults(enable_prefix_caching=False)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
...@@ -672,7 +672,7 @@ async def benchmark( ...@@ -672,7 +672,7 @@ async def benchmark(
def evaluate(ret, args): def evaluate(ret, args):
def _eval_correctness_json(expected, actual): def _eval_correctness_json(expected, actual):
# extract json string from string using regex # extract json string from string using regex
import re import regex as re
actual = actual.replace("\n", "").replace(" ", "").strip() actual = actual.replace("\n", "").replace(" ", "").strip()
try: try:
...@@ -687,7 +687,7 @@ def evaluate(ret, args): ...@@ -687,7 +687,7 @@ def evaluate(ret, args):
return actual in args.choice return actual in args.choice
def _eval_correctness_regex(expected, actual): def _eval_correctness_regex(expected, actual):
import re import regex as re
return re.match(args.regex, actual) is not None return re.match(args.regex, actual) is not None
......
...@@ -84,7 +84,10 @@ def main( ...@@ -84,7 +84,10 @@ def main(
if version == "v2": if version == "v2":
if current_platform.is_rocm(): if current_platform.is_rocm():
global PARTITION_SIZE global PARTITION_SIZE
PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM if not args.custom_paged_attn and not current_platform.is_navi():
PARTITION_SIZE = 1024
else:
PARTITION_SIZE = PARTITION_SIZE_ROCM
num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
tmp_output = torch.empty( tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size), size=(num_seqs, num_query_heads, num_partitions, head_size),
...@@ -159,6 +162,7 @@ def main( ...@@ -159,6 +162,7 @@ def main(
scale, scale,
block_tables, block_tables,
seq_lens, seq_lens,
None,
block_size, block_size,
max_seq_len, max_seq_len,
alibi_slopes, alibi_slopes,
......
...@@ -2,11 +2,11 @@ ...@@ -2,11 +2,11 @@
import math import math
import pickle import pickle
import re
from collections import defaultdict from collections import defaultdict
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
import pandas as pd import pandas as pd
import regex as re
import seaborn as sns import seaborn as sns
from torch.utils.benchmark import Measurement as TMeasurement from torch.utils.benchmark import Measurement as TMeasurement
......
...@@ -6,11 +6,6 @@ ...@@ -6,11 +6,6 @@
[tool.ruff] [tool.ruff]
line-length = 88 line-length = 88
exclude = [
# External file, leaving license intact
"examples/other/fp8/quantizer/quantize.py",
"vllm/vllm_flash_attn/flash_attn_interface.pyi"
]
[tool.ruff.lint.per-file-ignores] [tool.ruff.lint.per-file-ignores]
"vllm/third_party/**" = ["ALL"] "vllm/third_party/**" = ["ALL"]
......
...@@ -19,6 +19,7 @@ namespace vec_op { ...@@ -19,6 +19,7 @@ namespace vec_op {
#define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \ #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...) \
AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \ AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__) \
AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__) AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
#define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \ #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \
......
...@@ -15,15 +15,6 @@ ...@@ -15,15 +15,6 @@
cutlassGetStatusString(error)); \ cutlassGetStatusString(error)); \
} }
/**
* Panic wrapper for unwinding CUDA runtime errors
*/
#define CUDA_CHECK(status) \
{ \
cudaError_t error = status; \
TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
}
inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) { inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
int max_shared_mem_per_block_opt_in = 0; int max_shared_mem_per_block_opt_in = 0;
cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in, cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,
......
...@@ -13,6 +13,10 @@ ...@@ -13,6 +13,10 @@
#include <cub/block/block_load.cuh> #include <cub/block/block_load.cuh>
#include <cub/block/block_store.cuh> #include <cub/block/block_store.cuh>
#ifdef USE_ROCM
namespace cub = hipcub;
#endif
#include "static_switch.h" #include "static_switch.h"
...@@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) { ...@@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
auto kernel = &causal_conv1d_fwd_kernel<Ktraits>; auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
if (kSmemSize >= 48 * 1024) { if (kSmemSize >= 48 * 1024) {
#ifndef USE_ROCM
C10_CUDA_CHECK(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
#else
// There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
C10_CUDA_CHECK(cudaFuncSetAttribute( C10_CUDA_CHECK(cudaFuncSetAttribute(
(void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl; std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
#endif
} }
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params); kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
......
...@@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) { ...@@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
auto kernel = &selective_scan_fwd_kernel<Ktraits>; auto kernel = &selective_scan_fwd_kernel<Ktraits>;
if (kSmemSize >= 48 * 1024) { if (kSmemSize >= 48 * 1024) {
C10_CUDA_CHECK(cudaFuncSetAttribute( C10_CUDA_CHECK(cudaFuncSetAttribute(
kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize)); (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
} }
kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params); kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
C10_CUDA_KERNEL_LAUNCH_CHECK(); C10_CUDA_KERNEL_LAUNCH_CHECK();
......
...@@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output, ...@@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
torch::Tensor num_tokens_post_pad, int64_t top_k, torch::Tensor num_tokens_post_pad, int64_t top_k,
int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N, int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
int64_t BLOCK_SIZE_K, int64_t bit); int64_t BLOCK_SIZE_K, int64_t bit);
#endif #endif
\ No newline at end of file
bool moe_permute_unpermute_supported();
\ No newline at end of file
...@@ -5,6 +5,9 @@ ...@@ -5,6 +5,9 @@
#include "permute_unpermute_kernels/dispatch.h" #include "permute_unpermute_kernels/dispatch.h"
#include "core/registration.h" #include "core/registration.h"
// moe_permute kernels require at least CUDA 12.0
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
void moe_permute( void moe_permute(
const torch::Tensor& input, // [n_token, hidden] const torch::Tensor& input, // [n_token, hidden]
const torch::Tensor& topk_weights, //[n_token, topk] const torch::Tensor& topk_weights, //[n_token, topk]
...@@ -127,7 +130,45 @@ void moe_unpermute( ...@@ -127,7 +130,45 @@ void moe_unpermute(
}); });
} }
#else
void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
torch::Tensor& permuted_input,
torch::Tensor& expert_first_token_offset,
torch::Tensor& src_row_id2dst_row_id_map,
torch::Tensor& m_indices) {
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
}
void moe_unpermute(const torch::Tensor& input,
const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
const torch::Tensor& token_expert_indicies,
const std::optional<torch::Tensor>& expert_map,
int64_t n_expert, int64_t n_local_expert, int64_t topk,
const std::optional<int64_t>& align_block_size,
torch::Tensor& permuted_input,
torch::Tensor& expert_first_token_offset,
torch::Tensor& src_row_id2dst_row_id_map,
torch::Tensor& m_indices) {
TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
}
#endif
bool moe_permute_unpermute_supported() {
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
return true;
#else
return false;
#endif
}
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) { TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("moe_permute", &moe_permute); m.impl("moe_permute", &moe_permute);
m.impl("moe_unpermute", &moe_unpermute); m.impl("moe_unpermute", &moe_unpermute);
} }
\ No newline at end of file
#include "moe_permute_unpermute_kernel.h" #include "moe_permute_unpermute_kernel.h"
// moe_permute kernels require at least CUDA 12.0
#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
// CubKeyValueSorter definition begin // CubKeyValueSorter definition begin
CubKeyValueSorter::CubKeyValueSorter() CubKeyValueSorter::CubKeyValueSorter()
: num_experts_(0), num_bits_(sizeof(int) * 8) {} : num_experts_(0), num_bits_(sizeof(int) * 8) {}
...@@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size, ...@@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
int num_experts) { int num_experts) {
auto tidx = threadIdx.x; auto tidx = threadIdx.x;
auto bidx = blockIdx.x; auto bidx = blockIdx.x;
auto lidx = tidx & 31;
auto widx = tidx >> 5;
auto warp_count = (blockDim.x + 31) >> 5;
auto offset = bidx * blockDim.x; auto offset = bidx * blockDim.x;
auto bound = min(offset + blockDim.x, size); auto bound = min(offset + blockDim.x, size);
extern __shared__ int smem_expert_map[]; extern __shared__ int smem_expert_map[];
...@@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset, ...@@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset,
expert_first_token_offset, align_expert_first_token_offset, m_indices, expert_first_token_offset, align_expert_first_token_offset, m_indices,
num_local_expert, align_block_size); num_local_expert, align_block_size);
} }
} }
\ No newline at end of file
#endif
...@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { ...@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
// Calculate the result of moe by summing up the partial results // Calculate the result of moe by summing up the partial results
// from all selected experts. // from all selected experts.
m.def("moe_sum(Tensor! input, Tensor output) -> ()"); m.def("moe_sum(Tensor input, Tensor! output) -> ()");
m.impl("moe_sum", torch::kCUDA, &moe_sum); m.impl("moe_sum", torch::kCUDA, &moe_sum);
// Aligning the number of tokens to be processed by each expert such // Aligning the number of tokens to be processed by each expert such
...@@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { ...@@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor " "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
"expert_first_token_offset, int n_expert, int n_local_expert,int " "expert_first_token_offset, int n_expert, int n_local_expert,int "
"topk, Tensor! hidden_states)->()"); "topk, Tensor! hidden_states)->()");
// conditionally compiled so impl registration is in source file
m.def("moe_permute_unpermute_supported() -> bool");
m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
#endif #endif
} }
......
...@@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) { ...@@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
} }
bool cutlass_group_gemm_supported(int64_t cuda_device_capability) { bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
// CUTLASS groped FP8 kernels need at least CUDA 12.3 // CUTLASS grouped FP8 kernels need at least CUDA 12.3
// and SM90 (Hopper) // and SM90 (Hopper)
#if defined CUDA_VERSION #if defined CUDA_VERSION
......
This diff is collapsed.
...@@ -8,6 +8,8 @@ ...@@ -8,6 +8,8 @@
#include <ATen/cuda/CUDAContext.h> #include <ATen/cuda/CUDAContext.h>
#include "cuda_utils.h"
#include "cutlass/cutlass.h" #include "cutlass/cutlass.h"
#include "cutlass/gemm/device/gemm_universal_adapter.h" #include "cutlass/gemm/device/gemm_universal_adapter.h"
...@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm { ...@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
// clang-format off // clang-format off
using CollectiveMainloop = using CollectiveMainloop =
typename cutlass::gemm::collective::CollectiveBuilder< typename cutlass::gemm::collective::CollectiveBuilder<
cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
ElementAB, cutlass::layout::RowMajor, AlignmentAB, ElementAB, cutlass::layout::RowMajor, AlignmentAB,
ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
ElementAcc, TileShape, ClusterShape, ElementAcc, TileShape, ClusterShape,
Stages, Stages,
KernelSchedule>::CollectiveOp; KernelSchedule>::CollectiveOp;
......
...@@ -486,41 +486,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -486,41 +486,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor page_table, float scale) -> ()"); " Tensor page_table, float scale) -> ()");
ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode); ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
// Mamba selective scan kernel
ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta,"
"Tensor! A, Tensor! B, Tensor! C,"
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
"bool delta_softplus,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"Tensor! ssm_states,"
"int pad_slot_id) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
ops.def(
"causal_conv1d_update(Tensor! x,"
"Tensor! conv_state,"
"Tensor! weight,"
"Tensor? bias_,"
"bool silu_activation,"
"Tensor? cache_seqlens_,"
"Tensor? conv_state_indices,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
ops.def(
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
"Tensor? bias_,"
"Tensor!? conv_states,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"bool silu_activation,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
// Compute NVFP4 block quantized tensor. // Compute NVFP4 block quantized tensor.
ops.def( ops.def(
"scaled_fp4_quant(Tensor! output, Tensor input," "scaled_fp4_quant(Tensor! output, Tensor input,"
...@@ -588,6 +553,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -588,6 +553,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
&dynamic_scaled_int8_quant); &dynamic_scaled_int8_quant);
// Mamba selective scan kernel
ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta,"
"Tensor! A, Tensor! B, Tensor! C,"
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
"bool delta_softplus,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"Tensor! ssm_states,"
"int pad_slot_id) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
ops.def(
"causal_conv1d_update(Tensor! x,"
"Tensor! conv_state,"
"Tensor! weight,"
"Tensor? bias_,"
"bool silu_activation,"
"Tensor? cache_seqlens_,"
"Tensor? conv_state_indices,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
ops.def(
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
"Tensor? bias_,"
"Tensor!? conv_states,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"bool silu_activation,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
#ifndef USE_ROCM #ifndef USE_ROCM
// reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
ops.def( ops.def(
......
...@@ -2,8 +2,8 @@ ...@@ -2,8 +2,8 @@
# to run the OpenAI compatible server. # to run the OpenAI compatible server.
# Please update any changes made here to # Please update any changes made here to
# docs/source/contributing/dockerfile/dockerfile.md and # docs/contributing/dockerfile/dockerfile.md and
# docs/source/assets/contributing/dockerfile-stages-dependency.png # docs/assets/contributing/dockerfile-stages-dependency.png
ARG CUDA_VERSION=12.8.1 ARG CUDA_VERSION=12.8.1
#################### BASE BUILD IMAGE #################### #################### BASE BUILD IMAGE ####################
...@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace ...@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace
ENV DEBIAN_FRONTEND=noninteractive ENV DEBIAN_FRONTEND=noninteractive
ARG TARGETPLATFORM ARG TARGETPLATFORM
SHELL ["/bin/bash", "-c"]
RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \ RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
...@@ -255,15 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist ...@@ -255,15 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \ if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
# uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \ # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
# TESTING: install FlashInfer from source to test 2.7.0 final RC
if [[ "$CUDA_VERSION" == 12.8* ]]; then \ if [[ "$CUDA_VERSION" == 12.8* ]]; then \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \ uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
else \ else \
export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \ export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
fi && \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
export FLASHINFER_ENABLE_AOT=1; \ if [ "$CUDA_MAJOR" -lt 12 ]; then \
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \ export FLASHINFER_ENABLE_SM90=0; \
fi; \
uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
fi \
fi fi
COPY examples examples COPY examples examples
COPY benchmarks benchmarks COPY benchmarks benchmarks
...@@ -273,7 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -273,7 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
. /etc/environment && \ . /etc/environment && \
uv pip list uv pip list
# Although we build Flashinfer with AOT mode, there's still # Even when we build Flashinfer with AOT mode, there's still
# some issues w.r.t. JIT compilation. Therefore we need to # some issues w.r.t. JIT compilation. Therefore we need to
# install build dependencies for JIT compilation. # install build dependencies for JIT compilation.
# TODO: Remove this once FlashInfer AOT wheel is fixed # TODO: Remove this once FlashInfer AOT wheel is fixed
...@@ -301,8 +305,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -301,8 +305,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4" uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/dev.txt CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
if [ "$CUDA_MAJOR" -ge 12 ]; then \
uv pip install --system -r requirements/dev.txt; \
fi
# install development dependencies (for testing) # install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -321,7 +328,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1 ...@@ -321,7 +328,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
# will not be imported by other tests # will not be imported by other tests
RUN mkdir test_docs RUN mkdir test_docs
RUN mv docs test_docs/ RUN mv docs test_docs/
RUN cp -r examples test_docs/
RUN mv vllm test_docs/ RUN mv vllm test_docs/
RUN mv mkdocs.yaml test_docs/
#################### TEST IMAGE #################### #################### TEST IMAGE ####################
#################### OPENAI API SERVER #################### #################### OPENAI API SERVER ####################
......
...@@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --upgrade pip && \ uv pip install --upgrade pip && \
uv pip install -r requirements/cpu.txt uv pip install -r requirements/cpu.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD" ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
RUN echo 'ulimit -c 0' >> ~/.bashrc RUN echo 'ulimit -c 0' >> ~/.bashrc
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment