Merge remote-tracking branch 'mirror/releases/v0.9.0' into v0.9.0-ori

4eabe123 · zhuwenwen · 45840cd2 · 58738772 · 4eabe123 · 4eabe123
Commit 4eabe123 authored May 28, 2025 by zhuwenwen
20 changed files
--- a/benchmarks/benchmark_dataset.py
+++ b/benchmarks/benchmark_dataset.py
@@ -35,6 +35,7 @@ from transformers import PreTrainedTokenizerBase
 from vllm.lora.request import LoRARequest
 from vllm.lora.utils import get_adapter_absolute_path
 from vllm.multimodal import MultiModalDataDict
+from vllm.multimodal.image import convert_image_mode
 from vllm.transformers_utils.tokenizer import AnyTokenizer, get_lora_tokenizer
 logger = logging.getLogger(__name__)
@@ -257,7 +258,7 @@ def process_image(image: Any) -> Mapping[str, Any]:
    if isinstance(image, dict) and "bytes" in image:
        image = Image.open(BytesIO(image["bytes"]))
    if isinstance(image, Image.Image):
-        image = image.convert("RGB")
+        image = convert_image_mode(image, "RGB")
        with io.BytesIO() as image_data:
            image.save(image_data, format="JPEG")
            image_base64 = base64.b64encode(image_data.getvalue()).decode("utf-8")

--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -189,5 +189,8 @@ if __name__ == "__main__":
    )
    parser = EngineArgs.add_cli_args(parser)
+    # V1 enables prefix caching by default which skews the latency
+    # numbers. We need to disable prefix caching by default.
+    parser.set_defaults(enable_prefix_caching=False)
    args = parser.parse_args()
    main(args)
--- a/benchmarks/benchmark_serving_structured_output.py
+++ b/benchmarks/benchmark_serving_structured_output.py
@@ -672,7 +672,7 @@ async def benchmark(
 def evaluate(ret, args):
    def _eval_correctness_json(expected, actual):
        # extract json string from string using regex
-        import re
+        import regex as re
        actual = actual.replace("\n", "").replace(" ", "").strip()
        try:
@@ -687,7 +687,7 @@ def evaluate(ret, args):
        return actual in args.choice
    def _eval_correctness_regex(expected, actual):
-        import re
+        import regex as re
        return re.match(args.regex, actual) is not None

--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -84,7 +84,10 @@ def main(
    if version == "v2":
        if current_platform.is_rocm():
            global PARTITION_SIZE
-            PARTITION_SIZE = 1024 if not args.custom_paged_attn else PARTITION_SIZE_ROCM
+            if not args.custom_paged_attn and not current_platform.is_navi():
+                PARTITION_SIZE = 1024
+            else:
+                PARTITION_SIZE = PARTITION_SIZE_ROCM
        num_partitions = (max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE
        tmp_output = torch.empty(
            size=(num_seqs, num_query_heads, num_partitions, head_size),
@@ -159,6 +162,7 @@ def main(
                        scale,
                        block_tables,
                        seq_lens,
+                        None,
                        block_size,
                        max_seq_len,
                        alibi_slopes,

--- a/benchmarks/kernels/graph_machete_bench.py
+++ b/benchmarks/kernels/graph_machete_bench.py
@@ -2,11 +2,11 @@
 import math
 import pickle
-import re
 from collections import defaultdict
 import matplotlib.pyplot as plt
 import pandas as pd
+import regex as re
 import seaborn as sns
 from torch.utils.benchmark import Measurement as TMeasurement

--- a/benchmarks/pyproject.toml
+++ b/benchmarks/pyproject.toml
@@ -6,11 +6,6 @@
 [tool.ruff]
 line-length = 88
-exclude = [
-    # External file, leaving license intact
-    "examples/other/fp8/quantizer/quantize.py",
-    "vllm/vllm_flash_attn/flash_attn_interface.pyi"
-]
 [tool.ruff.lint.per-file-ignores]
 "vllm/third_party/**" = ["ALL"]

--- a/csrc/cpu/cpu_types_x86.hpp
+++ b/csrc/cpu/cpu_types_x86.hpp
@@ -19,6 +19,7 @@ namespace vec_op {
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES_FP8(...)        \
  AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__)    \
  AT_DISPATCH_CASE(at::ScalarType::BFloat16, __VA_ARGS__) \
+  AT_DISPATCH_CASE(at::ScalarType::Half, __VA_ARGS__)     \
  AT_DISPATCH_CASE(at::ScalarType::Float8_e5m2, __VA_ARGS__)
 #define VLLM_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...) \

--- a/csrc/cutlass_extensions/common.hpp
+++ b/csrc/cutlass_extensions/common.hpp
@@ -15,15 +15,6 @@
                cutlassGetStatusString(error));     \
  }
-/**
- * Panic wrapper for unwinding CUDA runtime errors
- */
-#define CUDA_CHECK(status)                                        \
-  {                                                               \
-    cudaError_t error = status;                                   \
-    TORCH_CHECK(error == cudaSuccess, cudaGetErrorString(error)); \
-  }
 inline int get_cuda_max_shared_memory_per_block_opt_in(int const device) {
  int max_shared_mem_per_block_opt_in = 0;
  cudaDeviceGetAttribute(&max_shared_mem_per_block_opt_in,

--- a/csrc/mamba/causal_conv1d/causal_conv1d.cu
+++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu
@@ -13,6 +13,10 @@
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_store.cuh>
+#ifdef USE_ROCM
+    namespace cub = hipcub;
+#endif
 #include "static_switch.h"
@@ -501,15 +505,9 @@ void causal_conv1d_fwd_launch(ConvParamsBase &params, cudaStream_t stream) {
        auto kernel = &causal_conv1d_fwd_kernel<Ktraits>;
        if (kSmemSize >= 48 * 1024) {
-            #ifndef USE_ROCM
-            C10_CUDA_CHECK(cudaFuncSetAttribute(
-                kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
-            #else
-            // There is a slight signature discrepancy in HIP and CUDA "FuncSetAttribute" function.
            C10_CUDA_CHECK(cudaFuncSetAttribute(
                (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
            std::cerr << "Warning (causal_conv1d fwd launch): attempting to set maxDynamicSharedMemorySize on an AMD GPU which is currently a non-op (in ROCm versions <= 6.1). This might lead to undefined behavior. \n" << std::endl;
-            #endif
        }
        kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);

--- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
+++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu
@@ -321,7 +321,7 @@ void selective_scan_fwd_launch(SSMParamsBase &params, cudaStream_t stream) {
            auto kernel = &selective_scan_fwd_kernel<Ktraits>;
            if (kSmemSize >= 48 * 1024) {
                C10_CUDA_CHECK(cudaFuncSetAttribute(
-                    kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
+                    (void *) kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, kSmemSize));
            }
            kernel<<<grid, Ktraits::kNThreads, kSmemSize, stream>>>(params);
            C10_CUDA_KERNEL_LAUNCH_CHECK();

--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -28,4 +28,6 @@ torch::Tensor moe_wna16_gemm(torch::Tensor input, torch::Tensor output,
                             torch::Tensor num_tokens_post_pad, int64_t top_k,
                             int64_t BLOCK_SIZE_M, int64_t BLOCK_SIZE_N,
                             int64_t BLOCK_SIZE_K, int64_t bit);
 #endif
\ No newline at end of file
+bool moe_permute_unpermute_supported();
\ No newline at end of file
--- a/csrc/moe/moe_permute_unpermute_op.cu
+++ b/csrc/moe/moe_permute_unpermute_op.cu
@@ -5,6 +5,9 @@
 #include "permute_unpermute_kernels/dispatch.h"
 #include "core/registration.h"
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
 void moe_permute(
    const torch::Tensor& input,                      // [n_token, hidden]
    const torch::Tensor& topk_weights,               //[n_token, topk]
@@ -127,7 +130,45 @@ void moe_unpermute(
  });
 }
+#else
+void moe_permute(const torch::Tensor& input, const torch::Tensor& topk_weights,
+                 torch::Tensor& topk_ids,
+                 const torch::Tensor& token_expert_indicies,
+                 const std::optional<torch::Tensor>& expert_map,
+                 int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                 const std::optional<int64_t>& align_block_size,
+                 torch::Tensor& permuted_input,
+                 torch::Tensor& expert_first_token_offset,
+                 torch::Tensor& src_row_id2dst_row_id_map,
+                 torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+void moe_unpermute(const torch::Tensor& input,
+                   const torch::Tensor& topk_weights, torch::Tensor& topk_ids,
+                   const torch::Tensor& token_expert_indicies,
+                   const std::optional<torch::Tensor>& expert_map,
+                   int64_t n_expert, int64_t n_local_expert, int64_t topk,
+                   const std::optional<int64_t>& align_block_size,
+                   torch::Tensor& permuted_input,
+                   torch::Tensor& expert_first_token_offset,
+                   torch::Tensor& src_row_id2dst_row_id_map,
+                   torch::Tensor& m_indices) {
+  TORCH_CHECK(false, "moe_unpermute is not supported on CUDA < 12.0");
+}
+#endif
+bool moe_permute_unpermute_supported() {
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+  return true;
+#else
+  return false;
+#endif
+}
 TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
  m.impl("moe_permute", &moe_permute);
  m.impl("moe_unpermute", &moe_unpermute);
 }
\ No newline at end of file
--- a/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
+++ b/csrc/moe/permute_unpermute_kernels/moe_permute_unpermute_kernel.cu
 #include "moe_permute_unpermute_kernel.h"
+// moe_permute kernels require at least CUDA 12.0
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
 // CubKeyValueSorter definition begin
 CubKeyValueSorter::CubKeyValueSorter()
    : num_experts_(0), num_bits_(sizeof(int) * 8) {}
@@ -131,9 +134,6 @@ __global__ void preprocessTopkIdKernel(int* topk_id_ptr, int size,
                                       int num_experts) {
  auto tidx = threadIdx.x;
  auto bidx = blockIdx.x;
-  auto lidx = tidx & 31;
-  auto widx = tidx >> 5;
-  auto warp_count = (blockDim.x + 31) >> 5;
  auto offset = bidx * blockDim.x;
  auto bound = min(offset + blockDim.x, size);
  extern __shared__ int smem_expert_map[];
@@ -226,4 +226,6 @@ void getMIndices(int64_t* expert_first_token_offset,
        expert_first_token_offset, align_expert_first_token_offset, m_indices,
        num_local_expert, align_block_size);
  }
 }
\ No newline at end of file
+#endif
--- a/csrc/moe/torch_bindings.cpp
+++ b/csrc/moe/torch_bindings.cpp
@@ -10,7 +10,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
  // Calculate the result of moe by summing up the partial results
  // from all selected experts.
-  m.def("moe_sum(Tensor! input, Tensor output) -> ()");
+  m.def("moe_sum(Tensor input, Tensor! output) -> ()");
  m.impl("moe_sum", torch::kCUDA, &moe_sum);
  // Aligning the number of tokens to be processed by each expert such
@@ -77,7 +77,9 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
      "Tensor topk_ids,Tensor src_row_id2dst_row_id_map, Tensor "
      "expert_first_token_offset, int n_expert, int n_local_expert,int "
      "topk, Tensor! hidden_states)->()");
-  // conditionally compiled so impl registration is in source file
+  m.def("moe_permute_unpermute_supported() -> bool");
+  m.impl("moe_permute_unpermute_supported", &moe_permute_unpermute_supported);
 #endif
 }

--- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu
@@ -123,7 +123,7 @@ bool cutlass_scaled_mm_supports_block_fp8(int64_t cuda_device_capability) {
 }
 bool cutlass_group_gemm_supported(int64_t cuda_device_capability) {
-  // CUTLASS groped FP8 kernels need at least CUDA 12.3
+  // CUTLASS grouped FP8 kernels need at least CUDA 12.3
  // and SM90 (Hopper)
 #if defined CUDA_VERSION

--- a/csrc/rocm/attention.cu
+++ b/csrc/rocm/attention.cu
--- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
+++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cuh
@@ -8,6 +8,8 @@
 #include <ATen/cuda/CUDAContext.h>
+#include "cuda_utils.h"
 #include "cutlass/cutlass.h"
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
@@ -95,9 +97,9 @@ struct cutlass_sparse_3x_gemm {
  // clang-format off
  using CollectiveMainloop =
      typename cutlass::gemm::collective::CollectiveBuilder<
-          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp, 
+          cutlass::arch::Sm90, cutlass::arch::OpClassSparseTensorOp,
-          ElementAB, cutlass::layout::RowMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::RowMajor, AlignmentAB,
-          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB, 
+          ElementAB, cutlass::layout::ColumnMajor, AlignmentAB,
          ElementAcc, TileShape, ClusterShape,
          Stages,
          KernelSchedule>::CollectiveOp;

--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -486,41 +486,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "                   Tensor page_table, float scale) -> ()");
  ops.impl("cutlass_mla_decode", torch::kCUDA, &cutlass_mla_decode);
-  // Mamba selective scan kernel
-  ops.def(
-      "selective_scan_fwd(Tensor! u, Tensor! delta,"
-      "Tensor! A, Tensor! B, Tensor! C,"
-      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
-      "bool delta_softplus,"
-      "Tensor? query_start_loc,"
-      "Tensor? cache_indices,"
-      "Tensor? has_initial_state,"
-      "Tensor! ssm_states,"
-      "int pad_slot_id) -> ()");
-  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
-  ops.def(
-      "causal_conv1d_update(Tensor! x,"
-      "Tensor! conv_state,"
-      "Tensor! weight,"
-      "Tensor? bias_,"
-      "bool silu_activation,"
-      "Tensor? cache_seqlens_,"
-      "Tensor? conv_state_indices,"
-      "int pad_slot_id) -> ()");
-  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
-  ops.def(
-      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
-      "Tensor? bias_,"
-      "Tensor!? conv_states,"
-      "Tensor? query_start_loc,"
-      "Tensor? cache_indices,"
-      "Tensor? has_initial_state,"
-      "bool silu_activation,"
-      "int pad_slot_id) -> ()");
-  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
  // Compute NVFP4 block quantized tensor.
  ops.def(
      "scaled_fp4_quant(Tensor! output, Tensor input,"
@@ -588,6 +553,41 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
           &dynamic_scaled_int8_quant);
+  // Mamba selective scan kernel
+  ops.def(
+      "selective_scan_fwd(Tensor! u, Tensor! delta,"
+      "Tensor! A, Tensor! B, Tensor! C,"
+      "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
+      "bool delta_softplus,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "Tensor! ssm_states,"
+      "int pad_slot_id) -> ()");
+  ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
+  ops.def(
+      "causal_conv1d_update(Tensor! x,"
+      "Tensor! conv_state,"
+      "Tensor! weight,"
+      "Tensor? bias_,"
+      "bool silu_activation,"
+      "Tensor? cache_seqlens_,"
+      "Tensor? conv_state_indices,"
+      "int pad_slot_id) -> ()");
+  ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
+  ops.def(
+      "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
+      "Tensor? bias_,"
+      "Tensor!? conv_states,"
+      "Tensor? query_start_loc,"
+      "Tensor? cache_indices,"
+      "Tensor? has_initial_state,"
+      "bool silu_activation,"
+      "int pad_slot_id) -> ()");
+  ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
 #ifndef USE_ROCM
  // reorder weight for AllSpark Ampere W8A16 Fused Gemm kernel
  ops.def(

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -2,8 +2,8 @@
 # to run the OpenAI compatible server.
 # Please update any changes made here to
-# docs/source/contributing/dockerfile/dockerfile.md and
+# docs/contributing/dockerfile/dockerfile.md and
-# docs/source/assets/contributing/dockerfile-stages-dependency.png
+# docs/assets/contributing/dockerfile-stages-dependency.png
 ARG CUDA_VERSION=12.8.1
 #################### BASE BUILD IMAGE ####################
@@ -189,6 +189,8 @@ WORKDIR /vllm-workspace
 ENV DEBIAN_FRONTEND=noninteractive
 ARG TARGETPLATFORM
+SHELL ["/bin/bash", "-c"]
 RUN PYTHON_VERSION_STR=$(echo ${PYTHON_VERSION} | sed 's/\.//g') && \
    echo "export PYTHON_VERSION_STR=${PYTHON_VERSION_STR}" >> /etc/environment
@@ -255,15 +257,17 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist
 RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 if [ "$TARGETPLATFORM" != "linux/arm64" ]; then \
-    # uv pip install --system https://github.com/flashinfer-ai/flashinfer/releases/download/v0.2.4/flashinfer_python-0.2.4+cu124torch2.6-cp38-abi3-linux_x86_64.whl ; \
+    # FlashInfer alreary has a wheel for PyTorch 2.7.0 and CUDA 12.8. This is enough for CI use
-    # TESTING: install FlashInfer from source to test 2.7.0 final RC
    if [[ "$CUDA_VERSION" == 12.8* ]]; then \
-        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0 10.0+PTX'; \
+        uv pip install --system https://download.pytorch.org/whl/cu128/flashinfer/flashinfer_python-0.2.5%2Bcu128torch2.7-cp38-abi3-linux_x86_64.whl; \
    else \
        export TORCH_CUDA_ARCH_LIST='7.5 8.0 8.9 9.0+PTX'; \
-    fi && \
+        CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
-    export FLASHINFER_ENABLE_AOT=1; \
+        if [ "$CUDA_MAJOR" -lt 12 ]; then \
-    uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
+            export FLASHINFER_ENABLE_SM90=0; \
+        fi; \
+        uv pip install --system --no-build-isolation "git+https://github.com/flashinfer-ai/flashinfer@21ea1d2545f74782b91eb8c08fd503ac4c0743fc" ; \
+    fi \
 fi
 COPY examples examples
 COPY benchmarks benchmarks
@@ -273,7 +277,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 . /etc/environment && \
 uv pip list
-# Although we build Flashinfer with AOT mode, there's still
+# Even when we build Flashinfer with AOT mode, there's still
 # some issues w.r.t. JIT compilation. Therefore we need to
 # install build dependencies for JIT compilation.
 # TODO: Remove this once FlashInfer AOT wheel is fixed
@@ -301,8 +305,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --no-build-isolation "git+https://github.com/state-spaces/mamba@v2.2.4"
 # install development dependencies (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
+RUN --mount=type=cache,target=/root/.cache/uv \   
-    uv pip install --system -r requirements/dev.txt
+    CUDA_MAJOR="${CUDA_VERSION%%.*}"; \
+    if [ "$CUDA_MAJOR" -ge 12 ]; then \
+        uv pip install --system -r requirements/dev.txt; \
+    fi
 # install development dependencies (for testing)
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -321,7 +328,9 @@ COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
 # will not be imported by other tests
 RUN mkdir test_docs
 RUN mv docs test_docs/
+RUN cp -r examples test_docs/
 RUN mv vllm test_docs/
+RUN mv mkdocs.yaml test_docs/
 #################### TEST IMAGE ####################
 #################### OPENAI API SERVER ####################

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -51,9 +51,6 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --upgrade pip && \
    uv pip install -r requirements/cpu.txt
-RUN --mount=type=cache,target=/root/.cache/uv \
-    uv pip install intel-openmp==2024.2.1 intel_extension_for_pytorch==2.6.0
 ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/opt/venv/lib/libiomp5.so:$LD_PRELOAD"
 RUN echo 'ulimit -c 0' >> ~/.bashrc