Merge tag 'v0.19.1' into v0.19.0

fc67613a · zhuwenwen · 31aec25b · b1388b1f · fc67613a · fc67613a
Commit fc67613a authored Apr 18, 2026 by zhuwenwen
20 changed files
--- a/.buildkite/scripts/hardware_ci/run-cpu-test.sh
+++ b/.buildkite/scripts/hardware_ci/run-cpu-test.sh
@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
 docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
 # Run the image, setting --shm-size=4g for tensor parallel.
-docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \
+docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
        timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
--- a/.buildkite/test_areas/kernels.yaml
+++ b/.buildkite/test_areas/kernels.yaml
@@ -10,7 +10,20 @@ steps:
  - tests/kernels/test_top_k_per_row.py
  - tests/kernels/test_concat_mla_q.py
  commands:
-    - pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+    - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
+- label: Kernels MiniMax Reduce RMS Test (2 GPUs)
+  timeout_in_minutes: 15
+  num_devices: 2
+  device: h100
+  source_file_dependencies:
+  - csrc/minimax_reduce_rms_kernel.cu
+  - csrc/minimax_reduce_rms_kernel.h
+  - vllm/model_executor/layers/mamba/linear_attn.py
+  - vllm/model_executor/layers/mamba/lamport_workspace.py
+  - tests/kernels/core/test_minimax_reduce_rms.py
+  commands:
+    - pytest -v -s kernels/core/test_minimax_reduce_rms.py
 - label: Kernels Attention Test %N
  timeout_in_minutes: 35

--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -69,3 +69,18 @@ steps:
    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
    # Whisper needs spawn method to avoid deadlock
    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
+- label: Transformers Backward Compatibility Models Test
+  working_dir: "/vllm-workspace/"
+  optional: true
+  soft_fail: true
+  commands:
+    - pip install transformers==4.57.5
+    - pytest -v -s tests/models/test_initialization.py
+    - pytest -v -s tests/models/test_transformers.py
+    - pytest -v -s tests/models/multimodal/processing/
+    - pytest -v -s tests/models/multimodal/test_mapping.py
+    - python3 examples/offline_inference/basic/chat.py
+    - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
+    # Whisper needs spawn method to avoid deadlock
+    - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -306,6 +306,8 @@ set(VLLM_EXT_SRC
  "csrc/torch_bindings.cpp")
 if(VLLM_GPU_LANG STREQUAL "CUDA")
+  list(APPEND VLLM_EXT_SRC "csrc/minimax_reduce_rms_kernel.cu")
  SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
  # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.

--- a/csrc/minimax_reduce_rms_kernel.cu
+++ b/csrc/minimax_reduce_rms_kernel.cu
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <cooperative_groups.h>
+#include <cuda_runtime.h>
+#include <torch/cuda.h>
+#include <ATen/cuda/CUDAContext.h>
+#include <c10/cuda/CUDAGuard.h>
+#include "cuda_compat.h"
+#include "cuda_utils.h"
+#include "core/registration.h"
+#include "minimax_reduce_rms_kernel.h"
+#include <algorithm>
+#define FINAL_MASK 0xffffffff
+#define MINIMAX_REDUCE_RMS_WARP_SIZE 32
+namespace vllm {
+namespace tensorrt_llm {
+template <int NRanks>
+struct LamportComm {
+  __device__ __forceinline__ LamportComm(void** workspace, int rank) {
+    counter_ptr = &reinterpret_cast<int*>(workspace[NRanks * 3])[0];
+    flag_ptr = &reinterpret_cast<int*>(workspace[NRanks * 3])[2];
+    clear_ptr = &reinterpret_cast<int64_t*>(workspace[NRanks * 3 + 1])[0];
+    flag_value = *flag_ptr;
+    auto comm_size = reinterpret_cast<int64_t*>(workspace[NRanks * 3 + 1])[1];
+    clear_size = *clear_ptr;
+    int data_offset = flag_value % 3;
+    int clear_offset = (flag_value + 2) % 3;
+    for (int r = 0; r < NRanks; ++r) {
+      data_bufs[r] = reinterpret_cast<uint8_t*>(workspace[2 * NRanks + r]) +
+                     data_offset * comm_size;
+    }
+    clear_buf = reinterpret_cast<uint8_t*>(workspace[2 * NRanks + rank]) +
+                clear_offset * comm_size;
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      atomicAdd(counter_ptr, 1);
+    }
+  }
+  __device__ __forceinline__ void update(int64_t new_clear_size) {
+    if (blockIdx.x == 0 && threadIdx.x == 0) {
+      while (*reinterpret_cast<int volatile*>(counter_ptr) != gridDim.x) {
+      }
+      *flag_ptr = (flag_value + 1) % 3;
+      *clear_ptr = new_clear_size;
+      *counter_ptr = 0;
+    }
+  }
+  int* counter_ptr;
+  int* flag_ptr;
+  int64_t* clear_ptr;
+  uint8_t* data_bufs[NRanks];
+  uint8_t* clear_buf;
+  int64_t clear_size;
+  int flag_value;
+};
+__device__ __forceinline__ bool is_neg_zero(float v) {
+  return *reinterpret_cast<uint32_t*>(&v) == 0x80000000;
+}
+__device__ __forceinline__ bool is_neg_zero(float4 v) {
+  return is_neg_zero(v.x) || is_neg_zero(v.y) || is_neg_zero(v.z) ||
+         is_neg_zero(v.w);
+}
+__device__ __forceinline__ float4 get_neg_zero() {
+  float4 vec;
+#pragma unroll
+  for (int i = 0; i < 4; ++i) {
+    reinterpret_cast<uint32_t*>(&vec)[i] = 0x80000000;
+  }
+  return vec;
+}
+template <int Dim>
+__device__ __forceinline__ float rms_rsqrt(float& v, float eps) {
+  constexpr float kInvDim = 1.0F / static_cast<float>(Dim);
+  v = rsqrtf((v * kInvDim) + eps);
+  return v;
+}
+template <int Dim>
+__device__ __forceinline__ float4 rms_rsqrt(float4& v, float eps) {
+  constexpr float kInvDim = 1.0F / static_cast<float>(Dim);
+  v.x = rsqrtf((v.x * kInvDim) + eps);
+  v.y = rsqrtf((v.y * kInvDim) + eps);
+  v.z = rsqrtf((v.z * kInvDim) + eps);
+  v.w = rsqrtf((v.w * kInvDim) + eps);
+  return v;
+}
+__device__ __forceinline__ float4 ld_global_volatile(float4* addr) {
+  float4 val;
+  asm volatile("ld.volatile.global.v4.f32 {%0, %1, %2, %3}, [%4];"
+               : "=f"(val.x), "=f"(val.y), "=f"(val.z), "=f"(val.w)
+               : "l"(addr));
+  return val;
+}
+__device__ __forceinline__ float ld_global_volatile(float* addr) {
+  float val;
+  asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(val) : "l"(addr));
+  return val;
+}
+// Used by the scalar (non-float4) kernel only
+template <typename T, int NUM>
+__inline__ __device__ T warpReduceSumV2(T* val) {
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+#pragma unroll
+    for (int mask = 16; mask > 0; mask >>= 1)
+      val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
+  }
+  return (T)(0.0f);
+}
+template <typename T, int NUM>
+__inline__ __device__ T blockReduceSumV2(T* val) {
+  static __shared__ T shared[NUM][33];
+  int lane = threadIdx.x & 0x1f;
+  int wid = threadIdx.x >> 5;
+  warpReduceSumV2<T, NUM>(val);
+  if (lane == 0) {
+#pragma unroll
+    for (int i = 0; i < NUM; i++) {
+      shared[i][wid] = val[i];
+    }
+  }
+  __syncthreads();
+  bool is_mask = threadIdx.x < (blockDim.x / 32.f);
+#pragma unroll
+  for (int i = 0; i < NUM; i++) {
+    val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
+  }
+  warpReduceSumV2<T, NUM>(val);
+  return (T)0.0f;
+}
+// for float4 version
+template <uint32_t kNumThreads, typename T, int ArraySize = 4>
+__device__ __forceinline__ void local_warp_reduce_sum_array(
+    T* value_ptr, uint32_t active_mask = 0xffffffffu) {
+  static_assert(kNumThreads >= 1 &&
+                kNumThreads <= MINIMAX_REDUCE_RMS_WARP_SIZE);
+#pragma unroll
+  for (int i = 0; i < ArraySize; ++i) {
+#pragma unroll
+    for (int mask = kNumThreads / 2; mask > 0; mask >>= 1) {
+      value_ptr[i] += __shfl_xor_sync(active_mask, value_ptr[i], mask,
+                                      MINIMAX_REDUCE_RMS_WARP_SIZE);
+    }
+  }
+}
+constexpr int next_pow2(int val) {
+  int result = 1;
+  while (result < val) {
+    result <<= 1;
+  }
+  return result;
+}
+// ---------------------------------------------------------------------------
+template <typename DType>
+class IndexHelper {
+ public:
+  __device__ __forceinline__ IndexHelper(MiniMaxReduceRMSParams const& params) {
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+    namespace cg = cooperative_groups;
+    cg::cluster_group cluster = cg::this_cluster();
+    cg::grid_group grid = cg::this_grid();
+    token_id = grid.cluster_rank();
+    access_id_in_token = cluster.thread_rank();
+    token_stride = grid.num_clusters();
+#else
+    token_id = blockIdx.x;
+    access_id_in_token = threadIdx.x;
+    token_stride = gridDim.x;
+#endif
+    access_id = token_id * params.hidden_dim / kElemsPerAccess<DType> +
+                access_id_in_token;
+    access_stride = token_stride * params.hidden_dim / kElemsPerAccess<DType>;
+    tot_access = params.size_q / kElemsPerAccess<DType>;
+  }
+  int token_id;
+  int access_id_in_token;
+  int token_stride;
+  int access_id;
+  int access_stride;
+  int tot_access;
+};
+/**
+* this kernel is used to for minimax attention module
+* input tensor [total_tokens, hidden_dim / tp_size], fp32
+* rms weight [hidden_dim / tp_size], bf16
+step 1: reduce from single rank to get the variance sum (reduce(input^2,
+dim=-1)) step 2: reduce from all ranks to get the variance sum
+(all_reduce(variance_sum)) step 3: calculate the rms norm (input *
+rsqrt(variance + eps)) in this case, max hidden_dim is 6144 (float data), for
+each token, we only need 6144 / 4 / tp_size = (1536 / tp_size) threads so we can
+assume cluster size is 1 (tp_size >= 2)
+ */
+template <typename DType, int NRanks>
+__global__ void __launch_bounds__(1024)
+    minimax_reduce_rms_kernel_lamport(MiniMaxReduceRMSParams params) {
+  IndexHelper<DType> index_helper(params);
+  int token_id = index_helper.token_id;
+  int access_id_in_token = index_helper.access_id_in_token;
+  int token_stride = index_helper.token_stride;
+  int access_id = index_helper.access_id;
+  int access_stride = index_helper.access_stride;
+  int tot_access = index_helper.tot_access;
+  int tot_tokens = params.size_q / params.hidden_dim;
+  float4 clear_vec = get_neg_zero();
+  LamportComm<NRanks> comm(params.workspace, params.rank);
+  int clear_access = comm.clear_size / kElemsPerAccess<DType>;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+  for (int idx = access_id; idx < tot_access;
+       idx += access_stride, token_id += token_stride) {
+    alignas(16) DType vals[kElemsPerAccess<DType>];
+    float sum_variance = 0.F;
+    *reinterpret_cast<float4*>(vals) =
+        reinterpret_cast<float4*>(params.allreduce_in)[idx];
+#pragma unroll
+    for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+      sum_variance += static_cast<float>(vals[i]) * static_cast<float>(vals[i]);
+    }
+    blockReduceSumV2<float, 1>(&sum_variance);
+    if (is_neg_zero(sum_variance)) {
+      sum_variance = 0.F;
+    }
+    if (threadIdx.x == 0) {
+      for (int r = 0; r < NRanks; ++r) {
+        reinterpret_cast<float*>(
+            comm.data_bufs[r])[(params.rank * tot_tokens) + token_id] =
+            (sum_variance);
+      }
+    }
+    bool done = false;
+    float vars_all_ranks[NRanks];
+    while (!done) {
+      done = true;
+#pragma unroll
+      for (int r = 0; r < NRanks; ++r) {
+        vars_all_ranks[r] = ld_global_volatile(&reinterpret_cast<float*>(
+            comm.data_bufs[params.rank])[(r * tot_tokens) + token_id]);
+        done &= !is_neg_zero(vars_all_ranks[r]);
+      }
+    }
+    sum_variance = 0.F;
+#pragma unroll
+    for (int r = 0; r < NRanks; ++r) {
+      sum_variance += vars_all_ranks[r];
+    }
+    DType norm_weight[kElemsPerAccess<DType>];
+    *reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(norm_weight) =
+        reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
+            params.rms_gamma)[access_id_in_token];
+#pragma unroll
+    for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+      vals[i] = static_cast<DType>(
+          static_cast<float>(vals[i]) *
+          rsqrtf(
+              (sum_variance / static_cast<float>(params.hidden_dim) / NRanks) +
+              params.rms_eps) *
+          static_cast<float>(norm_weight[i]));
+    }
+    reinterpret_cast<float4*>(params.rms_norm_out)[idx] =
+        *reinterpret_cast<float4*>(vals);
+  }
+  for (int idx = access_id; idx < clear_access; idx += access_stride) {
+    reinterpret_cast<float4*>(comm.clear_buf)[idx] = clear_vec;
+  }
+  comm.update(params.size_q * NRanks);
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+/**
+ * Float4 variant: process 4 rows at once, allreduce variance sums as float4 for
+ * better memory coalescing. sum_variance is always float; applies to all DTypes
+ * (half, bf16, float). When tot_tokens % 4 != 0, the last group pads rows with
+ * zeros; padded rows are not written to rms_norm_out. IsQK: when true, process
+ * Q+K in one loop with doubled comm buffer; when false, single-matrix (Q only).
+ */
+template <typename DType, int NRanks, int OriginQDim, int OriginKDim>
+__global__ void __launch_bounds__(1024)
+    minimax_reduce_qk_rms_kernel_lamport_float4(MiniMaxReduceRMSParams params) {
+  // Compile-time per-rank dimensions
+  constexpr int RankQDim = OriginQDim / NRanks;
+  constexpr int RankKDim = OriginKDim / NRanks;
+  // Threads needed to cover one row of Q / K with float4 accesses
+  constexpr int ThreadsPerRowQ = RankQDim / kElemsPerAccess<DType>;
+  constexpr int ThreadsPerRowK = RankKDim / kElemsPerAccess<DType>;
+  // Number of warps dedicated to Q / K
+  constexpr int NumWarpQ = (ThreadsPerRowQ + MINIMAX_REDUCE_RMS_WARP_SIZE - 1) /
+                           MINIMAX_REDUCE_RMS_WARP_SIZE;
+  constexpr int NumWarpK = (ThreadsPerRowK + MINIMAX_REDUCE_RMS_WARP_SIZE - 1) /
+                           MINIMAX_REDUCE_RMS_WARP_SIZE;
+  int tot_tokens = params.size_q / RankQDim;
+  int tot_groups = (tot_tokens + 3) / 4;  // ceiling; last group may be partial
+  // Memory strides for strided qkv tensors (elements -> float4-access units)
+  int access_stride_q = (params.stride_q > 0 ? params.stride_q : RankQDim) /
+                        kElemsPerAccess<DType>;
+  int access_stride_k = (params.stride_k > 0 ? params.stride_k : RankKDim) /
+                        kElemsPerAccess<DType>;
+  // Output strides: default to contiguous (hidden_dim / hidden_dim_k)
+  int access_stride_q_out =
+      (params.stride_q_out > 0 ? params.stride_q_out : params.hidden_dim) /
+      kElemsPerAccess<DType>;
+  int access_stride_k_out =
+      (params.stride_k_out > 0 ? params.stride_k_out : params.hidden_dim_k) /
+      kElemsPerAccess<DType>;
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  namespace cg = cooperative_groups;
+  cg::cluster_group cluster = cg::this_cluster();
+  cg::grid_group grid = cg::this_grid();
+  int group_id = grid.cluster_rank();
+  int access_id_in_token = cluster.thread_rank();
+  int group_stride = grid.num_clusters();
+#else
+  int group_id = blockIdx.x;
+  int access_id_in_token = threadIdx.x;
+  int group_stride = gridDim.x;
+#endif
+  bool is_q = (access_id_in_token < NumWarpQ * MINIMAX_REDUCE_RMS_WARP_SIZE);
+  int k_thread_idx =
+      access_id_in_token - (NumWarpQ * MINIMAX_REDUCE_RMS_WARP_SIZE);
+  bool is_valid_q = (access_id_in_token < ThreadsPerRowQ);
+  bool is_valid_k = (k_thread_idx >= 0 && k_thread_idx < ThreadsPerRowK);
+  float4 clear_vec = get_neg_zero();
+  // Shared memory for two-level block reduction and scale broadcast
+  __shared__ float block_reduce_sum[4][MINIMAX_REDUCE_RMS_WARP_SIZE + 1];
+  __shared__ float global_scale_q[4];
+  __shared__ float global_scale_k[4];
+  LamportComm<NRanks> comm(params.workspace, params.rank);
+  DType norm_weight[kElemsPerAccess<DType>]{};
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+  if (is_q) {
+    if (is_valid_q) {
+      *reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
+          norm_weight) =
+          reinterpret_cast<typename ElemsPerAccess<DType>::vec_type const*>(
+              params.rms_gamma)[access_id_in_token];
+    }
+  } else {
+    if (is_valid_k) {
+      *reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
+          norm_weight) =
+          reinterpret_cast<typename ElemsPerAccess<DType>::vec_type const*>(
+              params.rms_gamma_k)[k_thread_idx];
+    }
+  }
+  // Main loop: process one group of 4 tokens per iteration.
+  for (int g = group_id; g < tot_groups; g += group_stride) {
+    alignas(16) DType vals[4][kElemsPerAccess<DType>]{};
+    float warp_sum_variance[4]{0.F, 0.F, 0.F, 0.F};
+    if (is_q) {
+#pragma unroll
+      for (int row = 0; row < 4; ++row) {
+        int token_r = g * 4 + row;
+        if (token_r >= tot_tokens || !is_valid_q) {
+          continue;
+        }
+        int idx_r = token_r * access_stride_q + access_id_in_token;
+        *reinterpret_cast<float4*>(&vals[row][0]) =
+            reinterpret_cast<float4 const*>(params.allreduce_in)[idx_r];
+#pragma unroll
+        for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+          float x = static_cast<float>(vals[row][i]);
+          warp_sum_variance[row] += x * x;
+        }
+      }
+    } else {
+#pragma unroll
+      for (int row = 0; row < 4; ++row) {
+        int token_r = g * 4 + row;
+        if (token_r >= tot_tokens || !is_valid_k) {
+          continue;
+        }
+        int idx_r = token_r * access_stride_k + k_thread_idx;
+        *reinterpret_cast<float4*>(&vals[row][0]) =
+            reinterpret_cast<float4 const*>(params.allreduce_in_k)[idx_r];
+#pragma unroll
+        for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+          float x = static_cast<float>(vals[row][i]);
+          warp_sum_variance[row] += x * x;
+        }
+      }
+    }
+    local_warp_reduce_sum_array<MINIMAX_REDUCE_RMS_WARP_SIZE, float, 4>(
+        warp_sum_variance);
+    // Warp lane 0 writes its warp's partial sum to shared memory
+    int lane = threadIdx.x & (MINIMAX_REDUCE_RMS_WARP_SIZE - 1);
+    if (lane == 0) {
+#pragma unroll
+      for (int t = 0; t < 4; ++t) {
+        block_reduce_sum[t][threadIdx.x / MINIMAX_REDUCE_RMS_WARP_SIZE] =
+            warp_sum_variance[t];
+      }
+    }
+    __syncthreads();
+    int tid = threadIdx.x;
+    if (tid < MINIMAX_REDUCE_RMS_WARP_SIZE) {
+      constexpr int kNumWarpQPow2 =
+          (next_pow2(NumWarpQ) > NRanks) ? next_pow2(NumWarpQ) : NRanks;
+      float local_sum[4];
+#pragma unroll
+      for (int t = 0; t < 4; ++t) {
+        local_sum[t] = (tid < NumWarpQ) ? block_reduce_sum[t][tid] : 0.F;
+      }
+      // After this, all kNumWarpQPow2 lanes (including tid 0..NRanks-1) have
+      // the total Q sum-of-squares for all 4 tokens.
+      local_warp_reduce_sum_array<kNumWarpQPow2, float, 4>(local_sum);
+      if (tid < NRanks) {
+#pragma unroll
+        for (int t = 0; t < 4; ++t) {
+          if (is_neg_zero(local_sum[t])) {
+            local_sum[t] = 0.F;
+          }
+        }
+        // Parallel push: thread tid writes this rank's Q sum to rank tid's buf
+        reinterpret_cast<float4*>(
+            comm.data_bufs[tid])[(params.rank * tot_groups * 2) + (2 * g)] =
+            *reinterpret_cast<float4*>(local_sum);
+        // Parallel pull: thread tid reads rank tid's contribution from
+        // this rank's (params.rank's) buffer
+        bool done = false;
+        float4 var_all_ranks;
+        while (!done) {
+          done = true;
+          var_all_ranks = ld_global_volatile(&reinterpret_cast<float4*>(
+              comm.data_bufs[params.rank])[(tid * tot_groups * 2) + (2 * g)]);
+          done &= !is_neg_zero(var_all_ranks);
+        }
+        // Warp-level allreduce: each of the NRanks threads holds one rank's
+        // partial sum; after this all NRanks threads have the global total.
+        constexpr uint32_t kQActiveMask = (1u << NRanks) - 1u;
+        local_warp_reduce_sum_array<NRanks, float, 4>(
+            reinterpret_cast<float*>(&var_all_ranks), kQActiveMask);
+        // Thread 0 computes rsqrt with compile-time Dim and writes to smem
+        if (tid == 0) {
+          *reinterpret_cast<float4*>(global_scale_q) =
+              rms_rsqrt<OriginQDim>(var_all_ranks, params.rms_eps);
+        }
+      }
+    } else if (tid >= MINIMAX_REDUCE_RMS_WARP_SIZE * NumWarpQ &&
+               tid < MINIMAX_REDUCE_RMS_WARP_SIZE * (NumWarpQ + 1)) {
+      // --- K leader warp ---
+      constexpr int kNumWarpKPow2 =
+          (next_pow2(NumWarpK) > NRanks) ? next_pow2(NumWarpK) : NRanks;
+      float local_sum[4];
+#pragma unroll
+      for (int t = 0; t < 4; ++t) {
+        local_sum[t] = (k_thread_idx < NumWarpK)
+                           ? block_reduce_sum[t][NumWarpQ + k_thread_idx]
+                           : 0.F;
+      }
+      local_warp_reduce_sum_array<kNumWarpKPow2, float, 4>(local_sum);
+      if (k_thread_idx < NRanks) {
+#pragma unroll
+        for (int t = 0; t < 4; ++t) {
+          if (is_neg_zero(local_sum[t])) {
+            local_sum[t] = 0.F;
+          }
+        }
+        reinterpret_cast<float4*>(
+            comm.data_bufs[k_thread_idx])[(params.rank * tot_groups * 2) +
+                                          (2 * g + 1)] =
+            *reinterpret_cast<float4*>(local_sum);
+        bool done = false;
+        float4 var_all_ranks;
+        while (!done) {
+          done = true;
+          var_all_ranks = ld_global_volatile(&reinterpret_cast<float4*>(
+              comm.data_bufs[params.rank])[(k_thread_idx * tot_groups * 2) +
+                                           (2 * g + 1)]);
+          done &= !is_neg_zero(var_all_ranks);
+        }
+        constexpr uint32_t kKActiveMask = (1u << NRanks) - 1u;
+        local_warp_reduce_sum_array<NRanks, float, 4>(
+            reinterpret_cast<float*>(&var_all_ranks), kKActiveMask);
+        if (k_thread_idx == 0) {
+          *reinterpret_cast<float4*>(global_scale_k) =
+              rms_rsqrt<OriginKDim>(var_all_ranks, params.rms_eps);
+        }
+      }
+    }
+    __syncthreads();
+    if (is_q) {
+#pragma unroll
+      for (int t = 0; t < 4; ++t) {
+        warp_sum_variance[t] = global_scale_q[t];
+      }
+#pragma unroll
+      for (int r = 0; r < 4; ++r) {
+#pragma unroll
+        for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+          vals[r][i] = static_cast<DType>(static_cast<float>(vals[r][i]) *
+                                          warp_sum_variance[r] *
+                                          static_cast<float>(norm_weight[i]));
+        }
+        int token_r = g * 4 + r;
+        if (token_r >= tot_tokens || !is_valid_q) {
+          continue;
+        }
+        int idx_out = token_r * access_stride_q_out + access_id_in_token;
+        reinterpret_cast<float4*>(params.rms_norm_out)[idx_out] =
+            *reinterpret_cast<float4*>(&vals[r][0]);
+      }
+    } else {
+#pragma unroll
+      for (int t = 0; t < 4; ++t) {
+        warp_sum_variance[t] = global_scale_k[t];
+      }
+#pragma unroll
+      for (int r = 0; r < 4; ++r) {
+#pragma unroll
+        for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
+          vals[r][i] = static_cast<DType>(static_cast<float>(vals[r][i]) *
+                                          warp_sum_variance[r] *
+                                          static_cast<float>(norm_weight[i]));
+        }
+        int token_r = g * 4 + r;
+        if (token_r >= tot_tokens || !is_valid_k) {
+          continue;
+        }
+        int idx_out = token_r * access_stride_k_out + k_thread_idx;
+        reinterpret_cast<float4*>(params.rms_norm_out_k)[idx_out] =
+            *reinterpret_cast<float4*>(&vals[r][0]);
+      }
+    }
+  }  // end group loop
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+  int clear_access = static_cast<int>(comm.clear_size / kElemsPerAccess<DType>);
+  int clear_stride = group_stride * blockDim.x;
+  for (int idx = group_id * blockDim.x + threadIdx.x; idx < clear_access;
+       idx += clear_stride) {
+    reinterpret_cast<float4*>(comm.clear_buf)[idx] = clear_vec;
+  }
+  comm.update(static_cast<int64_t>(2) * tot_groups * kElemsPerAccess<DType> *
+              NRanks);
+}
+int get_sm_count() {
+  static int sm_count = 0;
+  if (sm_count == 0) {
+    int device_id;
+    CUDA_CHECK(cudaGetDevice(&device_id));
+    cudaDeviceProp device_prop;
+    cudaGetDeviceProperties(&device_prop, device_id);
+    sm_count = device_prop.multiProcessorCount;
+  }
+  return sm_count;
+}
+inline int getSMVersion(bool queryRealSmArch = false) {
+  int device{-1};
+  CUDA_CHECK(cudaGetDevice(&device));
+  int sm_major = 0;
+  int sm_minor = 0;
+  CUDA_CHECK(cudaDeviceGetAttribute(&sm_major,
+                                    cudaDevAttrComputeCapabilityMajor, device));
+  CUDA_CHECK(cudaDeviceGetAttribute(&sm_minor,
+                                    cudaDevAttrComputeCapabilityMinor, device));
+  int sm = sm_major * 10 + sm_minor;
+  if (sm == 121 && !queryRealSmArch) {
+    return 120;
+  }
+  return sm;
+}
+template <typename KernelFunc>
+int get_max_active_blocks(KernelFunc kernel, int block_size,
+                          int dynamic_smem = 0) {
+  int max_active = 0;
+  CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+      &max_active, kernel, block_size, dynamic_smem));
+  return std::max(max_active, 1);
+}
+template <typename DType, int NRanks>
+void minimax_reduce_rms_kernel_launcher(MiniMaxReduceRMSParams const& params) {
+  static int SM = getSMVersion();
+  int token_num = params.size_q / params.hidden_dim;
+  int sm_count = get_sm_count();
+  int cluster_size = 1;
+  int cluster_num = token_num;
+  int threads_per_token = params.hidden_dim / kElemsPerAccess<DType>;
+  int block_size = threads_per_token;
+  int max_blocks_per_sm = get_max_active_blocks(
+      minimax_reduce_rms_kernel_lamport<DType, NRanks>, block_size);
+  int max_grid = max_blocks_per_sm * sm_count;
+  int grid_size =
+      (std::min(max_grid, cluster_num * cluster_size) / cluster_size) *
+      cluster_size;
+  cudaLaunchConfig_t cfg;
+  cfg.gridDim = grid_size;
+  cfg.blockDim = block_size;
+  cfg.dynamicSmemBytes = 0;
+  cfg.stream = params.stream;
+  cudaLaunchAttribute attribute[2];
+  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attribute[0].val.programmaticStreamSerializationAllowed = 1;
+  attribute[1].id = cudaLaunchAttributeClusterDimension;
+  attribute[1].val.clusterDim.x = cluster_size;
+  attribute[1].val.clusterDim.y = 1;
+  attribute[1].val.clusterDim.z = 1;
+  cfg.attrs = attribute;
+  cfg.numAttrs = SM >= 90 ? 2 : 0;
+  CUDA_CHECK(cudaLaunchKernelEx(
+      &cfg, minimax_reduce_rms_kernel_lamport<DType, NRanks>, params));
+}
+template <typename DType, int NRanks, int OriginQDim, int OriginKDim>
+void minimax_reduce_rms_kernel_launcher_float4(
+    MiniMaxReduceRMSParams const& params) {
+  TORCH_CHECK(params.size_q % params.hidden_dim == 0);
+  TORCH_CHECK(params.hidden_dim % kElemsPerAccess<DType> == 0);
+  if (params.stride_q > 0) {
+    TORCH_CHECK(params.stride_q % kElemsPerAccess<DType> == 0);
+  }
+  TORCH_CHECK(params.allreduce_in_k != nullptr,
+              "float4 QK kernel requires K input");
+  TORCH_CHECK(params.hidden_dim >= params.hidden_dim_k);
+  TORCH_CHECK(params.size_k % params.hidden_dim_k == 0);
+  TORCH_CHECK(params.hidden_dim_k % kElemsPerAccess<DType> == 0);
+  TORCH_CHECK(params.size_q / params.hidden_dim ==
+              params.size_k / params.hidden_dim_k);
+  if (params.stride_k > 0) {
+    TORCH_CHECK(params.stride_k % kElemsPerAccess<DType> == 0);
+  }
+  int token_num = params.size_q / params.hidden_dim;
+  int tot_groups = (token_num + 3) / 4;
+  if (tot_groups == 0) {
+    return;
+  }
+  static int SM = getSMVersion();
+  int sm_count = get_sm_count();
+  int cluster_size = 1;
+  int cluster_num = tot_groups;
+  int access_per_row_q = params.hidden_dim / kElemsPerAccess<DType>;
+  int access_per_row_k = params.hidden_dim_k / kElemsPerAccess<DType>;
+  // Round each section up to a warp boundary
+  auto divUp = [](int a, int b) { return (a + b - 1) / b * b; };
+  int block_size = divUp(access_per_row_q, MINIMAX_REDUCE_RMS_WARP_SIZE) +
+                   divUp(access_per_row_k, MINIMAX_REDUCE_RMS_WARP_SIZE);
+  auto kfn =
+      minimax_reduce_qk_rms_kernel_lamport_float4<DType, NRanks, OriginQDim,
+                                                  OriginKDim>;
+  int max_blocks_per_sm = get_max_active_blocks(kfn, block_size);
+  int max_grid = max_blocks_per_sm * sm_count;
+  int grid_size =
+      (std::min(max_grid, cluster_num * cluster_size) / cluster_size) *
+      cluster_size;
+  cudaLaunchConfig_t cfg;
+  cfg.gridDim = grid_size;
+  cfg.blockDim = block_size;
+  cfg.dynamicSmemBytes = 0;
+  cfg.stream = params.stream;
+  cudaLaunchAttribute attribute[2];
+  attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attribute[0].val.programmaticStreamSerializationAllowed = 1;
+  attribute[1].id = cudaLaunchAttributeClusterDimension;
+  attribute[1].val.clusterDim.x = cluster_size;
+  attribute[1].val.clusterDim.y = 1;
+  attribute[1].val.clusterDim.z = 1;
+  cfg.attrs = attribute;
+  cfg.numAttrs = SM >= 90 ? 2 : 0;
+  CUDA_CHECK(cudaLaunchKernelEx(&cfg, kfn, params));
+}
+template <int NRanks>
+void dispatch_dtype(MiniMaxReduceRMSParams const& params) {
+  // Use the optimized QK float4 kernel when:
+  //  - K input is present, AND
+  //  - the full (NRanks * per-rank) dimensions match the MiniMax M2 shape.
+  // Otherwise fall back to the scalar kernel.
+  bool use_float4 = (params.allreduce_in_k != nullptr) &&
+                    (params.hidden_dim * params.nranks == 6144) &&
+                    (params.hidden_dim_k * params.nranks == 1024);
+  if (params.dtype == at::ScalarType::Half) {
+    if (use_float4) {
+      minimax_reduce_rms_kernel_launcher_float4<half, NRanks, 6144, 1024>(
+          params);
+    } else {
+      minimax_reduce_rms_kernel_launcher<half, NRanks>(params);
+    }
+  } else if (params.dtype == at::ScalarType::BFloat16) {
+    if (use_float4) {
+      minimax_reduce_rms_kernel_launcher_float4<__nv_bfloat16, NRanks, 6144,
+                                                1024>(params);
+    } else {
+      minimax_reduce_rms_kernel_launcher<__nv_bfloat16, NRanks>(params);
+    }
+  } else if (params.dtype == at::ScalarType::Float) {
+    if (use_float4) {
+      minimax_reduce_rms_kernel_launcher_float4<float, NRanks, 6144, 1024>(
+          params);
+    } else {
+      minimax_reduce_rms_kernel_launcher<float, NRanks>(params);
+    }
+  } else {
+    TORCH_CHECK(false, "Unsupported data type for minimax_reduce_rms_op");
+  }
+}
+void minimax_reduce_rms_op(MiniMaxReduceRMSParams const& params) {
+  if (params.nranks == 2) {
+    dispatch_dtype<2>(params);
+  } else if (params.nranks == 4) {
+    dispatch_dtype<4>(params);
+  } else if (params.nranks == 8) {
+    dispatch_dtype<8>(params);
+  } else if (params.nranks == 16) {
+    dispatch_dtype<16>(params);
+  } else {
+    TORCH_CHECK(false, "minimax_reduce_rms_op: unsupported ranks number!");
+  }
+}
+}  // namespace tensorrt_llm
+}  // namespace vllm
+torch::Tensor minimax_allreduce_rms(torch::Tensor const& input,
+                                    torch::Tensor const& norm_weight,
+                                    torch::Tensor workspace, int64_t const rank,
+                                    int64_t const nranks, double const eps) {
+  auto allreduce_params = vllm::tensorrt_llm::MiniMaxReduceRMSParams();
+  allreduce_params.nranks = static_cast<int>(nranks);
+  allreduce_params.rank = static_cast<int>(rank);
+  allreduce_params.dtype = input.scalar_type();
+  allreduce_params.size_q = static_cast<int>(input.numel());
+  allreduce_params.hidden_dim = static_cast<int>(input.size(-1));
+  allreduce_params.stride_q = allreduce_params.hidden_dim;
+  allreduce_params.workspace =
+      reinterpret_cast<void**>(workspace.mutable_data_ptr());
+  allreduce_params.allreduce_in = input.data_ptr();
+  allreduce_params.rms_gamma = norm_weight.data_ptr();
+  allreduce_params.rms_eps = static_cast<float>(eps);
+  allreduce_params.stream = at::cuda::getCurrentCUDAStream(input.get_device());
+  torch::Tensor rms_norm_out = torch::empty_like(input);
+  allreduce_params.rms_norm_out = rms_norm_out.mutable_data_ptr();
+  vllm::tensorrt_llm::minimax_reduce_rms_op(allreduce_params);
+  return rms_norm_out;
+}
+std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
+    torch::Tensor qkv, torch::Tensor const& norm_weight_q,
+    torch::Tensor const& norm_weight_k, torch::Tensor workspace,
+    int64_t const q_size, int64_t const kv_size, int64_t const rank,
+    int64_t const nranks, double const eps) {
+  TORCH_CHECK(qkv.dim() == 2, "minimax_allreduce_rms_qk: qkv must be 2D");
+  TORCH_CHECK(qkv.is_contiguous(),
+              "minimax_allreduce_rms_qk: qkv must be contiguous");
+  int64_t qkv_dim = qkv.size(-1);
+  TORCH_CHECK(qkv_dim == q_size + 2 * kv_size,
+              "minimax_allreduce_rms_qk: qkv last dim must equal "
+              "q_size + 2 * kv_size");
+  TORCH_CHECK(rank < nranks,
+              "minimax_allreduce_rms_qk: rank must be less than nranks");
+  int64_t num_tokens = qkv.size(0);
+  int elem_bytes = qkv.element_size();
+  torch::Tensor q_out = torch::empty({num_tokens, q_size}, qkv.options());
+  torch::Tensor k_out = torch::empty({num_tokens, kv_size}, qkv.options());
+  auto params = vllm::tensorrt_llm::MiniMaxReduceRMSParams();
+  params.nranks = static_cast<int>(nranks);
+  params.rank = static_cast<int>(rank);
+  params.dtype = qkv.scalar_type();
+  params.size_q = static_cast<int>(num_tokens * q_size);
+  params.hidden_dim = static_cast<int>(q_size);
+  params.size_k = static_cast<int>(num_tokens * kv_size);
+  params.hidden_dim_k = static_cast<int>(kv_size);
+  params.stride_q = static_cast<int>(qkv_dim);
+  params.stride_k = static_cast<int>(qkv_dim);
+  params.stride_q_out = 0;  // q_out is contiguous; kernel uses hidden_dim
+  params.stride_k_out = 0;  // k_out is contiguous; kernel uses hidden_dim_k
+  params.workspace = reinterpret_cast<void**>(workspace.mutable_data_ptr());
+  uint8_t* base = static_cast<uint8_t*>(qkv.data_ptr());
+  params.allreduce_in = base;
+  params.allreduce_in_k = base + q_size * elem_bytes;
+  params.rms_gamma = norm_weight_q.data_ptr();
+  params.rms_gamma_k = norm_weight_k.data_ptr();
+  params.rms_eps = static_cast<float>(eps);
+  params.stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
+  params.rms_norm_out = q_out.mutable_data_ptr();
+  params.rms_norm_out_k = k_out.mutable_data_ptr();
+  vllm::tensorrt_llm::minimax_reduce_rms_op(params);
+  return {q_out, k_out};
+}
--- a/csrc/minimax_reduce_rms_kernel.h
+++ b/csrc/minimax_reduce_rms_kernel.h
+/*
+ * Copyright (c) 2026, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+#include <torch/types.h>
+namespace vllm {
+namespace tensorrt_llm {
+template <typename DType>
+struct ElemsPerAccess;
+template <>
+struct ElemsPerAccess<half> {
+  static constexpr int value = 8;
+  using vec_type = float4;
+};
+template <>
+struct ElemsPerAccess<nv_bfloat16> {
+  static constexpr int value = 8;
+  using vec_type = float4;
+};
+template <>
+struct ElemsPerAccess<float> {
+  static constexpr int value = 4;
+  using vec_type = float4;
+};
+template <typename DType>
+static constexpr int kElemsPerAccess = ElemsPerAccess<DType>::value;
+struct MiniMaxReduceRMSParams {
+  int nranks{};
+  int rank{};
+  at::ScalarType dtype{at::ScalarType::Undefined};
+  int size_q{};
+  int hidden_dim{};
+  int size_k{};
+  int hidden_dim_k{};
+  int stride_q{};  // row stride for q input (elements); when > hidden_dim,
+                   // q is part of a wider qkv tensor
+  int stride_k{};  // row stride for k input (elements); when > hidden_dim_k,
+                   // k is part of a wider qkv tensor
+  int stride_q_out{};  // row stride for q output (elements); 0 = contiguous
+  int stride_k_out{};  // row stride for k output (elements); 0 = contiguous
+  void** workspace{};
+  void* allreduce_in{};
+  void* rms_norm_out{};
+  void* rms_gamma{};
+  void* allreduce_in_k{};
+  void* rms_norm_out_k{};
+  void* rms_gamma_k{};
+  float rms_eps{};
+  cudaStream_t stream{};
+};
+void minimax_reduce_rms_op(MiniMaxReduceRMSParams const& params);
+}  // namespace tensorrt_llm
+}  // namespace vllm
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -392,3 +392,15 @@ int64_t qr_max_size();
 void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a,
                       torch::Tensor const& mat_b);
 #endif
+#ifndef USE_ROCM
+torch::Tensor minimax_allreduce_rms(torch::Tensor const& input,
+                                    torch::Tensor const& norm_weight,
+                                    torch::Tensor workspace, int64_t const rank,
+                                    int64_t const nranks, double const eps);
+std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
+    torch::Tensor qkv, torch::Tensor const& norm_weight_q,
+    torch::Tensor const& norm_weight_k, torch::Tensor workspace,
+    int64_t const q_size, int64_t const kv_size, int64_t const rank,
+    int64_t const nranks, double const eps);
+#endif
\ No newline at end of file
--- a/csrc/torch_bindings.cpp
+++ b/csrc/torch_bindings.cpp
@@ -668,6 +668,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
      "Tensor? b_qzeros, "
      "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
      "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
+  ops.def(
+      "minimax_allreduce_rms("
+      "Tensor input,"
+      "Tensor norm_weight,"
+      "Tensor workspace,"
+      "int rank,"
+      "int nranks,"
+      "float eps) -> Tensor");
+  ops.impl("minimax_allreduce_rms", torch::kCUDA, &minimax_allreduce_rms);
+  ops.def(
+      "minimax_allreduce_rms_qk("
+      "Tensor qkv,"
+      "Tensor norm_weight_q,"
+      "Tensor norm_weight_k,"
+      "Tensor workspace,"
+      "int q_size,"
+      "int kv_size,"
+      "int rank,"
+      "int nranks,"
+      "float eps) -> (Tensor, Tensor)");
+  ops.impl("minimax_allreduce_rms_qk", torch::kCUDA, &minimax_allreduce_rms_qk);
  //  conditionally compiled so impl in source file
 #endif
 }

--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -649,7 +649,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    else \
        BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
    fi; \
-    uv pip install --system accelerate hf_transfer modelscope \
+    uv pip install --system accelerate modelscope \
        "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
 # ============================================================
@@ -772,9 +772,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
+ENV HF_XET_HIGH_PERFORMANCE 1
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 # Copy in the v1 package for testing (it isn't distributed yet)
 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

--- a/docker/Dockerfile.cpu
+++ b/docker/Dockerfile.cpu
@@ -140,9 +140,11 @@ RUN \
    esac; \
    }; \
    remove_packages_not_supported_on_aarch64 && \
-    sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \
+    sed -i 's/^torch==.*/torch==2.11.0/g' requirements/cpu-test.in && \
    sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
    sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
+    # Related issue: https://github.com/vllm-project/vllm/pull/38800#issuecomment-4228314305
+    sed -i 's/^sentence-transformers.*/sentence-transformers==5.3.0/g' requirements/cpu-test.in && \
    uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -195,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install -e tests/vllm_test_utils
+# enable fast downloads from hf (for testing)
+ENV HF_XET_HIGH_PERFORMANCE 1
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 ######################### RELEASE IMAGE #########################
 FROM base AS vllm-openai

--- a/docker/Dockerfile.nightly_torch
+++ b/docker/Dockerfile.nightly_torch
@@ -269,9 +269,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -e tests/vllm_test_utils
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
+ENV HF_XET_HIGH_PERFORMANCE 1
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER 1
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r requirements/nightly_torch_test.txt

--- a/docker/Dockerfile.rocm
+++ b/docker/Dockerfile.rocm
@@ -364,9 +364,10 @@ RUN cd /vllm-workspace \
    && python3 -m pip install pytest-shard
 # enable fast downloads from hf (for testing)
-RUN --mount=type=cache,target=/root/.cache/uv \
+ENV HF_XET_HIGH_PERFORMANCE=1
-    uv pip install --system hf_transfer
-ENV HF_HUB_ENABLE_HF_TRANSFER=1
+# increase timeout for hf downloads (for testing)
+ENV HF_HUB_DOWNLOAD_TIMEOUT 60
 # install audio decode package `torchcodec` from source (required due to 
 # ROCm and torch version mismatch) for tests with datasets package

--- a/docs/getting_started/installation/gpu.rocm.inc.md
+++ b/docs/getting_started/installation/gpu.rocm.inc.md
@@ -147,7 +147,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
        # Install dependencies
        pip install --upgrade numba \
            scipy \
-            huggingface-hub[cli,hf_transfer] \
+            huggingface-hub[cli] \
            setuptools_scm
        pip install -r requirements/rocm.txt

--- a/examples/tool_chat_template_gemma4.jinja
+++ b/examples/tool_chat_template_gemma4.jinja
+{%- macro format_parameters(properties, required) -%}
+    {%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
+    {%- set ns = namespace(found_first=false) -%}
+    {%- for key, value in properties | dictsort -%}
+        {%- set add_comma = false -%}
+        {%- if key not in standard_keys -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {{ key }}:{
+            {%- if value['description'] -%}
+                description:<|"|>{{ value['description'] }}<|"|>
+                {%- set add_comma = true -%}
+            {%- endif -%}
+            {%- if value['nullable'] %}
+                {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                nullable:true
+            {%- endif -%}
+            {%- if value['type'] | upper == 'STRING' -%}
+                {%- if value['enum'] -%}
+                    {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+                    enum:{{ format_argument(value['enum']) }}
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'OBJECT' -%}
+                ,properties:{
+                {%- if value['properties'] is defined and value['properties'] is mapping -%}
+                    {{- format_parameters(value['properties'], value['required'] | default([])) -}}
+                {%- elif value is mapping -%}
+                    {{- format_parameters(value, value['required'] | default([])) -}}
+                {%- endif -%}
+                }
+                {%- if value['required'] -%}
+                    ,required:[
+                    {%- for item in value['required'] | default([]) -%}
+                        <|"|>{{- item -}}<|"|>
+                        {%- if not loop.last %},{% endif -%}
+                    {%- endfor -%}
+                    ]
+                {%- endif -%}
+            {%- elif value['type'] | upper == 'ARRAY' -%}
+                {%- if value['items'] is mapping and value['items'] -%}
+                    ,items:{
+                    {%- set ns_items = namespace(found_first=false) -%}
+                    {%- for item_key, item_value in value['items'] | dictsort -%}
+                        {%- if item_value is not none -%}
+                            {%- if ns_items.found_first %},{% endif -%}
+                            {%- set ns_items.found_first = true -%}
+                            {%- if item_key == 'properties' -%}
+                                properties:{
+                                {%- if item_value is mapping -%}
+                                    {{- format_parameters(item_value, value['items']['required'] | default([])) -}}
+                                {%- endif -%}
+                                }
+                            {%- elif item_key == 'required' -%}
+                                required:[
+                                {%- for req_item in item_value -%}
+                                    <|"|>{{- req_item -}}<|"|>
+                                    {%- if not loop.last %},{% endif -%}
+                                {%- endfor -%}
+                                ]
+                            {%- elif item_key == 'type' -%}
+                                {%- if item_value is string -%}
+                                    type:{{ format_argument(item_value | upper) }}
+                                {%- else -%}
+                                    type:{{ format_argument(item_value | map('upper') | list) }}
+                                {%- endif -%}
+                            {%- else -%}
+                                {{ item_key }}:{{ format_argument(item_value) }}
+                            {%- endif -%}
+                        {%- endif -%}
+                    {%- endfor -%}
+                    }
+                {%- endif -%}
+            {%- endif -%}
+            {%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
+            type:<|"|>{{ value['type'] | upper }}<|"|>}
+        {%- endif -%}
+    {%- endfor -%}
+{%- endmacro -%}
+{%- macro format_function_declaration(tool_data) -%}
+    declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
+    {%- set params = tool_data['function']['parameters'] -%}
+    {%- if params -%}
+        ,parameters:{
+        {%- if params['properties'] -%}
+            properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
+        {%- endif -%}
+        {%- if params['required'] -%}
+            required:[
+            {%- for item in params['required'] -%}
+                <|"|>{{- item -}}<|"|>
+                {{- ',' if not loop.last -}}
+            {%- endfor -%}
+            ],
+        {%- endif -%}
+        {%- if params['type'] -%}
+            type:<|"|>{{- params['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    {%- if 'response' in tool_data['function'] -%}
+        {%- set response_declaration = tool_data['function']['response'] -%}
+        ,response:{
+        {%- if response_declaration['description'] -%}
+            description:<|"|>{{- response_declaration['description'] -}}<|"|>,
+        {%- endif -%}
+        {%- if response_declaration['type'] | upper == 'OBJECT' -%}
+            type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
+        {%- endif -%}
+    {%- endif -%}
+    }
+{%- endmacro -%}
+{%- macro format_argument(argument, escape_keys=True) -%}
+    {%- if argument is string -%}
+        {{- '<|"|>' + argument + '<|"|>' -}}
+    {%- elif argument is boolean -%}
+        {{- 'true' if argument else 'false' -}}
+    {%- elif argument is mapping -%}
+        {{- '{' -}}
+        {%- set ns = namespace(found_first=false) -%}
+        {%- for key, value in argument | dictsort -%}
+            {%- if ns.found_first %},{% endif -%}
+            {%- set ns.found_first = true -%}
+            {%- if escape_keys -%}
+                {{- '<|"|>' + key + '<|"|>' -}}
+            {%- else -%}
+                {{- key -}}
+            {%- endif -%}
+            :{{- format_argument(value, escape_keys=escape_keys) -}}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- elif argument is sequence -%}
+        {{- '[' -}}
+        {%- for item in argument -%}
+            {{- format_argument(item, escape_keys=escape_keys) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- ']' -}}
+    {%- else -%}
+        {{- argument -}}
+    {%- endif -%}
+{%- endmacro -%}
+{%- macro strip_thinking(text) -%}
+    {%- set ns = namespace(result='') -%}
+    {%- for part in text.split('<channel|>') -%}
+        {%- if '<|channel>' in part -%}
+            {%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
+        {%- else -%}
+            {%- set ns.result = ns.result + part -%}
+        {%- endif -%}
+    {%- endfor -%}
+    {{- ns.result | trim -}}
+{%- endmacro -%}
+{%- macro format_tool_response_block(tool_name, response) -%}
+    {{- '<|tool_response>' -}}
+    {%- if response is mapping -%}
+        {{- 'response:' + tool_name + '{' -}}
+        {%- for key, value in response | dictsort -%}
+            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+            {%- if not loop.last %},{% endif -%}
+        {%- endfor -%}
+        {{- '}' -}}
+    {%- else -%}
+        {{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
+    {%- endif -%}
+    {{- '<tool_response|>' -}}
+{%- endmacro -%}
+{%- set ns = namespace(prev_message_type=None) -%}
+{%- set loop_messages = messages -%}
+{{ bos_token }}
+{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
+    {{- '<|turn>system\n' -}}
+    {%- if enable_thinking is defined and enable_thinking -%}
+        {{- '<|think|>' -}}
+        {%- set ns.prev_message_type = 'think' -%}
+    {%- endif -%}
+    {%- if messages[0]['role'] in ['system', 'developer'] -%}
+        {{- messages[0]['content'] | trim -}}
+        {%- set loop_messages = messages[1:] -%}
+    {%- endif -%}
+    {%- if tools -%}
+        {%- for tool in tools %}
+            {{- '<|tool>' -}}
+            {{- format_function_declaration(tool) | trim -}}
+            {{- '<tool|>' -}}
+        {%- endfor %}
+        {%- set ns.prev_message_type = 'tool' -%}
+    {%- endif -%}
+    {{- '<turn|>\n' -}}
+{%- endif %}
+{%- set ns_turn = namespace(last_user_idx=-1) -%}
+{%- for i in range(loop_messages | length) -%}
+    {%- if loop_messages[i]['role'] == 'user' -%}
+        {%- set ns_turn.last_user_idx = i -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- for message in loop_messages -%}
+    {%- if message['role'] != 'tool' -%}
+    {%- set ns.prev_message_type = None -%}
+    {%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
+    {#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
+        Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
+    {%- set prev_nt = namespace(role=None, found=false) -%}
+    {%- if loop.index0 > 0 -%}
+        {%- for j in range(loop.index0 - 1, -1, -1) -%}
+            {%- if not prev_nt.found -%}
+                {%- if loop_messages[j]['role'] != 'tool' -%}
+                    {%- set prev_nt.role = loop_messages[j]['role'] -%}
+                    {%- set prev_nt.found = true -%}
+                {%- endif -%}
+            {%- endif -%}
+        {%- endfor -%}
+    {%- endif -%}
+    {%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
+    {%- if not continue_same_model_turn -%}
+        {{- '<|turn>' + role + '\n' }}
+    {%- endif -%}
+    {%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
+        {{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
+    {%- endif -%}
+            {%- if message['tool_calls'] -%}
+                {%- for tool_call in message['tool_calls'] -%}
+                    {%- set function = tool_call['function'] -%}
+                    {{- '<|tool_call>call:' + function['name'] + '{' -}}
+                    {%- if function['arguments'] is mapping -%}
+                        {%- set ns_args = namespace(found_first=false) -%}
+                        {%- for key, value in function['arguments'] | dictsort -%}
+                            {%- if ns_args.found_first %},{% endif -%}
+                            {%- set ns_args.found_first = true -%}
+                            {{- key -}}:{{- format_argument(value, escape_keys=False) -}}
+                        {%- endfor -%}
+                    {%- elif function['arguments'] is string -%}
+                        {{- function['arguments'] -}}
+                    {%- endif -%}
+                    {{- '}<tool_call|>' -}}
+                {%- endfor -%}
+                {%- set ns.prev_message_type = 'tool_call' -%}
+            {%- endif -%}
+            {%- set ns_tr_out = namespace(flag=false) -%}
+            {%- if message.get('tool_responses') -%}
+                {#- Legacy: tool_responses embedded on the assistant message -#}
+                {%- for tool_response in message['tool_responses'] -%}
+                    {{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
+                    {%- set ns_tr_out.flag = true -%}
+                    {%- set ns.prev_message_type = 'tool_response' -%}
+                {%- endfor -%}
+            {%- elif message.get('tool_calls') -%}
+                {#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
+                {%- set ns_tool_scan = namespace(stopped=false) -%}
+                {%- for k in range(loop.index0 + 1, loop_messages | length) -%}
+                    {%- if ns_tool_scan.stopped -%}
+                    {%- elif loop_messages[k]['role'] != 'tool' -%}
+                        {%- set ns_tool_scan.stopped = true -%}
+                    {%- else -%}
+                        {%- set follow = loop_messages[k] -%}
+                        {%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
+                        {%- for tc in message['tool_calls'] -%}
+                            {%- if tc.get('id') == follow.get('tool_call_id') -%}
+                                {%- set ns_tname.name = tc['function']['name'] -%}
+                            {%- endif -%}
+                        {%- endfor -%}
+                        {%- set tool_body = follow.get('content') -%}
+                        {%- if tool_body is string -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- elif tool_body is sequence and tool_body is not string -%}
+                            {%- set ns_txt = namespace(s='') -%}
+                            {%- for part in tool_body -%}
+                                {%- if part.get('type') == 'text' -%}
+                                    {%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
+                                {%- endif -%}
+                            {%- endfor -%}
+                            {{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
+                        {%- else -%}
+                            {{- format_tool_response_block(ns_tname.name, tool_body) -}}
+                        {%- endif -%}
+                        {%- set ns_tr_out.flag = true -%}
+                        {%- set ns.prev_message_type = 'tool_response' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+            {%- if message['content'] is string -%}
+                {%- if role == 'model' -%}
+                    {{- strip_thinking(message['content']) -}}
+                {%- else -%}
+                    {{- message['content'] | trim -}}
+                {%- endif -%}
+            {%- elif message['content'] is sequence -%}
+                {%- for item in message['content'] -%}
+                    {%- if item['type'] == 'text' -%}
+                        {%- if role == 'model' -%}
+                            {{- strip_thinking(item['text']) -}}
+                        {%- else -%}
+                            {{- item['text'] | trim -}}
+                        {%- endif -%}
+                    {%- elif item['type'] == 'image' -%}
+                        {{- '\n\n<|image|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'image' -%}
+                    {%- elif item['type'] == 'audio' -%}
+                        {{- '<|audio|>' -}}
+                        {%- set ns.prev_message_type = 'audio' -%}
+                    {%- elif item['type'] == 'video' -%}
+                        {{- '\n\n<|video|>\n\n' -}}
+                        {%- set ns.prev_message_type = 'video' -%}
+                    {%- endif -%}
+                {%- endfor -%}
+            {%- endif -%}
+        {%- if not (ns_tr_out.flag and not message.get('content')) -%}
+            {{- '<turn|>\n' -}}
+        {%- endif -%}
+    {%- endif -%}
+{%- endfor -%}
+{%- if add_generation_prompt -%}
+    {%- if ns.prev_message_type != 'tool_response' -%}
+        {{- '<|turn>model\n' -}}
+    {%- endif -%}
+    {%- if not enable_thinking | default(false) -%}
+        {{- '<|channel>thought\n<channel|>' -}}
+    {%- endif -%}
+{%- endif -%}
--- a/requirements/common.txt
+++ b/requirements/common.txt
@@ -7,7 +7,7 @@ requests >= 2.26.0
 tqdm
 blake3
 py-cpuinfo
-transformers >= 4.56.0, < 5
+transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
 tokenizers >= 0.21.1  # Required for fast incremental detokenization.
 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
 fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
@@ -37,7 +37,7 @@ pyyaml
 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
 einops # Required for Qwen2-VL.
-compressed-tensors == 0.14.0.1 # required for compressed-tensors
+compressed-tensors == 0.15.0.1 # required for compressed-tensors
 depyf==0.20.0 # required for profiling and debugging with compilation config
 cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
 watchfiles # required for http server to monitor the updates of TLS files

--- a/requirements/kv_connectors.txt
+++ b/requirements/kv_connectors.txt
 lmcache >= 0.3.9
 nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
+nixl-cu12 >= 0.7.1, < 0.10.0
+nixl-cu13 >= 0.7.1, < 0.10.0
 mooncake-transfer-engine >= 0.3.8
--- a/requirements/nightly_torch_test.txt
+++ b/requirements/nightly_torch_test.txt
@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
+transformers==5.5.3
-tokenizers==0.22.0
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes>=0.49.2

--- a/requirements/rocm-test.in
+++ b/requirements/rocm-test.in
@@ -36,8 +36,8 @@ opencv-python-headless>=4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
+transformers==5.5.3
-tokenizers==0.22.0
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test
 # quantization
 bitsandbytes==0.49.2
@@ -80,4 +80,3 @@ plotly # required for perf comparison html report
 rapidfuzz
 torchgeo==0.7.0
 multiprocess==0.70.16
-huggingface-hub==0.36.2
--- a/requirements/rocm-test.txt
+++ b/requirements/rocm-test.txt
@@ -232,7 +232,6 @@ filelock==3.25.2
    #   python-discovery
    #   ray
    #   torch
-    #   transformers
    #   virtualenv
 fiona==1.10.1
    # via torchgeo
@@ -318,7 +317,7 @@ h5py==3.16.0
    # via terratorch
 harfile==0.4.0
    # via schemathesis
-hf-xet==1.4.2
+hf-xet==1.4.3
    # via huggingface-hub
 hiredis==3.3.1
    # via tensorizer
@@ -332,11 +331,11 @@ httpx==0.27.2
    # via
    #   -r requirements/rocm-test.in
    #   diffusers
+    #   huggingface-hub
    #   perceptron
    #   schemathesis
-huggingface-hub==0.36.2
+huggingface-hub==1.10.2
    # via
-    #   -r requirements/rocm-test.in
    #   accelerate
    #   datasets
    #   diffusers
@@ -970,7 +969,6 @@ requests==2.32.5
    #   google-api-core
    #   google-cloud-storage
    #   gpt-oss
-    #   huggingface-hub
    #   lightly
    #   lm-eval
    #   mistral-common
@@ -983,7 +981,6 @@ requests==2.32.5
    #   starlette-testclient
    #   tacoreader
    #   tiktoken
-    #   transformers
    #   wandb
 resampy==0.4.3
    # via -r requirements/rocm-test.in
@@ -1191,7 +1188,7 @@ timm==1.0.17
    #   segmentation-models-pytorch
    #   terratorch
    #   torchgeo
-tokenizers==0.22.0
+tokenizers==0.22.2
    # via
    #   -c requirements/common.txt
    #   -r requirements/rocm-test.in
@@ -1230,7 +1227,7 @@ tqdm==4.67.3
    #   tacoreader
    #   terratorch
    #   transformers
-transformers==4.57.5
+transformers==5.5.3
    # via
    #   -c requirements/common.txt
    #   -r requirements/rocm-test.in
@@ -1252,7 +1249,9 @@ typepy==1.3.4
 typer==0.24.1
    # via
    #   fastsafetensors
+    #   huggingface-hub
    #   perceptron
+    #   transformers
 typeshed-client==2.9.0
    # via jsonargparse
 typing-extensions==4.15.0

--- a/requirements/test.in
+++ b/requirements/test.in
@@ -18,7 +18,7 @@ httpx
 librosa # required for audio tests
 vector_quantize_pytorch # required for minicpmo_26 test
 vocos # required for minicpmo_26 test
-peft>=0.15.0 # required for phi-4-mm test
+peft>=0.18.1 # required for phi-4-mm test
 pqdm
 ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
 resampy # required for audio tests
@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
 datamodel_code_generator # required for minicpm3 test
 lm-eval[api]>=0.4.11 # required for model evaluation test
 mteb[bm25s]>=2, <3 # required for mteb test
-transformers==4.57.5
+transformers==5.5.3
-tokenizers==0.22.0
+tokenizers==0.22.2
 schemathesis>=3.39.15 # Required for openai schema test.
 # quantization
 bitsandbytes==0.49.2