Commit fc67613a authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.19.1' into v0.19.0

parents 31aec25b b1388b1f
...@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image" ...@@ -16,5 +16,5 @@ echo "--- :docker: Building Docker image"
docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu . docker build --progress plain --tag "$IMAGE_NAME" --target vllm-test -f docker/Dockerfile.cpu .
# Run the image, setting --shm-size=4g for tensor parallel. # Run the image, setting --shm-size=4g for tensor parallel.
docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 --shm-size=4g "$IMAGE_NAME" \ docker run --rm --cpuset-cpus="$CORE_RANGE" --cpuset-mems="$NUMA_NODE" -v ~/.cache/huggingface:/root/.cache/huggingface --privileged=true -e HF_TOKEN -e VLLM_CPU_KVCACHE_SPACE=16 -e VLLM_CPU_CI_ENV=1 -e VLLM_CPU_SIM_MULTI_NUMA=1 -e VLLM_CPU_ATTN_SPLIT_KV=0 --shm-size=4g "$IMAGE_NAME" \
timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}" timeout "$TIMEOUT_VAL" bash -c "set -euox pipefail; echo \"--- Print packages\"; pip list; echo \"--- Running tests\"; ${TEST_COMMAND}"
...@@ -10,7 +10,20 @@ steps: ...@@ -10,7 +10,20 @@ steps:
- tests/kernels/test_top_k_per_row.py - tests/kernels/test_top_k_per_row.py
- tests/kernels/test_concat_mla_q.py - tests/kernels/test_concat_mla_q.py
commands: commands:
- pytest -v -s kernels/core kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py - pytest -v -s kernels/core --ignore=kernels/core/test_minimax_reduce_rms.py kernels/test_top_k_per_row.py kernels/test_concat_mla_q.py
- label: Kernels MiniMax Reduce RMS Test (2 GPUs)
timeout_in_minutes: 15
num_devices: 2
device: h100
source_file_dependencies:
- csrc/minimax_reduce_rms_kernel.cu
- csrc/minimax_reduce_rms_kernel.h
- vllm/model_executor/layers/mamba/linear_attn.py
- vllm/model_executor/layers/mamba/lamport_workspace.py
- tests/kernels/core/test_minimax_reduce_rms.py
commands:
- pytest -v -s kernels/core/test_minimax_reduce_rms.py
- label: Kernels Attention Test %N - label: Kernels Attention Test %N
timeout_in_minutes: 35 timeout_in_minutes: 35
......
...@@ -69,3 +69,18 @@ steps: ...@@ -69,3 +69,18 @@ steps:
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl - python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock # Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper - VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
- label: Transformers Backward Compatibility Models Test
working_dir: "/vllm-workspace/"
optional: true
soft_fail: true
commands:
- pip install transformers==4.57.5
- pytest -v -s tests/models/test_initialization.py
- pytest -v -s tests/models/test_transformers.py
- pytest -v -s tests/models/multimodal/processing/
- pytest -v -s tests/models/multimodal/test_mapping.py
- python3 examples/offline_inference/basic/chat.py
- python3 examples/offline_inference/vision_language.py --model-type qwen2_5_vl
# Whisper needs spawn method to avoid deadlock
- VLLM_WORKER_MULTIPROC_METHOD=spawn python3 examples/offline_inference/audio_language.py --model-type whisper
...@@ -306,6 +306,8 @@ set(VLLM_EXT_SRC ...@@ -306,6 +306,8 @@ set(VLLM_EXT_SRC
"csrc/torch_bindings.cpp") "csrc/torch_bindings.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA") if(VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_EXT_SRC "csrc/minimax_reduce_rms_kernel.cu")
SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library") SET(CUTLASS_ENABLE_HEADERS_ONLY ON CACHE BOOL "Enable only the header library")
# Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building. # Set CUTLASS_REVISION. Used for FetchContent. Also fixes some bogus messages when building.
......
/*
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <cooperative_groups.h>
#include <cuda_runtime.h>
#include <torch/cuda.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include "cuda_compat.h"
#include "cuda_utils.h"
#include "core/registration.h"
#include "minimax_reduce_rms_kernel.h"
#include <algorithm>
#define FINAL_MASK 0xffffffff
#define MINIMAX_REDUCE_RMS_WARP_SIZE 32
namespace vllm {
namespace tensorrt_llm {
template <int NRanks>
struct LamportComm {
__device__ __forceinline__ LamportComm(void** workspace, int rank) {
counter_ptr = &reinterpret_cast<int*>(workspace[NRanks * 3])[0];
flag_ptr = &reinterpret_cast<int*>(workspace[NRanks * 3])[2];
clear_ptr = &reinterpret_cast<int64_t*>(workspace[NRanks * 3 + 1])[0];
flag_value = *flag_ptr;
auto comm_size = reinterpret_cast<int64_t*>(workspace[NRanks * 3 + 1])[1];
clear_size = *clear_ptr;
int data_offset = flag_value % 3;
int clear_offset = (flag_value + 2) % 3;
for (int r = 0; r < NRanks; ++r) {
data_bufs[r] = reinterpret_cast<uint8_t*>(workspace[2 * NRanks + r]) +
data_offset * comm_size;
}
clear_buf = reinterpret_cast<uint8_t*>(workspace[2 * NRanks + rank]) +
clear_offset * comm_size;
__syncthreads();
if (threadIdx.x == 0) {
atomicAdd(counter_ptr, 1);
}
}
__device__ __forceinline__ void update(int64_t new_clear_size) {
if (blockIdx.x == 0 && threadIdx.x == 0) {
while (*reinterpret_cast<int volatile*>(counter_ptr) != gridDim.x) {
}
*flag_ptr = (flag_value + 1) % 3;
*clear_ptr = new_clear_size;
*counter_ptr = 0;
}
}
int* counter_ptr;
int* flag_ptr;
int64_t* clear_ptr;
uint8_t* data_bufs[NRanks];
uint8_t* clear_buf;
int64_t clear_size;
int flag_value;
};
__device__ __forceinline__ bool is_neg_zero(float v) {
return *reinterpret_cast<uint32_t*>(&v) == 0x80000000;
}
__device__ __forceinline__ bool is_neg_zero(float4 v) {
return is_neg_zero(v.x) || is_neg_zero(v.y) || is_neg_zero(v.z) ||
is_neg_zero(v.w);
}
__device__ __forceinline__ float4 get_neg_zero() {
float4 vec;
#pragma unroll
for (int i = 0; i < 4; ++i) {
reinterpret_cast<uint32_t*>(&vec)[i] = 0x80000000;
}
return vec;
}
template <int Dim>
__device__ __forceinline__ float rms_rsqrt(float& v, float eps) {
constexpr float kInvDim = 1.0F / static_cast<float>(Dim);
v = rsqrtf((v * kInvDim) + eps);
return v;
}
template <int Dim>
__device__ __forceinline__ float4 rms_rsqrt(float4& v, float eps) {
constexpr float kInvDim = 1.0F / static_cast<float>(Dim);
v.x = rsqrtf((v.x * kInvDim) + eps);
v.y = rsqrtf((v.y * kInvDim) + eps);
v.z = rsqrtf((v.z * kInvDim) + eps);
v.w = rsqrtf((v.w * kInvDim) + eps);
return v;
}
__device__ __forceinline__ float4 ld_global_volatile(float4* addr) {
float4 val;
asm volatile("ld.volatile.global.v4.f32 {%0, %1, %2, %3}, [%4];"
: "=f"(val.x), "=f"(val.y), "=f"(val.z), "=f"(val.w)
: "l"(addr));
return val;
}
__device__ __forceinline__ float ld_global_volatile(float* addr) {
float val;
asm volatile("ld.volatile.global.f32 %0, [%1];" : "=f"(val) : "l"(addr));
return val;
}
// Used by the scalar (non-float4) kernel only
template <typename T, int NUM>
__inline__ __device__ T warpReduceSumV2(T* val) {
#pragma unroll
for (int i = 0; i < NUM; i++) {
#pragma unroll
for (int mask = 16; mask > 0; mask >>= 1)
val[i] += __shfl_xor_sync(FINAL_MASK, val[i], mask, 32);
}
return (T)(0.0f);
}
template <typename T, int NUM>
__inline__ __device__ T blockReduceSumV2(T* val) {
static __shared__ T shared[NUM][33];
int lane = threadIdx.x & 0x1f;
int wid = threadIdx.x >> 5;
warpReduceSumV2<T, NUM>(val);
if (lane == 0) {
#pragma unroll
for (int i = 0; i < NUM; i++) {
shared[i][wid] = val[i];
}
}
__syncthreads();
bool is_mask = threadIdx.x < (blockDim.x / 32.f);
#pragma unroll
for (int i = 0; i < NUM; i++) {
val[i] = is_mask ? shared[i][lane] : (T)(0.0f);
}
warpReduceSumV2<T, NUM>(val);
return (T)0.0f;
}
// for float4 version
template <uint32_t kNumThreads, typename T, int ArraySize = 4>
__device__ __forceinline__ void local_warp_reduce_sum_array(
T* value_ptr, uint32_t active_mask = 0xffffffffu) {
static_assert(kNumThreads >= 1 &&
kNumThreads <= MINIMAX_REDUCE_RMS_WARP_SIZE);
#pragma unroll
for (int i = 0; i < ArraySize; ++i) {
#pragma unroll
for (int mask = kNumThreads / 2; mask > 0; mask >>= 1) {
value_ptr[i] += __shfl_xor_sync(active_mask, value_ptr[i], mask,
MINIMAX_REDUCE_RMS_WARP_SIZE);
}
}
}
constexpr int next_pow2(int val) {
int result = 1;
while (result < val) {
result <<= 1;
}
return result;
}
// ---------------------------------------------------------------------------
template <typename DType>
class IndexHelper {
public:
__device__ __forceinline__ IndexHelper(MiniMaxReduceRMSParams const& params) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
cg::grid_group grid = cg::this_grid();
token_id = grid.cluster_rank();
access_id_in_token = cluster.thread_rank();
token_stride = grid.num_clusters();
#else
token_id = blockIdx.x;
access_id_in_token = threadIdx.x;
token_stride = gridDim.x;
#endif
access_id = token_id * params.hidden_dim / kElemsPerAccess<DType> +
access_id_in_token;
access_stride = token_stride * params.hidden_dim / kElemsPerAccess<DType>;
tot_access = params.size_q / kElemsPerAccess<DType>;
}
int token_id;
int access_id_in_token;
int token_stride;
int access_id;
int access_stride;
int tot_access;
};
/**
* this kernel is used to for minimax attention module
* input tensor [total_tokens, hidden_dim / tp_size], fp32
* rms weight [hidden_dim / tp_size], bf16
step 1: reduce from single rank to get the variance sum (reduce(input^2,
dim=-1)) step 2: reduce from all ranks to get the variance sum
(all_reduce(variance_sum)) step 3: calculate the rms norm (input *
rsqrt(variance + eps)) in this case, max hidden_dim is 6144 (float data), for
each token, we only need 6144 / 4 / tp_size = (1536 / tp_size) threads so we can
assume cluster size is 1 (tp_size >= 2)
*/
template <typename DType, int NRanks>
__global__ void __launch_bounds__(1024)
minimax_reduce_rms_kernel_lamport(MiniMaxReduceRMSParams params) {
IndexHelper<DType> index_helper(params);
int token_id = index_helper.token_id;
int access_id_in_token = index_helper.access_id_in_token;
int token_stride = index_helper.token_stride;
int access_id = index_helper.access_id;
int access_stride = index_helper.access_stride;
int tot_access = index_helper.tot_access;
int tot_tokens = params.size_q / params.hidden_dim;
float4 clear_vec = get_neg_zero();
LamportComm<NRanks> comm(params.workspace, params.rank);
int clear_access = comm.clear_size / kElemsPerAccess<DType>;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
for (int idx = access_id; idx < tot_access;
idx += access_stride, token_id += token_stride) {
alignas(16) DType vals[kElemsPerAccess<DType>];
float sum_variance = 0.F;
*reinterpret_cast<float4*>(vals) =
reinterpret_cast<float4*>(params.allreduce_in)[idx];
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
sum_variance += static_cast<float>(vals[i]) * static_cast<float>(vals[i]);
}
blockReduceSumV2<float, 1>(&sum_variance);
if (is_neg_zero(sum_variance)) {
sum_variance = 0.F;
}
if (threadIdx.x == 0) {
for (int r = 0; r < NRanks; ++r) {
reinterpret_cast<float*>(
comm.data_bufs[r])[(params.rank * tot_tokens) + token_id] =
(sum_variance);
}
}
bool done = false;
float vars_all_ranks[NRanks];
while (!done) {
done = true;
#pragma unroll
for (int r = 0; r < NRanks; ++r) {
vars_all_ranks[r] = ld_global_volatile(&reinterpret_cast<float*>(
comm.data_bufs[params.rank])[(r * tot_tokens) + token_id]);
done &= !is_neg_zero(vars_all_ranks[r]);
}
}
sum_variance = 0.F;
#pragma unroll
for (int r = 0; r < NRanks; ++r) {
sum_variance += vars_all_ranks[r];
}
DType norm_weight[kElemsPerAccess<DType>];
*reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(norm_weight) =
reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
params.rms_gamma)[access_id_in_token];
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
vals[i] = static_cast<DType>(
static_cast<float>(vals[i]) *
rsqrtf(
(sum_variance / static_cast<float>(params.hidden_dim) / NRanks) +
params.rms_eps) *
static_cast<float>(norm_weight[i]));
}
reinterpret_cast<float4*>(params.rms_norm_out)[idx] =
*reinterpret_cast<float4*>(vals);
}
for (int idx = access_id; idx < clear_access; idx += access_stride) {
reinterpret_cast<float4*>(comm.clear_buf)[idx] = clear_vec;
}
comm.update(params.size_q * NRanks);
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
}
/**
* Float4 variant: process 4 rows at once, allreduce variance sums as float4 for
* better memory coalescing. sum_variance is always float; applies to all DTypes
* (half, bf16, float). When tot_tokens % 4 != 0, the last group pads rows with
* zeros; padded rows are not written to rms_norm_out. IsQK: when true, process
* Q+K in one loop with doubled comm buffer; when false, single-matrix (Q only).
*/
template <typename DType, int NRanks, int OriginQDim, int OriginKDim>
__global__ void __launch_bounds__(1024)
minimax_reduce_qk_rms_kernel_lamport_float4(MiniMaxReduceRMSParams params) {
// Compile-time per-rank dimensions
constexpr int RankQDim = OriginQDim / NRanks;
constexpr int RankKDim = OriginKDim / NRanks;
// Threads needed to cover one row of Q / K with float4 accesses
constexpr int ThreadsPerRowQ = RankQDim / kElemsPerAccess<DType>;
constexpr int ThreadsPerRowK = RankKDim / kElemsPerAccess<DType>;
// Number of warps dedicated to Q / K
constexpr int NumWarpQ = (ThreadsPerRowQ + MINIMAX_REDUCE_RMS_WARP_SIZE - 1) /
MINIMAX_REDUCE_RMS_WARP_SIZE;
constexpr int NumWarpK = (ThreadsPerRowK + MINIMAX_REDUCE_RMS_WARP_SIZE - 1) /
MINIMAX_REDUCE_RMS_WARP_SIZE;
int tot_tokens = params.size_q / RankQDim;
int tot_groups = (tot_tokens + 3) / 4; // ceiling; last group may be partial
// Memory strides for strided qkv tensors (elements -> float4-access units)
int access_stride_q = (params.stride_q > 0 ? params.stride_q : RankQDim) /
kElemsPerAccess<DType>;
int access_stride_k = (params.stride_k > 0 ? params.stride_k : RankKDim) /
kElemsPerAccess<DType>;
// Output strides: default to contiguous (hidden_dim / hidden_dim_k)
int access_stride_q_out =
(params.stride_q_out > 0 ? params.stride_q_out : params.hidden_dim) /
kElemsPerAccess<DType>;
int access_stride_k_out =
(params.stride_k_out > 0 ? params.stride_k_out : params.hidden_dim_k) /
kElemsPerAccess<DType>;
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
namespace cg = cooperative_groups;
cg::cluster_group cluster = cg::this_cluster();
cg::grid_group grid = cg::this_grid();
int group_id = grid.cluster_rank();
int access_id_in_token = cluster.thread_rank();
int group_stride = grid.num_clusters();
#else
int group_id = blockIdx.x;
int access_id_in_token = threadIdx.x;
int group_stride = gridDim.x;
#endif
bool is_q = (access_id_in_token < NumWarpQ * MINIMAX_REDUCE_RMS_WARP_SIZE);
int k_thread_idx =
access_id_in_token - (NumWarpQ * MINIMAX_REDUCE_RMS_WARP_SIZE);
bool is_valid_q = (access_id_in_token < ThreadsPerRowQ);
bool is_valid_k = (k_thread_idx >= 0 && k_thread_idx < ThreadsPerRowK);
float4 clear_vec = get_neg_zero();
// Shared memory for two-level block reduction and scale broadcast
__shared__ float block_reduce_sum[4][MINIMAX_REDUCE_RMS_WARP_SIZE + 1];
__shared__ float global_scale_q[4];
__shared__ float global_scale_k[4];
LamportComm<NRanks> comm(params.workspace, params.rank);
DType norm_weight[kElemsPerAccess<DType>]{};
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.wait;");
#endif
if (is_q) {
if (is_valid_q) {
*reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
norm_weight) =
reinterpret_cast<typename ElemsPerAccess<DType>::vec_type const*>(
params.rms_gamma)[access_id_in_token];
}
} else {
if (is_valid_k) {
*reinterpret_cast<typename ElemsPerAccess<DType>::vec_type*>(
norm_weight) =
reinterpret_cast<typename ElemsPerAccess<DType>::vec_type const*>(
params.rms_gamma_k)[k_thread_idx];
}
}
// Main loop: process one group of 4 tokens per iteration.
for (int g = group_id; g < tot_groups; g += group_stride) {
alignas(16) DType vals[4][kElemsPerAccess<DType>]{};
float warp_sum_variance[4]{0.F, 0.F, 0.F, 0.F};
if (is_q) {
#pragma unroll
for (int row = 0; row < 4; ++row) {
int token_r = g * 4 + row;
if (token_r >= tot_tokens || !is_valid_q) {
continue;
}
int idx_r = token_r * access_stride_q + access_id_in_token;
*reinterpret_cast<float4*>(&vals[row][0]) =
reinterpret_cast<float4 const*>(params.allreduce_in)[idx_r];
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
float x = static_cast<float>(vals[row][i]);
warp_sum_variance[row] += x * x;
}
}
} else {
#pragma unroll
for (int row = 0; row < 4; ++row) {
int token_r = g * 4 + row;
if (token_r >= tot_tokens || !is_valid_k) {
continue;
}
int idx_r = token_r * access_stride_k + k_thread_idx;
*reinterpret_cast<float4*>(&vals[row][0]) =
reinterpret_cast<float4 const*>(params.allreduce_in_k)[idx_r];
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
float x = static_cast<float>(vals[row][i]);
warp_sum_variance[row] += x * x;
}
}
}
local_warp_reduce_sum_array<MINIMAX_REDUCE_RMS_WARP_SIZE, float, 4>(
warp_sum_variance);
// Warp lane 0 writes its warp's partial sum to shared memory
int lane = threadIdx.x & (MINIMAX_REDUCE_RMS_WARP_SIZE - 1);
if (lane == 0) {
#pragma unroll
for (int t = 0; t < 4; ++t) {
block_reduce_sum[t][threadIdx.x / MINIMAX_REDUCE_RMS_WARP_SIZE] =
warp_sum_variance[t];
}
}
__syncthreads();
int tid = threadIdx.x;
if (tid < MINIMAX_REDUCE_RMS_WARP_SIZE) {
constexpr int kNumWarpQPow2 =
(next_pow2(NumWarpQ) > NRanks) ? next_pow2(NumWarpQ) : NRanks;
float local_sum[4];
#pragma unroll
for (int t = 0; t < 4; ++t) {
local_sum[t] = (tid < NumWarpQ) ? block_reduce_sum[t][tid] : 0.F;
}
// After this, all kNumWarpQPow2 lanes (including tid 0..NRanks-1) have
// the total Q sum-of-squares for all 4 tokens.
local_warp_reduce_sum_array<kNumWarpQPow2, float, 4>(local_sum);
if (tid < NRanks) {
#pragma unroll
for (int t = 0; t < 4; ++t) {
if (is_neg_zero(local_sum[t])) {
local_sum[t] = 0.F;
}
}
// Parallel push: thread tid writes this rank's Q sum to rank tid's buf
reinterpret_cast<float4*>(
comm.data_bufs[tid])[(params.rank * tot_groups * 2) + (2 * g)] =
*reinterpret_cast<float4*>(local_sum);
// Parallel pull: thread tid reads rank tid's contribution from
// this rank's (params.rank's) buffer
bool done = false;
float4 var_all_ranks;
while (!done) {
done = true;
var_all_ranks = ld_global_volatile(&reinterpret_cast<float4*>(
comm.data_bufs[params.rank])[(tid * tot_groups * 2) + (2 * g)]);
done &= !is_neg_zero(var_all_ranks);
}
// Warp-level allreduce: each of the NRanks threads holds one rank's
// partial sum; after this all NRanks threads have the global total.
constexpr uint32_t kQActiveMask = (1u << NRanks) - 1u;
local_warp_reduce_sum_array<NRanks, float, 4>(
reinterpret_cast<float*>(&var_all_ranks), kQActiveMask);
// Thread 0 computes rsqrt with compile-time Dim and writes to smem
if (tid == 0) {
*reinterpret_cast<float4*>(global_scale_q) =
rms_rsqrt<OriginQDim>(var_all_ranks, params.rms_eps);
}
}
} else if (tid >= MINIMAX_REDUCE_RMS_WARP_SIZE * NumWarpQ &&
tid < MINIMAX_REDUCE_RMS_WARP_SIZE * (NumWarpQ + 1)) {
// --- K leader warp ---
constexpr int kNumWarpKPow2 =
(next_pow2(NumWarpK) > NRanks) ? next_pow2(NumWarpK) : NRanks;
float local_sum[4];
#pragma unroll
for (int t = 0; t < 4; ++t) {
local_sum[t] = (k_thread_idx < NumWarpK)
? block_reduce_sum[t][NumWarpQ + k_thread_idx]
: 0.F;
}
local_warp_reduce_sum_array<kNumWarpKPow2, float, 4>(local_sum);
if (k_thread_idx < NRanks) {
#pragma unroll
for (int t = 0; t < 4; ++t) {
if (is_neg_zero(local_sum[t])) {
local_sum[t] = 0.F;
}
}
reinterpret_cast<float4*>(
comm.data_bufs[k_thread_idx])[(params.rank * tot_groups * 2) +
(2 * g + 1)] =
*reinterpret_cast<float4*>(local_sum);
bool done = false;
float4 var_all_ranks;
while (!done) {
done = true;
var_all_ranks = ld_global_volatile(&reinterpret_cast<float4*>(
comm.data_bufs[params.rank])[(k_thread_idx * tot_groups * 2) +
(2 * g + 1)]);
done &= !is_neg_zero(var_all_ranks);
}
constexpr uint32_t kKActiveMask = (1u << NRanks) - 1u;
local_warp_reduce_sum_array<NRanks, float, 4>(
reinterpret_cast<float*>(&var_all_ranks), kKActiveMask);
if (k_thread_idx == 0) {
*reinterpret_cast<float4*>(global_scale_k) =
rms_rsqrt<OriginKDim>(var_all_ranks, params.rms_eps);
}
}
}
__syncthreads();
if (is_q) {
#pragma unroll
for (int t = 0; t < 4; ++t) {
warp_sum_variance[t] = global_scale_q[t];
}
#pragma unroll
for (int r = 0; r < 4; ++r) {
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
vals[r][i] = static_cast<DType>(static_cast<float>(vals[r][i]) *
warp_sum_variance[r] *
static_cast<float>(norm_weight[i]));
}
int token_r = g * 4 + r;
if (token_r >= tot_tokens || !is_valid_q) {
continue;
}
int idx_out = token_r * access_stride_q_out + access_id_in_token;
reinterpret_cast<float4*>(params.rms_norm_out)[idx_out] =
*reinterpret_cast<float4*>(&vals[r][0]);
}
} else {
#pragma unroll
for (int t = 0; t < 4; ++t) {
warp_sum_variance[t] = global_scale_k[t];
}
#pragma unroll
for (int r = 0; r < 4; ++r) {
#pragma unroll
for (int i = 0; i < kElemsPerAccess<DType>; ++i) {
vals[r][i] = static_cast<DType>(static_cast<float>(vals[r][i]) *
warp_sum_variance[r] *
static_cast<float>(norm_weight[i]));
}
int token_r = g * 4 + r;
if (token_r >= tot_tokens || !is_valid_k) {
continue;
}
int idx_out = token_r * access_stride_k_out + k_thread_idx;
reinterpret_cast<float4*>(params.rms_norm_out_k)[idx_out] =
*reinterpret_cast<float4*>(&vals[r][0]);
}
}
} // end group loop
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
asm volatile("griddepcontrol.launch_dependents;");
#endif
int clear_access = static_cast<int>(comm.clear_size / kElemsPerAccess<DType>);
int clear_stride = group_stride * blockDim.x;
for (int idx = group_id * blockDim.x + threadIdx.x; idx < clear_access;
idx += clear_stride) {
reinterpret_cast<float4*>(comm.clear_buf)[idx] = clear_vec;
}
comm.update(static_cast<int64_t>(2) * tot_groups * kElemsPerAccess<DType> *
NRanks);
}
int get_sm_count() {
static int sm_count = 0;
if (sm_count == 0) {
int device_id;
CUDA_CHECK(cudaGetDevice(&device_id));
cudaDeviceProp device_prop;
cudaGetDeviceProperties(&device_prop, device_id);
sm_count = device_prop.multiProcessorCount;
}
return sm_count;
}
inline int getSMVersion(bool queryRealSmArch = false) {
int device{-1};
CUDA_CHECK(cudaGetDevice(&device));
int sm_major = 0;
int sm_minor = 0;
CUDA_CHECK(cudaDeviceGetAttribute(&sm_major,
cudaDevAttrComputeCapabilityMajor, device));
CUDA_CHECK(cudaDeviceGetAttribute(&sm_minor,
cudaDevAttrComputeCapabilityMinor, device));
int sm = sm_major * 10 + sm_minor;
if (sm == 121 && !queryRealSmArch) {
return 120;
}
return sm;
}
template <typename KernelFunc>
int get_max_active_blocks(KernelFunc kernel, int block_size,
int dynamic_smem = 0) {
int max_active = 0;
CUDA_CHECK(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&max_active, kernel, block_size, dynamic_smem));
return std::max(max_active, 1);
}
template <typename DType, int NRanks>
void minimax_reduce_rms_kernel_launcher(MiniMaxReduceRMSParams const& params) {
static int SM = getSMVersion();
int token_num = params.size_q / params.hidden_dim;
int sm_count = get_sm_count();
int cluster_size = 1;
int cluster_num = token_num;
int threads_per_token = params.hidden_dim / kElemsPerAccess<DType>;
int block_size = threads_per_token;
int max_blocks_per_sm = get_max_active_blocks(
minimax_reduce_rms_kernel_lamport<DType, NRanks>, block_size);
int max_grid = max_blocks_per_sm * sm_count;
int grid_size =
(std::min(max_grid, cluster_num * cluster_size) / cluster_size) *
cluster_size;
cudaLaunchConfig_t cfg;
cfg.gridDim = grid_size;
cfg.blockDim = block_size;
cfg.dynamicSmemBytes = 0;
cfg.stream = params.stream;
cudaLaunchAttribute attribute[2];
attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attribute[0].val.programmaticStreamSerializationAllowed = 1;
attribute[1].id = cudaLaunchAttributeClusterDimension;
attribute[1].val.clusterDim.x = cluster_size;
attribute[1].val.clusterDim.y = 1;
attribute[1].val.clusterDim.z = 1;
cfg.attrs = attribute;
cfg.numAttrs = SM >= 90 ? 2 : 0;
CUDA_CHECK(cudaLaunchKernelEx(
&cfg, minimax_reduce_rms_kernel_lamport<DType, NRanks>, params));
}
template <typename DType, int NRanks, int OriginQDim, int OriginKDim>
void minimax_reduce_rms_kernel_launcher_float4(
MiniMaxReduceRMSParams const& params) {
TORCH_CHECK(params.size_q % params.hidden_dim == 0);
TORCH_CHECK(params.hidden_dim % kElemsPerAccess<DType> == 0);
if (params.stride_q > 0) {
TORCH_CHECK(params.stride_q % kElemsPerAccess<DType> == 0);
}
TORCH_CHECK(params.allreduce_in_k != nullptr,
"float4 QK kernel requires K input");
TORCH_CHECK(params.hidden_dim >= params.hidden_dim_k);
TORCH_CHECK(params.size_k % params.hidden_dim_k == 0);
TORCH_CHECK(params.hidden_dim_k % kElemsPerAccess<DType> == 0);
TORCH_CHECK(params.size_q / params.hidden_dim ==
params.size_k / params.hidden_dim_k);
if (params.stride_k > 0) {
TORCH_CHECK(params.stride_k % kElemsPerAccess<DType> == 0);
}
int token_num = params.size_q / params.hidden_dim;
int tot_groups = (token_num + 3) / 4;
if (tot_groups == 0) {
return;
}
static int SM = getSMVersion();
int sm_count = get_sm_count();
int cluster_size = 1;
int cluster_num = tot_groups;
int access_per_row_q = params.hidden_dim / kElemsPerAccess<DType>;
int access_per_row_k = params.hidden_dim_k / kElemsPerAccess<DType>;
// Round each section up to a warp boundary
auto divUp = [](int a, int b) { return (a + b - 1) / b * b; };
int block_size = divUp(access_per_row_q, MINIMAX_REDUCE_RMS_WARP_SIZE) +
divUp(access_per_row_k, MINIMAX_REDUCE_RMS_WARP_SIZE);
auto kfn =
minimax_reduce_qk_rms_kernel_lamport_float4<DType, NRanks, OriginQDim,
OriginKDim>;
int max_blocks_per_sm = get_max_active_blocks(kfn, block_size);
int max_grid = max_blocks_per_sm * sm_count;
int grid_size =
(std::min(max_grid, cluster_num * cluster_size) / cluster_size) *
cluster_size;
cudaLaunchConfig_t cfg;
cfg.gridDim = grid_size;
cfg.blockDim = block_size;
cfg.dynamicSmemBytes = 0;
cfg.stream = params.stream;
cudaLaunchAttribute attribute[2];
attribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
attribute[0].val.programmaticStreamSerializationAllowed = 1;
attribute[1].id = cudaLaunchAttributeClusterDimension;
attribute[1].val.clusterDim.x = cluster_size;
attribute[1].val.clusterDim.y = 1;
attribute[1].val.clusterDim.z = 1;
cfg.attrs = attribute;
cfg.numAttrs = SM >= 90 ? 2 : 0;
CUDA_CHECK(cudaLaunchKernelEx(&cfg, kfn, params));
}
template <int NRanks>
void dispatch_dtype(MiniMaxReduceRMSParams const& params) {
// Use the optimized QK float4 kernel when:
// - K input is present, AND
// - the full (NRanks * per-rank) dimensions match the MiniMax M2 shape.
// Otherwise fall back to the scalar kernel.
bool use_float4 = (params.allreduce_in_k != nullptr) &&
(params.hidden_dim * params.nranks == 6144) &&
(params.hidden_dim_k * params.nranks == 1024);
if (params.dtype == at::ScalarType::Half) {
if (use_float4) {
minimax_reduce_rms_kernel_launcher_float4<half, NRanks, 6144, 1024>(
params);
} else {
minimax_reduce_rms_kernel_launcher<half, NRanks>(params);
}
} else if (params.dtype == at::ScalarType::BFloat16) {
if (use_float4) {
minimax_reduce_rms_kernel_launcher_float4<__nv_bfloat16, NRanks, 6144,
1024>(params);
} else {
minimax_reduce_rms_kernel_launcher<__nv_bfloat16, NRanks>(params);
}
} else if (params.dtype == at::ScalarType::Float) {
if (use_float4) {
minimax_reduce_rms_kernel_launcher_float4<float, NRanks, 6144, 1024>(
params);
} else {
minimax_reduce_rms_kernel_launcher<float, NRanks>(params);
}
} else {
TORCH_CHECK(false, "Unsupported data type for minimax_reduce_rms_op");
}
}
void minimax_reduce_rms_op(MiniMaxReduceRMSParams const& params) {
if (params.nranks == 2) {
dispatch_dtype<2>(params);
} else if (params.nranks == 4) {
dispatch_dtype<4>(params);
} else if (params.nranks == 8) {
dispatch_dtype<8>(params);
} else if (params.nranks == 16) {
dispatch_dtype<16>(params);
} else {
TORCH_CHECK(false, "minimax_reduce_rms_op: unsupported ranks number!");
}
}
} // namespace tensorrt_llm
} // namespace vllm
torch::Tensor minimax_allreduce_rms(torch::Tensor const& input,
torch::Tensor const& norm_weight,
torch::Tensor workspace, int64_t const rank,
int64_t const nranks, double const eps) {
auto allreduce_params = vllm::tensorrt_llm::MiniMaxReduceRMSParams();
allreduce_params.nranks = static_cast<int>(nranks);
allreduce_params.rank = static_cast<int>(rank);
allreduce_params.dtype = input.scalar_type();
allreduce_params.size_q = static_cast<int>(input.numel());
allreduce_params.hidden_dim = static_cast<int>(input.size(-1));
allreduce_params.stride_q = allreduce_params.hidden_dim;
allreduce_params.workspace =
reinterpret_cast<void**>(workspace.mutable_data_ptr());
allreduce_params.allreduce_in = input.data_ptr();
allreduce_params.rms_gamma = norm_weight.data_ptr();
allreduce_params.rms_eps = static_cast<float>(eps);
allreduce_params.stream = at::cuda::getCurrentCUDAStream(input.get_device());
torch::Tensor rms_norm_out = torch::empty_like(input);
allreduce_params.rms_norm_out = rms_norm_out.mutable_data_ptr();
vllm::tensorrt_llm::minimax_reduce_rms_op(allreduce_params);
return rms_norm_out;
}
std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
torch::Tensor qkv, torch::Tensor const& norm_weight_q,
torch::Tensor const& norm_weight_k, torch::Tensor workspace,
int64_t const q_size, int64_t const kv_size, int64_t const rank,
int64_t const nranks, double const eps) {
TORCH_CHECK(qkv.dim() == 2, "minimax_allreduce_rms_qk: qkv must be 2D");
TORCH_CHECK(qkv.is_contiguous(),
"minimax_allreduce_rms_qk: qkv must be contiguous");
int64_t qkv_dim = qkv.size(-1);
TORCH_CHECK(qkv_dim == q_size + 2 * kv_size,
"minimax_allreduce_rms_qk: qkv last dim must equal "
"q_size + 2 * kv_size");
TORCH_CHECK(rank < nranks,
"minimax_allreduce_rms_qk: rank must be less than nranks");
int64_t num_tokens = qkv.size(0);
int elem_bytes = qkv.element_size();
torch::Tensor q_out = torch::empty({num_tokens, q_size}, qkv.options());
torch::Tensor k_out = torch::empty({num_tokens, kv_size}, qkv.options());
auto params = vllm::tensorrt_llm::MiniMaxReduceRMSParams();
params.nranks = static_cast<int>(nranks);
params.rank = static_cast<int>(rank);
params.dtype = qkv.scalar_type();
params.size_q = static_cast<int>(num_tokens * q_size);
params.hidden_dim = static_cast<int>(q_size);
params.size_k = static_cast<int>(num_tokens * kv_size);
params.hidden_dim_k = static_cast<int>(kv_size);
params.stride_q = static_cast<int>(qkv_dim);
params.stride_k = static_cast<int>(qkv_dim);
params.stride_q_out = 0; // q_out is contiguous; kernel uses hidden_dim
params.stride_k_out = 0; // k_out is contiguous; kernel uses hidden_dim_k
params.workspace = reinterpret_cast<void**>(workspace.mutable_data_ptr());
uint8_t* base = static_cast<uint8_t*>(qkv.data_ptr());
params.allreduce_in = base;
params.allreduce_in_k = base + q_size * elem_bytes;
params.rms_gamma = norm_weight_q.data_ptr();
params.rms_gamma_k = norm_weight_k.data_ptr();
params.rms_eps = static_cast<float>(eps);
params.stream = at::cuda::getCurrentCUDAStream(qkv.get_device());
params.rms_norm_out = q_out.mutable_data_ptr();
params.rms_norm_out_k = k_out.mutable_data_ptr();
vllm::tensorrt_llm::minimax_reduce_rms_op(params);
return {q_out, k_out};
}
/*
* Copyright (c) 2026, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#include <torch/types.h>
namespace vllm {
namespace tensorrt_llm {
template <typename DType>
struct ElemsPerAccess;
template <>
struct ElemsPerAccess<half> {
static constexpr int value = 8;
using vec_type = float4;
};
template <>
struct ElemsPerAccess<nv_bfloat16> {
static constexpr int value = 8;
using vec_type = float4;
};
template <>
struct ElemsPerAccess<float> {
static constexpr int value = 4;
using vec_type = float4;
};
template <typename DType>
static constexpr int kElemsPerAccess = ElemsPerAccess<DType>::value;
struct MiniMaxReduceRMSParams {
int nranks{};
int rank{};
at::ScalarType dtype{at::ScalarType::Undefined};
int size_q{};
int hidden_dim{};
int size_k{};
int hidden_dim_k{};
int stride_q{}; // row stride for q input (elements); when > hidden_dim,
// q is part of a wider qkv tensor
int stride_k{}; // row stride for k input (elements); when > hidden_dim_k,
// k is part of a wider qkv tensor
int stride_q_out{}; // row stride for q output (elements); 0 = contiguous
int stride_k_out{}; // row stride for k output (elements); 0 = contiguous
void** workspace{};
void* allreduce_in{};
void* rms_norm_out{};
void* rms_gamma{};
void* allreduce_in_k{};
void* rms_norm_out_k{};
void* rms_gamma_k{};
float rms_eps{};
cudaStream_t stream{};
};
void minimax_reduce_rms_op(MiniMaxReduceRMSParams const& params);
} // namespace tensorrt_llm
} // namespace vllm
...@@ -392,3 +392,15 @@ int64_t qr_max_size(); ...@@ -392,3 +392,15 @@ int64_t qr_max_size();
void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a, void dsv3_fused_a_gemm(torch::Tensor& output, torch::Tensor const& mat_a,
torch::Tensor const& mat_b); torch::Tensor const& mat_b);
#endif #endif
#ifndef USE_ROCM
torch::Tensor minimax_allreduce_rms(torch::Tensor const& input,
torch::Tensor const& norm_weight,
torch::Tensor workspace, int64_t const rank,
int64_t const nranks, double const eps);
std::tuple<torch::Tensor, torch::Tensor> minimax_allreduce_rms_qk(
torch::Tensor qkv, torch::Tensor const& norm_weight_q,
torch::Tensor const& norm_weight_k, torch::Tensor workspace,
int64_t const q_size, int64_t const kv_size, int64_t const rank,
int64_t const nranks, double const eps);
#endif
\ No newline at end of file
...@@ -668,6 +668,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -668,6 +668,29 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"Tensor? b_qzeros, " "Tensor? b_qzeros, "
"SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt " "SymInt n, SymInt group_size, SymInt sm_count, SymInt sm_version, SymInt "
"CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor"); "CUBLAS_M_THRESHOLD, bool has_zp, bool n32k16_reorder) -> Tensor");
ops.def(
"minimax_allreduce_rms("
"Tensor input,"
"Tensor norm_weight,"
"Tensor workspace,"
"int rank,"
"int nranks,"
"float eps) -> Tensor");
ops.impl("minimax_allreduce_rms", torch::kCUDA, &minimax_allreduce_rms);
ops.def(
"minimax_allreduce_rms_qk("
"Tensor qkv,"
"Tensor norm_weight_q,"
"Tensor norm_weight_k,"
"Tensor workspace,"
"int q_size,"
"int kv_size,"
"int rank,"
"int nranks,"
"float eps) -> (Tensor, Tensor)");
ops.impl("minimax_allreduce_rms_qk", torch::kCUDA, &minimax_allreduce_rms_qk);
// conditionally compiled so impl in source file // conditionally compiled so impl in source file
#endif #endif
} }
......
...@@ -649,7 +649,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -649,7 +649,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
else \ else \
BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \ BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_X86}"; \
fi; \ fi; \
uv pip install --system accelerate hf_transfer modelscope \ uv pip install --system accelerate modelscope \
"bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}" "bitsandbytes>=${BITSANDBYTES_VERSION}" "timm${TIMM_VERSION}" "runai-model-streamer[s3,gcs,azure]${RUNAI_MODEL_STREAMER_VERSION}"
# ============================================================ # ============================================================
...@@ -772,9 +772,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -772,9 +772,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE 1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# Copy in the v1 package for testing (it isn't distributed yet) # Copy in the v1 package for testing (it isn't distributed yet)
COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 COPY vllm/v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
......
...@@ -140,9 +140,11 @@ RUN \ ...@@ -140,9 +140,11 @@ RUN \
esac; \ esac; \
}; \ }; \
remove_packages_not_supported_on_aarch64 && \ remove_packages_not_supported_on_aarch64 && \
sed -i 's/^torch==.*/torch==2.10.0/g' requirements/cpu-test.in && \ sed -i 's/^torch==.*/torch==2.11.0/g' requirements/cpu-test.in && \
sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \ sed -i 's/torchaudio.*/torchaudio/g' requirements/cpu-test.in && \
sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \ sed -i 's/torchvision.*/torchvision/g' requirements/cpu-test.in && \
# Related issue: https://github.com/vllm-project/vllm/pull/38800#issuecomment-4228314305
sed -i 's/^sentence-transformers.*/sentence-transformers==5.3.0/g' requirements/cpu-test.in && \
uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu uv pip compile requirements/cpu-test.in -o requirements/cpu-test.txt --index-strategy unsafe-best-match --torch-backend cpu
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
...@@ -195,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/ ...@@ -195,6 +197,12 @@ ADD ./.buildkite/ ./.buildkite/
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install -e tests/vllm_test_utils uv pip install -e tests/vllm_test_utils
# enable fast downloads from hf (for testing)
ENV HF_XET_HIGH_PERFORMANCE 1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
######################### RELEASE IMAGE ######################### ######################### RELEASE IMAGE #########################
FROM base AS vllm-openai FROM base AS vllm-openai
......
...@@ -269,9 +269,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \ ...@@ -269,9 +269,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -e tests/vllm_test_utils uv pip install --system -e tests/vllm_test_utils
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE 1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER 1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
RUN --mount=type=cache,target=/root/.cache/uv \ RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r requirements/nightly_torch_test.txt uv pip install --system -r requirements/nightly_torch_test.txt
......
...@@ -364,9 +364,10 @@ RUN cd /vllm-workspace \ ...@@ -364,9 +364,10 @@ RUN cd /vllm-workspace \
&& python3 -m pip install pytest-shard && python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing) # enable fast downloads from hf (for testing)
RUN --mount=type=cache,target=/root/.cache/uv \ ENV HF_XET_HIGH_PERFORMANCE=1
uv pip install --system hf_transfer
ENV HF_HUB_ENABLE_HF_TRANSFER=1 # increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# install audio decode package `torchcodec` from source (required due to # install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package # ROCm and torch version mismatch) for tests with datasets package
......
...@@ -147,7 +147,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700 ...@@ -147,7 +147,7 @@ uv pip install vllm --extra-index-url https://wheels.vllm.ai/rocm/0.15.0/rocm700
# Install dependencies # Install dependencies
pip install --upgrade numba \ pip install --upgrade numba \
scipy \ scipy \
huggingface-hub[cli,hf_transfer] \ huggingface-hub[cli] \
setuptools_scm setuptools_scm
pip install -r requirements/rocm.txt pip install -r requirements/rocm.txt
......
{%- macro format_parameters(properties, required) -%}
{%- set standard_keys = ['description', 'type', 'properties', 'required', 'nullable'] -%}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in properties | dictsort -%}
{%- set add_comma = false -%}
{%- if key not in standard_keys -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{{ key }}:{
{%- if value['description'] -%}
description:<|"|>{{ value['description'] }}<|"|>
{%- set add_comma = true -%}
{%- endif -%}
{%- if value['nullable'] %}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
nullable:true
{%- endif -%}
{%- if value['type'] | upper == 'STRING' -%}
{%- if value['enum'] -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
enum:{{ format_argument(value['enum']) }}
{%- endif -%}
{%- elif value['type'] | upper == 'OBJECT' -%}
,properties:{
{%- if value['properties'] is defined and value['properties'] is mapping -%}
{{- format_parameters(value['properties'], value['required'] | default([])) -}}
{%- elif value is mapping -%}
{{- format_parameters(value, value['required'] | default([])) -}}
{%- endif -%}
}
{%- if value['required'] -%}
,required:[
{%- for item in value['required'] | default([]) -%}
<|"|>{{- item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- endif -%}
{%- elif value['type'] | upper == 'ARRAY' -%}
{%- if value['items'] is mapping and value['items'] -%}
,items:{
{%- set ns_items = namespace(found_first=false) -%}
{%- for item_key, item_value in value['items'] | dictsort -%}
{%- if item_value is not none -%}
{%- if ns_items.found_first %},{% endif -%}
{%- set ns_items.found_first = true -%}
{%- if item_key == 'properties' -%}
properties:{
{%- if item_value is mapping -%}
{{- format_parameters(item_value, value['items']['required'] | default([])) -}}
{%- endif -%}
}
{%- elif item_key == 'required' -%}
required:[
{%- for req_item in item_value -%}
<|"|>{{- req_item -}}<|"|>
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
]
{%- elif item_key == 'type' -%}
{%- if item_value is string -%}
type:{{ format_argument(item_value | upper) }}
{%- else -%}
type:{{ format_argument(item_value | map('upper') | list) }}
{%- endif -%}
{%- else -%}
{{ item_key }}:{{ format_argument(item_value) }}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
}
{%- endif -%}
{%- endif -%}
{%- if add_comma %},{%- else -%} {%- set add_comma = true -%} {% endif -%}
type:<|"|>{{ value['type'] | upper }}<|"|>}
{%- endif -%}
{%- endfor -%}
{%- endmacro -%}
{%- macro format_function_declaration(tool_data) -%}
declaration:{{- tool_data['function']['name'] -}}{description:<|"|>{{- tool_data['function']['description'] -}}<|"|>
{%- set params = tool_data['function']['parameters'] -%}
{%- if params -%}
,parameters:{
{%- if params['properties'] -%}
properties:{ {{- format_parameters(params['properties'], params['required']) -}} },
{%- endif -%}
{%- if params['required'] -%}
required:[
{%- for item in params['required'] -%}
<|"|>{{- item -}}<|"|>
{{- ',' if not loop.last -}}
{%- endfor -%}
],
{%- endif -%}
{%- if params['type'] -%}
type:<|"|>{{- params['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
{%- if 'response' in tool_data['function'] -%}
{%- set response_declaration = tool_data['function']['response'] -%}
,response:{
{%- if response_declaration['description'] -%}
description:<|"|>{{- response_declaration['description'] -}}<|"|>,
{%- endif -%}
{%- if response_declaration['type'] | upper == 'OBJECT' -%}
type:<|"|>{{- response_declaration['type'] | upper -}}<|"|>}
{%- endif -%}
{%- endif -%}
}
{%- endmacro -%}
{%- macro format_argument(argument, escape_keys=True) -%}
{%- if argument is string -%}
{{- '<|"|>' + argument + '<|"|>' -}}
{%- elif argument is boolean -%}
{{- 'true' if argument else 'false' -}}
{%- elif argument is mapping -%}
{{- '{' -}}
{%- set ns = namespace(found_first=false) -%}
{%- for key, value in argument | dictsort -%}
{%- if ns.found_first %},{% endif -%}
{%- set ns.found_first = true -%}
{%- if escape_keys -%}
{{- '<|"|>' + key + '<|"|>' -}}
{%- else -%}
{{- key -}}
{%- endif -%}
:{{- format_argument(value, escape_keys=escape_keys) -}}
{%- endfor -%}
{{- '}' -}}
{%- elif argument is sequence -%}
{{- '[' -}}
{%- for item in argument -%}
{{- format_argument(item, escape_keys=escape_keys) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- ']' -}}
{%- else -%}
{{- argument -}}
{%- endif -%}
{%- endmacro -%}
{%- macro strip_thinking(text) -%}
{%- set ns = namespace(result='') -%}
{%- for part in text.split('<channel|>') -%}
{%- if '<|channel>' in part -%}
{%- set ns.result = ns.result + part.split('<|channel>')[0] -%}
{%- else -%}
{%- set ns.result = ns.result + part -%}
{%- endif -%}
{%- endfor -%}
{{- ns.result | trim -}}
{%- endmacro -%}
{%- macro format_tool_response_block(tool_name, response) -%}
{{- '<|tool_response>' -}}
{%- if response is mapping -%}
{{- 'response:' + tool_name + '{' -}}
{%- for key, value in response | dictsort -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- if not loop.last %},{% endif -%}
{%- endfor -%}
{{- '}' -}}
{%- else -%}
{{- 'response:' + tool_name + '{value:' + format_argument(response, escape_keys=False) + '}' -}}
{%- endif -%}
{{- '<tool_response|>' -}}
{%- endmacro -%}
{%- set ns = namespace(prev_message_type=None) -%}
{%- set loop_messages = messages -%}
{{ bos_token }}
{%- if (enable_thinking is defined and enable_thinking) or tools or messages[0]['role'] in ['system', 'developer'] -%}
{{- '<|turn>system\n' -}}
{%- if enable_thinking is defined and enable_thinking -%}
{{- '<|think|>' -}}
{%- set ns.prev_message_type = 'think' -%}
{%- endif -%}
{%- if messages[0]['role'] in ['system', 'developer'] -%}
{{- messages[0]['content'] | trim -}}
{%- set loop_messages = messages[1:] -%}
{%- endif -%}
{%- if tools -%}
{%- for tool in tools %}
{{- '<|tool>' -}}
{{- format_function_declaration(tool) | trim -}}
{{- '<tool|>' -}}
{%- endfor %}
{%- set ns.prev_message_type = 'tool' -%}
{%- endif -%}
{{- '<turn|>\n' -}}
{%- endif %}
{%- set ns_turn = namespace(last_user_idx=-1) -%}
{%- for i in range(loop_messages | length) -%}
{%- if loop_messages[i]['role'] == 'user' -%}
{%- set ns_turn.last_user_idx = i -%}
{%- endif -%}
{%- endfor -%}
{%- for message in loop_messages -%}
{%- if message['role'] != 'tool' -%}
{%- set ns.prev_message_type = None -%}
{%- set role = 'model' if message['role'] == 'assistant' else message['role'] -%}
{#- OpenAI may emit multiple assistant messages in one tool loop (user → asst → tool → asst → tool).
Only the first of those should open <|turn>model; later ones continue the same model turn. -#}
{%- set prev_nt = namespace(role=None, found=false) -%}
{%- if loop.index0 > 0 -%}
{%- for j in range(loop.index0 - 1, -1, -1) -%}
{%- if not prev_nt.found -%}
{%- if loop_messages[j]['role'] != 'tool' -%}
{%- set prev_nt.role = loop_messages[j]['role'] -%}
{%- set prev_nt.found = true -%}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- set continue_same_model_turn = (role == 'model' and prev_nt.role == 'assistant') -%}
{%- if not continue_same_model_turn -%}
{{- '<|turn>' + role + '\n' }}
{%- endif -%}
{%- if message.get('reasoning') and loop.index0 > ns_turn.last_user_idx and message.get('tool_calls') -%}
{{- '<|channel>thought\n' + message['reasoning'] + '\n<channel|>'}}
{%- endif -%}
{%- if message['tool_calls'] -%}
{%- for tool_call in message['tool_calls'] -%}
{%- set function = tool_call['function'] -%}
{{- '<|tool_call>call:' + function['name'] + '{' -}}
{%- if function['arguments'] is mapping -%}
{%- set ns_args = namespace(found_first=false) -%}
{%- for key, value in function['arguments'] | dictsort -%}
{%- if ns_args.found_first %},{% endif -%}
{%- set ns_args.found_first = true -%}
{{- key -}}:{{- format_argument(value, escape_keys=False) -}}
{%- endfor -%}
{%- elif function['arguments'] is string -%}
{{- function['arguments'] -}}
{%- endif -%}
{{- '}<tool_call|>' -}}
{%- endfor -%}
{%- set ns.prev_message_type = 'tool_call' -%}
{%- endif -%}
{%- set ns_tr_out = namespace(flag=false) -%}
{%- if message.get('tool_responses') -%}
{#- Legacy: tool_responses embedded on the assistant message -#}
{%- for tool_response in message['tool_responses'] -%}
{{- format_tool_response_block(tool_response['name'] | default('unknown'), tool_response['response']) -}}
{%- set ns_tr_out.flag = true -%}
{%- set ns.prev_message_type = 'tool_response' -%}
{%- endfor -%}
{%- elif message.get('tool_calls') -%}
{#- OpenAI Chat Completions: consecutive following messages with role "tool" (no break/continue; range scan) -#}
{%- set ns_tool_scan = namespace(stopped=false) -%}
{%- for k in range(loop.index0 + 1, loop_messages | length) -%}
{%- if ns_tool_scan.stopped -%}
{%- elif loop_messages[k]['role'] != 'tool' -%}
{%- set ns_tool_scan.stopped = true -%}
{%- else -%}
{%- set follow = loop_messages[k] -%}
{%- set ns_tname = namespace(name=follow.get('name') | default('unknown')) -%}
{%- for tc in message['tool_calls'] -%}
{%- if tc.get('id') == follow.get('tool_call_id') -%}
{%- set ns_tname.name = tc['function']['name'] -%}
{%- endif -%}
{%- endfor -%}
{%- set tool_body = follow.get('content') -%}
{%- if tool_body is string -%}
{{- format_tool_response_block(ns_tname.name, tool_body) -}}
{%- elif tool_body is sequence and tool_body is not string -%}
{%- set ns_txt = namespace(s='') -%}
{%- for part in tool_body -%}
{%- if part.get('type') == 'text' -%}
{%- set ns_txt.s = ns_txt.s + (part.get('text') | default('')) -%}
{%- endif -%}
{%- endfor -%}
{{- format_tool_response_block(ns_tname.name, ns_txt.s) -}}
{%- else -%}
{{- format_tool_response_block(ns_tname.name, tool_body) -}}
{%- endif -%}
{%- set ns_tr_out.flag = true -%}
{%- set ns.prev_message_type = 'tool_response' -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- if message['content'] is string -%}
{%- if role == 'model' -%}
{{- strip_thinking(message['content']) -}}
{%- else -%}
{{- message['content'] | trim -}}
{%- endif -%}
{%- elif message['content'] is sequence -%}
{%- for item in message['content'] -%}
{%- if item['type'] == 'text' -%}
{%- if role == 'model' -%}
{{- strip_thinking(item['text']) -}}
{%- else -%}
{{- item['text'] | trim -}}
{%- endif -%}
{%- elif item['type'] == 'image' -%}
{{- '\n\n<|image|>\n\n' -}}
{%- set ns.prev_message_type = 'image' -%}
{%- elif item['type'] == 'audio' -%}
{{- '<|audio|>' -}}
{%- set ns.prev_message_type = 'audio' -%}
{%- elif item['type'] == 'video' -%}
{{- '\n\n<|video|>\n\n' -}}
{%- set ns.prev_message_type = 'video' -%}
{%- endif -%}
{%- endfor -%}
{%- endif -%}
{%- if not (ns_tr_out.flag and not message.get('content')) -%}
{{- '<turn|>\n' -}}
{%- endif -%}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{%- if ns.prev_message_type != 'tool_response' -%}
{{- '<|turn>model\n' -}}
{%- endif -%}
{%- if not enable_thinking | default(false) -%}
{{- '<|channel>thought\n<channel|>' -}}
{%- endif -%}
{%- endif -%}
...@@ -7,7 +7,7 @@ requests >= 2.26.0 ...@@ -7,7 +7,7 @@ requests >= 2.26.0
tqdm tqdm
blake3 blake3
py-cpuinfo py-cpuinfo
transformers >= 4.56.0, < 5 transformers >= 4.56.0, != 5.0.*, != 5.1.*, != 5.2.*, != 5.3.*, != 5.4.*, != 5.5.0
tokenizers >= 0.21.1 # Required for fast incremental detokenization. tokenizers >= 0.21.1 # Required for fast incremental detokenization.
protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994 protobuf >= 5.29.6, !=6.30.*, !=6.31.*, !=6.32.*, !=6.33.0.*, !=6.33.1.*, !=6.33.2.*, !=6.33.3.*, !=6.33.4.* # Required by LlamaTokenizer, gRPC. CVE-2026-0994
fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint. fastapi[standard] >= 0.115.0 # Required by FastAPI's form models in the OpenAI API server's audio transcriptions endpoint.
...@@ -37,7 +37,7 @@ pyyaml ...@@ -37,7 +37,7 @@ pyyaml
six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12
setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12 setuptools>=77.0.3,<81.0.0; python_version > '3.11' # Setuptools is used by triton, we need to ensure a modern version is installed for 3.12+ so that it does not try to import distutils, which was removed in 3.12
einops # Required for Qwen2-VL. einops # Required for Qwen2-VL.
compressed-tensors == 0.14.0.1 # required for compressed-tensors compressed-tensors == 0.15.0.1 # required for compressed-tensors
depyf==0.20.0 # required for profiling and debugging with compilation config depyf==0.20.0 # required for profiling and debugging with compilation config
cloudpickle # allows pickling lambda functions in model_executor/models/registry.py cloudpickle # allows pickling lambda functions in model_executor/models/registry.py
watchfiles # required for http server to monitor the updates of TLS files watchfiles # required for http server to monitor the updates of TLS files
......
lmcache >= 0.3.9 lmcache >= 0.3.9
nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill nixl >= 0.7.1, < 0.10.0 # Required for disaggregated prefill
nixl-cu12 >= 0.7.1, < 0.10.0
nixl-cu13 >= 0.7.1, < 0.10.0
mooncake-transfer-engine >= 0.3.8 mooncake-transfer-engine >= 0.3.8
...@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test ...@@ -29,8 +29,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes>=0.49.2 bitsandbytes>=0.49.2
......
...@@ -36,8 +36,8 @@ opencv-python-headless>=4.13.0 # required for video test ...@@ -36,8 +36,8 @@ opencv-python-headless>=4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test schemathesis>=3.39.15 # Required for openai schema test
# quantization # quantization
bitsandbytes==0.49.2 bitsandbytes==0.49.2
...@@ -80,4 +80,3 @@ plotly # required for perf comparison html report ...@@ -80,4 +80,3 @@ plotly # required for perf comparison html report
rapidfuzz rapidfuzz
torchgeo==0.7.0 torchgeo==0.7.0
multiprocess==0.70.16 multiprocess==0.70.16
huggingface-hub==0.36.2
...@@ -232,7 +232,6 @@ filelock==3.25.2 ...@@ -232,7 +232,6 @@ filelock==3.25.2
# python-discovery # python-discovery
# ray # ray
# torch # torch
# transformers
# virtualenv # virtualenv
fiona==1.10.1 fiona==1.10.1
# via torchgeo # via torchgeo
...@@ -318,7 +317,7 @@ h5py==3.16.0 ...@@ -318,7 +317,7 @@ h5py==3.16.0
# via terratorch # via terratorch
harfile==0.4.0 harfile==0.4.0
# via schemathesis # via schemathesis
hf-xet==1.4.2 hf-xet==1.4.3
# via huggingface-hub # via huggingface-hub
hiredis==3.3.1 hiredis==3.3.1
# via tensorizer # via tensorizer
...@@ -332,11 +331,11 @@ httpx==0.27.2 ...@@ -332,11 +331,11 @@ httpx==0.27.2
# via # via
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
# diffusers # diffusers
# huggingface-hub
# perceptron # perceptron
# schemathesis # schemathesis
huggingface-hub==0.36.2 huggingface-hub==1.10.2
# via # via
# -r requirements/rocm-test.in
# accelerate # accelerate
# datasets # datasets
# diffusers # diffusers
...@@ -970,7 +969,6 @@ requests==2.32.5 ...@@ -970,7 +969,6 @@ requests==2.32.5
# google-api-core # google-api-core
# google-cloud-storage # google-cloud-storage
# gpt-oss # gpt-oss
# huggingface-hub
# lightly # lightly
# lm-eval # lm-eval
# mistral-common # mistral-common
...@@ -983,7 +981,6 @@ requests==2.32.5 ...@@ -983,7 +981,6 @@ requests==2.32.5
# starlette-testclient # starlette-testclient
# tacoreader # tacoreader
# tiktoken # tiktoken
# transformers
# wandb # wandb
resampy==0.4.3 resampy==0.4.3
# via -r requirements/rocm-test.in # via -r requirements/rocm-test.in
...@@ -1191,7 +1188,7 @@ timm==1.0.17 ...@@ -1191,7 +1188,7 @@ timm==1.0.17
# segmentation-models-pytorch # segmentation-models-pytorch
# terratorch # terratorch
# torchgeo # torchgeo
tokenizers==0.22.0 tokenizers==0.22.2
# via # via
# -c requirements/common.txt # -c requirements/common.txt
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
...@@ -1230,7 +1227,7 @@ tqdm==4.67.3 ...@@ -1230,7 +1227,7 @@ tqdm==4.67.3
# tacoreader # tacoreader
# terratorch # terratorch
# transformers # transformers
transformers==4.57.5 transformers==5.5.3
# via # via
# -c requirements/common.txt # -c requirements/common.txt
# -r requirements/rocm-test.in # -r requirements/rocm-test.in
...@@ -1252,7 +1249,9 @@ typepy==1.3.4 ...@@ -1252,7 +1249,9 @@ typepy==1.3.4
typer==0.24.1 typer==0.24.1
# via # via
# fastsafetensors # fastsafetensors
# huggingface-hub
# perceptron # perceptron
# transformers
typeshed-client==2.9.0 typeshed-client==2.9.0
# via jsonargparse # via jsonargparse
typing-extensions==4.15.0 typing-extensions==4.15.0
......
...@@ -18,7 +18,7 @@ httpx ...@@ -18,7 +18,7 @@ httpx
librosa # required for audio tests librosa # required for audio tests
vector_quantize_pytorch # required for minicpmo_26 test vector_quantize_pytorch # required for minicpmo_26 test
vocos # required for minicpmo_26 test vocos # required for minicpmo_26 test
peft>=0.15.0 # required for phi-4-mm test peft>=0.18.1 # required for phi-4-mm test
pqdm pqdm
ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests ray[cgraph,default]>=2.48.0 # Ray Compiled Graph, required by pipeline parallelism tests
resampy # required for audio tests resampy # required for audio tests
...@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test ...@@ -39,8 +39,8 @@ opencv-python-headless >= 4.13.0 # required for video test
datamodel_code_generator # required for minicpm3 test datamodel_code_generator # required for minicpm3 test
lm-eval[api]>=0.4.11 # required for model evaluation test lm-eval[api]>=0.4.11 # required for model evaluation test
mteb[bm25s]>=2, <3 # required for mteb test mteb[bm25s]>=2, <3 # required for mteb test
transformers==4.57.5 transformers==5.5.3
tokenizers==0.22.0 tokenizers==0.22.2
schemathesis>=3.39.15 # Required for openai schema test. schemathesis>=3.39.15 # Required for openai schema test.
# quantization # quantization
bitsandbytes==0.49.2 bitsandbytes==0.49.2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment