"tests/vscode:/vscode.git/clone" did not exist on "eb4205fee52dce68f916e1793faec1fa557a014d"
Commit 53076d70 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.8.2' into v0.8.2-ori

parents 322a0be6 9c5c81b0
...@@ -19,11 +19,11 @@ template <typename scalar_t> ...@@ -19,11 +19,11 @@ template <typename scalar_t>
static __global__ void quantize_q8_1(const scalar_t* __restrict__ x, static __global__ void quantize_q8_1(const scalar_t* __restrict__ x,
void* __restrict__ vy, const int kx, void* __restrict__ vy, const int kx,
const int kx_padded) { const int kx_padded) {
const int ix = blockDim.x * blockIdx.x + threadIdx.x; const auto ix = blockDim.x * blockIdx.x + threadIdx.x;
if (ix >= kx_padded) { if (ix >= kx_padded) {
return; return;
} }
const int iy = blockDim.y * blockIdx.y + threadIdx.y; const auto iy = blockDim.y * blockIdx.y + threadIdx.y;
const int i_padded = iy * kx_padded + ix; const int i_padded = iy * kx_padded + ix;
block_q8_1* y = (block_q8_1*)vy; block_q8_1* y = (block_q8_1*)vy;
......
...@@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q( ...@@ -14,10 +14,10 @@ static __device__ __forceinline__ void mul_mat_q(
const int & ncols_dst = ncols_y; const int & ncols_dst = ncols_y;
const int row_dst_0 = blockIdx.x*mmq_y; const auto row_dst_0 = blockIdx.x*mmq_y;
const int & row_x_0 = row_dst_0; const int & row_x_0 = row_dst_0;
const int col_dst_0 = blockIdx.y*mmq_x; const auto col_dst_0 = blockIdx.y*mmq_x;
const int & col_y_0 = col_dst_0; const int & col_y_0 = col_dst_0;
int * tile_x_ql = nullptr; int * tile_x_ql = nullptr;
...@@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q( ...@@ -39,7 +39,7 @@ static __device__ __forceinline__ void mul_mat_q(
#pragma unroll #pragma unroll
for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) { for (int ir = 0; ir < qr && ib0 + ir * blocks_per_warp/qr < blocks_per_row_x; ++ir) {
const int kqs = ir*WARP_SIZE_GGUF + threadIdx.x; const auto kqs = ir*WARP_SIZE_GGUF + threadIdx.x;
const int kbxd = kqs / QI8_1; const int kbxd = kqs / QI8_1;
#pragma unroll #pragma unroll
...@@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q( ...@@ -53,7 +53,7 @@ static __device__ __forceinline__ void mul_mat_q(
#pragma unroll #pragma unroll
for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) { for (int ids0 = 0; ids0 < mmq_x; ids0 += nwarps * QI8_1) {
const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x; const int ids = (ids0 + threadIdx.y * QI8_1 + threadIdx.x / (WARP_SIZE_GGUF/QI8_1)) % mmq_x;
const int kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1); const auto kby = threadIdx.x % (WARP_SIZE_GGUF/QI8_1);
const int col_y_eff = min(col_y_0 + ids, ncols_y-1); const int col_y_eff = min(col_y_0 + ids, ncols_y-1);
// if the sum is not needed it's faster to transform the scale to f32 ahead of time // if the sum is not needed it's faster to transform the scale to f32 ahead of time
...@@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q( ...@@ -87,14 +87,14 @@ static __device__ __forceinline__ void mul_mat_q(
#pragma unroll #pragma unroll
for (int j = 0; j < mmq_x; j += nwarps) { for (int j = 0; j < mmq_x; j += nwarps) {
const int col_dst = col_dst_0 + j + threadIdx.y; const auto col_dst = col_dst_0 + j + threadIdx.y;
if (col_dst >= ncols_dst) { if (col_dst >= ncols_dst) {
return; return;
} }
#pragma unroll #pragma unroll
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
const int row_dst = row_dst_0 + threadIdx.x + i; const auto row_dst = row_dst_0 + threadIdx.x + i;
if (row_dst >= nrows_dst) { if (row_dst >= nrows_dst) {
continue; continue;
} }
......
// copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu // copied and adapted from https://github.com/ggerganov/llama.cpp/blob/b2899/ggml-cuda/mmvq.cu
template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda> template <typename scalar_t, int qk, int qi, typename block_q_t, int vdr, vec_dot_q_cuda_t vec_dot_q_cuda>
static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) { static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * __restrict__ vy, scalar_t * __restrict__ dst, const int ncols, const int nrows) {
const int row = blockIdx.x*blockDim.y + threadIdx.y; const auto row = blockIdx.x*blockDim.y + threadIdx.y;
if (row >= nrows) { if (row >= nrows) {
return; return;
...@@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void * ...@@ -16,7 +16,7 @@ static __global__ void mul_mat_vec_q(const void * __restrict__ vx, const void *
const block_q_t * x = (const block_q_t *) vx; const block_q_t * x = (const block_q_t *) vx;
const block_q8_1 * y = (const block_q8_1 *) vy; const block_q8_1 * y = (const block_q8_1 *) vy;
for (int i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) { for (auto i = threadIdx.x / (qi/vdr); i < blocks_per_row; i += blocks_per_warp) {
const int ibx = row*blocks_per_row + i; // x block index const int ibx = row*blocks_per_row + i; // x block index
const int iby = i * (qk/QK8_1); // y block index that aligns with ibx const int iby = i * (qk/QK8_1); // y block index that aligns with ibx
......
...@@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q( ...@@ -19,10 +19,10 @@ static __device__ __forceinline__ void moe_q(
const int ncols_dst = ncols_y * top_k; const int ncols_dst = ncols_y * top_k;
const int row_dst_0 = blockIdx.x * mmq_y; const auto row_dst_0 = blockIdx.x * mmq_y;
const int& row_x_0 = row_dst_0; const int& row_x_0 = row_dst_0;
const int col_dst_0 = blockIdx.y * mmq_x; const auto col_dst_0 = blockIdx.y * mmq_x;
int token_offs[mmq_x / nwarps]; int token_offs[mmq_x / nwarps];
for (int i = 0; i < mmq_x; i += nwarps) { for (int i = 0; i < mmq_x; i += nwarps) {
...@@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q( ...@@ -56,7 +56,7 @@ static __device__ __forceinline__ void moe_q(
const int n_per_r = ((qk * blocks_per_warp) / qr); const int n_per_r = ((qk * blocks_per_warp) / qr);
#pragma unroll #pragma unroll
for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) { for (int ir = 0; ir < qr && ib0 * qk + ir * n_per_r < ncols_x; ++ir) {
const int kqs = ir * WARP_SIZE_GGUF + threadIdx.x; const auto kqs = ir * WARP_SIZE_GGUF + threadIdx.x;
const int kbxd = kqs / QI8_1; const int kbxd = kqs / QI8_1;
#pragma unroll #pragma unroll
...@@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q( ...@@ -73,7 +73,7 @@ static __device__ __forceinline__ void moe_q(
} }
if (threadIdx.x < n_per_r / QK8_1) { if (threadIdx.x < n_per_r / QK8_1) {
const int kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1); const auto kby = threadIdx.x % (WARP_SIZE_GGUF / QI8_1);
const int col_y_eff = token_offs[threadIdx.y] / top_k; const int col_y_eff = token_offs[threadIdx.y] / top_k;
const int block_x = const int block_x =
ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby; ib0 * (qk / QK8_1) + ir * (WARP_SIZE_GGUF / QI8_1) + kby;
...@@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q( ...@@ -119,7 +119,7 @@ static __device__ __forceinline__ void moe_q(
#pragma unroll #pragma unroll
for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) { for (int i = 0; i < mmq_y; i += WARP_SIZE_GGUF) {
const int row_dst = row_dst_0 + threadIdx.x + i; const auto row_dst = row_dst_0 + threadIdx.x + i;
if (row_dst >= nrows_dst) { if (row_dst >= nrows_dst) {
continue; continue;
} }
......
...@@ -203,12 +203,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel( ...@@ -203,12 +203,12 @@ __global__ void gemm_half_q_half_gptq_4bit_kernel(
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int t = threadIdx.x; auto t = threadIdx.x;
// Block // Block
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
int offset_m = blockIdx.y * m_count; auto offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
...@@ -341,12 +341,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel( ...@@ -341,12 +341,12 @@ __global__ void gemm_half_q_half_gptq_2bit_kernel(
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int t = threadIdx.x; auto t = threadIdx.x;
// Block // Block
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
int offset_m = blockIdx.y * m_count; auto offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
...@@ -462,12 +462,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel( ...@@ -462,12 +462,12 @@ __global__ void gemm_half_q_half_gptq_3bit_kernel(
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int t = threadIdx.x; auto t = threadIdx.x;
// Block // Block
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
int offset_m = blockIdx.y * m_count; auto offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
...@@ -590,12 +590,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel( ...@@ -590,12 +590,12 @@ __global__ void gemm_half_q_half_gptq_8bit_kernel(
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int t = threadIdx.x; auto t = threadIdx.x;
// Block // Block
int offset_n = blockIdx.x * BLOCK_KN_SIZE * 4; auto offset_n = blockIdx.x * BLOCK_KN_SIZE * 4;
int offset_m = blockIdx.y * m_count; auto offset_m = blockIdx.y * m_count;
int offset_k = blockIdx.z * BLOCK_KN_SIZE; auto offset_k = blockIdx.z * BLOCK_KN_SIZE;
[[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n); [[maybe_unused]] int end_n = min(offset_n + BLOCK_KN_SIZE * 4, size_n);
[[maybe_unused]] int end_m = min(offset_m + m_count, size_m); [[maybe_unused]] int end_m = min(offset_m + m_count, size_m);
...@@ -769,14 +769,14 @@ __global__ void reconstruct_exllama_8bit_kernel( ...@@ -769,14 +769,14 @@ __global__ void reconstruct_exllama_8bit_kernel(
MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q8_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int offset_k = BLOCK_KN_SIZE * blockIdx.y; auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
// Preload remapping table // Preload remapping table
__shared__ int perm[BLOCK_KN_SIZE]; __shared__ int perm[BLOCK_KN_SIZE];
int t = threadIdx.x; auto t = threadIdx.x;
if (b_q_perm) { if (b_q_perm) {
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
...@@ -866,14 +866,14 @@ __global__ void reconstruct_exllama_4bit_kernel( ...@@ -866,14 +866,14 @@ __global__ void reconstruct_exllama_4bit_kernel(
MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q4_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int offset_k = BLOCK_KN_SIZE * blockIdx.y; auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
// Preload remapping table // Preload remapping table
__shared__ int perm[BLOCK_KN_SIZE]; __shared__ int perm[BLOCK_KN_SIZE];
int t = threadIdx.x; auto t = threadIdx.x;
if (b_q_perm) { if (b_q_perm) {
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
...@@ -971,14 +971,14 @@ __global__ void reconstruct_exllama_3bit_kernel( ...@@ -971,14 +971,14 @@ __global__ void reconstruct_exllama_3bit_kernel(
MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q3_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int offset_k = BLOCK_KN_SIZE * blockIdx.y; auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
// Preload remapping table // Preload remapping table
__shared__ int perm[BLOCK_KN_SIZE]; __shared__ int perm[BLOCK_KN_SIZE];
int t = threadIdx.x; auto t = threadIdx.x;
if (b_q_perm) { if (b_q_perm) {
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
...@@ -1069,14 +1069,14 @@ __global__ void reconstruct_exllama_2bit_kernel( ...@@ -1069,14 +1069,14 @@ __global__ void reconstruct_exllama_2bit_kernel(
MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n); MatrixView_q2_row b_gptq_qzeros_(b_gptq_qzeros, groups, size_n);
MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n); MatrixView_half b_gptq_scales_(b_gptq_scales, groups, size_n);
int offset_k = BLOCK_KN_SIZE * blockIdx.y; auto offset_k = BLOCK_KN_SIZE * blockIdx.y;
int offset_n = BLOCK_KN_SIZE * blockIdx.x * 4; auto offset_n = BLOCK_KN_SIZE * blockIdx.x * 4;
int end_k = min(offset_k + BLOCK_KN_SIZE, size_k); int end_k = min(offset_k + BLOCK_KN_SIZE, size_k);
// Preload remapping table // Preload remapping table
__shared__ int perm[BLOCK_KN_SIZE]; __shared__ int perm[BLOCK_KN_SIZE];
int t = threadIdx.x; auto t = threadIdx.x;
if (b_q_perm) { if (b_q_perm) {
if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t]; if (offset_k + t < size_k) perm[t] = b_q_perm[offset_k + t];
...@@ -1185,11 +1185,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel( ...@@ -1185,11 +1185,11 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
int zero_width = width / 8; int zero_width = width / 8;
int vec_height = height * 4; int vec_height = height * 4;
const int blockwidth2 = BLOCK_KN_SIZE / 2; const int blockwidth2 = BLOCK_KN_SIZE / 2;
int b = blockIdx.y * BLOCK_M_SIZE_MAX; auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
int b_end = min(BLOCK_M_SIZE_MAX, batch - b); int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
int h = BLOCK_KN_SIZE * blockIdx.z / 8; auto h = BLOCK_KN_SIZE * blockIdx.z / 8;
int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4; int h_end = min(BLOCK_KN_SIZE / 8, height - h) * 4;
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
if (threadIdx.x < h_end) { if (threadIdx.x < h_end) {
...@@ -1201,8 +1201,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel( ...@@ -1201,8 +1201,8 @@ __global__ void gemm_half_q_half_alt_4bit_kernel(
} }
__shared__ half2 deq2[256][8]; __shared__ half2 deq2[256][8];
int val = threadIdx.x / 8; auto val = threadIdx.x / 8;
int off = threadIdx.x % 8; auto off = threadIdx.x % 8;
for (; val < 256; val += BLOCK_KN_SIZE / 8) { for (; val < 256; val += BLOCK_KN_SIZE / 8) {
deq2[val][off] = deq2[val][off] =
__halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4)); __halves2half2(__int2half_rn(val & 0xF), __int2half_rn(val >> 4));
...@@ -1284,11 +1284,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel( ...@@ -1284,11 +1284,11 @@ __global__ void gemm_half_q_half_alt_8bit_kernel(
int zero_width = width / 4; int zero_width = width / 4;
int vec_height = height * 2; int vec_height = height * 2;
const int blockwidth2 = BLOCK_KN_SIZE / 2; const int blockwidth2 = BLOCK_KN_SIZE / 2;
int b = blockIdx.y * BLOCK_M_SIZE_MAX; auto b = blockIdx.y * BLOCK_M_SIZE_MAX;
int b_end = min(BLOCK_M_SIZE_MAX, batch - b); int b_end = min(BLOCK_M_SIZE_MAX, batch - b);
int h = BLOCK_KN_SIZE * blockIdx.z / 4; auto h = BLOCK_KN_SIZE * blockIdx.z / 4;
int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2; int h_end = min(BLOCK_KN_SIZE / 4, height - h) * 2;
int w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; auto w = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
__shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2]; __shared__ half2 blockvec[BLOCK_M_SIZE_MAX][blockwidth2];
if (threadIdx.x < h_end) { if (threadIdx.x < h_end) {
...@@ -1397,8 +1397,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w, ...@@ -1397,8 +1397,8 @@ __global__ void reconstruct_gptq_kernel(const uint32_t* __restrict__ w,
half* __restrict__ out) { half* __restrict__ out) {
// Start of block // Start of block
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
int row = blockIdx.y * 32 / bit; auto row = blockIdx.y * 32 / bit;
if (column >= width) return; if (column >= width) return;
// Views // Views
...@@ -1429,8 +1429,8 @@ __global__ void reconstruct_gptq_3bit_kernel( ...@@ -1429,8 +1429,8 @@ __global__ void reconstruct_gptq_3bit_kernel(
const int height, const int width, const int group, const int height, const int width, const int group,
half* __restrict__ out) { half* __restrict__ out) {
// Start of block // Start of block
int column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x; auto column = BLOCK_KN_SIZE * blockIdx.x + threadIdx.x;
int row = blockIdx.y * 32; auto row = blockIdx.y * 32;
if (column >= width) return; if (column >= width) return;
// Views // Views
...@@ -1546,7 +1546,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a, ...@@ -1546,7 +1546,7 @@ void gemm_half_q_half_cuda(cublasHandle_t cublas_handle, const half* a,
__global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
const int size_k, const int size_n) { const int size_k, const int size_n) {
int n = blockIdx.x * THREADS_X + threadIdx.x; auto n = blockIdx.x * THREADS_X + threadIdx.x;
if (n >= size_n) return; if (n >= size_n) return;
int k = 0; int k = 0;
uint32_t* b_ptr = b_q_weight + n; uint32_t* b_ptr = b_q_weight + n;
...@@ -1559,7 +1559,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight, ...@@ -1559,7 +1559,7 @@ __global__ void shuffle_4bit_kernel(uint32_t* __restrict__ b_q_weight,
__global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
const int size_k, const int size_n) { const int size_k, const int size_n) {
int n = blockIdx.x * THREADS_X + threadIdx.x; auto n = blockIdx.x * THREADS_X + threadIdx.x;
if (n >= size_n) return; if (n >= size_n) return;
int k = 0; int k = 0;
uint32_t* b_ptr = b_q_weight + n; uint32_t* b_ptr = b_q_weight + n;
...@@ -1572,7 +1572,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight, ...@@ -1572,7 +1572,7 @@ __global__ void shuffle_8bit_kernel(uint32_t* __restrict__ b_q_weight,
__global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
const int size_k, const int size_n) { const int size_k, const int size_n) {
int n = blockIdx.x * THREADS_X + threadIdx.x; auto n = blockIdx.x * THREADS_X + threadIdx.x;
if (n >= size_n) return; if (n >= size_n) return;
int k = 0; int k = 0;
uint32_t* b_ptr = b_q_weight + n; uint32_t* b_ptr = b_q_weight + n;
...@@ -1585,7 +1585,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight, ...@@ -1585,7 +1585,7 @@ __global__ void shuffle_2bit_kernel(uint32_t* __restrict__ b_q_weight,
__global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight, __global__ void shuffle_3bit_kernel(uint32_t* __restrict__ b_q_weight,
const int size_k, const int size_n) { const int size_k, const int size_n) {
int n = blockIdx.x * THREADS_X + threadIdx.x; auto n = blockIdx.x * THREADS_X + threadIdx.x;
if (n >= size_n) return; if (n >= size_n) return;
int k = 0; int k = 0;
uint32_t* b_ptr = b_q_weight + n; uint32_t* b_ptr = b_q_weight + n;
...@@ -1603,9 +1603,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w, ...@@ -1603,9 +1603,9 @@ __global__ void make_sequential_4bit_kernel(const uint32_t* __restrict__ w,
const uint64_t* w2 = (uint64_t*)w; const uint64_t* w2 = (uint64_t*)w;
uint64_t* w_new2 = (uint64_t*)w_new; uint64_t* w_new2 = (uint64_t*)w_new;
int w2_stride = w_width >> 1; int w2_stride = w_width >> 1;
int w2_column = THREADS_X * blockIdx.x + threadIdx.x; auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
if (w2_column >= w2_stride) return; if (w2_column >= w2_stride) return;
int w_new2_row = blockIdx.y; auto w_new2_row = blockIdx.y;
int q_perm_idx = w_new2_row << 3; int q_perm_idx = w_new2_row << 3;
uint64_t dst = 0; uint64_t dst = 0;
...@@ -1634,9 +1634,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w, ...@@ -1634,9 +1634,9 @@ __global__ void make_sequential_2bit_kernel(const uint32_t* __restrict__ w,
const uint64_t* w2 = (uint64_t*)w; const uint64_t* w2 = (uint64_t*)w;
uint64_t* w_new2 = (uint64_t*)w_new; uint64_t* w_new2 = (uint64_t*)w_new;
int w2_stride = w_width >> 1; int w2_stride = w_width >> 1;
int w2_column = THREADS_X * blockIdx.x + threadIdx.x; auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
if (w2_column >= w2_stride) return; if (w2_column >= w2_stride) return;
int w_new2_row = blockIdx.y; auto w_new2_row = blockIdx.y;
int q_perm_idx = w_new2_row << 4; int q_perm_idx = w_new2_row << 4;
uint64_t dst = 0; uint64_t dst = 0;
...@@ -1662,10 +1662,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w, ...@@ -1662,10 +1662,10 @@ __global__ void make_sequential_3bit_kernel(const uint32_t* __restrict__ w,
uint32_t* __restrict__ w_new, uint32_t* __restrict__ w_new,
const int* __restrict__ q_perm, const int* __restrict__ q_perm,
const int w_width) { const int w_width) {
int w_column = THREADS_X * blockIdx.x + threadIdx.x; auto w_column = THREADS_X * blockIdx.x + threadIdx.x;
if (w_column >= w_width) return; if (w_column >= w_width) return;
int w_new_row = blockIdx.y * 3; auto w_new_row = blockIdx.y * 3;
int q_perm_idx = blockIdx.y << 5; auto q_perm_idx = blockIdx.y << 5;
uint32_t dst[3] = {0, 0, 0}; uint32_t dst[3] = {0, 0, 0};
#pragma unroll #pragma unroll
...@@ -1748,9 +1748,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w, ...@@ -1748,9 +1748,9 @@ __global__ void make_sequential_8bit_kernel(const uint32_t* __restrict__ w,
const uint64_t* w2 = (uint64_t*)w; const uint64_t* w2 = (uint64_t*)w;
uint64_t* w_new2 = (uint64_t*)w_new; uint64_t* w_new2 = (uint64_t*)w_new;
int w2_stride = w_width >> 1; int w2_stride = w_width >> 1;
int w2_column = THREADS_X * blockIdx.x + threadIdx.x; auto w2_column = THREADS_X * blockIdx.x + threadIdx.x;
if (w2_column >= w2_stride) return; if (w2_column >= w2_stride) return;
int w_new2_row = blockIdx.y; auto w_new2_row = blockIdx.y;
int q_perm_idx = w_new2_row << 2; int q_perm_idx = w_new2_row << 2;
uint64_t dst = 0; uint64_t dst = 0;
......
...@@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -55,11 +55,11 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K + this_block_B_base_ptr = params.B_ptr + blockIdx.y * Ntile * params.K +
blockIdx.z * params.SplitK * 4; blockIdx.z * params.SplitK * 4;
const int lane_id = threadIdx.x % WARP_SIZE; const auto lane_id = threadIdx.x % WARP_SIZE;
// For matrix A, a block load/store Mtile(row) x 32(col) elements in // For matrix A, a block load/store Mtile(row) x 32(col) elements in
// multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter // multiple iters, 8x4 warp load/store 8(row) x 32(col) elements per iter
const int Aldg_row_base_idx = threadIdx.x / 4; const auto Aldg_row_base_idx = threadIdx.x / 4;
Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A; Aldg_col_idx = (threadIdx.x % 4) * LDG_ELEMENT_CNT_A;
const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx; const int Aldg_base_offset = Aldg_row_base_idx * params.K + Aldg_col_idx;
...@@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -67,7 +67,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
// elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row) // elements of N32K16 packing in multiple iters, 4x8 warp load/store 4(row)
// * 128(col) per iter // * 128(col) per iter
Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B; Bldg_col_idx = (threadIdx.x % 8) * LDG_ELEMENT_CNT_B;
const int Bldg_row_base_idx = threadIdx.x / 8; const auto Bldg_row_base_idx = threadIdx.x / 8;
const int Bldg_base_offset = const int Bldg_base_offset =
Bldg_row_base_idx * params.K * 4 + Bldg_col_idx; Bldg_row_base_idx * params.K * 4 + Bldg_col_idx;
...@@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -89,7 +89,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
B_ldg_guard = 0; B_ldg_guard = 0;
#pragma unroll #pragma unroll
for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) { for (int i = 0; i < (Mtile + M_SIZE_ONE_LOAD - 1) / M_SIZE_ONE_LOAD; ++i) {
int m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD; auto m_idx = blockIdx.x * Mtile + Aldg_row_base_idx + i * M_SIZE_ONE_LOAD;
if (m_idx < params.M) { if (m_idx < params.M) {
A_ldg_guard |= (1u << i); A_ldg_guard |= (1u << i);
} }
...@@ -98,7 +98,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -98,7 +98,7 @@ struct GmemTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
const int N_padded = (params.N + 31) / 32 * 32; const int N_padded = (params.N + 31) / 32 * 32;
#pragma unroll #pragma unroll
for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) { for (int i = 0; i < (Ntile + N_SIZE_ONE_LOAD - 1) / N_SIZE_ONE_LOAD; ++i) {
int n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 + auto n_idx = blockIdx.y * Ntile + (Bldg_row_base_idx / 8) * 32 +
i * N_SIZE_ONE_LOAD; i * N_SIZE_ONE_LOAD;
if (n_idx < N_padded) { if (n_idx < N_padded) {
B_ldg_guard |= (1u << i); B_ldg_guard |= (1u << i);
...@@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -355,7 +355,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
__device__ void fused_splitk_reduce() { __device__ void fused_splitk_reduce() {
// need splitk-reduce if enable splitk // need splitk-reduce if enable splitk
if (gridDim.z > 1) { if (gridDim.z > 1) {
int blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y; auto blk_red_idx = blockIdx.x * gridDim.y + blockIdx.y;
// Wait for all previous blocks in the splitk direction to accumulate the // Wait for all previous blocks in the splitk direction to accumulate the
// results into C_tmp // results into C_tmp
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
...@@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -371,7 +371,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
} }
__syncthreads(); __syncthreads();
int C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4; auto C_tmp_base_offset = blk_red_idx * Mtile * Ntile + threadIdx.x * 4;
if (blockIdx.z != 0) { if (blockIdx.z != 0) {
// expecting that temporary register here reuses the previous A&B frag // expecting that temporary register here reuses the previous A&B frag
// register // register
...@@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK { ...@@ -456,7 +456,7 @@ struct ComputeTile_W8A16_PerC_MtilexNtilex32_multistage_SM8x_SplitK {
FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset; FType* C_base_ptr = this_block_C_base_ptr + store_c_base_offset;
// C_tile lds and stg // C_tile lds and stg
int m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile; auto m_base_idx = store_c_row_base_idx + blockIdx.x * Mtile;
bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N; bool n_guard = (store_c_col_idx + blockIdx.y * Ntile) < params.N;
if (WARP_NTILE == 32) { if (WARP_NTILE == 32) {
int lds_c_base_offset = warp_id * Mtile * WARP_NTILE + int lds_c_base_offset = warp_id * Mtile * WARP_NTILE +
...@@ -580,7 +580,7 @@ __global__ void __launch_bounds__(BLOCK) ...@@ -580,7 +580,7 @@ __global__ void __launch_bounds__(BLOCK)
int sts_stage_idx = 0; int sts_stage_idx = 0;
int lds_stage_idx = 0; int lds_stage_idx = 0;
int tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K auto tb_k_slice = blockIdx.z * params.SplitK + params.SplitK <= params.K
? params.SplitK ? params.SplitK
: params.K - blockIdx.z * params.SplitK; : params.K - blockIdx.z * params.SplitK;
int k_tiles = (tb_k_slice + 31) / 32; int k_tiles = (tb_k_slice + 31) / 32;
...@@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( ...@@ -777,13 +777,13 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
const QT* qdata, const FT* scales, const FT* zeros, FT* fdata, const QT* qdata, const FT* scales, const FT* zeros, FT* fdata,
const int N_32align, const int N, const int K) { const int N_32align, const int N, const int K) {
__shared__ FT smem[64 * 32]; __shared__ FT smem[64 * 32];
int warp_id = threadIdx.x / 32; auto warp_id = threadIdx.x / 32;
int lane_id = threadIdx.x % 32; auto lane_id = threadIdx.x % 32;
const int src_row_idx = blockIdx.x * 8 + lane_id / 4; const auto src_row_idx = blockIdx.x * 8 + lane_id / 4;
const int src_col_idx = const int src_col_idx =
blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16; blockIdx.y * 64 * 4 + warp_id * 16 * 4 + (lane_id % 4) * 16;
const int src_offset = src_row_idx * K * 4 + src_col_idx; const int src_offset = src_row_idx * K * 4 + src_col_idx;
int params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4; auto params_nidx = blockIdx.x * 32 + (lane_id / 4) * 4;
QT qval_reg[16]; QT qval_reg[16];
const QT* pdata = qdata + src_offset; const QT* pdata = qdata + src_offset;
...@@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel( ...@@ -829,8 +829,8 @@ __global__ void restore_N32_K16_dequantize_rhs_w8a16_perc_kernel(
*reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32); *reinterpret_cast<uint4*>(smem + lds_base_offset + i * 32 * 32);
} }
const int dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4; const auto dst_row_base_kidx = blockIdx.y * 64 + threadIdx.x / 4;
const int dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8; const auto dst_col_nidx = blockIdx.x * 32 + (threadIdx.x % 4) * 8;
#pragma unroll #pragma unroll
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
int dst_row_kidx = dst_row_base_kidx + i * 32; int dst_row_kidx = dst_row_base_kidx + i * 32;
......
...@@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128) ...@@ -13,8 +13,8 @@ __global__ void __launch_bounds__(128)
const uint8_t* B, const FType* B_scale, const FType* B_zero, const uint8_t* B, const FType* B_scale, const FType* B_zero,
uint8_t* B_result, FType* B_scale_result, FType* B_zero_result, uint8_t* B_result, FType* B_scale_result, FType* B_zero_result,
const int K, const int N, const int N_32align) { const int K, const int N, const int N_32align) {
const int lane_id = threadIdx.x % 32; const auto lane_id = threadIdx.x % 32;
const int warp_id = threadIdx.x / 32; const auto warp_id = threadIdx.x / 32;
if (blockIdx.x != gridDim.x - 1) { if (blockIdx.x != gridDim.x - 1) {
// Load B // Load B
...@@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128) ...@@ -50,7 +50,7 @@ __global__ void __launch_bounds__(128)
} }
// Store B // Store B
const int dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8; const auto dst_row_base_idx = blockIdx.y * (128 / 4) + (lane_id / 8) * 8;
const int dst_col_idx = const int dst_col_idx =
blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8; blockIdx.x * (64 * 4) + warp_id * 64 + (lane_id % 8) * 8;
for (int i = 0; i < 8; ++i) { for (int i = 0; i < 8; ++i) {
...@@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128) ...@@ -65,7 +65,7 @@ __global__ void __launch_bounds__(128)
} else { } else {
// Load B_scale and B_zero // Load B_scale and B_zero
FType b_scale_reg, b_zero_reg; FType b_scale_reg, b_zero_reg;
int src_offset = blockIdx.y * 128 + threadIdx.x; auto src_offset = blockIdx.y * 128 + threadIdx.x;
ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N); ldg16_cg_0(b_scale_reg, B_scale + src_offset, src_offset < N);
if (B_zero != nullptr) if (B_zero != nullptr)
ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N); ldg16_cg_0(b_zero_reg, B_zero + src_offset, src_offset < N);
......
...@@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX> ...@@ -62,7 +62,7 @@ template <typename FType, int BLOCK, int N_MATRIX>
__global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C, __global__ void f16_gemm_splitk_reduce_kernel(const FType* C_split, FType* C,
uint32_t n, uint32_t n_matrix, uint32_t n, uint32_t n_matrix,
uint32_t matrix_size) { uint32_t matrix_size) {
int idx = blockIdx.x * BLOCK + threadIdx.x; auto idx = blockIdx.x * BLOCK + threadIdx.x;
if (idx >= matrix_size) { if (idx >= matrix_size) {
return; return;
......
...@@ -124,3 +124,52 @@ nsys stats report1.nsys-rep ...@@ -124,3 +124,52 @@ nsys stats report1.nsys-rep
GUI example: GUI example:
<img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" /> <img width="1799" alt="Screenshot 2025-03-05 at 11 48 42 AM" src="https://github.com/user-attachments/assets/c7cff1ae-6d6f-477d-a342-bd13c4fc424c" />
## Profiling vLLM Python Code
The Python standard library includes
[cProfile](https://docs.python.org/3/library/profile.html) for profiling Python
code. vLLM includes a couple of helpers that make it easy to apply it to a section of vLLM.
Both the `vllm.utils.cprofile` and `vllm.utils.cprofile_context` functions can be
used to profile a section of code.
### Example usage - decorator
The first helper is a Python decorator that can be used to profile a function.
If a filename is specified, the profile will be saved to that file. If no filename is
specified, profile data will be printed to stdout.
```python
import vllm.utils
@vllm.utils.cprofile("expensive_function.prof")
def expensive_function():
# some expensive code
pass
```
### Example Usage - context manager
The second helper is a context manager that can be used to profile a block of
code. Similar to the decorator, the filename is optional.
```python
import vllm.utils
def another_function():
# more expensive code
pass
with vllm.utils.cprofile_context("another_function.prof"):
another_function()
```
### Analyzing Profile Results
There are multiple tools available that can help analyze the profile results.
One example is [snakeviz](https://jiffyclub.github.io/snakeviz/).
```bash
pip install snakeviz
snakeviz expensive_function.prof
```
...@@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference. ...@@ -7,5 +7,192 @@ A major use case is for multi-host/multi-node distributed inference.
vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving. vLLM can be deployed with [LWS](https://github.com/kubernetes-sigs/lws) on Kubernetes for distributed model serving.
Please see [this guide](https://github.com/kubernetes-sigs/lws/tree/main/docs/examples/vllm) for more details on ## Prerequisites
deploying vLLM on Kubernetes using LWS.
* At least two Kubernetes nodes, each with 8 GPUs, are required.
* Install LWS by following the instructions found [here](https://lws.sigs.k8s.io/docs/installation/).
## Deploy and Serve
Deploy the following yaml file `lws.yaml`
```yaml
apiVersion: leaderworkerset.x-k8s.io/v1
kind: LeaderWorkerSet
metadata:
name: vllm
spec:
replicas: 2
leaderWorkerTemplate:
size: 2
restartPolicy: RecreateGroupOnPodRestart
leaderTemplate:
metadata:
labels:
role: leader
spec:
containers:
- name: vllm-leader
image: docker.io/vllm/vllm-openai:latest
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh leader --ray_cluster_size=$(LWS_GROUP_SIZE);
python3 -m vllm.entrypoints.openai.api_server --port 8080 --model meta-llama/Meta-Llama-3.1-405B-Instruct --tensor-parallel-size 8 --pipeline_parallel_size 2"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
ports:
- containerPort: 8080
readinessProbe:
tcpSocket:
port: 8080
initialDelaySeconds: 15
periodSeconds: 10
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
workerTemplate:
spec:
containers:
- name: vllm-worker
image: docker.io/vllm/vllm-openai:latest
command:
- sh
- -c
- "bash /vllm-workspace/examples/online_serving/multi-node-serving.sh worker --ray_address=$(LWS_LEADER_ADDRESS)"
resources:
limits:
nvidia.com/gpu: "8"
memory: 1124Gi
ephemeral-storage: 800Gi
requests:
ephemeral-storage: 800Gi
cpu: 125
env:
- name: HUGGING_FACE_HUB_TOKEN
value: <your-hf-token>
volumeMounts:
- mountPath: /dev/shm
name: dshm
volumes:
- name: dshm
emptyDir:
medium: Memory
sizeLimit: 15Gi
---
apiVersion: v1
kind: Service
metadata:
name: vllm-leader
spec:
ports:
- name: http
port: 8080
protocol: TCP
targetPort: 8080
selector:
leaderworkerset.sigs.k8s.io/name: vllm
role: leader
type: ClusterIP
```
```bash
kubectl apply -f lws.yaml
```
Verify the status of the pods:
```bash
kubectl get pods
```
Should get an output similar to this:
```bash
NAME READY STATUS RESTARTS AGE
vllm-0 1/1 Running 0 2s
vllm-0-1 1/1 Running 0 2s
vllm-1 1/1 Running 0 2s
vllm-1-1 1/1 Running 0 2s
```
Verify that the distributed tensor-parallel inference works:
```bash
kubectl logs vllm-0 |grep -i "Loading model weights took"
```
Should get something similar to this:
```text
INFO 05-08 03:20:24 model_runner.py:173] Loading model weights took 0.1189 GB
(RayWorkerWrapper pid=169, ip=10.20.0.197) INFO 05-08 03:20:28 model_runner.py:173] Loading model weights took 0.1189 GB
```
## Access ClusterIP service
```bash
# Listen on port 8080 locally, forwarding to the targetPort of the service's port 8080 in a pod selected by the service
kubectl port-forward svc/vllm-leader 8080:8080
```
The output should be similar to the following:
```text
Forwarding from 127.0.0.1:8080 -> 8080
Forwarding from [::1]:8080 -> 8080
```
## Serve the model
Open another terminal and send a request
```text
curl http://localhost:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"prompt": "San Francisco is a",
"max_tokens": 7,
"temperature": 0
}'
```
The output should be similar to the following
```text
{
"id": "cmpl-1bb34faba88b43f9862cfbfb2200949d",
"object": "text_completion",
"created": 1715138766,
"model": "meta-llama/Meta-Llama-3.1-405B-Instruct",
"choices": [
{
"index": 0,
"text": " top destination for foodies, with",
"logprobs": null,
"finish_reason": "length",
"stop_reason": null
}
],
"usage": {
"prompt_tokens": 5,
"total_tokens": 12,
"completion_tokens": 7
}
}
```
...@@ -25,7 +25,7 @@ import torch ...@@ -25,7 +25,7 @@ import torch
# unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint. # unsloth/tinyllama-bnb-4bit is a pre-quantized checkpoint.
model_id = "unsloth/tinyllama-bnb-4bit" model_id = "unsloth/tinyllama-bnb-4bit"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes") quantization="bitsandbytes")
``` ```
## Inflight quantization: load as 4bit quantization ## Inflight quantization: load as 4bit quantization
...@@ -35,7 +35,7 @@ from vllm import LLM ...@@ -35,7 +35,7 @@ from vllm import LLM
import torch import torch
model_id = "huggyllama/llama-7b" model_id = "huggyllama/llama-7b"
llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \ llm = LLM(model=model_id, dtype=torch.bfloat16, trust_remote_code=True, \
quantization="bitsandbytes", load_format="bitsandbytes") quantization="bitsandbytes")
``` ```
## OpenAI Compatible Server ## OpenAI Compatible Server
...@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes") ...@@ -43,5 +43,5 @@ quantization="bitsandbytes", load_format="bitsandbytes")
Append the following to your 4bit model arguments: Append the following to your 4bit model arguments:
```console ```console
--quantization bitsandbytes --load-format bitsandbytes --quantization bitsandbytes
``` ```
...@@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs, ...@@ -10,10 +10,10 @@ Reasoning models return a additional `reasoning_content` field in their outputs,
vLLM currently supports the following reasoning models: vLLM currently supports the following reasoning models:
| Model Series | Parser Name | Structured Output Support | | Model Series | Parser Name | Structured Output Support | Tool Calling |
|--------------|-------------|------------------| |--------------|-------------|------------------|-------------|
| [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | | [DeepSeek R1 series](https://huggingface.co/collections/deepseek-ai/deepseek-r1-678e1e131c0169c0bc89728d) | `deepseek_r1` | `guided_json`, `guided_regex` | ❌ |
| [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | | [QwQ-32B](https://huggingface.co/Qwen/QwQ-32B) | `deepseek_r1` | `guided_json`, `guided_regex` | ✅ |
## Quickstart ## Quickstart
...@@ -170,10 +170,51 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content) ...@@ -170,10 +170,51 @@ print("reasoning_content: ", completion.choices[0].message.reasoning_content)
print("content: ", completion.choices[0].message.content) print("content: ", completion.choices[0].message.content)
``` ```
## Tool Calling
The reasoning content is also available when both tool calling and the reasoning parser are enabled. Additionally, tool calling only parses functions from the `content` field, not from the `reasoning_content`.
```python
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
tools = [{
"type": "function",
"function": {
"name": "get_weather",
"description": "Get the current weather in a given location",
"parameters": {
"type": "object",
"properties": {
"location": {"type": "string", "description": "City and state, e.g., 'San Francisco, CA'"},
"unit": {"type": "string", "enum": ["celsius", "fahrenheit"]}
},
"required": ["location", "unit"]
}
}
}]
response = client.chat.completions.create(
model=client.models.list().data[0].id,
messages=[{"role": "user", "content": "What's the weather like in San Francisco?"}],
tools=tools,
tool_choice="auto"
)
print(response)
tool_call = response.choices[0].message.tool_calls[0].function
print(f"reasoning_content: {response.choices[0].message.reasoning_content}")
print(f"Function called: {tool_call.name}")
print(f"Arguments: {tool_call.arguments}")
```
For more examples, please refer to <gh-file:examples/online_serving/openai_chat_completion_tool_calls_with_reasoning.py> .
## Limitations ## Limitations
- The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`). - The reasoning content is only available for online serving's chat completion endpoint (`/v1/chat/completions`).
- It is not compatible with [`tool_calling`](#tool_calling).
## How to support a new reasoning model ## How to support a new reasoning model
......
...@@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ...@@ -30,8 +30,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="facebook/opt-6.7b", model="facebook/opt-6.7b",
tensor_parallel_size=1, tensor_parallel_size=1,
speculative_model="facebook/opt-125m", speculative_config={
num_speculative_tokens=5, "model": "facebook/opt-125m",
"num_speculative_tokens": 5,
},
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -45,10 +47,14 @@ To perform the same with an online mode launch the server: ...@@ -45,10 +47,14 @@ To perform the same with an online mode launch the server:
```bash ```bash
python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \ python -m vllm.entrypoints.openai.api_server --host 0.0.0.0 --port 8000 --model facebook/opt-6.7b \
--seed 42 -tp 1 --speculative_model facebook/opt-125m \ --seed 42 -tp 1 --gpu_memory_utilization 0.8 \
--num_speculative_tokens 5 --gpu_memory_utilization 0.8 --speculative_config '{"model": "facebook/opt-125m", "num_speculative_tokens": 5}'
``` ```
:::{warning}
Note: Please use `--speculative_config` to set all configurations related to speculative decoding. The previous method of specifying the model through `--speculative_model` and adding related parameters (e.g., `--num_speculative_tokens`) separately will be deprecated in the next release.
:::
Then use a client: Then use a client:
```python ```python
...@@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ...@@ -101,9 +107,11 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="facebook/opt-6.7b", model="facebook/opt-6.7b",
tensor_parallel_size=1, tensor_parallel_size=1,
speculative_model="[ngram]", speculative_config={
num_speculative_tokens=5, "method": "ngram",
ngram_prompt_lookup_max=4, "num_speculative_tokens": 5,
"prompt_lookup_max": 4,
},
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ...@@ -131,8 +139,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3.1-70B-Instruct", model="meta-llama/Meta-Llama-3.1-70B-Instruct",
tensor_parallel_size=4, tensor_parallel_size=4,
speculative_model="ibm-ai-platform/llama3-70b-accelerator", speculative_config={
speculative_draft_tensor_parallel_size=1, "model": "ibm-ai-platform/llama3-70b-accelerator",
"draft_tensor_parallel_size": 1,
},
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95) ...@@ -175,8 +185,10 @@ sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM( llm = LLM(
model="meta-llama/Meta-Llama-3-8B-Instruct", model="meta-llama/Meta-Llama-3-8B-Instruct",
tensor_parallel_size=4, tensor_parallel_size=4,
speculative_model="yuhuili/EAGLE-LLaMA3-Instruct-8B", speculative_config={
speculative_draft_tensor_parallel_size=1, "model": "yuhuili/EAGLE-LLaMA3-Instruct-8B",
"draft_tensor_parallel_size": 1,
},
) )
outputs = llm.generate(prompts, sampling_params) outputs = llm.generate(prompts, sampling_params)
...@@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models: ...@@ -194,11 +206,10 @@ A few important things to consider when using the EAGLE based draft models:
be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304). be able to be loaded and used directly by vLLM after [PR 12304](https://github.com/vllm-project/vllm/pull/12304).
If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the If you are using vllm version before [PR 12304](https://github.com/vllm-project/vllm/pull/12304), please use the
[script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model, [script](https://gist.github.com/abhigoyal1997/1e7a4109ccb7704fbc67f625e86b2d6d) to convert the speculative model,
and specify `speculative_model="path/to/modified/eagle/model"`. If weight-loading problems still occur when using and specify `"model": "path/to/modified/eagle/model"` in `speculative_config`. If weight-loading problems still occur when using the latest version of vLLM, please leave a comment or raise an issue.
the latest version of vLLM, please leave a comment or raise an issue.
2. The EAGLE based draft models need to be run without tensor parallelism 2. The EAGLE based draft models need to be run without tensor parallelism
(i.e. speculative_draft_tensor_parallel_size is set to 1), although (i.e. draft_tensor_parallel_size is set to 1 in `speculative_config`), although
it is possible to run the main model using tensor parallelism (see example above). it is possible to run the main model using tensor parallelism (see example above).
3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is 3. When using EAGLE-based speculators with vLLM, the observed speedup is lower than what is
......
...@@ -26,4 +26,3 @@ installation/ai_accelerator ...@@ -26,4 +26,3 @@ installation/ai_accelerator
- Google TPU - Google TPU
- Intel Gaudi - Intel Gaudi
- AWS Neuron - AWS Neuron
- OpenVINO
...@@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -36,16 +36,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "# Installation"
:end-before: "## Requirements"
:::
::::
::::: :::::
## Requirements ## Requirements
...@@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -83,16 +73,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "## Requirements"
:end-before: "## Set up using Python"
:::
::::
::::: :::::
## Configure a new environment ## Configure a new environment
...@@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -130,14 +110,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} python_env_setup.inc.md
:::
::::
::::: :::::
## Set up using Python ## Set up using Python
...@@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -177,16 +149,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "### Pre-built wheels"
:end-before: "### Build wheel from source"
:::
::::
::::: :::::
### Build wheel from source ### Build wheel from source
...@@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -224,16 +186,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "### Build wheel from source"
:end-before: "## Set up using Docker"
:::
::::
::::: :::::
## Set up using Docker ## Set up using Docker
...@@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -273,16 +225,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "### Pre-built images"
:end-before: "### Build image from source"
:::
::::
::::: :::::
### Build image from source ### Build image from source
...@@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -320,16 +262,6 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "### Build image from source"
:end-before: "## Extra information"
:::
::::
::::: :::::
## Extra information ## Extra information
...@@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you ...@@ -364,13 +296,4 @@ vLLM is a Python library that supports the following AI accelerators. Select you
:::: ::::
::::{tab-item} OpenVINO
:sync: openvino
:::{include} ai_accelerator/openvino.inc.md
:start-after: "## Extra information"
:::
::::
::::: :::::
# Installation
vLLM powered by OpenVINO supports all LLM models from [vLLM supported models list](#supported-models) and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs ([the list of supported GPUs](https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu)).
:::{attention}
There are no pre-built wheels or images for this device, so you must build vLLM from source.
:::
## Requirements
- OS: Linux
- Instruction set architecture (ISA) requirement: at least AVX2.
## Set up using Python
### Pre-built wheels
Currently, there are no pre-built OpenVINO wheels.
### Build wheel from source
First, install Python and ensure you have the latest pip. For example, on Ubuntu 22.04, you can run:
```console
sudo apt-get update -y
sudo apt-get install python3
pip install --upgrade pip
```
Second, clone vLLM and install prerequisites for the vLLM OpenVINO backend installation:
```console
git clone https://github.com/vllm-project/vllm.git
cd vllm
pip install -r requirements/build.txt --extra-index-url https://download.pytorch.org/whl/cpu
```
Finally, install vLLM with OpenVINO backend:
```console
PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
```
:::{tip}
To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: [https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html](https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html).
:::
## Set up using Docker
### Pre-built images
Currently, there are no pre-built OpenVINO images.
### Build image from source
```console
docker build -f Dockerfile.openvino -t vllm-openvino-env .
docker run -it --rm vllm-openvino-env
```
## Extra information
## Supported features
OpenVINO vLLM backend supports the following advanced vLLM features:
- Prefix caching (`--enable-prefix-caching`)
- Chunked prefill (`--enable-chunked-prefill`)
## Performance tips
### vLLM OpenVINO backend environment variables
- `VLLM_OPENVINO_DEVICE` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, `VLLM_OPENVINO_DEVICE=GPU.1`). If the value is not specified, CPU device is used by default.
- `VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
### CPU performance tips
CPU uses the following environment variables to control behavior:
- `VLLM_OPENVINO_KVCACHE_SPACE` to specify the KV Cache size (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=40` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- `VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (`--enable-chunked-prefill`). Based on the experiments, the recommended batch size is `256` (`--max-num-batched-tokens`)
OpenVINO best known configuration for CPU is:
```console
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
```
### GPU performance tips
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account `gpu_memory_utilization` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using `VLLM_OPENVINO_KVCACHE_SPACE` environment variable (e.g, `VLLM_OPENVINO_KVCACHE_SPACE=8` means 8 GB space for KV cache).
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
OpenVINO best known configuration for GPU is:
```console
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
```
## Limitations
- LoRA serving is not supported.
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration.
...@@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams ...@@ -58,6 +58,11 @@ from vllm import LLM, SamplingParams
``` ```
The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params). The next section defines a list of input prompts and sampling parameters for text generation. The [sampling temperature](https://arxiv.org/html/2402.05201v1) is set to `0.8` and the [nucleus sampling probability](https://en.wikipedia.org/wiki/Top-p_sampling) is set to `0.95`. You can find more information about the sampling parameters [here](#sampling-params).
:::{important}
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the Hugging Face model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
However, if vLLM's default sampling parameters are preferred, please set `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
:::
```python ```python
prompts = [ prompts = [
...@@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m") ...@@ -76,7 +81,7 @@ llm = LLM(model="facebook/opt-125m")
``` ```
:::{note} :::{note}
By default, vLLM downloads models from [HuggingFace](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine. By default, vLLM downloads models from [Hugging Face](https://huggingface.co/). If you would like to use models from [ModelScope](https://www.modelscope.cn), set the environment variable `VLLM_USE_MODELSCOPE` before initializing the engine.
::: :::
Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens. Now, the fun part! The outputs are generated using `llm.generate`. It adds the input prompts to the vLLM engine's waiting queue and executes the vLLM engine to generate the outputs with high throughput. The outputs are returned as a list of `RequestOutput` objects, which include all of the output tokens.
...@@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct ...@@ -107,6 +112,11 @@ vllm serve Qwen/Qwen2.5-1.5B-Instruct
By default, the server uses a predefined chat template stored in the tokenizer. By default, the server uses a predefined chat template stored in the tokenizer.
You can learn about overriding it [here](#chat-template). You can learn about overriding it [here](#chat-template).
::: :::
:::{important}
By default, the server applies `generation_config.json` from the huggingface model repository if it exists. This means the default values of certain sampling parameters can be overridden by those recommended by the model creator.
To disable this behavior, please pass `--generation-config vllm` when launching the server.
:::
This server can be queried in the same format as OpenAI API. For example, to list the models: This server can be queried in the same format as OpenAI API. For example, to list the models:
......
...@@ -46,6 +46,11 @@ for output in outputs: ...@@ -46,6 +46,11 @@ for output in outputs:
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
``` ```
:::{important}
By default, vLLM will use sampling parameters recommended by model creator by applying the `generation_config.json` from the huggingface model repository if it exists. In most cases, this will provide you with the best results by default if {class}`~vllm.SamplingParams` is not specified.
However, if vLLM's default sampling parameters are preferred, please pass `generation_config="vllm"` when creating the {class}`~vllm.LLM` instance.
:::
A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py> A code example can be found here: <gh-file:examples/offline_inference/basic/basic.py>
### `LLM.beam_search` ### `LLM.beam_search`
......
...@@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ ...@@ -472,6 +472,11 @@ See [this page](#generative-models) for more information on how to use generativ
* `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc. * `Tele-AI/TeleChat2-3B`, `Tele-AI/TeleChat2-7B`, `Tele-AI/TeleChat2-35B`, etc.
* ✅︎ * ✅︎
* ✅︎ * ✅︎
- * `TeleFLMForCausalLM`
* TeleFLM
* `CofeAI/FLM-2-52B-Instruct-2407`, `CofeAI/Tele-FLM`, etc.
* ✅︎
* ✅︎
- * `XverseForCausalLM` - * `XverseForCausalLM`
* XVERSE * XVERSE
* `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc. * `xverse/XVERSE-7B-Chat`, `xverse/XVERSE-13B-Chat`, `xverse/XVERSE-65B-Chat`, etc.
......
...@@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should ...@@ -83,7 +83,7 @@ Since this is a ray cluster of **containers**, all the following commands should
Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs. Then, on any node, use `docker exec -it node /bin/bash` to enter the container, execute `ray status` and `ray list nodes` to check the status of the Ray cluster. You should see the right number of nodes and GPUs.
After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2: After that, on any node, use `docker exec -it node /bin/bash` to enter the container again. **In the container**, you can use vLLM as usual, just as you have all the GPUs on one node: vLLM will be able to leverage GPU resources of all nodes in the Ray cluster, and therefore, only run the `vllm` command on this node but not other nodes. The common practice is to set the tensor parallel size to the number of GPUs in each node, and the pipeline parallel size to the number of nodes. For example, if you have 16 GPUs in 2 nodes (8 GPUs per node), you can set the tensor parallel size to 8 and the pipeline parallel size to 2:
```console ```console
vllm serve /path/to/the/model/in/the/container \ vllm serve /path/to/the/model/in/the/container \
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment