Commit 1591c68f authored by zhuwenwen's avatar zhuwenwen
Browse files

merge v0.4.2

parents 09bcf00b c7f2cf2b
#pragma once
#include <torch/extension.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <cuda.h>
#include <cuda_fp16.h>
#include <cuda_runtime.h>
#include <iostream>
namespace gptq_marlin {
// 8 warps are a good choice since every SM has 4 schedulers and having more than 1 warp per
// schedule allows some more latency hiding. At the same time, we want relatively few warps to have
// many registers per warp and small tiles.
static constexpr int default_threads = 256;
static constexpr int pipe_stages = 4; // 4 pipeline stages fit into shared memory
static constexpr int min_thread_n = 64;
static constexpr int min_thread_k = 64;
static constexpr int tile_size = 16;
static constexpr int max_par = 16;
template <typename T, int n>
struct Vec {
T elems[n];
__device__ T& operator[](int i) { return elems[i]; }
};
using I4 = Vec<int, 4>;
constexpr int div_ceil(int a, int b) { return (a + b - 1) / b; }
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
// No support for async
#else
__device__ inline void cp_async4_pred(void* smem_ptr, const void* glob_ptr, bool pred = true) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" .reg .pred p;\n"
" setp.ne.b32 p, %0, 0;\n"
" @p cp.async.cg.shared.global [%1], [%2], %3;\n"
"}\n" ::"r"((int)pred),
"r"(smem), "l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async4(void* smem_ptr, const void* glob_ptr) {
const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile("{\n"
" cp.async.cg.shared.global [%0], [%1], %2;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
}
__device__ inline void cp_async_fence() { asm volatile("cp.async.commit_group;\n" ::); }
template <int n>
__device__ inline void cp_async_wait() {
asm volatile("cp.async.wait_group %0;\n" ::"n"(n));
}
#endif
} // namespace gptq_marlin
#include "gptq_marlin.cuh"
namespace gptq_marlin {
static constexpr int repack_stages = 8;
static constexpr int repack_threads = 256;
static constexpr int tile_k_size = tile_size;
static constexpr int tile_n_size = tile_k_size * 4;
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
uint32_t const *__restrict__ perm_ptr,
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {}
} // namespace gptq_marlin
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
int64_t size_k, int64_t size_n,
int64_t num_bits) {
TORCH_CHECK_NOT_IMPLEMENTED(
false, "marlin_repack_from_gptq(..) requires CUDA_ARCH >= 8.0");
return torch::empty({1, 1});
}
#else
template <int const num_threads, int const num_bits, bool const has_perm>
__global__ void
marlin_repack_kernel(uint32_t const *__restrict__ b_q_weight_ptr,
uint32_t const *__restrict__ perm_ptr,
uint32_t *__restrict__ out_ptr, int size_k, int size_n) {
constexpr int pack_factor = 32 / num_bits;
int k_tiles = size_k / tile_k_size;
int n_tiles = size_n / tile_n_size;
int block_k_tiles = div_ceil(k_tiles, gridDim.x);
int start_k_tile = blockIdx.x * block_k_tiles;
if (start_k_tile >= k_tiles) {
return;
}
int finish_k_tile = min(start_k_tile + block_k_tiles, k_tiles);
// Wait until the next thread tile has been loaded to shared memory.
auto wait_for_stage = [&]() {
// We only have `stages - 2` active fetches since we are double buffering
// and can only issue the next fetch when it is guaranteed that the previous
// shared memory load is fully complete (as it may otherwise be
// overwritten).
cp_async_wait<repack_stages - 2>();
__syncthreads();
};
extern __shared__ int4 sh[];
constexpr int perm_size = tile_k_size / 4;
int4 *sh_perm_ptr = sh;
int4 *sh_pipe_ptr = sh_perm_ptr;
if constexpr (has_perm) {
sh_pipe_ptr += perm_size;
}
constexpr int tile_ints = tile_k_size / pack_factor;
constexpr int stage_n_threads = tile_n_size / 4;
constexpr int stage_k_threads = has_perm ? tile_k_size : tile_ints;
constexpr int stage_size = stage_k_threads * stage_n_threads;
auto load_perm_to_shared = [&](int k_tile_id) {
int first_k_int4 = (k_tile_id * tile_k_size) / 4;
int4 const *perm_int4_ptr = reinterpret_cast<int4 const *>(perm_ptr);
if (threadIdx.x < perm_size) {
sh_perm_ptr[threadIdx.x] = perm_int4_ptr[first_k_int4 + threadIdx.x];
}
__syncthreads();
};
auto fetch_to_shared = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
cp_async_fence();
return;
}
int first_n = n_tile_id * tile_n_size;
int4 *sh_ptr = sh_pipe_ptr + stage_size * pipe;
if constexpr (has_perm) {
if (threadIdx.x < stage_size) {
int k_id = threadIdx.x / stage_n_threads;
int n_id = threadIdx.x % stage_n_threads;
uint32_t const *sh_perm_int_ptr =
reinterpret_cast<uint32_t const *>(sh_perm_ptr);
int src_k = sh_perm_int_ptr[k_id];
int src_k_packed = src_k / pack_factor;
cp_async4(
&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const *>(&(
b_q_weight_ptr[src_k_packed * size_n + first_n + (n_id * 4)])));
}
} else {
if (threadIdx.x < stage_size) {
int k_id = threadIdx.x / stage_n_threads;
int n_id = threadIdx.x % stage_n_threads;
int first_k = k_tile_id * tile_k_size;
int first_k_packed = first_k / pack_factor;
cp_async4(&sh_ptr[k_id * stage_n_threads + n_id],
reinterpret_cast<int4 const *>(
&(b_q_weight_ptr[(first_k_packed + k_id) * size_n +
first_n + (n_id * 4)])));
}
}
cp_async_fence();
};
auto repack_tile = [&](int pipe, int k_tile_id, int n_tile_id) {
if (n_tile_id >= n_tiles) {
return;
}
int warp_id = threadIdx.x / 32;
int th_id = threadIdx.x % 32;
if (warp_id >= 4) {
return;
}
int tc_col = th_id / 4;
int tc_row = (th_id % 4) * 2;
constexpr int tc_offsets[4] = {0, 1, 8, 9};
int cur_n = warp_id * 16 + tc_col;
constexpr int sh_stride = 64;
constexpr uint32_t mask = (1 << num_bits) - 1;
int4 *sh_stage_ptr = sh_pipe_ptr + stage_size * pipe;
uint32_t *sh_stage_int_ptr = reinterpret_cast<uint32_t *>(sh_stage_ptr);
uint32_t *sh_perm_int_ptr = reinterpret_cast<uint32_t *>(sh_perm_ptr);
uint32_t vals[8];
if constexpr (has_perm) {
for (int i = 0; i < 4; i++) {
int k_idx = tc_row + tc_offsets[i];
uint32_t src_k = sh_perm_int_ptr[k_idx];
uint32_t src_k_pos = src_k % pack_factor;
uint32_t b1_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n];
uint32_t b1_cur_val = (b1_val >> (src_k_pos * num_bits)) & mask;
uint32_t b2_val = sh_stage_int_ptr[k_idx * sh_stride + cur_n + 8];
uint32_t b2_cur_val = (b2_val >> (src_k_pos * num_bits)) & mask;
vals[i] = b1_cur_val;
vals[4 + i] = b2_cur_val;
}
} else {
uint32_t b1_vals[tile_ints];
uint32_t b2_vals[tile_ints];
#pragma unroll
for (int i = 0; i < tile_ints; i++) {
b1_vals[i] = sh_stage_int_ptr[cur_n + sh_stride * i];
b2_vals[i] = sh_stage_int_ptr[cur_n + 8 + sh_stride * i];
}
#pragma unroll
for (int i = 0; i < 4; i++) {
int cur_elem = tc_row + tc_offsets[i];
int cur_int = cur_elem / pack_factor;
int cur_pos = cur_elem % pack_factor;
vals[i] = (b1_vals[cur_int] >> (cur_pos * num_bits)) & mask;
vals[4 + i] = (b2_vals[cur_int] >> (cur_pos * num_bits)) & mask;
}
}
constexpr int tile_size = tile_k_size * tile_n_size / pack_factor;
int out_offset = (k_tile_id * n_tiles + n_tile_id) * tile_size;
// Result of:
// https://github.com/NVIDIA/FasterTransformer/blob/main/src/fastertransformer/cutlass_extensions/include/cutlass_extensions/interleaved_numeric_conversion.h
if constexpr (num_bits == 4) {
constexpr int pack_idx[8] = {0, 2, 4, 6, 1, 3, 5, 7};
uint32_t res = 0;
#pragma unroll
for (int i = 0; i < 8; i++) {
res |= vals[pack_idx[i]] << (i * 4);
}
out_ptr[out_offset + th_id * 4 + warp_id] = res;
} else {
constexpr int pack_idx[4] = {0, 2, 1, 3};
uint32_t res1 = 0;
uint32_t res2 = 0;
#pragma unroll
for (int i = 0; i < 4; i++) {
res1 |= vals[pack_idx[i]] << (i * 8);
res2 |= vals[4 + pack_idx[i]] << (i * 8);
}
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 0] = res1;
out_ptr[out_offset + th_id * 8 + (warp_id * 2) + 1] = res2;
}
};
auto start_pipes = [&](int k_tile_id, int n_tile_id) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages - 1; pipe++) {
fetch_to_shared(pipe, k_tile_id, n_tile_id + pipe);
}
wait_for_stage();
};
#pragma unroll
for (int k_tile_id = start_k_tile; k_tile_id < finish_k_tile; k_tile_id++) {
int n_tile_id = 0;
if constexpr (has_perm) {
load_perm_to_shared(k_tile_id);
}
start_pipes(k_tile_id, n_tile_id);
while (n_tile_id < n_tiles) {
#pragma unroll
for (int pipe = 0; pipe < repack_stages; pipe++) {
fetch_to_shared((pipe + repack_stages - 1) % repack_stages, k_tile_id,
n_tile_id + pipe + repack_stages - 1);
repack_tile(pipe, k_tile_id, n_tile_id + pipe);
wait_for_stage();
}
n_tile_id += repack_stages;
}
}
}
} // namespace gptq_marlin
#define CALL_IF(NUM_BITS, HAS_PERM) \
else if (num_bits == NUM_BITS && has_perm == HAS_PERM) { \
cudaFuncSetAttribute( \
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, \
NUM_BITS, HAS_PERM>, \
cudaFuncAttributeMaxDynamicSharedMemorySize, max_shared_mem); \
gptq_marlin::marlin_repack_kernel<gptq_marlin::repack_threads, NUM_BITS, \
HAS_PERM> \
<<<blocks, gptq_marlin::repack_threads, max_shared_mem, stream>>>( \
b_q_weight_ptr, perm_ptr, out_ptr, size_k, size_n); \
}
torch::Tensor gptq_marlin_repack(torch::Tensor &b_q_weight, torch::Tensor &perm,
int64_t size_k, int64_t size_n,
int64_t num_bits) {
// Verify compatibility with marlin tile of 16x64
TORCH_CHECK(size_k % gptq_marlin::tile_k_size == 0, "size_k = ", size_k,
" is not divisible by tile_k_size = ", gptq_marlin::tile_k_size);
TORCH_CHECK(size_n % gptq_marlin::tile_n_size == 0, "size_n = ", size_n,
" is not divisible by tile_n_size = ", gptq_marlin::tile_n_size);
TORCH_CHECK(num_bits == 4 || num_bits == 8,
"num_bits must be 4 or 8. Got = ", num_bits);
int const pack_factor = 32 / num_bits;
// Verify B
TORCH_CHECK((size_k / pack_factor) == b_q_weight.size(0),
"Shape mismatch: b_q_weight.size(0) = ", b_q_weight.size(0),
", size_k = ", size_k, ", pack_factor = ", pack_factor);
TORCH_CHECK(b_q_weight.size(1) == size_n,
"b_q_weight.size(1) = ", b_q_weight.size(1),
" is not size_n = ", size_n);
// Verify device and strides
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
TORCH_CHECK(b_q_weight.is_contiguous(), "b_q_weight is not contiguous");
TORCH_CHECK(b_q_weight.dtype() == at::kInt, "b_q_weight type is not kInt");
TORCH_CHECK(perm.device().is_cuda(), "perm is not on GPU");
TORCH_CHECK(perm.is_contiguous(), "perm is not contiguous");
TORCH_CHECK(perm.dtype() == at::kInt, "perm type is not at::kInt");
// Alloc buffers
const at::cuda::OptionalCUDAGuard device_guard(device_of(b_q_weight));
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
torch::Tensor out =
torch::empty({size_k / gptq_marlin::tile_size,
size_n * gptq_marlin::tile_size / pack_factor},
options);
// Detect if there is act_order
bool has_perm = perm.size(0) != 0;
// Get ptrs
uint32_t const *b_q_weight_ptr =
reinterpret_cast<uint32_t const *>(b_q_weight.data_ptr());
uint32_t const *perm_ptr =
reinterpret_cast<uint32_t const *>(perm.data_ptr());
uint32_t *out_ptr = reinterpret_cast<uint32_t *>(out.data_ptr());
// Get dev info
int dev = b_q_weight.get_device();
cudaStream_t stream = at::cuda::getCurrentCUDAStream(dev);
int blocks;
cudaDeviceGetAttribute(&blocks, cudaDevAttrMultiProcessorCount, dev);
int max_shared_mem = 0;
cudaDeviceGetAttribute(&max_shared_mem,
cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
TORCH_CHECK(max_shared_mem > 0);
if (false) {
}
CALL_IF(4, false)
CALL_IF(4, true)
CALL_IF(8, false)
CALL_IF(8, true)
else {
TORCH_CHECK(false, "Unsupported repack config: num_bits = ", num_bits,
", has_perm = ", has_perm);
}
return out;
}
#endif
...@@ -67,20 +67,13 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr, ...@@ -67,20 +67,13 @@ __device__ inline void cp_async4_pred(void *smem_ptr, const void *glob_ptr,
"r"(smem), "l"(glob_ptr), "n"(BYTES)); "r"(smem), "l"(glob_ptr), "n"(BYTES));
} }
// Asynchronous global->shared copy with a cache hint indicating that the values // Asynchronous global->shared copy
// may be evicted immediately; used for quantized weights B, which are only __device__ inline void cp_async4(void *smem_ptr, const void *glob_ptr) {
// accessed precisely once and should thus not pollute the L2 cache which we
// need for inputs A and outputs C.
__device__ inline void cp_async4_stream(void *smem_ptr, const void *glob_ptr) {
const int BYTES = 16; const int BYTES = 16;
uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr)); uint32_t smem = static_cast<uint32_t>(__cvta_generic_to_shared(smem_ptr));
asm volatile( asm volatile("{\n"
"{\n" " cp.async.cg.shared.global [%0], [%1], %2;\n"
" .reg .b64 p;\n" "}\n" :: "r"(smem), "l"(glob_ptr), "n"(BYTES));
" createpolicy.fractional.L2::evict_first.b64 p, 1.0;"
" cp.async.cg.shared.global.L2::cache_hint [%0], [%1], %2, p;\n"
"}\n" ::"r"(smem),
"l"(glob_ptr), "n"(BYTES));
} }
// Async copy fence. // Async copy fence.
...@@ -448,14 +441,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk ...@@ -448,14 +441,14 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
int4 *sh_b_stage = sh_b + b_sh_stage * pipe; int4 *sh_b_stage = sh_b + b_sh_stage * pipe;
#pragma unroll #pragma unroll
for (int i = 0; i < b_sh_wr_iters; i++) { for (int i = 0; i < b_sh_wr_iters; i++) {
cp_async4_stream(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]); cp_async4(&sh_b_stage[b_sh_wr_delta * i + b_sh_wr], B_ptr[i]);
B_ptr[i] += b_gl_rd_delta_o; B_ptr[i] += b_gl_rd_delta_o;
} }
// Only fetch scales if this tile starts a new group // Only fetch scales if this tile starts a new group
if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) { if (group_blocks != -1 && pipe % (group_blocks / thread_k_blocks) == 0) {
int4 *sh_s_stage = sh_s + s_sh_stage * pipe; int4 *sh_s_stage = sh_s + s_sh_stage * pipe;
if (s_sh_wr_pred) if (s_sh_wr_pred)
cp_async4_stream(&sh_s_stage[s_sh_wr], &s[s_gl_rd]); cp_async4(&sh_s_stage[s_sh_wr], &s[s_gl_rd]);
s_gl_rd += s_gl_rd_delta; s_gl_rd += s_gl_rd_delta;
} }
} }
...@@ -750,7 +743,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk ...@@ -750,7 +743,7 @@ Marlin(const int4 *__restrict__ A, // fp16 input matrix of shape mxk
// write-out // write-out
if (group_blocks == -1 && last) { if (group_blocks == -1 && last) {
if (s_sh_wr_pred) if (s_sh_wr_pred)
cp_async4_stream(&sh_s[s_sh_wr], &s[s_gl_rd]); cp_async4(&sh_s[s_sh_wr], &s[s_gl_rd]);
cp_async_fence(); cp_async_fence();
} }
thread_block_reduce(); thread_block_reduce();
......
...@@ -98,9 +98,10 @@ autodoc_mock_imports = [ ...@@ -98,9 +98,10 @@ autodoc_mock_imports = [
for mock_target in autodoc_mock_imports: for mock_target in autodoc_mock_imports:
if mock_target in sys.modules: if mock_target in sys.modules:
logger.info( logger.info(
f"Potentially problematic mock target ({mock_target}) found; " "Potentially problematic mock target (%s) found; "
"autodoc_mock_imports cannot mock modules that have already " "autodoc_mock_imports cannot mock modules that have already "
"been loaded into sys.modules when the sphinx build starts.") "been loaded into sys.modules when the sphinx build starts.",
mock_target)
class MockedClassDocumenter(autodoc.ClassDocumenter): class MockedClassDocumenter(autodoc.ClassDocumenter):
......
Dockerfile
====================
See `here <https://github.com/vllm-project/vllm/blob/main/Dockerfile>`_ for the main Dockerfile to construct
the image for running an OpenAI compatible server with vLLM.
- Below is a visual representation of the multi-stage Dockerfile. The build graph contains the following nodes:
- All build stages
- The default build target (highlighted in grey)
- External images (with dashed borders)
The edges of the build graph represent:
- FROM ... dependencies (with a solid line and a full arrow head)
- COPY --from=... dependencies (with a dashed line and an empty arrow head)
- RUN --mount=(.*)from=... dependencies (with a dotted line and an empty diamond arrow head)
.. figure:: ../../assets/dev/dockerfile-stages-dependency.png
:alt: query
:width: 100%
:align: center
Made using: https://github.com/patrickhoefler/dockerfilegraph
Commands to regenerate the build graph (make sure to run it **from the `root` directory of the vLLM repository** where the dockerfile is present):
.. code:: bash
dockerfilegraph -o png --legend --dpi 200 --max-label-length 50 --filename Dockerfile
or in case you want to run it directly with the docker image:
.. code:: bash
docker run \
--rm \
--user "$(id -u):$(id -g)" \
--workdir /workspace \
--volume "$(pwd)":/workspace \
ghcr.io/patrickhoefler/dockerfilegraph:alpine \
--output png \
--dpi 200 \
--max-label-length 50 \
--filename Dockerfile \
--legend
(To run it for a different file, you can pass in a different argument to the flag `--filename`.)
\ No newline at end of file
...@@ -3,9 +3,7 @@ ...@@ -3,9 +3,7 @@
Installation with ROCm Installation with ROCm
====================== ======================
vLLM 0.2.4 onwards supports model inferencing and serving on AMD GPUs with ROCm. vLLM supports AMD GPUs with ROCm 5.7 and 6.0.
At the moment AWQ quantization is not supported in ROCm, but SqueezeLLM quantization has been ported.
Data types currently supported in ROCm are FP16 and BF16.
Requirements Requirements
------------ ------------
...@@ -13,114 +11,57 @@ Requirements ...@@ -13,114 +11,57 @@ Requirements
* OS: Linux * OS: Linux
* Python: 3.8 -- 3.11 * Python: 3.8 -- 3.11
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
* Pytorch 2.0.1/2.1.1/2.2 * ROCm 6.0 and ROCm 5.7
* ROCm 5.7 (Verified on python 3.10) or ROCm 6.0 (Verified on python 3.9)
Installation options: Installation options:
#. :ref:`(Recommended) Quick start with vLLM pre-installed in Docker Image <quick_start_docker_rocm>`
#. :ref:`Build from source <build_from_source_rocm>`
#. :ref:`Build from source with docker <build_from_source_docker_rocm>` #. :ref:`Build from source with docker <build_from_source_docker_rocm>`
#. :ref:`Build from source <build_from_source_rocm>`
.. _quick_start_docker_rocm: .. _build_from_source_docker_rocm:
(Recommended) Option 1: Quick start with vLLM pre-installed in Docker Image
---------------------------------------------------------------------------
This option is for ROCm 5.7 only:
.. code-block:: console
$ docker pull embeddedllminfo/vllm-rocm:vllm-v0.2.4
$ docker run -it \
--network=host \
--group-add=video \
--ipc=host \
--cap-add=SYS_PTRACE \
--security-opt seccomp=unconfined \
--device /dev/kfd \
--device /dev/dri \
-v <path/to/model>:/app/model \
embeddedllminfo/vllm-rocm \
bash
.. _build_from_source_rocm:
Option 2: Build from source
---------------------------
You can build and install vLLM from source:
Below instruction is for ROCm 5.7 only.
At the time of this documentation update, PyTorch on ROCm 6.0 wheel is not yet available on the PyTorch website.
0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `Pytorch <https://pytorch.org/>`_
.. code-block:: console
$ pip install torch==2.2.0.dev20231206+rocm5.7 --index-url https://download.pytorch.org/whl/nightly/rocm5.7 # tested version
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console Option 1: Build from source with docker (recommended)
-----------------------------------------------------
$ pip install xformers==0.0.23 --no-deps You can build and install vLLM from source.
$ bash patch_xformers.rocm.sh
3. Build vLLM. First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
.. code-block:: console `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.0 by default, but also supports ROCm 5.7.
It provides flexibility to customize the build of docker image using the following arguments:
$ cd vllm * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`
$ pip install -U -r requirements-rocm.txt * `BUILD_FA`: specifies whether to build CK flash-attention. The default is 1. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
$ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation * `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build CK flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
* `FA_BRANCH`: specifies the branch used to build the CK flash-attention in `ROCm's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `ae7928c`
* `BUILD_TRITON`: specifies whether to build triton flash-attention. The default value is 1.
Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
.. _build_from_source_docker_rocm:
Option 3: Build from source with docker To build vllm on ROCm 6.0 for MI200 and MI300 series, you can use the default:
-----------------------------------------------------
You can build and install vLLM from source: .. code-block:: console
Build a docker image from `Dockerfile.rocm`, and launch a docker container. $ docker build -f Dockerfile.rocm -t vllm-rocm .
The `Dockerfile.rocm` is designed to support both ROCm 5.7 and ROCm 6.0 and later versions. It provides flexibility to customize the build of docker image using the following arguments: To build vllm on ROCm 6.0 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. We have tested ROCm 5.7 and ROCm 6.0. The default is `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1` .. code-block:: console
* `FX_GFX_ARCHS`: specifies the GFX architecture that is used to build flash-attention, for example, `gfx90a;gfx942` for MI200 and MI300. The default is `gfx90a;gfx942`
* `FA_BRANCH`: specifies the branch used to build the flash-attention in `ROCmSoftwarePlatform's flash-attention repo <https://github.com/ROCmSoftwarePlatform/flash-attention>`_. The default is `3d2b6f5`
* `BUILD_FA`: specifies whether to build flash-attention. For `Radeon RX 7900 series (gfx1100) <https://rocm.docs.amd.com/projects/radeon/en/latest/index.html>`_, this should be set to 0 before flash-attention supports this target.
Their values can be passed in when running ``docker build`` with ``--build-arg`` options. $ docker build --build-arg BUILD_FA="0" -f Dockerfile.rocm -t vllm-rocm .
For example, to build docker image for vllm on ROCm 5.7, you can run: To build docker image for vllm on ROCm 5.7, you can specify ``BASE_IMAGE`` as below:
.. code-block:: console .. code-block:: console
$ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \ $ docker build --build-arg BASE_IMAGE="rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1" \
-f Dockerfile.rocm -t vllm-rocm . -f Dockerfile.rocm -t vllm-rocm .
To build vllm on ROCm 6.0, you can use the default: To run the above docker image ``vllm-rocm``, use the below command:
.. code-block:: console .. code-block:: console
$ docker build -f Dockerfile.rocm -t vllm-rocm .
$ docker run -it \ $ docker run -it \
--network=host \ --network=host \
--group-add=video \ --group-add=video \
...@@ -133,7 +74,13 @@ To build vllm on ROCm 6.0, you can use the default: ...@@ -133,7 +74,13 @@ To build vllm on ROCm 6.0, you can use the default:
vllm-rocm \ vllm-rocm \
bash bash
Alternatively, if you plan to install vLLM-ROCm on a local machine or start from a fresh docker image (e.g. rocm/pytorch), you can follow the steps below: Where the `<path/to/model>` is the location where the model is stored, for example, the weights for llama2 or llama3 models.
.. _build_from_source_rocm:
Option 2: Build from source
---------------------------
0. Install prerequisites (skip if you are already in an environment/docker with the following installed): 0. Install prerequisites (skip if you are already in an environment/docker with the following installed):
...@@ -141,32 +88,50 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from ...@@ -141,32 +88,50 @@ Alternatively, if you plan to install vLLM-ROCm on a local machine or start from
- `Pytorch <https://pytorch.org/>`_ - `Pytorch <https://pytorch.org/>`_
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_ - `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
1. Install `flash attention for ROCm <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm>`_ For installing PyTorch, you can start from a fresh docker image, e.g, `rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2`, `rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1`, `rocm/pytorch-nightly`.
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCmSoftwarePlatform/flash-attention <https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_ Alternatively, you can install pytorch using pytorch wheels. You can check Pytorch installation guild in Pytorch `Getting Started <https://pytorch.org/get-started/locally/>`_
For rocm6.0:
.. code-block:: console
$ pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.0
For rocm5.7:
.. code-block:: console
$ pip install torch --index-url https://download.pytorch.org/whl/rocm5.7
1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm>`_
Install ROCm's flash attention (v2.0.4) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/flash_attention_for_rocm#amd-gpurocm-support>`_
.. note:: .. note::
- If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly. - If you are using rocm5.7 with pytorch 2.1.0 onwards, you don't need to apply the `hipify_python.patch`. You can build the ROCm flash attention directly.
- If you fail to install `ROCmSoftwarePlatform/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`. - If you fail to install `ROCm/flash-attention`, try cloning from the commit `6fd2f8e572805681cd67ef8596c7e2ce521ed3c6`.
- ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention. - ROCm's Flash-attention-2 (v2.0.4) does not support sliding windows attention.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
2. Setup `xformers==0.0.23` without dependencies, and apply patches to adapt for ROCm flash attention
.. code-block:: console
$ pip install xformers==0.0.23 --no-deps
$ bash patch_xformers.rocm.sh
3. Build vLLM. 3. Build vLLM.
.. code-block:: console .. code-block:: console
$ cd vllm $ cd vllm
$ pip install -U -r requirements-rocm.txt $ pip install -U -r requirements-rocm.txt
$ python setup.py install # This may take 5-10 minutes. $ python setup.py install # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation
.. note::
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation. .. tip::
- You may need to turn on the ``--enforce-eager`` flag if you experience process hang when running the `benchmark_thoughput.py` script to test your installation.
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
- To use CK flash-attention, please use this flag ``export VLLM_USE_FLASH_ATTN_TRITON=0`` to turn off triton flash attention.
- The ROCm version of pytorch, ideally, should match the ROCm driver version.
...@@ -53,6 +53,7 @@ You can also build and install vLLM from source: ...@@ -53,6 +53,7 @@ You can also build and install vLLM from source:
$ git clone https://github.com/vllm-project/vllm.git $ git clone https://github.com/vllm-project/vllm.git
$ cd vllm $ cd vllm
$ # export VLLM_INSTALL_PUNICA_KERNELS=1 # optionally build for multi-LoRA capability
$ pip install -e . # This may take 5-10 minutes. $ pip install -e . # This may take 5-10 minutes.
.. tip:: .. tip::
......
...@@ -75,6 +75,7 @@ Documentation ...@@ -75,6 +75,7 @@ Documentation
serving/deploying_with_docker serving/deploying_with_docker
serving/distributed_serving serving/distributed_serving
serving/metrics serving/metrics
serving/env_vars
serving/usage_stats serving/usage_stats
serving/integrations serving/integrations
...@@ -86,6 +87,7 @@ Documentation ...@@ -86,6 +87,7 @@ Documentation
models/adding_model models/adding_model
models/engine_args models/engine_args
models/lora models/lora
models/performance
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
...@@ -102,6 +104,7 @@ Documentation ...@@ -102,6 +104,7 @@ Documentation
dev/sampling_params dev/sampling_params
dev/engine/engine_index dev/engine/engine_index
dev/kernel/paged_attention dev/kernel/paged_attention
dev/dockerfile/dockerfile
Indices and tables Indices and tables
================== ==================
......
.. _performance:
Performance and Tuning
======================
Chunked Prefill
---------------
vLLM supports an experimental feature chunked prefill. Chunked prefill allows to chunk large prefills into smaller chunks and batch them together with decode requests.
You can enable the feature by specifying
.. code-block:: python
llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True)
# Set max_num_batched_tokens to tune performance.
# NOTE: 512 is the default max_num_batched_tokens for chunked prefill.
# llm = LLM(model="meta-llama/Llama-2-7b-hf", enable_chunked_prefill=True, max_num_batched_tokens=512)
By default, vLLM scheduler prioritizes prefills and doesn't batch prefill and decode to the same batch. This policy optimizes the TTFT (time to thefirst token), but incurs slower ITL (inter token latency) and inefficient GPU utilization.
Once chunked prefill is enabled, the policy is changed to
- prioritize decode requests. It batches all pending decode requests to the batch before scheduling any prefill.
- When there are available token_budget (`max_num_batched_tokens`), it schedules pending prefills. If a last pending prefill request cannot fit into `max_num_batched_tokens`, it chunks it.
This policy has two benefits.
- It improves ITL (inter token latency) and generation decode because decode requests are prioritized.
- It helps achieve better GPU utilization by locating compute-bound (prefill) and memory-bound (decode) requests to the same batch.
You can tune the performance by changing `max_num_batched_tokens`.
By default, it is set to 512, which has the best ITL on A100 in the initial benchmark.
Smaller batch size achieves better ITL because there are fewer prefills interrupting decodes.
Higher batch size achieves better TTFT as you can put more prefill to the batch.
If `max_num_batched_tokens` is the same as `max_model_len`, that's almost the equivalent to the default scheduling policy (except that it still prioritizes decodes).
Note that the default batch size (512) is optimized for ITL, and it may have lower throughput than the default scheduler. We recommend you set `max_num_batched_tokens > 2048` for throughput.
See related papers for more details (https://arxiv.org/pdf/2401.08671 or https://arxiv.org/pdf/2308.16369).
...@@ -101,7 +101,7 @@ Alongside each architecture, we include some popular models that use it. ...@@ -101,7 +101,7 @@ Alongside each architecture, we include some popular models that use it.
- -
* - :code:`OLMoForCausalLM` * - :code:`OLMoForCausalLM`
- OLMo - OLMo
- :code:`allenai/OLMo-1B`, :code:`allenai/OLMo-7B`, etc. - :code:`allenai/OLMo-1B-hf`, :code:`allenai/OLMo-7B-hf`, etc.
- -
* - :code:`OPTForCausalLM` * - :code:`OPTForCausalLM`
- OPT, OPT-IML - OPT, OPT-IML
...@@ -115,6 +115,10 @@ Alongside each architecture, we include some popular models that use it. ...@@ -115,6 +115,10 @@ Alongside each architecture, we include some popular models that use it.
- Phi - Phi
- :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc. - :code:`microsoft/phi-1_5`, :code:`microsoft/phi-2`, etc.
- -
* - :code:`Phi3ForCausalLM`
- Phi-3
- :code:`microsoft/Phi-3-mini-4k-instruct`, :code:`microsoft/Phi-3-mini-128k-instruct`, etc.
-
* - :code:`QWenLMHeadModel` * - :code:`QWenLMHeadModel`
- Qwen - Qwen
- :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc. - :code:`Qwen/Qwen-7B`, :code:`Qwen/Qwen-7B-Chat`, etc.
......
...@@ -49,3 +49,6 @@ To run vLLM: ...@@ -49,3 +49,6 @@ To run vLLM:
--env "HUGGING_FACE_HUB_TOKEN=<secret>" \ --env "HUGGING_FACE_HUB_TOKEN=<secret>" \
vllm/vllm-openai <args...> vllm/vllm-openai <args...>
.. note::
vLLM docker image is currently designed to be run under the root user (contribution welcomed for changing this!). It will try to load library at runtime under the root user's home directory, e.g. `/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` . If you are running the container under a different user, you may need to change the permissions of the library (and all the parent directories) to allow the user to access it. Then run vLLM with environment variable `VLLM_NCCL_SO_PATH=/root/.config/vllm/nccl/cu12/libnccl.so.2.18.1` .
Environment Variables
========================
vLLM uses the following environment variables to configure the system:
.. literalinclude:: ../../../vllm/envs.py
:language: python
:start-after: begin-env-vars-definition
:end-before: end-env-vars-definition
...@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat ...@@ -4,7 +4,7 @@ vLLM provides an HTTP server that implements OpenAI's [Completions](https://plat
You can start the server using Python, or using [Docker](deploying_with_docker.rst): You can start the server using Python, or using [Docker](deploying_with_docker.rst):
```bash ```bash
python -m vllm.entrypoints.openai.api_server --model mistralai/Mistral-7B-Instruct-v0.2 --dtype auto --api-key token-abc123 python -m vllm.entrypoints.openai.api_server --model NousResearch/Meta-Llama-3-8B-Instruct --dtype auto --api-key token-abc123
``` ```
To call the server, you can use the official OpenAI Python client library, or any other HTTP client. To call the server, you can use the official OpenAI Python client library, or any other HTTP client.
...@@ -16,7 +16,7 @@ client = OpenAI( ...@@ -16,7 +16,7 @@ client = OpenAI(
) )
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="mistralai/Mistral-7B-Instruct-v0.2", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Hello!"} {"role": "user", "content": "Hello!"}
] ]
...@@ -37,7 +37,7 @@ Or directly merge them into the JSON payload if you are using HTTP call directly ...@@ -37,7 +37,7 @@ Or directly merge them into the JSON payload if you are using HTTP call directly
```python ```python
completion = client.chat.completions.create( completion = client.chat.completions.create(
model="mistralai/Mistral-7B-Instruct-v0.2", model="NousResearch/Meta-Llama-3-8B-Instruct",
messages=[ messages=[
{"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"} {"role": "user", "content": "Classify this sentiment: vLLM is wonderful!"}
], ],
...@@ -87,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode ...@@ -87,7 +87,7 @@ In order for the language model to support chat protocol, vLLM requires the mode
a chat template in its tokenizer configuration. The chat template is a Jinja2 template that a chat template in its tokenizer configuration. The chat template is a Jinja2 template that
specifies how are roles, messages, and other chat-specific tokens are encoded in the input. specifies how are roles, messages, and other chat-specific tokens are encoded in the input.
An example chat template for `mistralai/Mistral-7B-Instruct-v0.2` can be found [here](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2#instruction-format) An example chat template for `NousResearch/Meta-Llama-3-8B-Instruct` can be found [here](https://github.com/meta-llama/llama3?tab=readme-ov-file#instruction-tuned-models)
Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model, Some models do not provide a chat template even though they are instruction/chat fine-tuned. For those model,
you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat you can manually specify their chat template in the `--chat-template` parameter with the file path to the chat
......
# Logging Configuration
vLLM leverages Python's `logging.config.dictConfig` functionality to enable
robust and flexible configuration of the various loggers used by vLLM.
vLLM offers two environment variables that can be used to accommodate a range
of logging configurations that range from simple-and-inflexible to
more-complex-and-more-flexible.
- No vLLM logging (simple and inflexible)
- Set `VLLM_CONFIGURE_LOGGING=0` (leaving `VLLM_LOGGING_CONFIG_PATH` unset)
- vLLM's default logging configuration (simple and inflexible)
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1`
- Fine-grained custom logging configuration (more complex, more flexible)
- Leave `VLLM_CONFIGURE_LOGGING` unset or set `VLLM_CONFIGURE_LOGGING=1` and
set `VLLM_LOGGING_CONFIG_PATH=<path-to-logging-config.json>`
## Logging Configuration Environment Variables
### `VLLM_CONFIGURE_LOGGING`
`VLLM_CONFIGURE_LOGGING` controls whether or not vLLM takes any action to
configure the loggers used by vLLM. This functionality is enabled by default,
but can be disabled by setting `VLLM_CONFIGURE_LOGGING=0` when running vLLM.
If `VLLM_CONFIGURE_LOGGING` is enabled and no value is given for
`VLLM_LOGGING_CONFIG_PATH`, vLLM will use built-in default configuration to
configure the root vLLM logger. By default, no other vLLM loggers are
configured and, as such, all vLLM loggers defer to the root vLLM logger to make
all logging decisions.
If `VLLM_CONFIGURE_LOGGING` is disabled and a value is given for
`VLLM_LOGGING_CONFIG_PATH`, an error will occur while starting vLLM.
### `VLLM_LOGGING_CONFIG_PATH`
`VLLM_LOGGING_CONFIG_PATH` allows users to specify a path to a JSON file of
alternative, custom logging configuration that will be used instead of vLLM's
built-in default logging configuration. The logging configuration should be
provided in JSON format following the schema specified by Python's [logging
configuration dictionary
schema](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details).
If `VLLM_LOGGING_CONFIG_PATH` is specified, but `VLLM_CONFIGURE_LOGGING` is
disabled, an error will occur while starting vLLM.
## Examples
### Example 1: Customize vLLM root logger
For this example, we will customize the vLLM root logger to use
[`python-json-logger`](https://github.com/madzak/python-json-logger) to log to
STDOUT of the console in JSON format with a log level of `INFO`.
To begin, first, create an appropriate JSON logging configuration file:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"json": {
"class": "pythonjsonlogger.jsonlogger.JsonFormatter"
}
},
"handlers": {
"console": {
"class" : "logging.StreamHandler",
"formatter": "json",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["console"],
"level": "INFO",
"propagate": false
}
},
"version": 1
}
```
Next, install the `python-json-logger` package if it's not already installed:
```bash
pip install python-json-logger
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
### Example 2: Silence a particular vLLM logger
To silence a particular vLLM logger, it is necessary to provide custom logging
configuration for the target logger that configures the logger so that it won't
propagate its log messages to the root vLLM logger.
When custom configuration is provided for any logger, it is also necessary to
provide configuration for the root vLLM logger since any custom logger
configuration overrides the built-in default logging configuration used by vLLM.
First, create an appropriate JSON logging configuration file that includes
configuration for the root vLLM logger and for the logger you wish to silence:
**/path/to/logging_config.json:**
```json
{
"formatters": {
"vllm": {
"class": "vllm.logging.NewLineFormatter",
"datefmt": "%m-%d %H:%M:%S",
"format": "%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s"
}
},
"handlers": {
"vllm": {
"class" : "logging.StreamHandler",
"formatter": "vllm",
"level": "INFO",
"stream": "ext://sys.stdout"
}
},
"loggers": {
"vllm": {
"handlers": ["vllm"],
"level": "DEBUG",
"propagage": false
},
"vllm.example_noisy_logger": {
"propagate": false
}
},
"version": 1
}
```
Finally, run vLLM with the `VLLM_LOGGING_CONFIG_PATH` environment variable set
to the path of the custom logging configuration JSON file:
```bash
VLLM_LOGGING_CONFIG_PATH=/path/to/logging_config.json \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
### Example 3: Disable vLLM default logging configuration
To disable vLLM's default logging configuration and silence all vLLM loggers,
simple set `VLLM_CONFIGURE_LOGGING=0` when running vLLM. This will prevent vLLM
for configuring the root vLLM logger, which in turn, silences all other vLLM
loggers.
```bash
VLLM_CONFIGURE_LOGGING=0 \
python3 -m vllm.entrypoints.openai.api_server \
--max-model-len 2048 \
--model mistralai/Mistral-7B-v0.1
```
## Additional resources
- [`logging.config` Dictionary Schema Details](https://docs.python.org/3/library/logging.config.html#dictionary-schema-details)
...@@ -873,6 +873,289 @@ ...@@ -873,6 +873,289 @@
], ],
"title": "Cache Utilization", "title": "Cache Utilization",
"type": "timeseries" "type": "timeseries"
},
{
"type": "heatmap",
"title": "Request Prompt Length",
"description": "Heatmap of request prompt length",
"gridPos": {
"x": 0,
"y": 24,
"w": 12,
"h": 8
},
"datasource": {
"uid": "prometheus",
"type": "prometheus"
},
"id": 12,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"refId": "A",
"expr": "sum by(le) (increase(vllm:request_prompt_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
"range": true,
"instant": false,
"editorMode": "builder",
"legendFormat": "{{le}}",
"useBackend": false,
"disableTextWrap": false,
"fullMetaSearch": false,
"includeNullMetadata": true,
"format": "heatmap"
}
],
"options": {
"calculate": false,
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "none",
"axisLabel": "Prompt Length"
},
"rowsFrame": {
"layout": "auto",
"value": "Request count"
},
"color": {
"mode": "scheme",
"fill": "dark-orange",
"scale": "exponential",
"exponent": 0.5,
"scheme": "Spectral",
"steps": 64,
"reverse": false,
"min": 0
},
"cellGap": 1,
"filterValues": {
"le": 1e-9
},
"tooltip": {
"show": true,
"yHistogram": true
},
"legend": {
"show": true
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"cellValues": {
"unit": "none"
}
},
"fieldConfig": {
"defaults": {
"custom": {
"scaleDistribution": {
"type": "linear"
},
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
}
}
},
"overrides": []
},
"pluginVersion": "10.2.0"
},
{
"datasource": {
"uid": "prometheus",
"type": "prometheus"
},
"type": "heatmap",
"title": "Request Generation Length",
"description": "Heatmap of request generation length",
"gridPos": {
"x": 12,
"y": 24,
"w": 12,
"h": 8
},
"id": 13,
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"refId": "A",
"expr": "sum by(le) (increase(vllm:request_generation_tokens_bucket{model_name=\"$model_name\"}[$__rate_interval]))",
"range": true,
"instant": false,
"editorMode": "builder",
"legendFormat": "{{le}}",
"useBackend": false,
"disableTextWrap": false,
"fullMetaSearch": false,
"includeNullMetadata": true,
"format": "heatmap"
}
],
"options": {
"calculate": false,
"yAxis": {
"axisPlacement": "left",
"reverse": false,
"unit": "none",
"axisLabel": "Generation Length"
},
"rowsFrame": {
"layout": "auto",
"value": "Request count"
},
"color": {
"mode": "scheme",
"fill": "dark-orange",
"scale": "exponential",
"exponent": 0.5,
"scheme": "Spectral",
"steps": 64,
"reverse": false,
"min": 0
},
"cellGap": 1,
"filterValues": {
"le": 1e-9
},
"tooltip": {
"show": true,
"yHistogram": true
},
"legend": {
"show": true
},
"exemplars": {
"color": "rgba(255,0,255,0.7)"
},
"cellValues": {
"unit": "none"
}
},
"fieldConfig": {
"defaults": {
"custom": {
"scaleDistribution": {
"type": "linear"
},
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
}
}
},
"overrides": []
},
"pluginVersion": "10.2.0"
},
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"fieldConfig": {
"defaults": {
"custom": {
"drawStyle": "line",
"lineInterpolation": "linear",
"barAlignment": 0,
"lineWidth": 1,
"fillOpacity": 0,
"gradientMode": "none",
"spanNulls": false,
"insertNulls": false,
"showPoints": "auto",
"pointSize": 5,
"stacking": {
"mode": "none",
"group": "A"
},
"axisPlacement": "auto",
"axisLabel": "",
"axisColorMode": "text",
"axisBorderShow": false,
"scaleDistribution": {
"type": "linear"
},
"axisCenteredZero": false,
"hideFrom": {
"tooltip": false,
"viz": false,
"legend": false
},
"thresholdsStyle": {
"mode": "off"
}
},
"color": {
"mode": "palette-classic"
},
"mappings": [],
"thresholds": {
"mode": "absolute",
"steps": [
{
"color": "green",
"value": null
},
{
"color": "red",
"value": 80
}
]
}
},
"overrides": []
},
"gridPos": {
"h": 8,
"w": 12,
"x": 0,
"y": 32
},
"id": 11,
"options": {
"tooltip": {
"mode": "single",
"sort": "none"
},
"legend": {
"showLegend": true,
"displayMode": "list",
"placement": "bottom",
"calcs": []
}
},
"targets": [
{
"datasource": {
"type": "prometheus",
"uid": "prometheus"
},
"disableTextWrap": false,
"editorMode": "builder",
"expr": "sum by(finished_reason) (increase(vllm:request_success_total{model_name=\"$model_name\"}[$__rate_interval]))",
"fullMetaSearch": false,
"includeNullMetadata": true,
"instant": false,
"interval": "",
"legendFormat": "__auto",
"range": true,
"refId": "A",
"useBackend": false
}
],
"title": "Finish Reason",
"description": "Number of finished requests by their finish reason: either an EOS token was generated or the max sequence length was reached.",
"type": "timeseries"
} }
], ],
"refresh": "", "refresh": "",
......
...@@ -95,7 +95,7 @@ echo 'vLLM yapf: Done' ...@@ -95,7 +95,7 @@ echo 'vLLM yapf: Done'
# Run mypy # Run mypy
echo 'vLLM mypy:' echo 'vLLM mypy:'
mypy vllm/attention --config-file pyproject.toml mypy vllm/attention --config-file pyproject.toml
mypy vllm/core/*.py --follow-imports=skip --config-file pyproject.toml mypy vllm/core --config-file pyproject.toml
mypy vllm/distributed --config-file pyproject.toml mypy vllm/distributed --config-file pyproject.toml
mypy vllm/entrypoints --config-file pyproject.toml mypy vllm/entrypoints --config-file pyproject.toml
mypy vllm/executor --config-file pyproject.toml mypy vllm/executor --config-file pyproject.toml
...@@ -105,8 +105,10 @@ mypy vllm/transformers_utils --config-file pyproject.toml ...@@ -105,8 +105,10 @@ mypy vllm/transformers_utils --config-file pyproject.toml
mypy vllm/engine --config-file pyproject.toml mypy vllm/engine --config-file pyproject.toml
mypy vllm/worker --config-file pyproject.toml mypy vllm/worker --config-file pyproject.toml
mypy vllm/spec_decode --config-file pyproject.toml mypy vllm/spec_decode --config-file pyproject.toml
mypy vllm/model_executor/*.py --config-file pyproject.toml mypy vllm/model_executor --config-file pyproject.toml
# mypy vllm/lora/*.py --config-file pyproject.toml mypy vllm/lora --config-file pyproject.toml
mypy vllm/logging --config-file pyproject.toml
mypy vllm/model_executor --config-file pyproject.toml
CODESPELL_EXCLUDES=( CODESPELL_EXCLUDES=(
......
...@@ -5,7 +5,7 @@ requires = [ ...@@ -5,7 +5,7 @@ requires = [
"ninja", "ninja",
"packaging", "packaging",
"setuptools >= 49.4.0", "setuptools >= 49.4.0",
"torch == 2.2.1", "torch == 2.3.0",
"wheel", "wheel",
] ]
build-backend = "setuptools.build_meta" build-backend = "setuptools.build_meta"
...@@ -32,6 +32,7 @@ select = [ ...@@ -32,6 +32,7 @@ select = [
"SIM", "SIM",
# isort # isort
# "I", # "I",
"G",
] ]
ignore = [ ignore = [
# star imports # star imports
......
...@@ -3,5 +3,5 @@ cmake>=3.21 ...@@ -3,5 +3,5 @@ cmake>=3.21
ninja ninja
packaging packaging
setuptools>=49.4.0 setuptools>=49.4.0
torch==2.2.1 torch==2.3.0
wheel wheel
...@@ -8,9 +8,11 @@ py-cpuinfo ...@@ -8,9 +8,11 @@ py-cpuinfo
transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3. transformers >= 4.40.0 # Required for StarCoder2 & Llava, Llama 3.
tokenizers >= 0.19.1 # Required for Llama 3. tokenizers >= 0.19.1 # Required for Llama 3.
fastapi fastapi
openai
uvicorn[standard] uvicorn[standard]
pydantic >= 2.0 # Required for OpenAI server. pydantic >= 2.0 # Required for OpenAI server.
prometheus_client >= 0.18.0 prometheus_client >= 0.18.0
prometheus-fastapi-instrumentator >= 7.0.0
tiktoken == 0.6.0 # Required for DBRX tokenizer tiktoken == 0.6.0 # Required for DBRX tokenizer
lm-format-enforcer == 0.9.8 lm-format-enforcer == 0.9.8
outlines == 0.0.34 # Requires torch >= 2.1.0 outlines == 0.0.34 # Requires torch >= 2.1.0
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment