Commit ad385667 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge branch 'v0.6.3.post1-dev'

parents be0967c1 903593d3
#pragma once
#include <torch/all.h>
void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums,
torch::Tensor& max_logits, torch::Tensor& tmp_out,
torch::Tensor& query, torch::Tensor& key_cache,
torch::Tensor& value_cache, int64_t num_kv_heads,
double scale, torch::Tensor& block_tables,
torch::Tensor& context_lens, int64_t block_size,
int64_t max_context_len,
const c10::optional<torch::Tensor>& alibi_slopes,
const std::string& kv_cache_dtype, double k_scale,
double v_scale);
#include "core/registration.h"
#include "rocm/ops.h"
// Note on op signatures:
// The X_meta signatures are for the meta functions corresponding to op X.
// They must be kept in sync with the signature for X. Generally, only
// functions that return Tensors require a meta function.
//
// See the following links for detailed docs on op registration and function
// schemas.
// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, rocm_ops) {
// vLLM custom ops for rocm
// Custom attention op
// Compute the attention between an input query and the cached
// keys/values using PagedAttention.
rocm_ops.def(
"paged_attention(Tensor! out, Tensor exp_sums,"
" Tensor max_logits, Tensor tmp_out,"
" Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads,"
" float scale, Tensor block_tables,"
" Tensor context_lens, int block_size,"
" int max_context_len,"
" Tensor? alibi_slopes,"
" str kv_cache_dtype,"
" float k_scale, float v_scale) -> ()");
rocm_ops.impl("paged_attention", torch::kCUDA, &paged_attention);
}
REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
...@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// PagedAttention V2. // PagedAttention V2.
ops.def( ops.def(
"paged_attention_v2(" "paged_attention_v2("
" Tensor! out, Tensor exp_sums, Tensor max_logits," " Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache," " Tensor! tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale," " Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size," " Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes," " int max_seq_len, Tensor? alibi_slopes,"
...@@ -75,6 +75,34 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -75,6 +75,34 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" int blocksparse_head_sliding_step) -> ()"); " int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2_opt", torch::kCUDA, &paged_attention_v2_opt); ops.impl("paged_attention_v2_opt", torch::kCUDA, &paged_attention_v2_opt);
// Compute the attention between an input query and the cached
// keys/values using PagedAttention. (opt)
ops.def(
"paged_attention_v1_opt_tc("
" Tensor! out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float k_scale, float v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v1_opt_tc", torch::kCUDA, &paged_attention_v1_opt_tc);
// PagedAttention V2 (opt).
ops.def(
"paged_attention_v2_opt_tc("
" Tensor! out, Tensor exp_sums, Tensor max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes,"
" str kv_cache_dtype, float k_scale, float v_scale,"
" int tp_rank, int blocksparse_local_blocks,"
" int blocksparse_vert_stride, int blocksparse_block_size,"
" int blocksparse_head_sliding_step) -> ()");
ops.impl("paged_attention_v2_opt_tc", torch::kCUDA, &paged_attention_v2_opt_tc);
// Activation ops // Activation ops
// Activation function used in SwiGLU. // Activation function used in SwiGLU.
ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()"); ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
...@@ -113,8 +141,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -113,8 +141,23 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
// prepare_inputs advance_step // prepare_inputs advance_step
ops.def("advance_step", &advance_step); ops.def(
ops.impl("advance_step", torch::kCUDA, &advance_step); "advance_step_flashattn(int num_seqs, int num_queries, int block_size, "
"Tensor! input_tokens, Tensor sampled_token_ids, "
"Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
"Tensor block_tables) -> ()");
ops.impl("advance_step_flashattn", torch::kCUDA, &advance_step_flashattn);
ops.def(
"advance_step_flashinfer("
" int num_seqs, int num_queries, int block_size,"
" Tensor! input_tokens, Tensor sampled_token_ids,"
" Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping,"
" Tensor block_tables, Tensor! paged_kv_indices,"
" Tensor! paged_kv_indptr, Tensor! paged_kv_last_page_len,"
" Tensor! block_table_bounds"
") -> ()");
ops.impl("advance_step_flashinfer", torch::kCUDA, &advance_step_flashinfer);
// Layernorm // Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
...@@ -175,77 +218,198 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -175,77 +218,198 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Quantization ops // Quantization ops
#ifndef USE_ROCM #ifndef USE_ROCM
// Quantized GEMM for AQLM. // Quantized GEMM for AQLM.
ops.def("aqlm_gemm", &aqlm_gemm); ops.def(
"aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
"Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
"-> Tensor");
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm); ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
// Decompression method for AQLM. // Decompression method for AQLM.
ops.def("aqlm_dequant", &aqlm_dequant); ops.def(
"aqlm_dequant(Tensor codes, Tensor codebooks, "
"int[] codebook_partition_sizes) -> Tensor");
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant); ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
// Quantized GEMM for AWQ. // Quantized GEMM for AWQ.
ops.def("awq_gemm", &awq_gemm); ops.def(
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters) -> Tensor");
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
// Dequantization for AWQ. // Dequantization for AWQ.
ops.def("awq_dequantize", &awq_dequantize); ops.def(
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
// Note about marlin kernel 'workspace' arguments:
// Technically these should be mutable since they are modified by the kernel.
// But since they are set back to zero once the kernel is finished we can
// hand wave and say that they have no net effect.
//
// The reason to mark 'workspace' as immutable is so that they don't interfere
// with using ScalarType arguments in the ops. If they are marked as mutable,
// pytorch throws an assert in
// 'torch._higher_order_ops._register_effectful_op' that prevents these
// kernels from being torch.compile'd.
// See the following document for more info on custom types and ops that use
// custom types:
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
// Marlin (Dense) Optimized Quantized GEMM for GPTQ. // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
ops.def("marlin_gemm", &marlin_gemm); ops.def(
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm); "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
// conditionally compiled so impl in source file
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm); ops.def(
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm); "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
"Tensor b_scales, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
"int size_m, int size_n, int size_k) -> Tensor");
// conditionally compiled so impl in source file
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
ops.def(
"machete_supported_schedules("
" __torch__.torch.classes._core_C.ScalarType btype"
") -> str[]");
ops.def(
"machete_gemm(Tensor A, Tensor B,"
" __torch__.torch.classes._core_C.ScalarType btype,"
" Tensor? scales, Tensor? zeros, int? group_size,"
" Tensor? C, float? alpha, float? beta, str? schedule)"
"-> Tensor");
ops.def(
"machete_prepack_B(Tensor B,"
" __torch__.torch.classes._core_C.ScalarType btype)"
"-> Tensor");
// conditionally compiled so impl registration is in source file
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
ops.impl("permute_cols", torch::kCUDA, &permute_cols);
// gptq_marlin Optimized Quantized GEMM for GPTQ. // gptq_marlin Optimized Quantized GEMM for GPTQ.
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm); ops.def(
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm); "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
"int size_m, int size_n, int size_k, bool is_k_full, "
"bool has_zp, bool use_fp32_reduce) -> Tensor");
// conditionally compiled so impl registration is in source file
// gptq_marlin repack from GPTQ. // gptq_marlin repack from GPTQ.
ops.def("gptq_marlin_repack", &gptq_marlin_repack); ops.def(
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack); "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
// conditionally compiled so impl registrations are in source file
// awq_marlin repack from AWQ. // awq_marlin repack from AWQ.
ops.def("awq_marlin_repack", &awq_marlin_repack); ops.def(
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack); "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
"SymInt size_n, int num_bits) -> Tensor");
// conditionally compiled so impl registrations are in source file
// Dequantization for GGML.
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
// mmvq kernel for GGML.
ops.def(
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
"-> Tensor");
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
// mmq kernel for GGML.
ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only. // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm); ops.def(
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm); "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int num_bits, int size_m, int size_n, "
"int size_k) -> Tensor");
// conditionally compiled so impl registration is in source file
// marlin_qqq_gemm for QQQ. // marlin_qqq_gemm for QQQ.
ops.def("marlin_qqq_gemm", &marlin_qqq_gemm); ops.def(
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm); "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
"Tensor! workspace, int size_m, int size_n, "
"int size_k) -> Tensor");
// conditionally compiled so impl registration is in source file
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization. // quantization, as well as bias
ops.def( ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a," "cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()"); " Tensor b_scales, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm); ops.impl("cutlass_scaled_mm", torch::kCUDA, &cutlass_scaled_mm);
// CUTLASS w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
// quantization.
ops.def(
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm_azp", torch::kCUDA, &cutlass_scaled_mm_azp);
// Check if cutlass scaled_mm is supported for CUDA devices of the given // Check if cutlass scaled_mm is supported for CUDA devices of the given
// capability // capability
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA, ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
&cutlass_scaled_mm_supports_fp8);
// Mamba selective scan kernel
ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta,"
"Tensor! A, Tensor! B, Tensor! C,"
"Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
"bool delta_softplus,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"Tensor! ssm_states,"
"int pad_slot_id) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
ops.def(
"causal_conv1d_update(Tensor! x,"
"Tensor! conv_state,"
"Tensor! weight,"
"Tensor? bias_,"
"bool silu_activation,"
"Tensor? cache_seqlens_,"
"Tensor? conv_state_indices,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
ops.def(
"causal_conv1d_fwd(Tensor! x, Tensor! weight,"
"Tensor? bias_,"
"Tensor!? conv_states,"
"Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"bool silu_activation,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
#endif #endif
// Quantized GEMM for GPTQ. // Quantized GEMM for GPTQ.
ops.def("gptq_gemm", &gptq_gemm); // Note: even though the C++ inferred schema is correct for this op, it seems
ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // to prevent the meta function registry.
// ops.def(
// "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
// "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
// "-> Tensor");
// ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
// Post processing for GPTQ. // Post processing for GPTQ.
ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()"); // ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle); // ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
// Quantized GEMM for SqueezeLLM.
ops.def(
"squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
"lookup_table) -> ()");
ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
// Compute FP8 quantized tensor for given scaling factor. // Compute FP8 quantized tensor for given scaling factor.
// ops.def( // ops.def(
...@@ -261,8 +425,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -261,8 +425,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Compute dynamic-per-token FP8 quantized tensor and scaling factor. // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
// ops.def( // ops.def(
// "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! " // "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
// "scale, Tensor? scale_ub) -> " // "Tensor! scale, Tensor? scale_ub) -> "
// "()"); // "()");
// ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, // ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
// &dynamic_per_token_scaled_fp8_quant); // &dynamic_per_token_scaled_fp8_quant);
...@@ -278,14 +442,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -278,14 +442,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
ops.def( ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> " "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale,"
"()"); "Tensor? azp) -> ()");
ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant); ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
// Compute int8 quantized tensor and scaling factor // Compute int8 quantized tensor and scaling factor
ops.def( ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> " "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, "
"()"); "Tensor!? azp) -> ()");
ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
&dynamic_scaled_int8_quant); &dynamic_scaled_int8_quant);
} }
...@@ -299,8 +463,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -299,8 +463,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
// Copy the cache blocks from src to dst. // Copy the cache blocks from src to dst.
cache_ops.def( cache_ops.def(
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor " "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"block_mapping) -> ()"); "Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks); cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
// Reshape the key and value tensors and cache them. // Reshape the key and value tensors and cache them.
...@@ -325,8 +489,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -325,8 +489,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
// Convert the key and value cache to fp8 data type. // Convert the key and value cache to fp8 data type.
cache_ops.def( cache_ops.def(
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str " "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
"kv_cache_dtype) -> ()"); "str kv_cache_dtype) -> ()");
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8); cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
} }
...@@ -334,26 +498,25 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { ...@@ -334,26 +498,25 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
// Cuda utils // Cuda utils
// Gets the specified device attribute. // Gets the specified device attribute.
cuda_utils.def("get_device_attribute", &get_device_attribute); cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute); cuda_utils.impl("get_device_attribute", &get_device_attribute);
// Gets the maximum shared memory per block device attribute. // Gets the maximum shared memory per block device attribute.
cuda_utils.def("get_max_shared_memory_per_block_device_attribute", cuda_utils.def(
&get_max_shared_memory_per_block_device_attribute); "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute", cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
torch::kCUDA,
&get_max_shared_memory_per_block_device_attribute); &get_max_shared_memory_per_block_device_attribute);
} }
#ifndef USE_ROCM #ifndef USE_ROCM
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels // Custom all-reduce kernels
custom_ar.def("init_custom_ar", &init_custom_ar); custom_ar.def(
"init_custom_ar(Tensor meta, Tensor rank_data, "
"str[] handles, int[] offsets, int rank, "
"bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
custom_ar.def("should_custom_ar", &should_custom_ar);
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()"); custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg); custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
...@@ -363,21 +526,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { ...@@ -363,21 +526,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg); custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
custom_ar.def("dispose", &dispose); custom_ar.def("dispose", &dispose);
custom_ar.impl("dispose", torch::kCPU, &dispose);
custom_ar.def("meta_size", &meta_size); custom_ar.def("meta_size", &meta_size);
custom_ar.impl("meta_size", torch::kCPU, &meta_size);
custom_ar.def("register_buffer", &register_buffer); custom_ar.def(
"register_buffer(int fa, Tensor t, str[] handles, "
"int[] offsets) -> ()");
custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer); custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
&get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers); custom_ar.def("register_graph_buffers", &register_graph_buffers);
custom_ar.impl("register_graph_buffers", torch::kCPU,
&register_graph_buffers);
} }
#endif #endif
......
...@@ -3,11 +3,15 @@ sphinx-book-theme==1.0.1 ...@@ -3,11 +3,15 @@ sphinx-book-theme==1.0.1
sphinx-copybutton==0.5.2 sphinx-copybutton==0.5.2
myst-parser==2.0.0 myst-parser==2.0.0
sphinx-argparse==0.4.0 sphinx-argparse==0.4.0
msgspec
cloudpickle
# packages to install to build the documentation # packages to install to build the documentation
pydantic pydantic >= 2.8
-f https://download.pytorch.org/whl/cpu -f https://download.pytorch.org/whl/cpu
torch torch
py-cpuinfo py-cpuinfo
transformers transformers
mistral_common >= 1.3.4
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
...@@ -5,6 +5,7 @@ vLLM Meetups ...@@ -5,6 +5,7 @@ vLLM Meetups
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__ - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__ - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__ - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
......
...@@ -20,6 +20,7 @@ vLLM is a community project. Our compute resources for development and testing a ...@@ -20,6 +20,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Roblox - Roblox
- RunPod - RunPod
- Sequoia Capital - Sequoia Capital
- Skywork AI
- Trainy - Trainy
- UC Berkeley - UC Berkeley
- UC San Diego - UC San Diego
......
...@@ -97,13 +97,14 @@ def setup(app): ...@@ -97,13 +97,14 @@ def setup(app):
# Mock out external dependencies here, otherwise the autodoc pages may be blank. # Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [ autodoc_mock_imports = [
"aiohttp", "aiohttp",
"compressed_tensors",
"cpuinfo", "cpuinfo",
"cv2",
"torch", "torch",
"transformers", "transformers",
"psutil", "psutil",
"prometheus_client", "prometheus_client",
"sentencepiece", "sentencepiece",
"vllm.cuda_utils",
"vllm._C", "vllm._C",
"PIL", "PIL",
"numpy", "numpy",
...@@ -112,6 +113,10 @@ autodoc_mock_imports = [ ...@@ -112,6 +113,10 @@ autodoc_mock_imports = [
"tensorizer", "tensorizer",
"pynvml", "pynvml",
"outlines", "outlines",
"librosa",
"soundfile",
"gguf",
"lark",
] ]
for mock_target in autodoc_mock_imports: for mock_target in autodoc_mock_imports:
......
...@@ -17,4 +17,4 @@ Input Processing Pipeline ...@@ -17,4 +17,4 @@ Input Processing Pipeline
6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`. 6. If the data contains multi-modal data, convert it into keyword arguments using :meth:`MULTIMODAL_REGISTRY.map_input <vllm.multimodal.MultiModalRegistry.map_input>`.
- For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision language model. - For example, convert a :class:`PIL.Image.Image` input to its pixel values for a vision model.
...@@ -25,7 +25,7 @@ Module Contents ...@@ -25,7 +25,7 @@ Module Contents
LLM Engine Inputs LLM Engine Inputs
----------------- -----------------
.. autoclass:: vllm.inputs.LLMInputs .. autoclass:: vllm.inputs.DecoderOnlyInputs
:members: :members:
:show-inheritance: :show-inheritance:
......
...@@ -8,13 +8,16 @@ Multi-Modality ...@@ -8,13 +8,16 @@ Multi-Modality
vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package. vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>` Multi-modal inputs can be passed alongside text and token prompts to :ref:`supported models <supported_vlms>`
via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptInputs`. via the ``multi_modal_data`` field in :class:`vllm.inputs.PromptType`.
Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities Currently, vLLM only has built-in support for image data. You can extend vLLM to process additional modalities
by following :ref:`this guide <adding_multimodal_plugin>`. by following :ref:`this guide <adding_multimodal_plugin>`.
Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`. Looking to add your own multi-modal model? Please follow the instructions listed :ref:`here <enabling_multimodal_inputs>`.
..
TODO: Add usage of --limit-mm-per-prompt when multi-image input is officially supported
Guides Guides
++++++ ++++++
...@@ -42,8 +45,6 @@ Base Classes ...@@ -42,8 +45,6 @@ Base Classes
.. autodata:: vllm.multimodal.NestedTensors .. autodata:: vllm.multimodal.NestedTensors
.. autodata:: vllm.multimodal.BatchedTensors
.. autodata:: vllm.multimodal.BatchedTensorInputs .. autodata:: vllm.multimodal.BatchedTensorInputs
.. autoclass:: vllm.multimodal.MultiModalDataBuiltins .. autoclass:: vllm.multimodal.MultiModalDataBuiltins
......
LLM Inputs LLM Inputs
========== ==========
.. autodata:: vllm.inputs.PromptInputs .. autodata:: vllm.inputs.PromptType
.. autoclass:: vllm.inputs.TextPrompt .. autoclass:: vllm.inputs.TextPrompt
:show-inheritance: :show-inheritance:
......
Profiling vLLM
=================================
We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
The OpenAI server also needs to be started with the ``VLLM_TORCH_PROFILER_DIR`` environment variable set.
When using ``benchmarks/benchmark_serving.py``, you can enable profiling by passing the ``--profile`` flag.
.. warning::
Only enable profiling in a development environment.
Traces can be visualized using https://ui.perfetto.dev/.
.. tip::
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
.. tip::
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
Set the env variable VLLM_RPC_TIMEOUT to a big number before you start the server. Say something like 30 minutes.
``export VLLM_RPC_TIMEOUT=1800000``
Example commands and usage:
===========================
Offline Inference:
------------------
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
OpenAI Server:
--------------
.. code-block:: bash
VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
benchmark_serving.py:
.. code-block:: bash
python benchmarks/benchmark_serving.py --backend vllm --model meta-llama/Meta-Llama-3-70B --dataset-name sharegpt --dataset-path sharegpt.json --profile --num-prompts 2
\ No newline at end of file
...@@ -3,15 +3,17 @@ ...@@ -3,15 +3,17 @@
Installation with ROCm Installation with ROCm
====================== ======================
vLLM supports AMD GPUs with ROCm 6.1. vLLM supports AMD GPUs with ROCm 6.2.
Requirements Requirements
------------ ------------
* OS: Linux * OS: Linux
* Python: 3.8 -- 3.11 * Python: 3.9 -- 3.12
* GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100) * GPU: MI200s (gfx90a), MI300 (gfx942), Radeon RX 7900 series (gfx1100)
* ROCm 6.1 * ROCm 6.2
Note: PyTorch 2.5+/ROCm6.2 dropped the support for python 3.8.
Installation options: Installation options:
...@@ -26,8 +28,18 @@ Option 1: Build from source with docker (recommended) ...@@ -26,8 +28,18 @@ Option 1: Build from source with docker (recommended)
You can build and install vLLM from source. You can build and install vLLM from source.
First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image. First, build a docker image from `Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ and launch a docker container from the image.
It is important that the user kicks off the docker build using buildkit. Either the user put DOCKER_BUILDKIT=1 as environment variable when calling docker build command, or the user needs to setup buildkit in the docker daemon configuration /etc/docker/daemon.json as follows and restart the daemon:
.. code-block:: console
{
"features": {
"buildkit": true
}
}
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.1 by default, but also supports ROCm 5.7 and 6.0 in older vLLM branches.
`Dockerfile.rocm <https://github.com/vllm-project/vllm/blob/main/Dockerfile.rocm>`_ uses ROCm 6.2 by default, but also supports ROCm 5.7, 6.0 and 6.1 in older vLLM branches.
It provides flexibility to customize the build of docker image using the following arguments: It provides flexibility to customize the build of docker image using the following arguments:
* `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image. * `BASE_IMAGE`: specifies the base image used when running ``docker build``, specifically the PyTorch on ROCm base image.
...@@ -39,13 +51,13 @@ It provides flexibility to customize the build of docker image using the followi ...@@ -39,13 +51,13 @@ It provides flexibility to customize the build of docker image using the followi
Their values can be passed in when running ``docker build`` with ``--build-arg`` options. Their values can be passed in when running ``docker build`` with ``--build-arg`` options.
To build vllm on ROCm 6.1 for MI200 and MI300 series, you can use the default: To build vllm on ROCm 6.2 for MI200 and MI300 series, you can use the default:
.. code-block:: console .. code-block:: console
$ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm . $ DOCKER_BUILDKIT=1 docker build -f Dockerfile.rocm -t vllm-rocm .
To build vllm on ROCm 6.1 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below: To build vllm on ROCm 6.2 for Radeon RX7900 series (gfx1100), you should specify ``BUILD_FA`` as below:
.. code-block:: console .. code-block:: console
...@@ -79,37 +91,55 @@ Option 2: Build from source ...@@ -79,37 +91,55 @@ Option 2: Build from source
- `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_ - `ROCm <https://rocm.docs.amd.com/en/latest/deploy/linux/index.html>`_
- `PyTorch <https://pytorch.org/>`_ - `PyTorch <https://pytorch.org/>`_
- `hipBLAS <https://rocm.docs.amd.com/projects/hipBLAS/en/latest/install.html>`_
For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.1.2_ubuntu20.04_py3.9_pytorch_staging`, `rocm/pytorch-nightly`. For installing PyTorch, you can start from a fresh docker image, e.g, `rocm/pytorch:rocm6.2_ubuntu20.04_py3.9_pytorch_release_2.3.0`, `rocm/pytorch-nightly`.
Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guild in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_ Alternatively, you can install PyTorch using PyTorch wheels. You can check PyTorch installation guide in PyTorch `Getting Started <https://pytorch.org/get-started/locally/>`_
1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_ 1. Install `Triton flash attention for ROCm <https://github.com/ROCm/triton>`_
Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_ Install ROCm's Triton flash attention (the default triton-mlir branch) following the instructions from `ROCm/triton <https://github.com/ROCm/triton/blob/triton-mlir/README.md>`_
.. code-block:: console
$ python3 -m pip install ninja cmake wheel pybind11
$ pip uninstall -y triton
$ git clone https://github.com/OpenAI/triton.git
$ cd triton
$ git checkout e192dba
$ cd python
$ pip3 install .
$ cd ../..
.. note::
- If you see HTTP issue related to downloading packages during building triton, please try again as the HTTP error is intermittent.
2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_ 2. Optionally, if you choose to use CK flash attention, you can install `flash attention for ROCm <https://github.com/ROCm/flash-attention/tree/ck_tile>`_
Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_ Install ROCm's flash attention (v2.5.9.post1) following the instructions from `ROCm/flash-attention <https://github.com/ROCm/flash-attention/tree/ck_tile#amd-gpurocm-support>`_
Alternatively, wheels intended for vLLM use can be accessed under the releases. Alternatively, wheels intended for vLLM use can be accessed under the releases.
.. note:: For example, for ROCm 6.2, suppose your gfx arch is `gfx90a`.
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) Note to get your gfx architecture, run `rocminfo |grep gfx`.
3. Build vLLM.
.. code-block:: console .. code-block:: console
$ cd vllm $ git clone https://github.com/ROCm/flash-attention.git
$ pip install -U -r requirements-rocm.txt $ cd flash-attention
$ python setup.py develop # This may take 5-10 minutes. Currently, `pip install .`` does not work for ROCm installation $ git checkout 3cea2fb
$ git submodule update --init
$ GPU_ARCHS="gfx90a" python3 setup.py install
$ cd ..
.. note::
- You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
.. tip:: 3. Build vLLM.
For example, vLLM v0.5.3 on ROCM 6.1 can be built with the following steps: For example, vLLM on ROCM 6.2 can be built with the following steps:
.. code-block:: console .. code-block:: console
...@@ -117,7 +147,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. ...@@ -117,7 +147,7 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
$ # Install PyTorch $ # Install PyTorch
$ pip uninstall torch -y $ pip uninstall torch -y
$ pip install --no-cache-dir --pre torch==2.5.0.dev20240726 --index-url https://download.pytorch.org/whl/nightly/rocm6.1 $ pip install --no-cache-dir --pre torch==2.6.0.dev20240918 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
$ # Build & install AMD SMI $ # Build & install AMD SMI
$ pip install /opt/rocm/share/amd_smi $ pip install /opt/rocm/share/amd_smi
...@@ -127,15 +157,14 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases. ...@@ -127,15 +157,14 @@ Alternatively, wheels intended for vLLM use can be accessed under the releases.
$ pip install "numpy<2" $ pip install "numpy<2"
$ pip install -r requirements-rocm.txt $ pip install -r requirements-rocm.txt
$ # Apply the patch to ROCM 6.1 (requires root permission)
$ wget -N https://github.com/ROCm/vllm/raw/fa78403/rocm_patch/libamdhip64.so.6 -P /opt/rocm/lib
$ rm -f "$(python3 -c 'import torch; print(torch.__path__[0])')"/lib/libamdhip64.so*
$ # Build vLLM for MI210/MI250/MI300. $ # Build vLLM for MI210/MI250/MI300.
$ export PYTORCH_ROCM_ARCH="gfx90a;gfx942" $ export PYTORCH_ROCM_ARCH="gfx90a;gfx942"
$ python3 setup.py develop $ python3 setup.py develop
This may take 5-10 minutes. Currently, :code:`pip install .` does not work for ROCm installation.
.. tip:: .. tip::
- Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers. - Triton flash attention is used by default. For benchmarking purposes, it is recommended to run a warm up step before collecting perf numbers.
......
...@@ -56,7 +56,7 @@ Build from source ...@@ -56,7 +56,7 @@ Build from source
.. code-block:: console .. code-block:: console
$ pip install --upgrade pip $ pip install --upgrade pip
$ pip install wheel packaging ninja "setuptools>=49.4.0" numpy $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
- Finally, build and install vLLM CPU backend: - Finally, build and install vLLM CPU backend:
......
.. _debugging: .. _debugging:
===============
Debugging Tips Debugging Tips
=============== ===============
Debugging hang/crash issues This document outlines some debugging strategies you can consider. If you think you've discovered a bug, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
---------------------------
.. note::
Once you've debugged a problem, remember to turn off any debugging environment variables defined, or simply start a new shell to avoid being affected by lingering debugging settings. Otherwise, the system might be slow with debugging functionalities left activated.
When an vLLM instance hangs or crashes, it is very difficult to debug the issue. But wait a minute, it is also possible that vLLM is doing something that indeed takes a long time: Hangs downloading a model
----------------------------------------
If the model isn't already downloaded to disk, vLLM will download it from the internet which can take time and depend on your internet connection.
It's recommended to download the model first using the `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and passing the local path to the model to vLLM. This way, you can isolate the issue.
- **Downloading a model**: Do you have the model already downloaded in your disk? If not, vLLM will download the model from the internet, which can take a long time. Be sure to check the internet connection. It would be better to download the model first using `huggingface-cli <https://huggingface.co/docs/huggingface_hub/en/guides/cli>`_ and then use the local path to the model. This way, you can isolate the issue. Hangs loading a model from disk
- **Loading the model from disk**: If the model is large, it can take a long time to load the model from disk. Please take care of the location you store the model. Some clusters have shared filesystems across nodes, e.g. distributed filesystem or network filesystem, which can be slow. It would be better to store the model in a local disk. In addition, please also watch the CPU memory usage. When the model is too large, it might take much CPU memory, which can slow down the operating system because it needs to frequently swap memory between the disk and the memory. ----------------------------------------
- **Tensor parallel inference**: If the model is too large to fit in a single GPU, you might want to use tensor parallelism to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `the provided script <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism. If the model is large, it can take a long time to load it from disk. Pay attention to where you store the model. Some clusters have shared filesystems across nodes, e.g. a distributed filesystem or a network filesystem, which can be slow.
It'd be better to store the model in a local disk. Additionally, have a look at the CPU memory usage, when the model is too large it might take a lot of CPU memory, slowing down the operating system because it needs to frequently swap between disk and memory.
If you have already taken care of the above issues, but the vLLM instance still hangs, with CPU and GPU utilization at near zero, it is likely that the vLLM instance is stuck somewhere. Here are some tips to help debug the issue: Model is too large
----------------------------------------
If the model is too large to fit in a single GPU, you might want to `consider tensor parallelism <https://docs.vllm.ai/en/latest/serving/distributed_serving.html#distributed-inference-and-serving>`_ to split the model across multiple GPUs. In that case, every process will read the whole model and split it into chunks, which makes the disk reading time even longer (proportional to the size of tensor parallelism). You can convert the model checkpoint to a sharded checkpoint using `this example <https://docs.vllm.ai/en/latest/getting_started/examples/save_sharded_state.html>`_ . The conversion process might take some time, but later you can load the sharded checkpoint much faster. The model loading time should remain constant regardless of the size of tensor parallelism.
- Set the environment variable ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging. Enable more logging
- Set the environment variable ``export CUDA_LAUNCH_BLOCKING=1`` to know exactly which CUDA kernel is causing the trouble. ----------------------------------------
- Set the environment variable ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL. If other strategies don't solve the problem, it's likely that the vLLM instance is stuck somewhere. You can use the following environment variables to help debug the issue:
- Set the environment variable ``export VLLM_TRACE_FUNCTION=1``. All the function calls in vLLM will be recorded. Inspect these log files, and tell which function crashes or hangs.
With more logging, hopefully you can find the root cause of the issue. - ``export VLLM_LOGGING_LEVEL=DEBUG`` to turn on more logging.
- ``export CUDA_LAUNCH_BLOCKING=1`` to identify which CUDA kernel is causing the problem.
- ``export NCCL_DEBUG=TRACE`` to turn on more logging for NCCL.
- ``export VLLM_TRACE_FUNCTION=1`` to record all function calls for inspection in the log files to tell which function crashes or hangs.
If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. Incorrect network setup
----------------------------------------
The vLLM instance cannot get the correct IP address if you have a complicated network config. You can find a log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl`` and the IP address should be the correct one.
If it's not, override the IP address using the environment variable ``export VLLM_HOST_IP=<your_ip_address>``.
Here are some common issues that can cause hangs: You might also need to set ``export NCCL_SOCKET_IFNAME=<your_network_interface>`` and ``export GLOO_SOCKET_IFNAME=<your_network_interface>`` to specify the network interface for the IP address.
- **Incorrect network setup**: The vLLM instance cannot get the correct IP address if you have complicated network config. You can find the log such as ``DEBUG 06-10 21:32:17 parallel_state.py:88] world_size=8 rank=0 local_rank=0 distributed_init_method=tcp://xxx.xxx.xxx.xxx:54641 backend=nccl``. The IP address should be the correct one. If not, override the IP address by setting the environment variable ``export VLLM_HOST_IP=your_ip_address``. You might also need to set ``export NCCL_SOCKET_IFNAME=your_network_interface`` and ``export GLOO_SOCKET_IFNAME=your_network_interface`` to specify the network interface for the IP address. Error near ``self.graph.replay()``
- **Incorrect hardware/driver**: GPU/CPU communication cannot be established. You can run the following sanity check script to see if the GPU/CPU communication is working correctly. ----------------------------------------
If vLLM crashes and the error trace captures it somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a CUDA error inside CUDAGraph.
To identify the particular CUDA operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class to disable the CUDAGraph optimization and isolate the exact CUDA operation that causes the error.
Incorrect hardware/driver
----------------------------------------
If GPU/CPU communication cannot be established, you can use the following Python script and follow the instructions below to confirm whether the GPU/CPU communication is working correctly.
.. code-block:: python .. code-block:: python
# Test PyTorch NCCL
import torch import torch
import torch.distributed as dist import torch.distributed as dist
dist.init_process_group(backend="nccl") dist.init_process_group(backend="nccl")
local_rank = dist.get_rank() % torch.cuda.device_count() local_rank = dist.get_rank() % torch.cuda.device_count()
data = torch.FloatTensor([1,] * 128).to(f"cuda:{local_rank}") torch.cuda.set_device(local_rank)
data = torch.FloatTensor([1,] * 128).to("cuda")
dist.all_reduce(data, op=dist.ReduceOp.SUM) dist.all_reduce(data, op=dist.ReduceOp.SUM)
torch.cuda.synchronize() torch.cuda.synchronize()
value = data.mean().item() value = data.mean().item()
world_size = dist.get_world_size() world_size = dist.get_world_size()
assert value == world_size, f"Expected {world_size}, got {value}" assert value == world_size, f"Expected {world_size}, got {value}"
print("PyTorch NCCL is successful!")
# Test PyTorch GLOO
gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo") gloo_group = dist.new_group(ranks=list(range(world_size)), backend="gloo")
cpu_data = torch.FloatTensor([1,] * 128) cpu_data = torch.FloatTensor([1,] * 128)
dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group) dist.all_reduce(cpu_data, op=dist.ReduceOp.SUM, group=gloo_group)
value = cpu_data.mean().item() value = cpu_data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}" assert value == world_size, f"Expected {world_size}, got {value}"
print("sanity check is successful!") print("PyTorch GLOO is successful!")
# Test vLLM NCCL, with cuda graph
from vllm.distributed.device_communicators.pynccl import PyNcclCommunicator
pynccl = PyNcclCommunicator(group=gloo_group, device=local_rank)
pynccl.disabled = False
s = torch.cuda.Stream()
with torch.cuda.stream(s):
data.fill_(1)
pynccl.all_reduce(data, stream=s)
value = data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("vLLM NCCL is successful!")
g = torch.cuda.CUDAGraph()
with torch.cuda.graph(cuda_graph=g, stream=s):
pynccl.all_reduce(data, stream=torch.cuda.current_stream())
data.fill_(1)
g.replay()
torch.cuda.current_stream().synchronize()
value = data.mean().item()
assert value == world_size, f"Expected {world_size}, got {value}"
print("vLLM NCCL with cuda graph is successful!")
dist.destroy_process_group(gloo_group)
dist.destroy_process_group()
If you are testing with a single node, adjust ``--nproc-per-node`` to the number of GPUs you want to use:
.. code-block:: shell
NCCL_DEBUG=TRACE torchrun --nproc-per-node=<number-of-GPUs> test.py
If you are testing with multi-nodes, adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup and set ``MASTER_ADDR`` to the correct IP address of the master node, reachable from all nodes. Then, run:
.. tip:: .. code-block:: shell
Save the script as ``test.py``. NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py
If you are testing in a single-node, run it with ``NCCL_DEBUG=TRACE torchrun --nproc-per-node=8 test.py``, adjust ``--nproc-per-node`` to the number of GPUs you want to use.
If you are testing with multi-nodes, run it with ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=$MASTER_ADDR test.py``. Adjust ``--nproc-per-node`` and ``--nnodes`` according to your setup. Make sure ``MASTER_ADDR``:
- is the correct IP address of the master node
- is reachable from all nodes
- is set before running the script.
If the script runs successfully, you should see the message ``sanity check is successful!``. If the script runs successfully, you should see the message ``sanity check is successful!``.
If the problem persists, feel free to `open an issue on GitHub <https://github.com/vllm-project/vllm/issues/new/choose>`_, with a detailed description of the issue, your environment, and the logs. .. note::
Some known issues: A multi-node environment is more complicated than a single-node one. If you see errors such as ``torch.distributed.DistNetworkError``, it is likely that the network/DNS setup is incorrect. In that case, you can manually assign node rank and specify the IP via command line arguments:
- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can cause hangs at a low probability (once in about 20 times, depending on the machine configuration). The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_ . - In the first node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 0 --master_addr $MASTER_ADDR test.py``.
- In the second node, run ``NCCL_DEBUG=TRACE torchrun --nnodes 2 --nproc-per-node=2 --node-rank 1 --master_addr $MASTER_ADDR test.py``.
.. warning:: Adjust ``--nproc-per-node``, ``--nnodes``, and ``--node-rank`` according to your setup, being sure to execute different commands (with different ``--node-rank``) on different nodes.
After you find the root cause and solve the issue, remember to turn off all the debugging environment variables defined above, or simply start a new shell to avoid being affected by the debugging settings. If you don't do this, the system might be slow because many debugging functionalities are turned on. Known Issues
----------------------------------------
- In ``v0.5.2``, ``v0.5.3``, and ``v0.5.3.post1``, there is a bug caused by `zmq <https://github.com/zeromq/pyzmq/issues/2000>`_ , which can occasionally cause vLLM to hang depending on the machine configuration. The solution is to upgrade to the latest version of ``vllm`` to include the `fix <https://github.com/vllm-project/vllm/pull/6759>`_.
.. _installation: .. _installation:
============
Installation Installation
============ ============
vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries. vLLM is a Python library that also contains pre-compiled C++ and CUDA (12.1) binaries.
Requirements Requirements
------------ ===========================
* OS: Linux * OS: Linux
* Python: 3.8 -- 3.12 * Python: 3.8 -- 3.12
* GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.) * GPU: compute capability 7.0 or higher (e.g., V100, T4, RTX20xx, A100, L4, H100, etc.)
Install with pip Install released versions
---------------- ===========================
You can install vLLM using pip: You can install vLLM using pip:
...@@ -26,6 +27,10 @@ You can install vLLM using pip: ...@@ -26,6 +27,10 @@ You can install vLLM using pip:
$ # Install vLLM with CUDA 12.1. $ # Install vLLM with CUDA 12.1.
$ pip install vllm $ pip install vllm
.. note::
Although we recommend using ``conda`` to create and manage Python environments, it is highly recommended to use ``pip`` to install vLLM. This is because ``pip`` can install ``torch`` with separate library packages like ``NCCL``, while ``conda`` installs ``torch`` with statically linked ``NCCL``. This can cause issues when vLLM tries to use ``NCCL``. See `this issue <https://github.com/vllm-project/vllm/issues/8420>`_ for more details.
.. note:: .. note::
As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default. As of now, vLLM's binaries are compiled with CUDA 12.1 and public PyTorch release versions by default.
...@@ -34,7 +39,7 @@ You can install vLLM using pip: ...@@ -34,7 +39,7 @@ You can install vLLM using pip:
.. code-block:: console .. code-block:: console
$ # Install vLLM with CUDA 11.8. $ # Install vLLM with CUDA 11.8.
$ export VLLM_VERSION=0.4.0 $ export VLLM_VERSION=0.6.1.post1
$ export PYTHON_VERSION=310 $ export PYTHON_VERSION=310
$ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118 $ pip install https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}+cu118-cp${PYTHON_VERSION}-cp${PYTHON_VERSION}-manylinux1_x86_64.whl --extra-index-url https://download.pytorch.org/whl/cu118
...@@ -42,63 +47,173 @@ You can install vLLM using pip: ...@@ -42,63 +47,173 @@ You can install vLLM using pip:
Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions. Therefore, it is recommended to install vLLM with a **fresh new** conda environment. If either you have a different CUDA version or you want to use an existing PyTorch installation, you need to build vLLM from source. See below for instructions.
.. note::
vLLM also publishes a subset of wheels (Python 3.10, 3.11 with CUDA 12) for every commit since v0.5.3. You can download them with the following command: .. _install-the-latest-code:
.. code-block:: console Install the latest code
=========================
LLM inference is a fast-evolving field, and the latest code may contain bug fixes, performance improvements, and new features that are not released yet. To allow users to try the latest code without waiting for the next release, vLLM provides wheels for Linux running on x86 platform with cuda 12 for every commit since v0.5.3. You can download and install the latest one with the following command:
.. code-block:: console
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
If you want to access the wheels for previous commits, you can specify the commit hash in the URL:
.. code-block:: console
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
Note that the wheels are built with Python 3.8 abi (see `PEP 425 <https://peps.python.org/pep-0425/>`_ for more details about abi), so **they are compatible with Python 3.8 and later**. The version string in the wheel file name (``1.0.0.dev``) is just a placeholder to have a unified URL for the wheels. The actual versions of wheels are contained in the wheel metadata.
Another way to access the latest code is to use the docker images:
.. code-block:: console
$ export VLLM_COMMIT=33f460b17a54acb3b6cc0b03f4a17876cff5eafd # use full commit hash from the main branch
$ docker pull public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:${VLLM_COMMIT}
$ export VLLM_VERSION=0.5.4 # vLLM's main branch version is currently set to latest released tag These docker images are used for CI and testing only, and they are not intended for production use. They will be expired after several days.
$ pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
$ # You can also access a specific commit
$ # export VLLM_COMMIT=...
$ # pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/${VLLM_COMMIT}/vllm-${VLLM_VERSION}-cp38-abi3-manylinux1_x86_64.whl
Latest code can contain bugs and may not be stable. Please use it with caution.
.. _build_from_source: .. _build_from_source:
Build from source Build from source
----------------- ==================
You can also build and install vLLM from source: .. _python-only-build:
Python-only build (without compilation)
----------------------------------------
If you only need to change Python code, you can simply build vLLM without compilation.
The first step is to install the latest vLLM wheel:
.. code-block:: console
pip install https://vllm-wheels.s3.us-west-2.amazonaws.com/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl
You can find more information about vLLM's wheels `above <#install-the-latest-code>`_.
After verifying that the installation is successful, you can use `the following script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_:
.. code-block:: console .. code-block:: console
$ git clone https://github.com/vllm-project/vllm.git $ git clone https://github.com/vllm-project/vllm.git
$ cd vllm $ cd vllm
$ pip install -e . # This may take 5-10 minutes. $ python python_only_dev.py
.. tip:: The script will:
Building from source requires quite a lot compilation. If you are building from source for multiple times, it is beneficial to cache the compilation results. For example, you can install `ccache <https://github.com/ccache/ccache>`_ via either `conda install ccache` or `apt install ccache` . As long as `which ccache` command can find the `ccache` binary, it will be used automatically by the build system. After the first build, the subsequent builds will be much faster. * Find the installed vLLM package in the current environment.
* Copy built files to the current directory.
* Rename the installed vLLM package.
* Symbolically link the current directory to the installed vLLM package.
.. tip:: Now, you can edit the Python code in the current directory, and the changes will be reflected when you run vLLM.
To avoid your system being overloaded, you can limit the number of compilation jobs
to be run simultaneously, via the environment variable `MAX_JOBS`. For example:
.. code-block:: console Once you have finished editing or want to install another vLLM wheel, you should exit the development environment using `the same script <https://github.com/vllm-project/vllm/blob/main/python_only_dev.py>`_ with the ``--quit-dev``(or ``-q`` for short) flag:
.. code-block:: console
$ python python_only_dev.py --quit-dev
The script with ``--quit-dev`` flag will:
* Remove the symbolic link from the current directory to the vLLM package.
* Restore the original vLLM package from the backup.
If you update the vLLM wheel and want to rebuild from the source and make further edits, you will need to start `all above <#python-only-build>`_ over again.
.. note::
$ export MAX_JOBS=6 There is a possibility that your source code may have a different commit ID compared to the latest vLLM wheel, which could potentially lead to unknown errors.
$ pip install -e . It is recommended to use the same commit ID for the source code as the vLLM wheel you have installed. Please refer to `the above section <#install-the-latest-code>`_ for instructions on how to install a specified wheel.
Full build (with compilation)
---------------------------------
If you want to modify C++ or CUDA code, you'll need to build vLLM from source. This can take several minutes:
.. code-block:: console
$ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ pip install -e .
.. tip:: .. tip::
If you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
.. code-block:: console Building from source requires a lot of compilation. If you are building from source repeatedly, it's more efficient to cache the compilation results.
For example, you can install `ccache <https://github.com/ccache/ccache>`_ using ``conda install ccache`` or ``apt install ccache`` .
As long as ``which ccache`` command can find the ``ccache`` binary, it will be used automatically by the build system. After the first build, subsequent builds will be much faster.
$ # Use `--ipc=host` to make sure the shared memory is large enough.
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable `CUDA_HOME` to the installation path of CUDA Toolkit, and make sure that the `nvcc` compiler is in your `PATH`, e.g.: Use an existing PyTorch installation
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
There are scenarios where the PyTorch dependency cannot be easily installed via pip, e.g.:
.. code-block:: console * Building vLLM with PyTorch nightly or a custom PyTorch build.
* Building vLLM with aarch64 and CUDA (GH200), where the PyTorch wheels are not available on PyPI. Currently, only the PyTorch nightly has wheels for aarch64 with CUDA. You can run ``pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124`` to `install PyTorch nightly <https://pytorch.org/get-started/locally/>`_, and then build vLLM on top of it.
$ export CUDA_HOME=/usr/local/cuda To build vLLM using an existing PyTorch installation:
$ export PATH="${CUDA_HOME}/bin:$PATH"
Here is a sanity check to verify that the CUDA Toolkit is correctly installed: .. code-block:: console
.. code-block:: console $ git clone https://github.com/vllm-project/vllm.git
$ cd vllm
$ python use_existing_torch.py
$ pip install -r requirements-build.txt
$ pip install -e . --no-build-isolation
Troubleshooting
~~~~~~~~~~~~~~~~~
To avoid your system being overloaded, you can limit the number of compilation jobs
to be run simultaneously, via the environment variable ``MAX_JOBS``. For example:
.. code-block:: console
$ export MAX_JOBS=6
$ pip install -e .
This is especially useful when you are building on less powerful machines. For example, when you use WSL it only `assigns 50% of the total memory by default <https://learn.microsoft.com/en-us/windows/wsl/wsl-config#main-wsl-settings>`_, so using ``export MAX_JOBS=1`` can avoid compiling multiple files simultaneously and running out of memory.
A side effect is a much slower build process.
Additionally, if you have trouble building vLLM, we recommend using the NVIDIA PyTorch Docker image.
.. code-block:: console
$ # Use `--ipc=host` to make sure the shared memory is large enough.
$ docker run --gpus all -it --rm --ipc=host nvcr.io/nvidia/pytorch:23.10-py3
If you don't want to use docker, it is recommended to have a full installation of CUDA Toolkit. You can download and install it from `the official website <https://developer.nvidia.com/cuda-toolkit-archive>`_. After installation, set the environment variable ``CUDA_HOME`` to the installation path of CUDA Toolkit, and make sure that the ``nvcc`` compiler is in your ``PATH``, e.g.:
.. code-block:: console
$ export CUDA_HOME=/usr/local/cuda
$ export PATH="${CUDA_HOME}/bin:$PATH"
Here is a sanity check to verify that the CUDA Toolkit is correctly installed:
.. code-block:: console
$ nvcc --version # verify that nvcc is in your PATH
$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME
Unsupported OS build
----------------------
vLLM can fully run only on Linux but for development purposes, you can still build it on other systems (for example, macOS), allowing for imports and a more convenient development environment. The binaries will not be compiled and won't work on non-Linux systems.
Simply disable the ``VLLM_TARGET_DEVICE`` environment variable before installing:
.. code-block:: console
$ nvcc --version # verify that nvcc is in your PATH $ export VLLM_TARGET_DEVICE=empty
$ ${CUDA_HOME}/bin/nvcc --version # verify that nvcc is in your CUDA_HOME $ pip install -e .
...@@ -3,8 +3,8 @@ ...@@ -3,8 +3,8 @@
Installation with Neuron Installation with Neuron
======================== ========================
vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK. vLLM 0.3.3 onwards supports model inferencing and serving on AWS Trainium/Inferentia with Neuron SDK with continuous batching.
At the moment Paged Attention is not supported in Neuron SDK, but naive continuous batching is supported in transformers-neuronx. Paged Attention and Chunked Prefill are currently in development and will be available soon.
Data types currently supported in Neuron SDK are FP16 and BF16. Data types currently supported in Neuron SDK are FP16 and BF16.
Requirements Requirements
...@@ -27,6 +27,10 @@ Installation steps: ...@@ -27,6 +27,10 @@ Installation steps:
.. _build_from_source_neuron: .. _build_from_source_neuron:
.. note::
The currently supported version of Pytorch for Neuron installs `triton` version `2.1.0`. This is incompatible with vLLM >= 0.5.3. You may see an error `cannot import name 'default_dump_dir...`. To work around this, run a `pip install --upgrade triton==3.0.0` after installing the vLLM wheel.
Build from source Build from source
----------------- -----------------
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
Installation with OpenVINO Installation with OpenVINO
========================== ==========================
vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support. OpenVINO vLLM backend supports the following advanced vLLM features: vLLM powered by OpenVINO supports all LLM models from :doc:`vLLM supported models list <../models/supported_models>` and can perform optimal model serving on all x86-64 CPUs with, at least, AVX2 support, as well as on both integrated and discrete Intel® GPUs (`the list of supported GPUs <https://docs.openvino.ai/2024/about-openvino/release-notes-openvino/system-requirements.html#gpu>`_). OpenVINO vLLM backend supports the following advanced vLLM features:
- Prefix caching (``--enable-prefix-caching``) - Prefix caching (``--enable-prefix-caching``)
- Chunked prefill (``--enable-chunked-prefill``) - Chunked prefill (``--enable-chunked-prefill``)
...@@ -53,34 +53,57 @@ Install from source ...@@ -53,34 +53,57 @@ Install from source
$ pip install --upgrade pip $ pip install --upgrade pip
$ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu $ pip install -r requirements-build.txt --extra-index-url https://download.pytorch.org/whl/cpu
- Finally, install vLLM with OpenVINO backend: - Finally, install vLLM with OpenVINO backend:
.. code-block:: console .. code-block:: console
$ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu https://storage.openvinotoolkit.org/simple/wheels/pre-release" VLLM_TARGET_DEVICE=openvino python -m pip install -v . $ PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu" VLLM_TARGET_DEVICE=openvino python -m pip install -v .
- [Optional] To use vLLM OpenVINO backend with a GPU device, ensure your system is properly set up. Follow the instructions provided here: `https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html <https://docs.openvino.ai/2024/get-started/configurations/configurations-intel-gpu.html>`_.
.. _openvino_backend_performance_tips: .. _openvino_backend_performance_tips:
Performance tips Performance tips
---------------- ----------------
vLLM OpenVINO backend uses the following environment variables to control behavior: vLLM OpenVINO backend environment variables
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- ``VLLM_OPENVINO_DEVICE`` to specify which device utilize for the inference. If there are multiple GPUs in the system, additional indexes can be used to choose the proper one (e.g, ``VLLM_OPENVINO_DEVICE=GPU.1``). If the value is not specified, CPU device is used by default.
- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off. You can also export model with different compression techniques using `optimum-cli` and pass exported folder as `<model_id>`
CPU performance tips
~~~~~~~~~~~~~~~~~~~~
CPU uses the following environment variables to control behavior:
- ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users. - ``VLLM_OPENVINO_KVCACHE_SPACE`` to specify the KV Cache size (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=40`` means 40 GB space for KV cache), larger setting will allow vLLM running more requests in parallel. This parameter should be set based on the hardware configuration and memory management pattern of users.
- ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform. - ``VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8`` to control KV cache precision. By default, FP16 / BF16 is used depending on platform.
- ``VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON`` to enable U8 weights compression during model loading stage. By default, compression is turned off.
To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``) To enable better TPOT / TTFT latency, you can use vLLM's chunked prefill feature (``--enable-chunked-prefill``). Based on the experiments, the recommended batch size is ``256`` (``--max-num-batched-tokens``)
OpenVINO best known configuration is: OpenVINO best known configuration for CPU is:
.. code-block:: console .. code-block:: console
$ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \ $ VLLM_OPENVINO_KVCACHE_SPACE=100 VLLM_OPENVINO_CPU_KV_CACHE_PRECISION=u8 VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256 python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json --enable-chunked-prefill --max-num-batched-tokens 256
GPU performance tips
~~~~~~~~~~~~~~~~~~~~
GPU device implements the logic for automatic detection of available GPU memory and, by default, tries to reserve as much memory as possible for the KV cache (taking into account ``gpu_memory_utilization`` option). However, this behavior can be overridden by explicitly specifying the desired amount of memory for the KV cache using ``VLLM_OPENVINO_KVCACHE_SPACE`` environment variable (e.g, ``VLLM_OPENVINO_KVCACHE_SPACE=8`` means 8 GB space for KV cache).
Currently, the best performance using GPU can be achieved with the default vLLM execution parameters for models with quantized weights (8 and 4-bit integer data types are supported) and `preemption-mode=swap`.
OpenVINO best known configuration for GPU is:
.. code-block:: console
$ VLLM_OPENVINO_DEVICE=GPU VLLM_OPENVINO_ENABLE_QUANTIZED_WEIGHTS=ON \
python3 vllm/benchmarks/benchmark_throughput.py --model meta-llama/Llama-2-7b-chat-hf --dataset vllm/benchmarks/ShareGPT_V3_unfiltered_cleaned_split.json
.. _openvino_backend_limitations: .. _openvino_backend_limitations:
Limitations Limitations
...@@ -91,5 +114,3 @@ Limitations ...@@ -91,5 +114,3 @@ Limitations
- Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration. - Only LLM models are currently supported. LLaVa and encoder-decoder models are not currently enabled in vLLM OpenVINO integration.
- Tensor and pipeline parallelism are not currently enabled in vLLM integration. - Tensor and pipeline parallelism are not currently enabled in vLLM integration.
- Speculative sampling is not tested within vLLM integration.
...@@ -24,7 +24,9 @@ Offline Batched Inference ...@@ -24,7 +24,9 @@ Offline Batched Inference
We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts.
Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process. Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM.
The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine.
The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process.
.. code-block:: python .. code-block:: python
...@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The ...@@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The
] ]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95) sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`. Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model <https://arxiv.org/abs/2205.01068>`_. The list of supported models can be found at :ref:`supported models <supported_models>`.
.. code-block:: python .. code-block:: python
......
...@@ -8,7 +8,7 @@ vLLM supports Google Cloud TPUs using PyTorch XLA. ...@@ -8,7 +8,7 @@ vLLM supports Google Cloud TPUs using PyTorch XLA.
Requirements Requirements
------------ ------------
* Google Cloud TPU VM (single host) * Google Cloud TPU VM (single & multi host)
* TPU versions: v5e, v5p, v4 * TPU versions: v5e, v5p, v4
* Python: 3.10 * Python: 3.10
...@@ -56,16 +56,17 @@ First, install the dependencies: ...@@ -56,16 +56,17 @@ First, install the dependencies:
$ pip uninstall torch torch-xla -y $ pip uninstall torch torch-xla -y
$ # Install PyTorch and PyTorch XLA. $ # Install PyTorch and PyTorch XLA.
$ export DATE="+20240726" $ export DATE="20240828"
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-nightly${DATE}-cp310-cp310-linux_x86_64.whl $ export TORCH_VERSION="2.5.0"
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-nightly${DATE}-cp310-cp310-linux_x86_64.whl $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
$ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl
$ # Install JAX and Pallas. $ # Install JAX and Pallas.
$ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html
$ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html $ pip install torch_xla[pallas] -f https://storage.googleapis.com/jax-releases/jax_nightly_releases.html -f https://storage.googleapis.com/jax-releases/jaxlib_nightly_releases.html
$ # Install other build dependencies. $ # Install other build dependencies.
$ pip install packaging aiohttp $ pip install -r requirements-tpu.txt
Next, build vLLM from source. This will only take a few seconds: Next, build vLLM from source. This will only take a few seconds:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment