Commit 4d3a2c28 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.5' into v0.6.5-dev

parents 92ec5d8e 2d1b9baa
...@@ -28,6 +28,7 @@ ...@@ -28,6 +28,7 @@
#include "common/base.h" #include "common/base.h"
#include "core/scalar_type.hpp" #include "core/scalar_type.hpp"
#include "core/registration.h"
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
...@@ -88,7 +89,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, ...@@ -88,7 +89,7 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_meta, torch::Tensor& b_meta,
torch::Tensor& b_scales, torch::Tensor& b_scales,
torch::Tensor& workspace, torch::Tensor& workspace,
vllm::ScalarTypeTorchPtr const& b_q_type, vllm::ScalarTypeId const b_q_type_id,
int64_t size_m, int64_t size_n, int64_t size_m, int64_t size_n,
int64_t size_k) { int64_t size_k) {
TORCH_CHECK_NOT_IMPLEMENTED( TORCH_CHECK_NOT_IMPLEMENTED(
...@@ -295,13 +296,9 @@ __global__ void Marlin_24( ...@@ -295,13 +296,9 @@ __global__ void Marlin_24(
// We use a different scale layout for grouped and column-wise quantization as // We use a different scale layout for grouped and column-wise quantization as
// we scale a `half2` tile in column-major layout in the former and in // we scale a `half2` tile in column-major layout in the former and in
// row-major in the latter case. // row-major in the latter case.
if (group_blocks != -1) { s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) + (threadIdx.x % 32) / 4; // Note that in the original Marlin kernel
(threadIdx.x % 32) / 4; // this is (threadIdx.x % 32) / 4
} else {
s_sh_rd = 8 * ((threadIdx.x / 32) % (thread_n_blocks / 4)) +
(threadIdx.x % 32) / 4;
}
// Precompute which thread should not read memory in which iterations; this is // Precompute which thread should not read memory in which iterations; this is
// needed if there are more threads than required for a certain tilesize or // needed if there are more threads than required for a certain tilesize or
...@@ -909,13 +906,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C, ...@@ -909,13 +906,16 @@ void marlin_cuda_2_4(const void* A, const void* B, const void* meta, void* C,
// than better compute utilization // than better compute utilization
thread_k = 128; thread_k = 128;
thread_m = 128; thread_m = 128;
} else if (prob_n <= 256) { } else {
thread_k = 64; thread_k = 64;
thread_m = 256; thread_m = 256;
} else {
thread_k = 32;
thread_m = 512;
} }
// Also had
// if prob_n > 256
// thread_k = 32;
// thread_m = 512;
// but this is broken,
// TODO(Lucas, Alex M): figure out why
} }
int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction int thread_k_blocks = thread_k / 32; // 2:4 version with m16n8k32 instruction
...@@ -1028,13 +1028,14 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, ...@@ -1028,13 +1028,14 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
torch::Tensor& b_meta, torch::Tensor& b_meta,
torch::Tensor& b_scales, torch::Tensor& b_scales,
torch::Tensor& workspace, torch::Tensor& workspace,
vllm::ScalarTypeTorchPtr const& b_q_type, vllm::ScalarTypeId const b_q_type_id,
int64_t size_m, int64_t size_n, int64_t size_m, int64_t size_n,
int64_t size_k) { int64_t size_k) {
vllm::ScalarType const b_q_type = vllm::ScalarType::from_id(b_q_type_id);
// Verify num_bits // Verify num_bits
TORCH_CHECK(*b_q_type == vllm::kU4B8 || *b_q_type == vllm::kU8B128, TORCH_CHECK(b_q_type == vllm::kU4B8 || b_q_type == vllm::kU8B128,
"num_bits must be uint4b8 or uint8b128. Got = ", b_q_type->str()); "num_bits must be uint4b8 or uint8b128. Got = ", b_q_type.str());
int pack_factor = 32 / b_q_type->size_bits(); int pack_factor = 32 / b_q_type.size_bits();
// Verify M // Verify M
TORCH_CHECK(size_m == a.size(0), TORCH_CHECK(size_m == a.size(0),
...@@ -1077,6 +1078,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, ...@@ -1077,6 +1078,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
// Verify A device and strides // Verify A device and strides
TORCH_CHECK(a.device().is_cuda(), "A is not on GPU"); TORCH_CHECK(a.device().is_cuda(), "A is not on GPU");
TORCH_CHECK(a.is_contiguous(), "A is not contiguous"); TORCH_CHECK(a.is_contiguous(), "A is not contiguous");
TORCH_CHECK(a.dtype() == torch::kFloat16,
"A is not float16, currently only float16 is supported");
// Verify B device and strides // Verify B device and strides
TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU"); TORCH_CHECK(b_q_weight.device().is_cuda(), "b_q_weight is not on GPU");
...@@ -1089,6 +1092,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, ...@@ -1089,6 +1092,8 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
// Verify scales device and strides // Verify scales device and strides
TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU"); TORCH_CHECK(b_scales.device().is_cuda(), "b_scales is not on GPU");
TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous"); TORCH_CHECK(b_scales.is_contiguous(), "b_scales is not contiguous");
TORCH_CHECK(b_scales.dtype() == torch::kFloat16,
"A is not float16, currently only float16 is supported");
// Alloc C matrix // Alloc C matrix
const at::cuda::OptionalCUDAGuard device_guard(device_of(a)); const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
...@@ -1129,8 +1134,12 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight, ...@@ -1129,8 +1134,12 @@ torch::Tensor gptq_marlin_24_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
marlin_24::marlin_cuda_2_4( marlin_24::marlin_cuda_2_4(
a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(), a.data_ptr(), b_q_weight.data_ptr(), b_meta.data_ptr(), c.data_ptr(),
b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(), b_scales.data_ptr(), size_n, size_m, size_k, workspace.data_ptr(),
b_q_type->size_bits(), groupsize, dev, b_q_type.size_bits(), groupsize, dev, at::cuda::getCurrentCUDAStream(dev),
at::cuda::getCurrentCUDAStream(dev), thread_k, thread_m, sms, max_par); thread_k, thread_m, sms, max_par);
return c; return c;
} }
TORCH_LIBRARY_IMPL_EXPAND(TORCH_EXTENSION_NAME, CUDA, m) {
m.impl("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
}
#pragma once
/**
* __device__ datatypes vectorized by 4
*/
// Include both AMD and NVIDIA fp8 types to avoid circular import
// TODO(luka/varun) use FP8_TYPE instead after refactoring
#include <c10/util/Float8_e4m3fnuz.h>
#include <c10/util/Float8_e4m3fn.h>
namespace vllm {
// Vectorization containers
template <typename scalar_t>
struct __align__(8) vec4_t {
scalar_t x;
scalar_t y;
scalar_t z;
scalar_t w;
};
template <typename quant_type_t>
struct __align__(4) q8x4_t {
static_assert(std::is_same_v<quant_type_t, int8_t> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fn> ||
std::is_same_v<quant_type_t, c10::Float8_e4m3fnuz>);
quant_type_t x;
quant_type_t y;
quant_type_t z;
quant_type_t w;
};
} // namespace vllm
...@@ -18,6 +18,9 @@ ...@@ -18,6 +18,9 @@
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops // vLLM custom ops
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
ops.impl("weak_ref_tensor", torch::kCUDA, &weak_ref_tensor);
// Attention ops // Attention ops
// Compute the attention between an input query and the cached // Compute the attention between an input query and the cached
// keys/values using PagedAttention. // keys/values using PagedAttention.
...@@ -225,6 +228,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -225,6 +228,10 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def("gelu_tanh_and_mul_opt(Tensor! out, Tensor input) -> ()"); ops.def("gelu_tanh_and_mul_opt(Tensor! out, Tensor input) -> ()");
ops.impl("gelu_tanh_and_mul_opt", torch::kCUDA, &gelu_tanh_and_mul); ops.impl("gelu_tanh_and_mul_opt", torch::kCUDA, &gelu_tanh_and_mul);
// FATReLU implementation.
ops.def("fatrelu_and_mul(Tensor! out, Tensor input, float threshold) -> ()");
ops.impl("fatrelu_and_mul", torch::kCUDA, &fatrelu_and_mul);
// GELU implementation used in GPT-2. // GELU implementation used in GPT-2.
ops.def("gelu_new(Tensor! out, Tensor input) -> ()"); ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
ops.impl("gelu_new", torch::kCUDA, &gelu_new); ops.impl("gelu_new", torch::kCUDA, &gelu_new);
...@@ -259,7 +266,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -259,7 +266,7 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Layernorm // Layernorm
// Apply Root Mean Square (RMS) Normalization to the input tensor. // Apply Root Mean Square (RMS) Normalization to the input tensor.
ops.def( ops.def(
"rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> " "rms_norm(Tensor! result, Tensor input, Tensor weight, float epsilon) -> "
"()"); "()");
ops.impl("rms_norm", torch::kCUDA, &rms_norm); ops.impl("rms_norm", torch::kCUDA, &rms_norm);
...@@ -281,6 +288,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -281,6 +288,31 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
"float epsilon) -> ()"); "float epsilon) -> ()");
ops.impl("fused_add_rms_norm_opt", torch::kCUDA, &fused_add_rms_norm_opt); ops.impl("fused_add_rms_norm_opt", torch::kCUDA, &fused_add_rms_norm_opt);
// Layernorm-quant
// Apply Root Mean Square (RMS) Normalization to the input tensor.
// ops.def(
// "rms_norm_static_fp8_quant(Tensor! result, Tensor input, Tensor weight, "
// "Tensor scale, float epsilon) -> "
// "()");
// ops.impl("rms_norm_static_fp8_quant", torch::kCUDA,
// &rms_norm_static_fp8_quant);
// In-place fused Add and RMS Normalization.
// ops.def(
// "fused_add_rms_norm_static_fp8_quant(Tensor! result, Tensor input, "
// "Tensor! residual, Tensor weight, "
// "Tensor scale, float epsilon) -> ()");
// ops.impl("fused_add_rms_norm_static_fp8_quant", torch::kCUDA,
// &fused_add_rms_norm_static_fp8_quant);
// Fused Layernorm + Quant kernels
ops.def(
"rms_norm_dynamic_per_token_quant(Tensor! result, Tensor input, "
"Tensor weight, Tensor! scale, float epsilon, "
"Tensor? scale_ub, Tensor!? residual) -> ()");
ops.impl("rms_norm_dynamic_per_token_quant", torch::kCUDA,
&rms_norm_dynamic_per_token_quant);
// Rotary embedding // Rotary embedding
// Apply GPT-NeoX or GPT-J style rotary embedding to query and key. // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
ops.def( ops.def(
...@@ -330,13 +362,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -330,13 +362,13 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Quantized GEMM for AWQ. // Quantized GEMM for AWQ.
ops.def( ops.def(
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, " "awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters) -> Tensor"); "Tensor _zeros, SymInt split_k_iters) -> Tensor");
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
// Dequantization for AWQ. // Dequantization for AWQ.
ops.def( ops.def(
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, " "awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor"); "Tensor _zeros, SymInt split_k_iters, int thx, int thy) -> Tensor");
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
// Note about marlin kernel 'workspace' arguments: // Note about marlin kernel 'workspace' arguments:
...@@ -356,31 +388,50 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -356,31 +388,50 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Marlin (Dense) Optimized Quantized GEMM for GPTQ. // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
ops.def( ops.def(
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " "marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor"); "Tensor! workspace, SymInt size_m, SymInt size_n, SymInt size_k) -> "
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm); "Tensor");
// conditionally compiled so impl in source file
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
ops.def( ops.def(
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, " "gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
"Tensor b_scales, Tensor workspace, " "Tensor b_scales, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, " "int b_q_type, "
"int size_m, int size_n, int size_k) -> Tensor"); "SymInt size_m, SymInt size_n, SymInt size_k) -> Tensor");
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm); // conditionally compiled so impl in source file
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper. // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
ops.def("machete_supported_schedules", &machete::supported_schedules);
ops.def( ops.def(
"machete_gemm(Tensor A, Tensor B," "machete_supported_schedules("
" __torch__.torch.classes._core_C.ScalarType btype," " ScalarType a_type,"
" Tensor? scales, Tensor? zeros, int? group_size," " int b_type,"
" Tensor? C, float? alpha, float? beta, str? schedule)" " ScalarType? maybe_group_scales_type,"
"-> Tensor"); " ScalarType? maybe_group_zeros_type,"
ops.impl("machete_gemm", torch::kCUDA, &machete::gemm); " ScalarType? maybe_channel_scales_type,"
ops.def( " ScalarType? maybe_token_scales_type,"
"machete_prepack_B(Tensor B," " ScalarType? maybe_out_type"
" __torch__.torch.classes._core_C.ScalarType btype)" ") -> str[]");
"-> Tensor"); ops.def(
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B); "machete_mm("
" Tensor A,"
" Tensor B,"
" int b_type,"
" ScalarType? out_type,"
" Tensor? group_scales,"
" Tensor? group_zeros,"
" int? group_size,"
" Tensor? channel_scales,"
" Tensor? token_scales,"
" str? schedule"
") -> Tensor");
ops.def(
"machete_prepack_B("
" Tensor B,"
" ScalarType a_type,"
" int b_type,"
" ScalarType? group_scales_type"
") -> Tensor");
// conditionally compiled so impl registration is in source file
ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor"); ops.def("permute_cols(Tensor A, Tensor perm) -> Tensor");
ops.impl("permute_cols", torch::kCUDA, &permute_cols); ops.impl("permute_cols", torch::kCUDA, &permute_cols);
...@@ -389,53 +440,54 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -389,53 +440,54 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def( ops.def(
"gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " "gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, " "Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, " "int b_q_type, "
"int size_m, int size_n, int size_k, bool is_k_full, " "SymInt size_m, SymInt size_n, SymInt size_k, bool is_k_full, "
"bool has_zp, bool use_fp32_reduce) -> Tensor"); "bool has_zp, bool use_fp32_reduce, bool is_zp_float) -> Tensor");
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm); // conditionally compiled so impl registration is in source file
// gptq_marlin repack from GPTQ. // gptq_marlin repack from GPTQ.
ops.def( ops.def(
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, " "gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor"); "SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack); // conditionally compiled so impl registrations are in source file
ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
// awq_marlin repack from AWQ. // awq_marlin repack from AWQ.
ops.def( ops.def(
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, " "awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
"SymInt size_n, int num_bits) -> Tensor"); "SymInt size_n, int num_bits) -> Tensor");
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack); // conditionally compiled so impl registrations are in source file
ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta); #endif
// Dequantization for GGML. // Dequantization for GGML.
ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor"); ops.def("ggml_dequantize(Tensor W, int type, SymInt m, SymInt n) -> Tensor");
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
// mmvq kernel for GGML. // mmvq kernel for GGML.
ops.def( ops.def(
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) " "ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, SymInt row) "
"-> Tensor"); "-> Tensor");
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8); ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
// mmq kernel for GGML. // mmq kernel for GGML.
ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor"); ops.def(
"ggml_mul_mat_a8(Tensor W, Tensor X, int type, SymInt row) -> Tensor");
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
#ifndef USE_ROCM
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only. // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def( ops.def(
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, " "fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int num_bits, int size_m, int size_n, " "Tensor! workspace, int num_bits, SymInt size_m, SymInt size_n, "
"int size_k) -> Tensor"); "SymInt size_k) -> Tensor");
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm); // conditionally compiled so impl registration is in source file
// marlin_qqq_gemm for QQQ. // marlin_qqq_gemm for QQQ.
ops.def( ops.def(
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, " "marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
"Tensor s_tok, Tensor s_ch, Tensor s_group, " "Tensor s_tok, Tensor s_ch, Tensor s_group, "
"Tensor! workspace, int size_m, int size_n, " "Tensor! workspace, SymInt size_m, SymInt size_n, "
"int size_k) -> Tensor"); "SymInt size_k) -> Tensor");
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm); // conditionally compiled so impl registration is in source file
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization, as well as bias // quantization, as well as bias
...@@ -463,27 +515,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -463,27 +515,35 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.def( ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta," "selective_scan_fwd(Tensor! u, Tensor! delta,"
"Tensor! A, Tensor! B, Tensor! C," "Tensor! A, Tensor! B, Tensor! C,"
"Tensor? D_, Tensor? z_, Tensor? delta_bias_," "Tensor? D_, Tensor!? z_, Tensor? delta_bias_,"
"bool delta_softplus," "bool delta_softplus,"
"Tensor? index_, Tensor!? x) -> Tensor[]"); "Tensor? query_start_loc,"
"Tensor? cache_indices,"
"Tensor? has_initial_state,"
"Tensor! ssm_states,"
"int pad_slot_id) -> ()");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
ops.def( ops.def(
"causal_conv1d_update(Tensor! x," "causal_conv1d_update(Tensor! x,"
"Tensor! conv_state," "Tensor! conv_state,"
"Tensor! weight," "Tensor! weight,"
"Tensor? bias," "Tensor? bias_,"
"bool silu_activation," "bool silu_activation,"
"Tensor? conv_state_indices) -> Tensor"); "Tensor? cache_seqlens_,"
"Tensor? conv_state_indices,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update); ops.impl("causal_conv1d_update", torch::kCUDA, &causal_conv1d_update);
ops.def( ops.def(
"causal_conv1d_fwd(Tensor! x, Tensor! weight," "causal_conv1d_fwd(Tensor! x, Tensor! weight,"
"Tensor? bias_," "Tensor? bias_,"
"Tensor? seq_idx_," "Tensor!? conv_states,"
"Tensor? initial_states_," "Tensor? query_start_loc,"
"Tensor!? final_states_out_," "Tensor? cache_indices,"
"bool silu_activation) -> Tensor"); "Tensor? has_initial_state,"
"bool silu_activation,"
"int pad_slot_id) -> ()");
ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd); ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
#endif #endif
...@@ -502,42 +562,34 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -502,42 +562,34 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Compute FP8 quantized tensor for given scaling factor. // Compute FP8 quantized tensor for given scaling factor.
// ops.def( // ops.def(
// "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()"); // "static_scaled_fp8_quant(Tensor! result, Tensor input, Tensor scale) -> "
// "()");
// ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant); // ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
// // Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
// Compute dynamic-per-tensor FP8 quantized tensor and scaling factor.
// ops.def( // ops.def(
// "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> " // "dynamic_scaled_fp8_quant(Tensor! result, Tensor input, Tensor! scale) "
// "-> "
// "()"); // "()");
// ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant); // ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
// Compute dynamic-per-token FP8 quantized tensor and scaling factor. // // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
// ops.def( // ops.def(
// "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, " // "dynamic_per_token_scaled_fp8_quant(Tensor! result, Tensor input, "
// "Tensor! scale, Tensor? scale_ub) -> " // "Tensor! scale, Tensor? scale_ub) -> "
// "()"); // "()");
// ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, // ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
// &dynamic_per_token_scaled_fp8_quant); // &dynamic_per_token_scaled_fp8_quant);
// Aligning the number of tokens to be processed by each expert such
// that it is divisible by the block size.
ops.def(
"moe_align_block_size(Tensor topk_ids, int num_experts,"
" int block_size, Tensor! sorted_token_ids,"
" Tensor! experts_ids,"
" Tensor! num_tokens_post_pad) -> ()");
ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
// Compute int8 quantized tensor for given scaling factor. // Compute int8 quantized tensor for given scaling factor.
ops.def( ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale," "static_scaled_int8_quant(Tensor! result, Tensor input, Tensor scale,"
"Tensor? azp) -> ()"); "Tensor? azp) -> ()");
ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant); ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
// Compute int8 quantized tensor and scaling factor // Compute int8 quantized tensor and scaling factor
ops.def( ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale, " "dynamic_scaled_int8_quant(Tensor! result, Tensor input, Tensor! scale, "
"Tensor!? azp) -> ()"); "Tensor!? azp) -> ()");
ops.impl("dynamic_scaled_int8_quant", torch::kCUDA, ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
&dynamic_scaled_int8_quant); &dynamic_scaled_int8_quant);
...@@ -617,27 +669,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { ...@@ -617,27 +669,18 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels // Custom all-reduce kernels
custom_ar.def( custom_ar.def(
"init_custom_ar(Tensor meta, Tensor rank_data, " "init_custom_ar(int[] ipc_tensors, Tensor rank_data, "
"str[] handles, int[] offsets, int rank, " "int rank, bool full_nvlink) -> int");
"bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
custom_ar.def( custom_ar.def(
"all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> " "all_reduce(int fa, Tensor inp, Tensor! out, int reg_buffer, "
"()"); "int reg_buffer_sz_bytes) -> ()");
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg); custom_ar.impl("all_reduce", torch::kCUDA, &all_reduce);
custom_ar.def("dispose", &dispose); custom_ar.def("dispose", &dispose);
custom_ar.def("meta_size", &meta_size); custom_ar.def("meta_size", &meta_size);
custom_ar.def( custom_ar.def("register_buffer", &register_buffer);
"register_buffer(int fa, Tensor t, str[] handles, "
"int[] offsets) -> ()");
custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers); custom_ar.def("register_graph_buffers", &register_graph_buffers);
} }
......
#pragma once
#include <torch/all.h>
#ifndef USE_ROCM
#include <cuda_bf16.h>
#include <cuda_fp16.h>
#else
#include <hip/hip_bf16.h>
#include <hip/hip_fp16.h>
using __nv_bfloat16 = __hip_bfloat16;
using __nv_bfloat162 = __hip_bfloat162;
#endif
namespace vllm {
/* Converter structs for the conversion from torch types to HIP/CUDA types,
and the associated type conversions within HIP/CUDA. These helpers need
to be implemented for now because the relevant type conversion
operators/constructors are not consistently implemented by HIP/CUDA, so
a generic conversion via type casts cannot be implemented.
Each struct should have the member static constexpr bool `exists`:
If false, the optimized kernel is not used for the corresponding torch type.
If true, the struct should be fully defined as shown in the examples below.
*/
template <typename torch_type>
struct _typeConvert {
static constexpr bool exists = false;
};
#if defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >= 12000))
// CUDA < 12.0 runs into issues with packed type conversion
template <>
struct _typeConvert<c10::Half> {
static constexpr bool exists = true;
using hip_type = __half;
using packed_hip_type = __half2;
__device__ static inline float convert(hip_type x) { return __half2float(x); }
__device__ static inline float2 convert(packed_hip_type x) {
return __half22float2(x);
}
__device__ static inline hip_type convert(float x) {
return __float2half_rn(x);
}
__device__ static inline packed_hip_type convert(float2 x) {
return __float22half2_rn(x);
}
};
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
// CUDA_ARCH < 800 does not have BF16 support
// TODO: Add in ROCm support once public headers handle bf16 maturely
template <>
struct _typeConvert<c10::BFloat16> {
static constexpr bool exists = true;
using hip_type = __nv_bfloat16;
using packed_hip_type = __nv_bfloat162;
__device__ static inline float convert(hip_type x) {
return __bfloat162float(x);
}
__device__ static inline float2 convert(packed_hip_type x) {
return __bfloat1622float2(x);
}
__device__ static inline hip_type convert(float x) {
return __float2bfloat16(x);
}
__device__ static inline packed_hip_type convert(float2 x) {
return __float22bfloat162_rn(x);
}
};
#endif // defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
#endif // defined(USE_ROCM) || (defined(CUDA_VERSION) && (CUDA_VERSION >=
// 12000))
/* Vector POD struct to generate vectorized and packed FP16/BF16 ops
for appropriate specializations of fused_add_rms_norm_kernel.
Only functions that are necessary in that kernel are implemented.
Alignment to 16 bytes is required to use 128-bit global memory ops.
*/
template <typename scalar_t, int width>
struct alignas(16) _f16Vec {
/* Not theoretically necessary that width is a power of 2 but should
almost always be the case for optimization purposes */
static_assert(width > 0 && (width & (width - 1)) == 0,
"Width is not a positive power of 2!");
using Converter = _typeConvert<scalar_t>;
using T1 = typename Converter::hip_type;
using T2 = typename Converter::packed_hip_type;
T1 data[width];
__device__ _f16Vec& operator+=(const _f16Vec<scalar_t, width>& other) {
if constexpr (width % 2 == 0) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]};
temp += T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} else {
#pragma unroll
for (int i = 0; i < width; ++i) data[i] += other.data[i];
}
return *this;
}
__device__ _f16Vec& operator*=(const _f16Vec<scalar_t, width>& other) {
if constexpr (width % 2 == 0) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
T2 temp{data[i], data[i + 1]};
temp *= T2{other.data[i], other.data[i + 1]};
data[i] = temp.x;
data[i + 1] = temp.y;
}
} else {
#pragma unroll
for (int i = 0; i < width; ++i) data[i] *= other.data[i];
}
return *this;
}
__device__ _f16Vec& operator*=(const float scale) {
if constexpr (width % 2 == 0) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 temp_f = Converter::convert(T2{data[i], data[i + 1]});
temp_f.x *= scale;
temp_f.y *= scale;
T2 temp = Converter::convert(temp_f);
data[i] = temp.x;
data[i + 1] = temp.y;
}
} else {
#pragma unroll
for (int i = 0; i < width; ++i) {
float temp = Converter::convert(data[i]) * scale;
data[i] = Converter::convert(temp);
}
}
return *this;
}
__device__ float sum_squares() const {
float result = 0.0f;
if constexpr (width % 2 == 0) {
#pragma unroll
for (int i = 0; i < width; i += 2) {
float2 z = Converter::convert(T2{data[i], data[i + 1]});
result += z.x * z.x + z.y * z.y;
}
} else {
#pragma unroll
for (int i = 0; i < width; ++i) {
float x = Converter::convert(data[i]);
result += x * x;
}
}
return result;
}
};
} // namespace vllm
\ No newline at end of file
...@@ -4,6 +4,7 @@ sphinx-copybutton==0.5.2 ...@@ -4,6 +4,7 @@ sphinx-copybutton==0.5.2
myst-parser==2.0.0 myst-parser==2.0.0
sphinx-argparse==0.4.0 sphinx-argparse==0.4.0
msgspec msgspec
cloudpickle
# packages to install to build the documentation # packages to install to build the documentation
pydantic >= 2.8 pydantic >= 2.8
...@@ -11,5 +12,10 @@ pydantic >= 2.8 ...@@ -11,5 +12,10 @@ pydantic >= 2.8
torch torch
py-cpuinfo py-cpuinfo
transformers transformers
mistral_common >= 1.3.4 mistral_common >= 1.5.0
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args aiohttp
\ No newline at end of file starlette
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
fastapi # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
partial-json-parser # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
requests
...@@ -9,6 +9,8 @@ document.addEventListener("DOMContentLoaded", function () { ...@@ -9,6 +9,8 @@ document.addEventListener("DOMContentLoaded", function () {
script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget. script.setAttribute("runllm-keyboard-shortcut", "Mod+j"); // cmd-j or ctrl-j to open the widget.
script.setAttribute("runllm-name", "vLLM"); script.setAttribute("runllm-name", "vLLM");
script.setAttribute("runllm-position", "BOTTOM_RIGHT"); script.setAttribute("runllm-position", "BOTTOM_RIGHT");
script.setAttribute("runllm-position-y", "20%");
script.setAttribute("runllm-position-x", "3%");
script.setAttribute("runllm-assistant-id", "207"); script.setAttribute("runllm-assistant-id", "207");
script.async = true; script.async = true;
......
...@@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen ...@@ -25,7 +25,7 @@ With this mapping, we can add another indirection in vLLM’s KV cache managemen
This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system. This design achieves automatic prefix caching without the need of maintaining a tree structure among the KV blocks. More specifically, all of the blocks are independent of each other and can be allocated and freed by itself, which enables us to manages the KV cache as ordinary caches in operating system.
# Generalized Caching Policy ## Generalized Caching Policy
Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full. Keeping all the KV blocks in a hash table enables vLLM to cache KV blocks from earlier requests to save memory and accelerate the computation of future requests. For example, if a new request shares the system prompt with the previous request, the KV cache of the shared prompt can directly be used for the new request without recomputation. However, the total KV cache space is limited and we have to decide which KV blocks to keep or evict when the cache is full.
......
...@@ -5,6 +5,7 @@ vLLM Meetups ...@@ -5,6 +5,7 @@ vLLM Meetups
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- `The seventh vLLM meetup <https://lu.ma/h0qvrajz>`__, with Snowflake, November 14th 2024. `[Slides] <https://docs.google.com/presentation/d/1e3CxQBV3JsfGp30SwyvS3eM_tW-ghOhJ9PAJGK6KR54/edit?usp=sharing>`__
- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__ - `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__ - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__ - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
......
...@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a ...@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a
- Dropbox - Dropbox
- Google Cloud - Google Cloud
- Lambda Lab - Lambda Lab
- Nebius
- NVIDIA - NVIDIA
- Replicate - Replicate
- Roblox - Roblox
......
...@@ -10,11 +10,13 @@ ...@@ -10,11 +10,13 @@
# add these directories to sys.path here. If the directory is relative to the # add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here. # documentation root, use os.path.abspath to make it absolute, like shown here.
import inspect
import logging import logging
import os import os
import sys import sys
from typing import List from typing import List
import requests
from sphinx.ext import autodoc from sphinx.ext import autodoc
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -34,6 +36,7 @@ author = 'the vLLM Team' ...@@ -34,6 +36,7 @@ author = 'the vLLM Team'
extensions = [ extensions = [
"sphinx.ext.napoleon", "sphinx.ext.napoleon",
"sphinx.ext.viewcode", "sphinx.ext.viewcode",
"sphinx.ext.linkcode",
"sphinx.ext.intersphinx", "sphinx.ext.intersphinx",
"sphinx_copybutton", "sphinx_copybutton",
"sphinx.ext.autodoc", "sphinx.ext.autodoc",
...@@ -94,9 +97,71 @@ def setup(app): ...@@ -94,9 +97,71 @@ def setup(app):
generate_examples() generate_examples()
_cached_base: str = ""
_cached_branch: str = ""
def get_repo_base_and_branch(pr_number):
global _cached_base, _cached_branch
if _cached_base and _cached_branch:
return _cached_base, _cached_branch
url = f"https://api.github.com/repos/vllm-project/vllm/pulls/{pr_number}"
response = requests.get(url)
if response.status_code == 200:
data = response.json()
_cached_base = data['head']['repo']['full_name']
_cached_branch = data['head']['ref']
return _cached_base, _cached_branch
else:
logger.error("Failed to fetch PR details: %s", response)
return None, None
def linkcode_resolve(domain, info):
if domain != 'py':
return None
if not info['module']:
return None
filename = info['module'].replace('.', '/')
module = info['module']
# try to determine the correct file and line number to link to
obj = sys.modules[module]
# get as specific as we can
lineno: int = 0
filename: str = ""
try:
for part in info['fullname'].split('.'):
obj = getattr(obj, part)
if not (inspect.isclass(obj) or inspect.isfunction(obj)
or inspect.ismethod(obj)):
obj = obj.__class__ # Get the class of the instance
lineno = inspect.getsourcelines(obj)[1]
filename = (inspect.getsourcefile(obj)
or f"{filename}.py").split("vllm/", 1)[1]
except Exception:
# For some things, like a class member, won't work, so
# we'll use the line number of the parent (the class)
pass
if filename.startswith("checkouts/"):
# a PR build on readthedocs
pr_number = filename.split("/")[1]
filename = filename.split("/", 2)[2]
base, branch = get_repo_base_and_branch(pr_number)
if base and branch:
return f"https://github.com/{base}/blob/{branch}/{filename}#L{lineno}"
# Otherwise, link to the source file on the main branch
return f"https://github.com/vllm-project/vllm/blob/main/{filename}#L{lineno}"
# Mock out external dependencies here, otherwise the autodoc pages may be blank. # Mock out external dependencies here, otherwise the autodoc pages may be blank.
autodoc_mock_imports = [ autodoc_mock_imports = [
"aiohttp",
"compressed_tensors", "compressed_tensors",
"cpuinfo", "cpuinfo",
"cv2", "cv2",
...@@ -113,10 +178,12 @@ autodoc_mock_imports = [ ...@@ -113,10 +178,12 @@ autodoc_mock_imports = [
"tensorizer", "tensorizer",
"pynvml", "pynvml",
"outlines", "outlines",
"xgrammar,"
"librosa", "librosa",
"soundfile", "soundfile",
"gguf", "gguf",
"lark", "lark",
"decord",
] ]
for mock_target in autodoc_mock_imports: for mock_target in autodoc_mock_imports:
...@@ -143,6 +210,7 @@ intersphinx_mapping = { ...@@ -143,6 +210,7 @@ intersphinx_mapping = {
"python": ("https://docs.python.org/3", None), "python": ("https://docs.python.org/3", None),
"typing_extensions": "typing_extensions":
("https://typing-extensions.readthedocs.io/en/latest", None), ("https://typing-extensions.readthedocs.io/en/latest", None),
"aiohttp": ("https://docs.aiohttp.org/en/stable", None),
"pillow": ("https://pillow.readthedocs.io/en/stable", None), "pillow": ("https://pillow.readthedocs.io/en/stable", None),
"numpy": ("https://numpy.org/doc/stable", None), "numpy": ("https://numpy.org/doc/stable", None),
"torch": ("https://pytorch.org/docs/stable", None), "torch": ("https://pytorch.org/docs/stable", None),
......
Contributing to vLLM
=====================
Thank you for your interest in contributing to vLLM! Our community is open to everyone and welcomes all kinds of contributions, no matter how small or large. There are several ways you can contribute to the project:
- Identify and report any issues or bugs.
- Request or add support for a new model.
- Suggest or implement new features.
- Improve documentation or contribute a how-to guide.
We also believe in the power of community support; thus, answering queries, offering PR reviews, and assisting others are also highly regarded and beneficial contributions.
Finally, one of the most impactful ways to support us is by raising awareness about vLLM. Talk about it in your blog posts and highlight how it's driving your incredible projects. Express your support on social media if you're using vLLM, or simply offer your appreciation by starring our repository!
License
-------
See `LICENSE <https://github.com/vllm-project/vllm/tree/main/LICENSE>`_.
Developing
----------
Depending on the kind of development you'd like to do (e.g. Python, CUDA), you can choose to build vLLM with or without compilation. Check out the `building from source <https://docs.vllm.ai/en/latest/getting_started/installation.html#build-from-source>`_ documentation for details.
Testing
-------
.. code-block:: bash
pip install -r requirements-dev.txt
# linting and formatting
bash format.sh
# Static type checking
mypy
# Unit tests
pytest tests/
.. note:: Currently, the repository does not pass the ``mypy`` tests.
Contribution Guidelines
=======================
Issues
------
If you encounter a bug or have a feature request, please `search existing issues <https://github.com/vllm-project/vllm/issues?q=is%3Aissue>`_ first to see if it has already been reported. If not, please `file a new issue <https://github.com/vllm-project/vllm/issues/new/choose>`_, providing as much relevant information as possible.
.. important::
If you discover a security vulnerability, please follow the instructions `here <https://github.com/vllm-project/vllm/tree/main/SECURITY.md#reporting-a-vulnerability>`_.
Pull Requests & Code Reviews
----------------------------
Thank you for your contribution to vLLM! Before submitting the pull request,
please ensure the PR meets the following criteria. This helps vLLM maintain the
code quality and improve the efficiency of the review process.
DCO and Signed-off-by
^^^^^^^^^^^^^^^^^^^^^
When contributing changes to this project, you must agree to the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
Commits must include a ``Signed-off-by:`` header which certifies agreement with
the terms of the `DCO <https://github.com/vllm-project/vllm/tree/main/DCO>`_.
Using ``-s`` with ``git commit`` will automatically add this header.
PR Title and Classification
^^^^^^^^^^^^^^^^^^^^^^^^^^^
Only specific types of PRs will be reviewed. The PR title is prefixed
appropriately to indicate the type of change. Please use one of the following:
- ``[Bugfix]`` for bug fixes.
- ``[CI/Build]`` for build or continuous integration improvements.
- ``[Doc]`` for documentation fixes and improvements.
- ``[Model]`` for adding a new model or improving an existing model. Model name
should appear in the title.
- ``[Frontend]`` For changes on the vLLM frontend (e.g., OpenAI API server,
``LLM`` class, etc.)
- ``[Kernel]`` for changes affecting CUDA kernels or other compute kernels.
- ``[Core]`` for changes in the core vLLM logic (e.g., ``LLMEngine``,
``AsyncLLMEngine``, ``Scheduler``, etc.)
- ``[Hardware][Vendor]`` for hardware-specific changes. Vendor name should
appear in the prefix (e.g., ``[Hardware][AMD]``).
- ``[Misc]`` for PRs that do not fit the above categories. Please use this
sparingly.
.. note::
If the PR spans more than one category, please include all relevant prefixes.
Code Quality
^^^^^^^^^^^^
The PR needs to meet the following code quality standards:
- We adhere to `Google Python style guide
<https://google.github.io/styleguide/pyguide.html>`_ and `Google C++ style guide
<https://google.github.io/styleguide/cppguide.html>`_.
- Pass all linter checks. Please use `format.sh
<https://github.com/vllm-project/vllm/blob/main/format.sh>`_ to format your
code.
- The code needs to be well-documented to ensure future contributors can easily
understand the code.
- Include sufficient tests to ensure the project stays correct and robust. This
includes both unit tests and integration tests.
- Please add documentation to ``docs/source/`` if the PR modifies the
user-facing behaviors of vLLM. It helps vLLM users understand and utilize the
new features or changes.
Adding or Changing Kernels
^^^^^^^^^^^^^^^^^^^^^^^^^^
Each custom kernel needs a schema and one or more implementations to be registered with PyTorch.
- Make sure custom ops are registered following PyTorch guidelines:
`Custom C++ and CUDA Operators <https://pytorch.org/tutorials/advanced/cpp_custom_ops.html#cpp-custom-ops-tutorial>`_
and `The Custom Operators Manual <https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU>`_.
- Custom operations that return ``Tensors`` require meta-functions.
Meta-functions should be implemented and registered in Python so that dynamic
dims can be handled automatically. See above documents for a description of
meta-functions.
- Use `torch.library.opcheck() <https://pytorch.org/docs/stable/library.html#torch.library.opcheck>`_
to test the function registration and meta-function for any registered ops.
See ``tests/kernels`` for examples.
- When changing the C++ signature of an existing op, the schema must be updated
to reflect the changes.
- If a new custom type is needed, see the following document:
`Custom Class Support in PT2 <https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA>`_.
Notes for Large Changes
^^^^^^^^^^^^^^^^^^^^^^^
Please keep the changes as concise as possible. For major architectural changes
(>500 LOC excluding kernel/data/config/test), we would expect a GitHub issue
(RFC) discussing the technical design and justification. Otherwise, we will tag
it with ``rfc-required`` and might not go through the PR.
What to Expect for the Reviews
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The goal of the vLLM team is to be a *transparent reviewing machine*. We would
like to make the review process transparent and efficient and make sure no
contributor feels confused or frustrated. However, the vLLM team is small, so we
need to prioritize some PRs over others. Here is what you can expect from the
review process:
- After the PR is submitted, the PR will be assigned to a reviewer. Every
reviewer will pick up the PRs based on their expertise and availability.
- After the PR is assigned, the reviewer will provide status updates every 2-3
days. If the PR is not reviewed within 7 days, please feel free to ping the
reviewer or the vLLM team.
- After the review, the reviewer will put an ``action-required`` label on the PR
if there are changes required. The contributor should address the comments and
ping the reviewer to re-review the PR.
- Please respond to all comments within a reasonable time frame. If a comment
isn't clear or you disagree with a suggestion, feel free to ask for
clarification or discuss the suggestion.
Thank You
---------
Finally, thank you for taking the time to read these guidelines and for your interest in contributing to vLLM.
All of your contributions help make vLLM a great tool and community for everyone!
Profiling vLLM ==============
================================= Profiling vLLM
==============
We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/`` We support tracing vLLM workers using the ``torch.profiler`` module. You can enable tracing by setting the ``VLLM_TORCH_PROFILER_DIR`` environment variable to the directory where you want to save the traces: ``VLLM_TORCH_PROFILER_DIR=/mnt/traces/``
......
.. _arch_overview:
Architecture Overview
======================
This document provides an overview of the vLLM architecture.
.. contents:: Table of Contents
:local:
:depth: 2
Entrypoints
-----------
vLLM provides a number of entrypoints for interacting with the system. The
following diagram shows the relationship between them.
.. image:: /assets/design/arch_overview/entrypoints.excalidraw.png
:alt: Entrypoints Diagram
LLM Class
^^^^^^^^^
The LLM class provides the primary Python interface for doing offline inference,
which is interacting with a model without using a separate model inference
server.
Here is a sample of `LLM` class usage:
.. code-block:: python
from vllm import LLM, SamplingParams
# Define a list of input prompts
prompts = [
"Hello, my name is",
"The capital of France is",
"The largest ocean is",
]
# Define sampling parameters
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
# Initialize the LLM engine with the OPT-125M model
llm = LLM(model="facebook/opt-125m")
# Generate outputs for the input prompts
outputs = llm.generate(prompts, sampling_params)
# Print the generated outputs
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
More API details can be found in the :doc:`Offline Inference
</dev/offline_inference/offline_index>` section of the API docs.
The code for the `LLM` class can be found in `vllm/entrypoints/llm.py
<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py>`_.
OpenAI-compatible API server
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
The second primary interface to vLLM is via its OpenAI-compatible API server.
This server can be started using the `vllm serve` command.
.. code-block:: bash
vllm serve <model>
The code for the `vllm` CLI can be found in `vllm/scripts.py
<https://github.com/vllm-project/vllm/blob/main/vllm/scripts.py>`_.
Sometimes you may see the API server entrypoint used directly instead of via the
`vllm` CLI command. For example:
.. code-block:: bash
python -m vllm.entrypoints.openai.api_server --model <model>
That code can be found in `vllm/entrypoints/openai/api_server.py
<https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/openai/api_server.py>`_.
More details on the API server can be found in the :doc:`OpenAI Compatible
Server </serving/openai_compatible_server>` document.
LLM Engine
----------
The `LLMEngine` and `AsyncLLMEngine` classes are central to the functioning of
the vLLM system, handling model inference and asynchronous request processing.
.. image:: /assets/design/arch_overview/llm_engine.excalidraw.png
:alt: LLMEngine Diagram
LLMEngine
^^^^^^^^^
The `LLMEngine` class is the core component of the vLLM engine. It is
responsible for receiving requests from clients and generating outputs from the
model. The `LLMEngine` includes input processing, model execution (possibly
distributed across multiple hosts and/or GPUs), scheduling, and output
processing.
- **Input Processing**: Handles tokenization of input text using the specified
tokenizer.
- **Scheduling**: Chooses which requests are processed in each step.
- **Model Execution**: Manages the execution of the language model, including
distributed execution across multiple GPUs.
- **Output Processing**: Processes the outputs generated by the model, decoding the
token IDs from a language model into human-readable text.
The code for `LLMEngine` can be found in `vllm/engine/llm_engine.py`_.
.. _vllm/engine/llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/llm_engine.py
AsyncLLMEngine
^^^^^^^^^^^^^^
The `AsyncLLMEngine` class is an asynchronous wrapper for the `LLMEngine` class.
It uses `asyncio` to create a background loop that continuously processes
incoming requests. The `AsyncLLMEngine` is designed for online serving, where it
can handle multiple concurrent requests and stream outputs to clients.
The OpenAI-compatible API server uses the `AsyncLLMEngine`. There is also a demo
API server that serves as a simpler example in
`vllm/entrypoints/api_server.py`_.
.. _vllm/entrypoints/api_server.py: https://github.com/vllm-project/vllm/tree/main/vllm/entrypoints/api_server.py
The code for `AsyncLLMEngine` can be found in `vllm/engine/async_llm_engine.py`_.
.. _vllm/engine/async_llm_engine.py: https://github.com/vllm-project/vllm/tree/main/vllm/engine/async_llm_engine.py
Worker
------
A worker is a process that runs the model inference. vLLM follows the common
practice of using one process to control one accelerator device, such as GPUs.
For example, if we use tensor parallelism of size 2 and pipeline parallelism of
size 2, we will have 4 workers in total. Workers are identified by their
``rank`` and ``local_rank``. ``rank`` is used for global orchestration, while
``local_rank`` is mainly used for assigning the accelerator device and accessing
local resources such as the file system and shared memory.
Model Runner
------------
Every worker has one model runner object, responsible for loading and running
the model. Much of the model execution logic resides here, such as preparing
input tensors and capturing cudagraphs.
Model
-----
Every model runner object has one model object, which is the actual
``torch.nn.Module`` instance. See :ref:`huggingface_integration` for how various
configurations affect the class we ultimately get.
Class Hierarchy
---------------
The following figure shows the class hierarchy of vLLM:
.. figure:: /assets/design/hierarchy.png
:alt: query
:width: 100%
:align: center
There are several important design choices behind this class hierarchy:
1. **Extensibility**: All classes in the hierarchy accept a configuration object
containing all the necessary information. The `VllmConfig
<https://github.com/vllm-project/vllm/blob/d1c6799b8870e513bf4f2305cbf6cda9fc3d773b/vllm/config.py#L2036>`__
class is the main configuration object that is passed around. The class
hierarchy is quite deep, and every class needs to read the configuration it is
interested in. By encapsulating all configurations in one object, we can easily
pass the configuration object around and access the configuration we need.
Suppose we want to add a new feature (this is often the case given how fast the
field of LLM inference is evolving) that only touches the model runner. We will
have to add a new configuration option in the `VllmConfig` class. Since we pass
the whole config object around, we only need to add the configuration option to
the `VllmConfig` class, and the model runner can access it directly. We don't
need to change the constructor of the engine, worker, or model class to pass the
new configuration option.
2. **Uniformity**: The model runner needs a unified interface to create and
initialize the model. vLLM supports more than 50 types of popular open-source
models. Each model has its own initialization logic. If the constructor
signature varies with models, the model runner does not know how to call the
constructor accordingly, without complicated and error-prone inspection logic.
By making the constructor of the model class uniform, the model runner can
easily create and initialize the model without knowing the specific model type.
This is also useful for composing models. Vision-language models often consist
of a vision model and a language model. By making the constructor uniform, we
can easily create a vision model and a language model and compose them into a
vision-language model.
.. note::
To support this change, all vLLM models' signatures have been updated to:
.. code-block:: python
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
To avoid accidentally passing incorrect arguments, the constructor is now keyword-only. This ensures that the constructor will raise an error if old configurations are passed. vLLM developers have already made this change for all models within vLLM. For out-of-tree registered models, developers need to update their models, for example by adding shim code to adapt the old constructor signature to the new one:
.. code-block:: python
class MyOldModel(nn.Module):
def __init__(
self,
config,
cache_config: Optional[CacheConfig] = None,
quant_config: Optional[QuantizationConfig] = None,
lora_config: Optional[LoRAConfig] = None,
prefix: str = "",
) -> None:
...
from vllm.config import VllmConfig
class MyNewModel(MyOldModel):
def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
config = vllm_config.model_config.hf_config
cache_config = vllm_config.cache_config
quant_config = vllm_config.quant_config
lora_config = vllm_config.lora_config
super().__init__(config, cache_config, quant_config, lora_config, prefix)
if __version__ >= "0.6.4":
MyModel = MyNewModel
else:
MyModel = MyOldModel
This way, the model can work with both old and new versions of vLLM.
3. **Sharding and Quantization at Initialization**: Certain features require
changing the model weights. For example, tensor parallelism needs to shard the
model weights, and quantization needs to quantize the model weights. There are
two possible ways to implement this feature. One way is to change the model
weights after the model is initialized. The other way is to change the model
weights during the model initialization. vLLM chooses the latter. The first
approach is not scalable to large models. Suppose we want to run a 405B model
(with roughly 810GB weights) with 16 H100 80GB GPUs. Ideally, every GPU should
only load 50GB weights. If we change the model weights after the model is
initialized, we need to load the full 810GB weights to every GPU and then shard
the weights, leading to a huge memory overhead. Instead, if we shard the weights
during the model initialization, every layer will only create a shard of the
weights it needs, leading to a much smaller memory overhead. The same idea
applies to quantization. Note that we also add an additional argument ``prefix``
to the model's constructor so that the model can initialize itself differently
based on the prefix. This is useful for non-uniform quantization, where
different parts of the model are quantized differently. The ``prefix`` is
usually an empty string for the top-level model and a string like ``"vision"``
or ``"language"`` for the sub-models. In general, it matches the name of the
module's state dict in the checkpoint file.
One disadvantage of this design is that it is hard to write unit tests for
individual components in vLLM because every component needs to be initialized by
a complete config object. We solve this problem by providing a default
initialization function that creates a default config object with all fields set
to ``None``. If the component we want to test only cares about a few fields in
the config object, we can create a default config object and set the fields we
care about. This way, we can test the component in isolation. Note that many
tests in vLLM are end-to-end tests that test the whole system, so this is not a
big problem.
In summary, the complete config object ``VllmConfig`` can be treated as an
engine-level global state that is shared among all vLLM classes.
.. _huggingface_integration:
Integration with HuggingFace
===================================
This document describes how vLLM integrates with HuggingFace libraries. We will explain step by step what happens under the hood when we run ``vllm serve``.
Let's say we want to serve the popular QWen model by running ``vllm serve Qwen/Qwen2-7B``.
1. The ``model`` argument is ``Qwen/Qwen2-7B``. vLLM determines whether this model exists by checking for the corresponding config file ``config.json``. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L162-L182>`__ for the implementation. Within this process:
- If the ``model`` argument corresponds to an existing local path, vLLM will load the config file directly from this path.
- If the ``model`` argument is a HuggingFace model ID consisting of a username and model name, vLLM will first try to use the config file from the HuggingFace local cache, using the ``model`` argument as the model name and the ``--revision`` argument as the revision. See `their website <https://huggingface.co/docs/huggingface_hub/en/package_reference/environment_variables#hfhome>`__ for more information on how the HuggingFace cache works.
- If the ``model`` argument is a HuggingFace model ID but it is not found in the cache, vLLM will download the config file from the HuggingFace model hub. Refer to `this function <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L91>`__ for the implementation. The input arguments include the ``model`` argument as the model name, the ``--revision`` argument as the revision, and the environment variable ``HF_TOKEN`` as the token to access the model hub. In our case, vLLM will download the `config.json <https://huggingface.co/Qwen/Qwen2-7B/blob/main/config.json>`__ file.
2. After confirming the existence of the model, vLLM loads its config file and converts it into a dictionary. See this `code snippet <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L185-L186>`__ for the implementation.
3. Next, vLLM `inspects <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L189>`__ the ``model_type`` field in the config dictionary to `generate <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#190-L216>`__ the config object to use. There are some ``model_type`` values that vLLM directly supports; see `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/transformers_utils/config.py#L48>`__ for the list. If the ``model_type`` is not in the list, vLLM will use `AutoConfig.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoConfig.from_pretrained>`__ to load the config class, with ``model``, ``--revision``, and ``--trust_remote_code`` as the arguments. Please note that:
- HuggingFace also has its own logic to determine the config class to use. It will again use the ``model_type`` field to search for the class name in the transformers library; see `here <https://github.com/huggingface/transformers/tree/main/src/transformers/models>`__ for the list of supported models. If the ``model_type`` is not found, HuggingFace will use the ``auto_map`` field from the config JSON file to determine the class name. Specifically, it is the ``AutoConfig`` field under ``auto_map``. See `DeepSeek <https://huggingface.co/deepseek-ai/DeepSeek-V2.5/blob/main/config.json>`__ for an example.
- The ``AutoConfig`` field under ``auto_map`` points to a module path in the model's repository. To create the config class, HuggingFace will import the module and use the ``from_pretrained`` method to load the config class. This can generally cause arbitrary code execution, so it is only executed when ``--trust_remote_code`` is enabled.
4. Subsequently, vLLM applies some historical patches to the config object. These are mostly related to RoPE configuration; see `here <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/config.py#L244>`__ for the implementation.
5. Finally, vLLM can reach the model class we want to initialize. vLLM uses the ``architectures`` field in the config object to determine the model class to initialize, as it maintains the mapping from architecture name to model class in `its registry <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/registry.py#L80>`__. If the architecture name is not found in the registry, it means this model architecture is not supported by vLLM. For ``Qwen/Qwen2-7B``, the ``architectures`` field is ``["Qwen2ForCausalLM"]``, which corresponds to the ``Qwen2ForCausalLM`` class in `vLLM's code <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/model_executor/models/qwen2.py#L364>`__. This class will initialize itself depending on various configs.
Beyond that, there are two more things vLLM depends on HuggingFace for.
1. **Tokenizer**: vLLM uses the tokenizer from HuggingFace to tokenize the input text. The tokenizer is loaded using `AutoTokenizer.from_pretrained <https://huggingface.co/docs/transformers/en/model_doc/auto#transformers.AutoTokenizer.from_pretrained>`__ with the ``model`` argument as the model name and the ``--revision`` argument as the revision. It is also possible to use a tokenizer from another model by specifying the ``--tokenizer`` argument in the ``vllm serve`` command. Other relevant arguments are ``--tokenizer-revision`` and ``--tokenizer-mode``. Please check HuggingFace's documentation for the meaning of these arguments. This part of the logic can be found in the `get_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L87>`__ function. After obtaining the tokenizer, notably, vLLM will cache some expensive attributes of the tokenizer in `get_cached_tokenizer <https://github.com/vllm-project/vllm/blob/127c07480ecea15e4c2990820c457807ff78a057/vllm/transformers_utils/tokenizer.py#L24>`__.
2. **Model weight**: vLLM downloads the model weight from the HuggingFace model hub using the ``model`` argument as the model name and the ``--revision`` argument as the revision. vLLM provides the argument ``--load-format`` to control what files to download from the model hub. By default, it will try to load the weights in the safetensors format and fall back to the PyTorch bin format if the safetensors format is not available. We can also pass ``--load-format dummy`` to skip downloading the weights.
- It is recommended to use the safetensors format, as it is efficient for loading in distributed inference and also safe from arbitrary code execution. See the `documentation <https://huggingface.co/docs/safetensors/en/index>`__ for more information on the safetensors format. This part of the logic can be found `here <https://github.com/vllm-project/vllm/blob/10b67d865d92e376956345becafc249d4c3c0ab7/vllm/model_executor/model_loader/loader.py#L385>`__. Please note that:
This completes the integration between vLLM and HuggingFace.
In summary, vLLM reads the config file ``config.json``, tokenizer, and model weight from the HuggingFace model hub or a local directory. It uses the config class from either vLLM, HuggingFace transformers, or loads the config class from the model's repository.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment