Commit 4851c202 authored by zhuwenwen's avatar zhuwenwen
Browse files

Merge tag 'v0.6.1' into v0.6.1-dev

parents 9b902f9e 3fd2b0d2
...@@ -11,8 +11,9 @@ import uvloop ...@@ -11,8 +11,9 @@ import uvloop
from tqdm import tqdm from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer, from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase) PreTrainedTokenizerBase)
from vllm.inputs import PromptInputs from vllm.inputs import PromptInputs
from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs from vllm.engine.arg_utils import DEVICE_OPTIONS, AsyncEngineArgs, EngineArgs
from vllm.entrypoints.openai.api_server import ( from vllm.entrypoints.openai.api_server import (
build_async_engine_client_from_engine_args) build_async_engine_client_from_engine_args)
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
...@@ -504,13 +505,11 @@ if __name__ == "__main__": ...@@ -504,13 +505,11 @@ if __name__ == "__main__":
'accuracy issues. FP8_E5M2 (without scaling) is only supported on ' 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is ' 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.') 'instead supported for common inference criteria.')
parser.add_argument( parser.add_argument("--device",
"--device", type=str,
type=str, default="auto",
default="auto", choices=DEVICE_OPTIONS,
choices=["auto", "cuda", "cpu", "openvino", "tpu", "xpu"], help='device type for vLLM execution')
help='device type for vLLM execution, supporting CUDA, OpenVINO and '
'CPU.')
parser.add_argument( parser.add_argument(
"--num-scheduler-steps", "--num-scheduler-steps",
type=int, type=int,
......
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD 17)
# #
# Define environment variables for special configurations # Define environment variables for special configurations
...@@ -83,12 +84,7 @@ endif() ...@@ -83,12 +84,7 @@ endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS "numa") list(APPEND LIBS dnnl numa)
#
# Define extension targets
#
# #
# _C extension # _C extension
...@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC ...@@ -102,6 +98,16 @@ set(VLLM_EXT_SRC
"csrc/cpu/pos_encoding.cpp" "csrc/cpu/pos_encoding.cpp"
"csrc/cpu/torch_bindings.cpp") "csrc/cpu/torch_bindings.cpp")
if (AVX512_FOUND AND NOT AVX512_DISABLED)
set(VLLM_EXT_SRC
"csrc/cpu/quant.cpp"
${VLLM_EXT_SRC})
endif()
#
# Define extension targets
#
define_gpu_extension_target( define_gpu_extension_target(
_C _C
DESTINATION vllm DESTINATION vllm
......
...@@ -355,6 +355,7 @@ function (define_gpu_extension_target GPU_MOD_NAME) ...@@ -355,6 +355,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES}) ${GPU_INCLUDE_DIRECTORIES})
# TODO: is torch_python_LIBRARY needed?
target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY} target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
${GPU_LIBRARIES}) ${GPU_LIBRARIES})
......
...@@ -24,8 +24,8 @@ namespace vec_op { ...@@ -24,8 +24,8 @@ namespace vec_op {
#define CPU_KERNEL_GUARD_OUT(NAME) #define CPU_KERNEL_GUARD_OUT(NAME)
#else #else
#define CPU_KERNEL_GUARD_IN(NAME) \ #define CPU_KERNEL_GUARD_IN(NAME) \
std::cout << #NAME << " invoked." << std::endl; RECORD_FUNCTION(#NAME, c10::ArrayRef<c10::IValue>({}));
#define CPU_KERNEL_GUARD_OUT(NAME) std::cout << #NAME << " exit." << std::endl; #define CPU_KERNEL_GUARD_OUT(NAME)
#endif #endif
#define FORCE_INLINE __attribute__((always_inline)) inline #define FORCE_INLINE __attribute__((always_inline)) inline
...@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> { ...@@ -106,6 +106,12 @@ struct BF16Vec16 : public Vec<BF16Vec16> {
explicit BF16Vec16(const FP32Vec16 &); explicit BF16Vec16(const FP32Vec16 &);
void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; } void save(void *ptr) const { *reinterpret_cast<__m256i *>(ptr) = reg; }
void save(void* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
_mm256_mask_storeu_epi16(ptr, mask, reg);
}
}; };
#ifdef __AVX512F__ #ifdef __AVX512F__
...@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -313,8 +319,28 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return FP32Vec16(_mm512_div_ps(reg, b.reg)); return FP32Vec16(_mm512_div_ps(reg, b.reg));
} }
FP32Vec16 clamp(const FP32Vec16& min, const FP32Vec16& max) const {
return FP32Vec16(_mm512_min_ps(max.reg, _mm512_max_ps(min.reg, reg)));
}
FP32Vec16 max(const FP32Vec16& b) const {
return FP32Vec16(_mm512_max_ps(reg, b.reg));
}
FP32Vec16 max(const FP32Vec16& b, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
}
FP32Vec16 abs() const {
return FP32Vec16(_mm512_abs_ps(reg));
}
float reduce_sum() const { return _mm512_reduce_add_ps(reg); } float reduce_sum() const { return _mm512_reduce_add_ps(reg); }
float reduce_max() const { return _mm512_reduce_max_ps(reg); }
template <int group_size> float reduce_sub_sum(int idx) { template <int group_size> float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0); static_assert(VEC_ELEM_NUM % group_size == 0);
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
...@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -323,6 +349,12 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
} }
void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); } void save(float *ptr) const { _mm512_storeu_ps(ptr, reg); }
void save(float* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
_mm512_mask_storeu_ps(ptr, mask, reg);
}
}; };
#else #else
struct FP32Vec16 : public Vec<FP32Vec16> { struct FP32Vec16 : public Vec<FP32Vec16> {
...@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -433,6 +465,32 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
}; };
#endif #endif
#ifdef __AVX512F__
struct INT8Vec16: public Vec<INT8Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
union AliasReg {
__m128i reg;
int8_t values[VEC_ELEM_NUM];
};
__m128i reg;
explicit INT8Vec16(const FP32Vec16& vec) : reg(
_mm512_cvtepi32_epi8(_mm512_cvt_roundps_epi32(vec.reg, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))
) {}
void save(int8_t* ptr) const {
_mm_storeu_epi8(ptr, reg);
}
void save(int8_t* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
_mm_mask_storeu_epi8(ptr, mask, reg);
}
};
#endif
template <typename T> struct VecType { using vec_type = void; }; template <typename T> struct VecType { using vec_type = void; };
template <typename T> using vec_t = typename VecType<T>::vec_type; template <typename T> using vec_t = typename VecType<T>::vec_type;
......
#ifndef DNNL_HELPER_HPP
#define DNNL_HELPER_HPP
#include <c10/util/BFloat16.h>
#include "oneapi/dnnl/dnnl.hpp"
namespace {
template <typename T>
struct DNNLType {
static constexpr dnnl::memory::data_type type =
dnnl::memory::data_type::undef;
};
template <>
struct DNNLType<int8_t> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s8;
};
template <>
struct DNNLType<int32_t> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::s32;
};
template <>
struct DNNLType<float> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::f32;
};
template <>
struct DNNLType<c10::BFloat16> {
static constexpr dnnl::memory::data_type type = dnnl::memory::data_type::bf16;
};
template <typename T>
constexpr inline dnnl::memory::data_type get_dnnl_type() {
return DNNLType<std::decay_t<T>>::type;
}
}; // namespace
template <bool InputNoScale>
class DNNLPrimitiveHelper {
public:
// I8 input GEMM kernel (C = a_scales * A @ (b_scales * B^T) + bias)
// A: [M, K], row-major
// B: [K, N], column-major
// C: [M, N], row-major
// bias: [N], row-major, optional
// a_scales: [MS]
// b_scales: [NS]
// Note: Due to the limitation of oneDNN
// (https://github.com/oneapi-src/oneDNN/issues/1636), the quantized bias is
// not supported.
template <typename OutputT, typename BiasT>
static void gemm_s8s8_jit(const int8_t* a, const int8_t* b, OutputT* c,
const BiasT* bias, dnnl_dim_t M, dnnl_dim_t N,
dnnl_dim_t K, const float* a_scales,
const float* b_scales, dnnl_dim_t MS,
dnnl_dim_t NS) {
auto&& OutputType = get_dnnl_type<OutputT>();
auto&& BiasType = get_dnnl_type<BiasT>();
dnnl::memory::desc a_md({M, K}, dnnl::memory::data_type::s8, {K, 1});
dnnl::memory::desc b_md({K, N}, dnnl::memory::data_type::s8, {1, K});
dnnl::memory::desc c_md({M, N}, OutputType, {N, 1});
dnnl::primitive_attr attr;
if constexpr (!InputNoScale) {
if (MS == 1) {
// per-tensor
attr.set_scales_mask(DNNL_ARG_SRC, 0);
} else {
// per-token
TORCH_CHECK(false, "per-token quantization is unsupported.");
}
}
if (NS == 1) {
// per-tensor
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
} else {
// per-channel
attr.set_scales_mask(DNNL_ARG_WEIGHTS, 2);
}
dnnl::matmul::primitive_desc matmul_pd;
if (bias) {
dnnl::memory::desc bias_md({1, N}, BiasType, {N, 1});
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
bias_md, c_md, attr);
} else {
matmul_pd = dnnl::matmul::primitive_desc(default_engine(), a_md, b_md,
c_md, attr);
}
dnnl::matmul matmul(matmul_pd);
auto& engine = default_engine();
dnnl::memory a_m(a_md, engine, (void*)a);
dnnl::memory b_m(b_md, engine, (void*)b);
dnnl::memory c_m(c_md, engine, (void*)c);
dnnl::memory a_scales_m({{MS}, dnnl::memory::data_type::f32, {1}}, engine,
(void*)a_scales);
dnnl::memory b_scales_m({{NS}, dnnl::memory::data_type::f32, {1}}, engine,
(void*)b_scales);
auto& stream = default_stream();
if constexpr (InputNoScale) {
if (bias) {
dnnl::memory::desc bias_md({N}, BiasType, {1});
dnnl::memory bias_m(bias_md, engine, (void*)bias);
matmul.execute(
stream, {
{DNNL_ARG_SRC, a_m},
{DNNL_ARG_WEIGHTS, b_m},
{DNNL_ARG_BIAS, bias_m},
{DNNL_ARG_DST, c_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
});
} else {
matmul.execute(
stream, {
{DNNL_ARG_SRC, a_m},
{DNNL_ARG_WEIGHTS, b_m},
{DNNL_ARG_DST, c_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
});
}
} else {
if (bias) {
dnnl::memory::desc bias_md({N}, BiasType, {1});
dnnl::memory bias_m(bias_md, engine, (void*)bias);
matmul.execute(
stream, {
{DNNL_ARG_SRC, a_m},
{DNNL_ARG_WEIGHTS, b_m},
{DNNL_ARG_BIAS, bias_m},
{DNNL_ARG_DST, c_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
});
} else {
matmul.execute(
stream, {
{DNNL_ARG_SRC, a_m},
{DNNL_ARG_WEIGHTS, b_m},
{DNNL_ARG_DST, c_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, a_scales_m},
{DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, b_scales_m},
});
}
}
stream.wait();
}
private:
static dnnl::engine& default_engine() {
static dnnl::engine engine(dnnl::engine::kind::cpu, 0);
return engine;
}
static dnnl::stream& default_stream() {
static dnnl::stream stream(default_engine());
return stream;
}
};
#endif
#include "cpu_types.hpp"
#include "dnnl_helper.hpp"
namespace {
template <typename scalar_t>
struct KernelVecType {
using load_vec_type = void;
using cvt_vec_type = void;
};
template <>
struct KernelVecType<float> {
using load_vec_type = vec_op::FP32Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
template <>
struct KernelVecType<c10::BFloat16> {
using load_vec_type = vec_op::BF16Vec16;
using cvt_vec_type = vec_op::FP32Vec16;
};
#ifdef __AVX512F__
template <typename scalar_t>
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
const float* scale, const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
constexpr float i8_min =
static_cast<float>(std::numeric_limits<int8_t>::min());
constexpr float i8_max =
static_cast<float>(std::numeric_limits<int8_t>::max());
const cvt_vec_t inv_scale(1.0 / *scale);
const cvt_vec_t i8_min_vec(i8_min);
const cvt_vec_t i8_max_vec(i8_max);
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale).clamp(i8_min_vec, i8_max_vec);
vec_op::INT8Vec16 elems_int8(elems_fp32);
if (j + vec_elem_num == hidden_size) {
elems_int8.save(output + i * hidden_size + j);
} else {
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
}
template <typename scalar_t>
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
float* scale, const int num_tokens,
const int hidden_size) {
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
cvt_vec_t max_abs(0.0);
{
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
max_abs = max_abs.max(elems_fp32.abs());
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
if (j + vec_elem_num == hidden_size) {
max_abs = max_abs.max(elems_fp32.abs());
} else {
max_abs = max_abs.max(elems_fp32.abs(), hidden_size - j);
}
}
float scale_val = max_abs.reduce_max() / 127.0f;
scale[i] = scale_val;
const cvt_vec_t inv_scale(1.0 / scale_val);
{
int j = 0;
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
vec_op::INT8Vec16 elems_int8(elems_fp32);
elems_int8.save(output + i * hidden_size + j);
}
load_vec_t elems(input + i * hidden_size + j);
cvt_vec_t elems_fp32(elems);
elems_fp32 = (elems_fp32 * inv_scale);
vec_op::INT8Vec16 elems_int8(elems_fp32);
if (j + vec_elem_num == hidden_size) {
elems_int8.save(output + i * hidden_size + j);
} else {
elems_int8.save(output + i * hidden_size + j, hidden_size - j);
}
}
}
}
template <bool Bias, typename scalar_t>
void dynamic_output_scale_impl(const float* input, scalar_t* output,
const float* scale, const scalar_t* bias,
const int num_tokens, const int hidden_size) {
CPU_KERNEL_GUARD_IN(dynamic_output_scale_impl)
using load_vec_t = typename KernelVecType<scalar_t>::load_vec_type;
using cvt_vec_t = typename KernelVecType<scalar_t>::cvt_vec_type;
constexpr int vec_elem_num = load_vec_t::VEC_ELEM_NUM;
#pragma omp parallel for
for (int i = 0; i < num_tokens; ++i) {
int j = 0;
cvt_vec_t token_scale_vec(scale[i]);
for (; j < hidden_size - vec_elem_num; j += vec_elem_num) {
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
elems_fp32 = elems_fp32 + bias_vec_fp32;
}
load_vec_t elems_out(elems_fp32);
elems_out.save(output + i * hidden_size + j);
}
cvt_vec_t elems_fp32(input + i * hidden_size + j);
elems_fp32 = elems_fp32 * token_scale_vec;
if constexpr (Bias) {
load_vec_t bias_vec(bias + j);
cvt_vec_t bias_vec_fp32(bias_vec);
elems_fp32 = elems_fp32 + bias_vec_fp32;
}
load_vec_t elems_out(elems_fp32);
if (j + vec_elem_num == hidden_size) {
elems_out.save(output + i * hidden_size + j);
} else {
elems_out.save(output + i * hidden_size + j, hidden_size - j);
}
}
}
#else
template <typename scalar_t>
void static_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
const float* scale, const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "static_scaled_int8_quant_impl requires AVX512 support.")
}
template <typename scalar_t>
void dynamic_scaled_int8_quant_impl(const scalar_t* input, int8_t* output,
float* scale, const int num_tokens,
const int hidden_size) {
TORCH_CHECK(false, "dynamic_scaled_int8_quant_impl requires AVX512 support.")
}
template <typename scalar_t>
void dynamic_output_scale_impl() {
TORCH_CHECK(false, "dynamic_output_scale_impl requires AVX512 support.")
}
#endif
} // namespace
void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major
const torch::Tensor& a, // [M, IC], row-major
const torch::Tensor& b, // [IC, OC], column-major
const torch::Tensor& a_scales, // [1] or [M]
const torch::Tensor& b_scales, // [1] or [OC]
const c10::optional<torch::Tensor>& bias // [OC]
) {
CPU_KERNEL_GUARD_IN(cutlass_scaled_mm)
// Checks for conformality
TORCH_CHECK(a.dtype() == torch::kInt8 && b.dtype() == torch::kInt8,
"int8_scaled_mm only supports INT8 inputs.")
TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2);
TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) &&
b.size(1) == c.size(1));
TORCH_CHECK(a_scales.numel() == 1 || a_scales.numel() == a.size(0));
TORCH_CHECK(b_scales.numel() == 1 || b_scales.numel() == b.size(1));
// Check for strides and alignment
TORCH_CHECK(a.stride(1) == 1 && c.stride(1) == 1); // Row-major
TORCH_CHECK(b.stride(0) == 1); // Column-major
TORCH_CHECK(c.stride(0) % 16 == 0 &&
b.stride(1) % 16 == 0); // 16 Byte Alignment
TORCH_CHECK(a_scales.is_contiguous() && b_scales.is_contiguous());
if (bias) {
TORCH_CHECK(bias->numel() == b.size(1) && bias->is_contiguous() &&
bias->dim() == 1);
}
VLLM_DISPATCH_FLOATING_TYPES(c.scalar_type(), "cutlass_scaled_mm", [&] {
if (a_scales.numel() != 1) {
// per-token
// Note: oneDNN doesn't support per-token activation quantization
torch::Tensor tmp_fp32_out =
torch::empty_like(c, ::at::ScalarType::Float);
DNNLPrimitiveHelper<true>::gemm_s8s8_jit(
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(),
tmp_fp32_out.data_ptr<float>(), (void*)(0), a.size(0), b.size(1),
a.size(1), (float*)(0), b_scales.data_ptr<float>(), 0,
b_scales.numel());
if (bias.has_value()) {
dynamic_output_scale_impl<true>(
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
a_scales.data_ptr<float>(), bias->data_ptr<scalar_t>(), c.size(0),
c.size(1));
} else {
dynamic_output_scale_impl<false>(
tmp_fp32_out.data_ptr<float>(), c.data_ptr<scalar_t>(),
a_scales.data_ptr<float>(), (scalar_t*)(0), c.size(0), c.size(1));
}
} else {
// per-tensor
if (bias.has_value()) {
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
bias->data_ptr<scalar_t>(), a.size(0), b.size(1), a.size(1),
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
a_scales.numel(), b_scales.numel());
} else {
DNNLPrimitiveHelper<false>::gemm_s8s8_jit(
a.data_ptr<int8_t>(), b.data_ptr<int8_t>(), c.data_ptr<scalar_t>(),
(void*)(0), a.size(0), b.size(1), a.size(1),
a_scales.data_ptr<float>(), b_scales.data_ptr<float>(),
a_scales.numel(), b_scales.numel());
}
}
});
}
// static-per-tensor quantization.
void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
const torch::Tensor& scale) {
CPU_KERNEL_GUARD_IN(static_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
TORCH_CHECK(scale.numel() == 1);
const int hidden_size = input.size(-1);
const int num_tokens = input.numel() / hidden_size;
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "static_scaled_int8_quant_impl", [&] {
static_scaled_int8_quant_impl(
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scale.data_ptr<float>(), num_tokens, hidden_size);
});
}
// dynamic-per-token quantization.
void dynamic_scaled_int8_quant(
torch::Tensor& out, // [..., hidden_size]
const torch::Tensor& input, // [..., hidden_size]
torch::Tensor& scale // [..., 1]
) {
CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant)
TORCH_CHECK(input.is_contiguous());
TORCH_CHECK(out.is_contiguous());
int const hidden_size = input.size(-1);
int const num_tokens = input.numel() / hidden_size;
VLLM_DISPATCH_FLOATING_TYPES(
input.scalar_type(), "dynamic_scaled_int8_quant_impl", [&] {
dynamic_scaled_int8_quant_impl(
input.data_ptr<scalar_t>(), out.data_ptr<int8_t>(),
scale.data_ptr<float>(), num_tokens, hidden_size);
});
}
...@@ -4,7 +4,12 @@ ...@@ -4,7 +4,12 @@
#include <torch/library.h> #include <torch/library.h>
void init_cpu_threads_env(const std::string& cpu_ids); std::string init_cpu_threads_env(const std::string& cpu_ids);
void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b, const torch::Tensor& a_scales,
const torch::Tensor& b_scales,
const c10::optional<torch::Tensor>& bias);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops // vLLM custom ops
...@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -27,8 +32,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// PagedAttention V2. // PagedAttention V2.
ops.def( ops.def(
"paged_attention_v2(" "paged_attention_v2("
" Tensor! out, Tensor exp_sums, Tensor max_logits," " Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache," " Tensor! tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale," " Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size," " Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes," " int max_seq_len, Tensor? alibi_slopes,"
...@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -84,6 +89,28 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor! key, int head_size," " Tensor! key, int head_size,"
" Tensor cos_sin_cache, bool is_neox) -> ()"); " Tensor cos_sin_cache, bool is_neox) -> ()");
ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding); ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
// Quantization
#ifdef __AVX512F__
// Compute int8 quantized tensor for given scaling factor.
ops.def(
"static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
"()");
ops.impl("static_scaled_int8_quant", torch::kCPU, &static_scaled_int8_quant);
// Compute int8 quantized tensor and scaling factor
ops.def(
"dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
"()");
ops.impl("dynamic_scaled_int8_quant", torch::kCPU,
&dynamic_scaled_int8_quant);
// W8A8 GEMM, supporting symmetric per-tensor or per-row/column
// quantization.
ops.def(
"cutlass_scaled_mm(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
#endif
} }
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
...@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -95,8 +122,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
// Copy the cache blocks from src to dst. // Copy the cache blocks from src to dst.
cache_ops.def( cache_ops.def(
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor " "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"block_mapping) -> ()"); "Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks); cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
// Reshape the key and value tensors and cache them. // Reshape the key and value tensors and cache them.
...@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -111,7 +138,7 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _utils), utils) {
// CPU utils // CPU utils
utils.def("init_cpu_threads_env(str cpu_ids) -> ()", &init_cpu_threads_env); utils.def("init_cpu_threads_env(str cpu_ids) -> str", &init_cpu_threads_env);
} }
REGISTER_EXTENSION(TORCH_EXTENSION_NAME) REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
...@@ -5,7 +5,7 @@ ...@@ -5,7 +5,7 @@
#include "cpu_types.hpp" #include "cpu_types.hpp"
void init_cpu_threads_env(const std::string& cpu_ids) { std::string init_cpu_threads_env(const std::string& cpu_ids) {
bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str()); bitmask* omp_cpu_mask = numa_parse_cpustring(cpu_ids.c_str());
TORCH_CHECK(omp_cpu_mask->size > 0); TORCH_CHECK(omp_cpu_mask->size > 0);
std::vector<int> omp_cpu_ids; std::vector<int> omp_cpu_ids;
...@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) { ...@@ -51,15 +51,40 @@ void init_cpu_threads_env(const std::string& cpu_ids) {
torch::set_num_threads((int)omp_cpu_ids.size()); torch::set_num_threads((int)omp_cpu_ids.size());
TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads()); TORCH_CHECK_EQ(omp_cpu_ids.size(), torch::get_num_threads());
TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads()); TORCH_CHECK_EQ(omp_cpu_ids.size(), omp_get_max_threads());
std::vector<std::pair<int, int>> thread_core_mapping;
thread_core_mapping.reserve(omp_cpu_ids.size());
omp_lock_t writelock;
omp_init_lock(&writelock);
#pragma omp parallel for schedule(static, 1) #pragma omp parallel for schedule(static, 1)
for (size_t i = 0; i < omp_cpu_ids.size(); ++i) { for (size_t i = 0; i < omp_cpu_ids.size(); ++i) {
cpu_set_t* mask = CPU_ALLOC(omp_cpu_mask->size); cpu_set_t mask;
size_t size = CPU_ALLOC_SIZE(omp_cpu_mask->size); CPU_ZERO(&mask);
CPU_ZERO_S(size, mask); CPU_SET(omp_cpu_ids[i], &mask);
CPU_SET_S(omp_cpu_ids[i], size, mask); int ret = sched_setaffinity(0, sizeof(cpu_set_t), &mask);
sched_setaffinity(0, sizeof(cpu_set_t), mask); if (ret == -1) {
CPU_FREE(mask); TORCH_CHECK(false,
"sched_setaffinity failed. errno: " + std::to_string(errno));
}
omp_set_lock(&writelock);
thread_core_mapping.emplace_back(gettid(), omp_cpu_ids[i]);
omp_unset_lock(&writelock);
} }
omp_destroy_lock(&writelock);
numa_free_nodemask(omp_cpu_mask); numa_free_nodemask(omp_cpu_mask);
std::stringstream ss;
ss << "OMP threads binding of Process " << getpid() << ":\n";
std::sort(thread_core_mapping.begin(), thread_core_mapping.end(),
[](auto&& a, auto&& b) { return a.second < b.second; });
for (auto&& item : thread_core_mapping) {
ss << "\t"
<< "OMP tid: " << item.first << ", core " << item.second << "\n";
}
return ss.str();
} }
...@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe( ...@@ -1737,4 +1737,4 @@ torch::Tensor marlin_gemm_moe(
moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k, moe_block_size, dev, at::cuda::getCurrentCUDAStream(dev), thread_k,
thread_n, sms, max_par, replicate_input, apply_weights); thread_n, sms, max_par, replicate_input, apply_weights);
return c; return c;
} }
\ No newline at end of file
...@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe( ...@@ -9,4 +9,4 @@ torch::Tensor marlin_gemm_moe(
const torch::Tensor& g_idx, const torch::Tensor& perm, const torch::Tensor& g_idx, const torch::Tensor& perm,
torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k, torch::Tensor& workspace, int64_t size_m, int64_t size_n, int64_t size_k,
bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size, bool is_k_full, int64_t num_experts, int64_t topk, int64_t moe_block_size,
bool replicate_input, bool apply_weights); bool replicate_input, bool apply_weights);
\ No newline at end of file
...@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) { ...@@ -16,7 +16,6 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
"g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int " "g_idx, Tensor! perm, Tensor! workspace, int size_m, int size_n, int "
"size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, " "size_k, bool is_k_full, int num_experts, int topk, int moe_block_size, "
"bool replicate_input, bool apply_weights) -> Tensor"); "bool replicate_input, bool apply_weights) -> Tensor");
m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe); m.impl("marlin_gemm_moe", torch::kCUDA, &marlin_gemm_moe);
#endif #endif
} }
......
...@@ -165,9 +165,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, ...@@ -165,9 +165,17 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
int64_t size_k, int64_t size_n, int64_t size_k, int64_t size_n,
int64_t num_bits); int64_t num_bits);
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
torch::Tensor& perm, c10::SymInt size_k,
c10::SymInt size_n, int64_t num_bits);
torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
int64_t size_n, int64_t num_bits); int64_t size_n, int64_t num_bits);
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
c10::SymInt size_k, c10::SymInt size_n,
int64_t num_bits);
torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m, torch::Tensor ggml_dequantize(torch::Tensor W, int64_t type, int64_t m,
int64_t n); int64_t n);
...@@ -212,9 +220,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, ...@@ -212,9 +220,6 @@ void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
torch::Tensor& scales); torch::Tensor& scales);
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor lookup_table);
// torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, // torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
// torch::Tensor b_gptq_qzeros, // torch::Tensor b_gptq_qzeros,
// torch::Tensor b_gptq_scales, torch::Tensor b_g_idx, // torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
......
...@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k, ...@@ -267,3 +267,15 @@ torch::Tensor awq_marlin_repack(torch::Tensor& b_q_weight, int64_t size_k,
} }
#endif #endif
torch::Tensor awq_marlin_repack_meta(torch::Tensor& b_q_weight,
c10::SymInt size_k, c10::SymInt size_n,
int64_t num_bits) {
int const pack_factor = 32 / num_bits;
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
return torch::empty_symint(
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
options);
}
...@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm, ...@@ -342,3 +342,15 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
} }
#endif #endif
torch::Tensor gptq_marlin_repack_meta(torch::Tensor& b_q_weight,
torch::Tensor& perm, c10::SymInt size_k,
c10::SymInt size_n, int64_t num_bits) {
int const pack_factor = 32 / num_bits;
auto options = torch::TensorOptions()
.dtype(b_q_weight.dtype())
.device(b_q_weight.device());
return torch::empty_symint(
{size_k / marlin::tile_size, size_n * marlin::tile_size / pack_factor},
options);
}
#include <torch/all.h>
#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_fp16.h>
// half-tensor
#include <c10/cuda/CUDAStream.h>
#include <ATen/cuda/CUDATensorMethods.cuh>
#include <c10/cuda/CUDAGuard.h>
#define BLOCKWIDTH 128
#define BLOCKHEIGHT4 16
namespace vllm {
namespace squeezellm {
__device__ inline unsigned int as_unsigned(int i) {
return *reinterpret_cast<unsigned int*>(&i);
}
// 4-bit matvec kernel (LUT-based)
__global__ void NUQ4MatMulKernel(
#ifndef USE_ROCM
const half2* __restrict__ vec,
#else
const __half2* __restrict__ vec,
#endif
const int* __restrict__ mat,
#ifndef USE_ROCM
half2* __restrict__ mul,
#else
float2* __restrict__ mul,
#endif
const __half* __restrict__ lookup_table, int height, int width, int batch,
int vec_height) {
const int blockwidth2 = BLOCKWIDTH / 2;
int row = BLOCKHEIGHT4 * blockIdx.x;
int col = BLOCKWIDTH * blockIdx.y + threadIdx.x;
#ifndef USE_ROCM
__shared__ half2 blockvec[blockwidth2];
#else
__shared__ __half2 blockvec[blockwidth2];
#endif
__shared__ __half deq2[16][BLOCKWIDTH];
int off = threadIdx.x;
int column_offset = col * 16;
for (int val = 0; val < 16; val += 1) {
int lut_index = column_offset + val;
deq2[val][off] = lookup_table[lut_index];
}
__half res;
#ifndef USE_ROCM
half2 res2;
half2 tmp2;
#else
__half2 res2;
__half2 tmp2;
#endif
int i;
int k;
unsigned int tmp1;
unsigned int lut_index1, lut_index2;
for (int b = 0; b < batch; ++b) {
i = width * row + col;
res = __int2half_rd(0);
k = 0;
__syncthreads();
if (threadIdx.x < blockwidth2)
blockvec[threadIdx.x] =
vec[b * vec_height / 2 + (row / BLOCKHEIGHT4) * blockwidth2 +
threadIdx.x];
__syncthreads();
while (k < blockwidth2) {
tmp1 = as_unsigned(mat[i]);
#ifndef USE_ROCM
res2 = {};
tmp2 = {};
#else
res2.x = __half_as_ushort(__float2half(0));
res2.y = __half_as_ushort(__float2half(0));
tmp2.x = __half_as_ushort(__float2half(0));
tmp2.y = __half_as_ushort(__float2half(0));
#endif
lut_index1 = tmp1 & 0xF;
lut_index2 = (tmp1 >> 4) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 0], res2);
lut_index1 = (tmp1 >> 8) & 0xF;
lut_index2 = (tmp1 >> 12) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 1], res2);
lut_index1 = (tmp1 >> 16) & 0xF;
lut_index2 = (tmp1 >> 20) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 2], res2);
lut_index1 = (tmp1 >> 24) & 0xF;
lut_index2 = (tmp1 >> 28) & 0xF;
#ifndef USE_ROCM
tmp2.x = deq2[lut_index1][off];
tmp2.y = deq2[lut_index2][off];
#else
tmp2.x = __half_as_ushort(deq2[lut_index1][off]);
tmp2.y = __half_as_ushort(deq2[lut_index2][off]);
#endif
res2 = __hfma2(tmp2, blockvec[k + 3], res2);
#ifndef USE_ROCM
res = __hadd(__hadd(res2.x, res2.y), res);
#else
res = __hadd(__hadd(__ushort_as_half(res2.x), __ushort_as_half(res2.y)),
res);
#endif
i += width;
k += 4;
}
// col%2 -> only set one of the two values
#ifndef USE_ROCM
half2 res3 = {};
if (col % 2 == 0) {
res3.x = res;
} else {
res3.y = res;
}
#else
__half2 res3;
res3.x = __half_as_ushort(__float2half(0));
res3.y = __half_as_ushort(__float2half(0));
if (col % 2 == 0) {
res3.x = __half_as_ushort(res);
} else {
res3.y = __half_as_ushort(res);
}
#endif
#ifndef USE_ROCM
atomicAdd(&mul[b * width / 2 + col / 2], res3);
#else
int tmp_addr = b * width / 2 + col / 2;
atomicAdd(&(mul[tmp_addr].x), __half2float(__ushort_as_half(res3.x)));
atomicAdd(&(mul[tmp_addr].y), __half2float(__ushort_as_half(res3.y)));
#endif
}
}
} // namespace squeezellm
} // namespace vllm
// 4-bit matvec kernel (LUT-based)
void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
torch::Tensor lookup_table) {
int height = mat.size(0);
int width = mat.size(1);
int batch = vec.size(0);
int vec_height = vec.size(1);
dim3 blocks((height + BLOCKHEIGHT4 - 1) / BLOCKHEIGHT4,
(width + BLOCKWIDTH - 1) / BLOCKWIDTH);
dim3 threads(BLOCKWIDTH);
const at::cuda::OptionalCUDAGuard device_guard(device_of(vec));
const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
vllm::squeezellm::NUQ4MatMulKernel<<<blocks, threads, 0, stream>>>(
#ifndef USE_ROCM
(half2*)vec.data_ptr<at::Half>(),
#else
(__half2*)vec.data_ptr<at::Half>(),
#endif
mat.data_ptr<int>(),
#ifndef USE_ROCM
(half2*)mul.data_ptr<at::Half>(),
(__half*)lookup_table.data_ptr<at::Half>(),
#else
(float2*)mul.data_ptr<float>(),
(__half*)lookup_table.data_ptr<at::Half>(),
#endif
height, width, batch, vec_height);
}
#undef BLOCKWIDTH
#undef BLOCKHEIGHT4
...@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -36,8 +36,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// PagedAttention V2. // PagedAttention V2.
ops.def( ops.def(
"paged_attention_v2(" "paged_attention_v2("
" Tensor! out, Tensor exp_sums, Tensor max_logits," " Tensor! out, Tensor! exp_sums, Tensor! max_logits,"
" Tensor tmp_out, Tensor query, Tensor key_cache," " Tensor! tmp_out, Tensor query, Tensor key_cache,"
" Tensor value_cache, int num_kv_heads, float scale," " Tensor value_cache, int num_kv_heads, float scale,"
" Tensor block_tables, Tensor seq_lens, int block_size," " Tensor block_tables, Tensor seq_lens, int block_size,"
" int max_seq_len, Tensor? alibi_slopes," " int max_seq_len, Tensor? alibi_slopes,"
...@@ -113,7 +113,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -113,7 +113,11 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("gelu_quick", torch::kCUDA, &gelu_quick); ops.impl("gelu_quick", torch::kCUDA, &gelu_quick);
// prepare_inputs advance_step // prepare_inputs advance_step
ops.def("advance_step", &advance_step); ops.def(
"advance_step(int num_seqs, int num_queries, int block_size, "
"Tensor! input_tokens, Tensor sampled_token_ids, "
"Tensor! input_positions, Tensor! seq_lens, Tensor! slot_mapping, "
"Tensor block_tables) -> ()");
ops.impl("advance_step", torch::kCUDA, &advance_step); ops.impl("advance_step", torch::kCUDA, &advance_step);
// Layernorm // Layernorm
...@@ -175,27 +179,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -175,27 +179,56 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Quantization ops // Quantization ops
#ifndef USE_ROCM #ifndef USE_ROCM
// Quantized GEMM for AQLM. // Quantized GEMM for AQLM.
ops.def("aqlm_gemm", &aqlm_gemm); ops.def(
"aqlm_gemm(Tensor input, Tensor codes, Tensor codebooks, "
"Tensor scales, int[] codebook_partition_sizes, Tensor? bias) "
"-> Tensor");
ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm); ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
// Decompression method for AQLM. // Decompression method for AQLM.
ops.def("aqlm_dequant", &aqlm_dequant); ops.def(
"aqlm_dequant(Tensor codes, Tensor codebooks, "
"int[] codebook_partition_sizes) -> Tensor");
ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant); ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
// Quantized GEMM for AWQ. // Quantized GEMM for AWQ.
ops.def("awq_gemm", &awq_gemm); ops.def(
"awq_gemm(Tensor _in_feats, Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters) -> Tensor");
ops.impl("awq_gemm", torch::kCUDA, &awq_gemm); ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
// Dequantization for AWQ. // Dequantization for AWQ.
ops.def("awq_dequantize", &awq_dequantize); ops.def(
"awq_dequantize(Tensor _kernel, Tensor _scaling_factors, "
"Tensor _zeros, int split_k_iters, int thx, int thy) -> Tensor");
ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize); ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
// Note about marlin kernel 'workspace' arguments:
// Technically these should be mutable since they are modified by the kernel.
// But since they are set back to zero once the kernel is finished we can
// hand wave and say that they have no net effect.
//
// The reason to mark 'workspace' as immutable is so that they don't interfere
// with using ScalarType arguments in the ops. If they are marked as mutable,
// pytorch throws an assert in
// 'torch._higher_order_ops._register_effectful_op' that prevents these
// kernels from being torch.compile'd.
// See the following document for more info on custom types and ops that use
// custom types:
// https://docs.google.com/document/d/18fBMPuOJ0fY5ZQ6YyrHUppw9FA332CpNtgB6SOIgyuA
// Marlin (Dense) Optimized Quantized GEMM for GPTQ. // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
ops.def("marlin_gemm", &marlin_gemm); ops.def(
"marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int size_m, int size_n, int size_k) -> Tensor");
ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm); ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
// Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ. // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm); ops.def(
"gptq_marlin_24_gemm(Tensor a, Tensor b_q_weight, Tensor b_meta, "
"Tensor b_scales, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
"int size_m, int size_n, int size_k) -> Tensor");
ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm); ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
// Machete (Dense) Optimized Mixed Precision GEMM for Hopper. // Machete (Dense) Optimized Mixed Precision GEMM for Hopper.
...@@ -214,35 +247,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -214,35 +247,55 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B); ops.impl("machete_prepack_B", torch::kCUDA, &machete::prepack_B);
// gptq_marlin Optimized Quantized GEMM for GPTQ. // gptq_marlin Optimized Quantized GEMM for GPTQ.
ops.def("gptq_marlin_gemm", &gptq_marlin_gemm); ops.def(
"gptq_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor b_zeros, Tensor g_idx, Tensor perm, Tensor workspace, "
"__torch__.torch.classes._core_C.ScalarType b_q_type, "
"int size_m, int size_n, int size_k, bool is_k_full, "
"bool has_zp, bool use_fp32_reduce) -> Tensor");
ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm); ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
// gptq_marlin repack from GPTQ. // gptq_marlin repack from GPTQ.
ops.def("gptq_marlin_repack", &gptq_marlin_repack); ops.def(
"gptq_marlin_repack(Tensor b_q_weight, Tensor perm, "
"SymInt size_k, SymInt size_n, int num_bits) -> Tensor");
ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack); ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
ops.impl("gptq_marlin_repack", torch::kMeta, &gptq_marlin_repack_meta);
// awq_marlin repack from AWQ. // awq_marlin repack from AWQ.
ops.def("awq_marlin_repack", &awq_marlin_repack); ops.def(
"awq_marlin_repack(Tensor b_q_weight, SymInt size_k, "
"SymInt size_n, int num_bits) -> Tensor");
ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack); ops.impl("awq_marlin_repack", torch::kCUDA, &awq_marlin_repack);
ops.impl("awq_marlin_repack", torch::kMeta, &awq_marlin_repack_meta);
// Dequantization for GGML. // Dequantization for GGML.
ops.def("ggml_dequantize", &ggml_dequantize); ops.def("ggml_dequantize(Tensor W, int type, int m, int n) -> Tensor");
ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize); ops.impl("ggml_dequantize", torch::kCUDA, &ggml_dequantize);
// mmvq kernel for GGML. // mmvq kernel for GGML.
ops.def("ggml_mul_mat_vec_a8", &ggml_mul_mat_vec_a8); ops.def(
"ggml_mul_mat_vec_a8(Tensor W, Tensor X, int type, int row) "
"-> Tensor");
ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8); ops.impl("ggml_mul_mat_vec_a8", torch::kCUDA, &ggml_mul_mat_vec_a8);
// mmq kernel for GGML. // mmq kernel for GGML.
ops.def("ggml_mul_mat_a8", &ggml_mul_mat_a8); ops.def("ggml_mul_mat_a8(Tensor W, Tensor X, int type, int row) -> Tensor");
ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8); ops.impl("ggml_mul_mat_a8", torch::kCUDA, &ggml_mul_mat_a8);
// fp8_marlin Optimized Quantized GEMM for FP8 weight-only. // fp8_marlin Optimized Quantized GEMM for FP8 weight-only.
ops.def("fp8_marlin_gemm", &fp8_marlin_gemm); ops.def(
"fp8_marlin_gemm(Tensor a, Tensor b_q_weight, Tensor b_scales, "
"Tensor! workspace, int num_bits, int size_m, int size_n, "
"int size_k) -> Tensor");
ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm); ops.impl("fp8_marlin_gemm", torch::kCUDA, &fp8_marlin_gemm);
// marlin_qqq_gemm for QQQ. // marlin_qqq_gemm for QQQ.
ops.def("marlin_qqq_gemm", &marlin_qqq_gemm); ops.def(
"marlin_qqq_gemm(Tensor a, Tensor b_q_weight, "
"Tensor s_tok, Tensor s_ch, Tensor s_group, "
"Tensor! workspace, int size_m, int size_n, "
"int size_k) -> Tensor");
ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm); ops.impl("marlin_qqq_gemm", torch::kCUDA, &marlin_qqq_gemm);
// CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
...@@ -264,16 +317,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -264,16 +317,16 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Check if cutlass scaled_mm is supported for CUDA devices of the given // Check if cutlass scaled_mm is supported for CUDA devices of the given
// capability // capability
ops.def("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8); ops.def("cutlass_scaled_mm_supports_fp8(int cuda_device_capability) -> bool");
ops.impl("cutlass_scaled_mm_supports_fp8", torch::kCUDA, ops.impl("cutlass_scaled_mm_supports_fp8", &cutlass_scaled_mm_supports_fp8);
&cutlass_scaled_mm_supports_fp8);
// Mamba selective scan kernel // Mamba selective scan kernel
ops.def( ops.def(
"selective_scan_fwd(Tensor! u, Tensor! delta," "selective_scan_fwd(Tensor! u, Tensor! delta,"
"Tensor! A, Tensor! B, Tensor! C," "Tensor! A, Tensor! B, Tensor! C,"
"Tensor? D_, Tensor? z_, Tensor? delta_bias_," "Tensor? D_, Tensor? z_, Tensor? delta_bias_,"
"bool delta_softplus," "bool delta_softplus,"
"Tensor? index_, Tensor? x) -> Tensor[]"); "Tensor? index_, Tensor(a! -> *)? x) -> Tensor(a)[]");
ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd); ops.impl("selective_scan_fwd", torch::kCUDA, &selective_scan_fwd);
ops.def( ops.def(
...@@ -295,19 +348,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -295,19 +348,18 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
#endif #endif
// Quantized GEMM for GPTQ. // Quantized GEMM for GPTQ.
// ops.def("gptq_gemm", &gptq_gemm); // Note: even though the C++ inferred schema is correct for this op, it seems
// to prevent the meta function registry.
// ops.def(
// "gptq_gemm(Tensor a, Tensor b_q_weight, Tensor b_gptq_qzeros, "
// "Tensor b_gptq_scales, Tensor b_g_idx, bool use_exllama, int bit) "
// "-> Tensor");
// ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm); // ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
// Post processing for GPTQ. // Post processing for GPTQ.
// ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()"); // ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
// ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle); // ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
// Quantized GEMM for SqueezeLLM.
ops.def(
"squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
"lookup_table) -> ()");
ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
// Compute FP8 quantized tensor for given scaling factor. // Compute FP8 quantized tensor for given scaling factor.
// ops.def( // ops.def(
// "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()"); // "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
...@@ -322,8 +374,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -322,8 +374,8 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// Compute dynamic-per-token FP8 quantized tensor and scaling factor. // Compute dynamic-per-token FP8 quantized tensor and scaling factor.
// ops.def( // ops.def(
// "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! " // "dynamic_per_token_scaled_fp8_quant(Tensor! out, Tensor input, "
// "scale, Tensor? scale_ub) -> " // "Tensor! scale, Tensor? scale_ub) -> "
// "()"); // "()");
// ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA, // ops.impl("dynamic_per_token_scaled_fp8_quant", torch::kCUDA,
// &dynamic_per_token_scaled_fp8_quant); // &dynamic_per_token_scaled_fp8_quant);
...@@ -360,8 +412,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -360,8 +412,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
// Copy the cache blocks from src to dst. // Copy the cache blocks from src to dst.
cache_ops.def( cache_ops.def(
"copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor " "copy_blocks(Tensor(a!)[] key_caches, Tensor[](b!) value_caches, "
"block_mapping) -> ()"); "Tensor block_mapping) -> ()");
cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks); cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
// Reshape the key and value tensors and cache them. // Reshape the key and value tensors and cache them.
...@@ -386,8 +438,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) { ...@@ -386,8 +438,8 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
// Convert the key and value cache to fp8 data type. // Convert the key and value cache to fp8 data type.
cache_ops.def( cache_ops.def(
"convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str " "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, "
"kv_cache_dtype) -> ()"); "str kv_cache_dtype) -> ()");
cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8); cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
} }
...@@ -395,24 +447,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) { ...@@ -395,24 +447,28 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
// Cuda utils // Cuda utils
// Gets the specified device attribute. // Gets the specified device attribute.
cuda_utils.def("get_device_attribute", &get_device_attribute); cuda_utils.def("get_device_attribute(int attribute, int device_id) -> int");
cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute); cuda_utils.impl("get_device_attribute", &get_device_attribute);
// Gets the maximum shared memory per block device attribute. // Gets the maximum shared memory per block device attribute.
cuda_utils.def("get_max_shared_memory_per_block_device_attribute", cuda_utils.def(
&get_max_shared_memory_per_block_device_attribute); "get_max_shared_memory_per_block_device_attribute(int device_id) -> int");
cuda_utils.impl("get_max_shared_memory_per_block_device_attribute", cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
torch::kCUDA,
&get_max_shared_memory_per_block_device_attribute); &get_max_shared_memory_per_block_device_attribute);
} }
#ifndef USE_ROCM #ifndef USE_ROCM
TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
// Custom all-reduce kernels // Custom all-reduce kernels
custom_ar.def("init_custom_ar", &init_custom_ar); custom_ar.def(
"init_custom_ar(Tensor meta, Tensor rank_data, "
"str[] handles, int[] offsets, int rank, "
"bool full_nvlink) -> int");
custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar); custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
custom_ar.def("should_custom_ar", &should_custom_ar); custom_ar.def(
"should_custom_ar(Tensor inp, int max_size, int world_size, "
"bool full_nvlink) -> bool");
custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar); custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()"); custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
...@@ -424,21 +480,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) { ...@@ -424,21 +480,15 @@ TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg); custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
custom_ar.def("dispose", &dispose); custom_ar.def("dispose", &dispose);
custom_ar.impl("dispose", torch::kCPU, &dispose);
custom_ar.def("meta_size", &meta_size); custom_ar.def("meta_size", &meta_size);
custom_ar.impl("meta_size", torch::kCPU, &meta_size);
custom_ar.def("register_buffer", &register_buffer); custom_ar.def(
"register_buffer(int fa, Tensor t, str[] handles, "
"int[] offsets) -> ()");
custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer); custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta); custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
&get_graph_buffer_ipc_meta);
custom_ar.def("register_graph_buffers", &register_graph_buffers); custom_ar.def("register_graph_buffers", &register_graph_buffers);
custom_ar.impl("register_graph_buffers", torch::kCPU,
&register_graph_buffers);
} }
#endif #endif
......
...@@ -11,6 +11,5 @@ pydantic >= 2.8 ...@@ -11,6 +11,5 @@ pydantic >= 2.8
torch torch
py-cpuinfo py-cpuinfo
transformers transformers
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
mistral_common >= 1.3.4 mistral_common >= 1.3.4
openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args openai # Required by docs/source/serving/openai_compatible_server.md's vllm.entrypoints.openai.cli_args
\ No newline at end of file
...@@ -5,6 +5,7 @@ vLLM Meetups ...@@ -5,6 +5,7 @@ vLLM Meetups
We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below: We host regular meetups in San Francisco Bay Area every 2 months. We will share the project updates from the vLLM team and have guest speakers from the industry to share their experience and insights. Please find the materials of our previous meetups below:
- `The sixth vLLM meetup <https://lu.ma/87q3nvnh>`__, with NVIDIA, September 9th 2024. `[Slides] <https://docs.google.com/presentation/d/1wrLGwytQfaOTd5wCGSPNhoaW3nq0E-9wqyP7ny93xRs/edit?usp=sharing>`__
- `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__ - `The fifth vLLM meetup <https://lu.ma/lp0gyjqr>`__, with AWS, July 24th 2024. `[Slides] <https://docs.google.com/presentation/d/1RgUD8aCfcHocghoP3zmXzck9vX3RCI9yfUAB2Bbcl4Y/edit?usp=sharing>`__
- `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__ - `The fourth vLLM meetup <https://lu.ma/agivllm>`__, with Cloudflare and BentoML, June 11th 2024. `[Slides] <https://docs.google.com/presentation/d/1iJ8o7V2bQEi0BFEljLTwc5G1S10_Rhv3beed5oB0NJ4/edit?usp=sharing>`__
- `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__ - `The third vLLM meetup <https://robloxandvllmmeetup2024.splashthat.com/>`__, with Roblox, April 2nd 2024. `[Slides] <https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing>`__
......
...@@ -99,6 +99,7 @@ autodoc_mock_imports = [ ...@@ -99,6 +99,7 @@ autodoc_mock_imports = [
"aiohttp", "aiohttp",
"compressed_tensors", "compressed_tensors",
"cpuinfo", "cpuinfo",
"cv2",
"torch", "torch",
"transformers", "transformers",
"psutil", "psutil",
......
...@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/. ...@@ -17,14 +17,28 @@ Traces can be visualized using https://ui.perfetto.dev/.
.. tip:: .. tip::
Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly. Only send a few requests through vLLM when profiling, as the traces can get quite large. Also, no need to untar the traces, they can be viewed directly.
Example commands: .. tip::
To stop the profiler - it flushes out all the profile trace files to the directory. This takes time, for example for about 100 requests worth of data for a llama 70b, it takes about 10 minutes to flush out on a H100.
Set the env variable VLLM_RPC_GET_DATA_TIMEOUT_MS to a big number before you start the server. Say something like 30 minutes.
``export VLLM_RPC_GET_DATA_TIMEOUT_MS=1800000``
Example commands and usage:
===========================
Offline Inference:
------------------
Refer to `examples/offline_inference_with_profiler.py <https://github.com/vllm-project/vllm/blob/main/examples/offline_inference_with_profiler.py>`_ for an example.
OpenAI Server: OpenAI Server:
--------------
.. code-block:: bash .. code-block:: bash
VLLM_TORCH_PROFILER_DIR=/mnt/traces/ python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B VLLM_TORCH_PROFILER_DIR=./vllm_profile python -m vllm.entrypoints.openai.api_server --model meta-llama/Meta-Llama-3-70B
benchmark_serving.py: benchmark_serving.py:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment