Unverified Commit 5eda21e7 authored by Li, Jiang's avatar Li, Jiang Committed by GitHub
Browse files

[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344)

parent 8e1cddcd
...@@ -32,10 +32,10 @@ docker exec cpu-test bash -c " ...@@ -32,10 +32,10 @@ docker exec cpu-test bash -c "
--ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported --ignore=tests/models/decoder_only/language/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test # Run compressed-tensor test
# docker exec cpu-test bash -c " docker exec cpu-test bash -c "
# pytest -s -v \ pytest -s -v \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \ tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
# tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynanmic_per_token" tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test # Run AWQ test
docker exec cpu-test bash -c " docker exec cpu-test bash -c "
......
...@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \ ...@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \ pip install --upgrade pip && \
pip install -r requirements-build.txt pip install -r requirements-build.txt
# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release
FROM cpu-test-1 AS build FROM cpu-test-1 AS build
WORKDIR /workspace/vllm WORKDIR /workspace/vllm
......
include(FetchContent)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
set(CMAKE_EXPORT_COMPILE_COMMANDS ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
set(CMAKE_CXX_STANDARD 17)
# #
# Define environment variables for special configurations # Define environment variables for special configurations
...@@ -82,15 +85,40 @@ else() ...@@ -82,15 +85,40 @@ else()
message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.") message(FATAL_ERROR "vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support.")
endif() endif()
#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
#
if (AVX512_FOUND AND NOT AVX512_DISABLED)
FetchContent_Declare(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.5.3
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
set(ONEDNN_LIBRARY_TYPE "STATIC")
set(ONEDNN_BUILD_DOC "OFF")
set(ONEDNN_BUILD_EXAMPLES "OFF")
set(ONEDNN_BUILD_TESTS "OFF")
set(ONEDNN_ENABLE_WORKLOAD "INFERENCE")
set(ONEDNN_ENABLE_PRIMITIVE "MATMUL;REORDER")
set(ONEDNN_BUILD_GRAPH "OFF")
set(ONEDNN_ENABLE_JIT_PROFILING "OFF")
set(ONEDNN_ENABLE_ITT_TASKS "OFF")
set(ONEDNN_ENABLE_MAX_CPU_ISA "OFF")
set(ONEDNN_ENABLE_CPU_ISA_HINTS "OFF")
set(CMAKE_POLICY_DEFAULT_CMP0077 NEW)
FetchContent_MakeAvailable(oneDNN)
list(APPEND LIBS dnnl)
endif()
message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}") message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
list(APPEND LIBS numa) list(APPEND LIBS numa)
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
if (AVX2_FOUND OR AVX512_FOUND)
list(APPEND LIBS dnnl)
endif()
# #
# _C extension # _C extension
# #
......
...@@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec<FP32Vec8> { ...@@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); } void save(float *ptr) const { _mm256_storeu_ps(ptr, reg); }
}; };
#ifdef __AVX512F__
struct INT32Vec16: public Vec<INT32Vec16> {
constexpr static int VEC_ELEM_NUM = 16;
union AliasReg {
__m512i reg;
int32_t values[VEC_ELEM_NUM];
};
__m512i reg;
explicit INT32Vec16(const void* data_ptr) : reg(_mm512_loadu_epi32(data_ptr)) {}
void save(int32_t* ptr) const {
_mm512_storeu_epi32(ptr, reg);
}
void save(int32_t* ptr, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
_mm512_mask_storeu_epi32(ptr, mask, reg);
}
};
#endif
#ifdef __AVX512F__ #ifdef __AVX512F__
struct FP32Vec16 : public Vec<FP32Vec16> { struct FP32Vec16 : public Vec<FP32Vec16> {
constexpr static int VEC_ELEM_NUM = 16; constexpr static int VEC_ELEM_NUM = 16;
...@@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16(__m512 data) : reg(data) {} explicit FP32Vec16(__m512 data) : reg(data) {}
explicit FP32Vec16(const FP32Vec16 &data) : reg(data.reg) {}
explicit FP32Vec16(const FP32Vec4 &data) explicit FP32Vec16(const FP32Vec4 &data)
: reg((__m512)_mm512_inserti32x4( : reg((__m512)_mm512_inserti32x4(
_mm512_inserti32x4( _mm512_inserti32x4(
...@@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {} explicit FP32Vec16(const BF16Vec8 &v) : FP32Vec16(FP32Vec8(v)) {}
explicit FP32Vec16(const INT32Vec16 &v)
: reg(_mm512_cvt_roundepi32_ps(v.reg, _MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC)) {}
FP32Vec16 operator*(const FP32Vec16 &b) const { FP32Vec16 operator*(const FP32Vec16 &b) const {
return FP32Vec16(_mm512_mul_ps(reg, b.reg)); return FP32Vec16(_mm512_mul_ps(reg, b.reg));
} }
...@@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg)); return FP32Vec16(_mm512_mask_max_ps(reg, mask, reg, b.reg));
} }
FP32Vec16 min(const FP32Vec16& b) const {
return FP32Vec16(_mm512_min_ps(reg, b.reg));
}
FP32Vec16 min(const FP32Vec16& b, const int elem_num) const {
constexpr uint32_t M = 0xFFFFFFFF;
__mmask16 mask = _cvtu32_mask16(M >> (32 - elem_num));
return FP32Vec16(_mm512_mask_min_ps(reg, mask, reg, b.reg));
}
FP32Vec16 abs() const { FP32Vec16 abs() const {
return FP32Vec16(_mm512_abs_ps(reg)); return FP32Vec16(_mm512_abs_ps(reg));
} }
...@@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> { ...@@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
float reduce_max() const { return _mm512_reduce_max_ps(reg); } float reduce_max() const { return _mm512_reduce_max_ps(reg); }
float reduce_min() const { return _mm512_reduce_min_ps(reg); }
template <int group_size> float reduce_sub_sum(int idx) { template <int group_size> float reduce_sub_sum(int idx) {
static_assert(VEC_ELEM_NUM % group_size == 0); static_assert(VEC_ELEM_NUM % group_size == 0);
constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size)); constexpr uint32_t base_mask = (0xFFFF >> (16 - group_size));
......
This diff is collapsed.
...@@ -11,6 +11,13 @@ void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, ...@@ -11,6 +11,13 @@ void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b_scales, const torch::Tensor& b_scales,
const c10::optional<torch::Tensor>& bias); const c10::optional<torch::Tensor>& bias);
void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a,
const torch::Tensor& b, const torch::Tensor& a_scales,
const torch::Tensor& b_scales,
const torch::Tensor& azp_adj,
const c10::optional<torch::Tensor>& azp,
const c10::optional<torch::Tensor>& bias);
TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
// vLLM custom ops // vLLM custom ops
...@@ -111,6 +118,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { ...@@ -111,6 +118,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor b, Tensor a_scales," " Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()"); " Tensor b_scales, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm); ops.impl("cutlass_scaled_mm", torch::kCPU, &int8_scaled_mm);
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
// quantization.
ops.def(
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()");
ops.impl("cutlass_scaled_mm_azp", torch::kCPU, &int8_scaled_mm_azp);
#endif #endif
} }
......
...@@ -59,20 +59,6 @@ Build from source ...@@ -59,20 +59,6 @@ Build from source
$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy $ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
- Third, build and install oneDNN library from source:
.. code-block:: console
$ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
$ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL
$ cmake --build ./oneDNN/build --target install --config Release
- Finally, build and install vLLM CPU backend: - Finally, build and install vLLM CPU backend:
.. code-block:: console .. code-block:: console
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment