Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
OpenDAS
vllm_cscc
Commits
5eda21e7
Unverified
Commit
5eda21e7
authored
Oct 18, 2024
by
Li, Jiang
Committed by
GitHub
Oct 17, 2024
Browse files
[Hardware][CPU] compressed-tensor INT8 W8A8 AZP support (#9344)
parent
8e1cddcd
Changes
7
Expand all
Show whitespace changes
Inline
Side-by-side
Showing
7 changed files
with
452 additions
and
96 deletions
+452
-96
.buildkite/run-cpu-test.sh
.buildkite/run-cpu-test.sh
+4
-4
Dockerfile.cpu
Dockerfile.cpu
+0
-13
cmake/cpu_extension.cmake
cmake/cpu_extension.cmake
+34
-6
csrc/cpu/cpu_types_x86.hpp
csrc/cpu/cpu_types_x86.hpp
+39
-2
csrc/cpu/quant.cpp
csrc/cpu/quant.cpp
+360
-57
csrc/cpu/torch_bindings.cpp
csrc/cpu/torch_bindings.cpp
+15
-0
docs/source/getting_started/cpu-installation.rst
docs/source/getting_started/cpu-installation.rst
+0
-14
No files found.
.buildkite/run-cpu-test.sh
View file @
5eda21e7
...
...
@@ -32,10 +32,10 @@ docker exec cpu-test bash -c "
--ignore=tests/models/decoder_only/language/test_danube3_4b.py"
# Mamba and Danube3-4B on CPU is not supported
# Run compressed-tensor test
#
docker exec cpu-test bash -c "
#
pytest -s -v \
#
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup \
#
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dyna
n
mic_per_token"
docker
exec
cpu-test bash
-c
"
pytest -s -v
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_static_setup
\
tests/quantization/test_compressed_tensors.py::test_compressed_tensors_w8a8_dynamic_per_token"
# Run AWQ test
docker
exec
cpu-test bash
-c
"
...
...
Dockerfile.cpu
View file @
5eda21e7
...
...
@@ -33,19 +33,6 @@ RUN --mount=type=cache,target=/root/.cache/pip \
pip install --upgrade pip && \
pip install -r requirements-build.txt
# install oneDNN
RUN git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
RUN --mount=type=cache,target=/root/.cache/ccache \
cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
cmake --build ./oneDNN/build --target install --config Release
FROM cpu-test-1 AS build
WORKDIR /workspace/vllm
...
...
cmake/cpu_extension.cmake
View file @
5eda21e7
include
(
FetchContent
)
set
(
CMAKE_CXX_STANDARD_REQUIRED ON
)
set
(
CMAKE_CXX_EXTENSIONS ON
)
set
(
CMAKE_EXPORT_COMPILE_COMMANDS ON
)
set
(
CMAKE_CXX_STANDARD 17
)
#
# Define environment variables for special configurations
...
...
@@ -82,15 +85,40 @@ else()
message
(
FATAL_ERROR
"vLLM CPU backend requires AVX512 or AVX2 or Power9+ ISA support."
)
endif
()
message
(
STATUS
"CPU extension compile flags:
${
CXX_COMPILE_FLAGS
}
"
)
list
(
APPEND LIBS numa
)
#
# Build oneDNN for W8A8 GEMM kernels (only for x86-AVX512 platforms)
#
if
(
AVX512_FOUND AND NOT AVX512_DISABLED
)
FetchContent_Declare
(
oneDNN
GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
GIT_TAG v3.5.3
GIT_PROGRESS TRUE
GIT_SHALLOW TRUE
)
set
(
ONEDNN_LIBRARY_TYPE
"STATIC"
)
set
(
ONEDNN_BUILD_DOC
"OFF"
)
set
(
ONEDNN_BUILD_EXAMPLES
"OFF"
)
set
(
ONEDNN_BUILD_TESTS
"OFF"
)
set
(
ONEDNN_ENABLE_WORKLOAD
"INFERENCE"
)
set
(
ONEDNN_ENABLE_PRIMITIVE
"MATMUL;REORDER"
)
set
(
ONEDNN_BUILD_GRAPH
"OFF"
)
set
(
ONEDNN_ENABLE_JIT_PROFILING
"OFF"
)
set
(
ONEDNN_ENABLE_ITT_TASKS
"OFF"
)
set
(
ONEDNN_ENABLE_MAX_CPU_ISA
"OFF"
)
set
(
ONEDNN_ENABLE_CPU_ISA_HINTS
"OFF"
)
set
(
CMAKE_POLICY_DEFAULT_CMP0077 NEW
)
FetchContent_MakeAvailable
(
oneDNN
)
# Appending the dnnl library for the AVX2 and AVX512, as it is not utilized by Power architecture.
if
(
AVX2_FOUND OR AVX512_FOUND
)
list
(
APPEND LIBS dnnl
)
endif
()
message
(
STATUS
"CPU extension compile flags:
${
CXX_COMPILE_FLAGS
}
"
)
list
(
APPEND LIBS numa
)
#
# _C extension
#
...
...
csrc/cpu/cpu_types_x86.hpp
View file @
5eda21e7
...
...
@@ -265,6 +265,30 @@ struct FP32Vec8 : public Vec<FP32Vec8> {
void
save
(
float
*
ptr
)
const
{
_mm256_storeu_ps
(
ptr
,
reg
);
}
};
#ifdef __AVX512F__
struct
INT32Vec16
:
public
Vec
<
INT32Vec16
>
{
constexpr
static
int
VEC_ELEM_NUM
=
16
;
union
AliasReg
{
__m512i
reg
;
int32_t
values
[
VEC_ELEM_NUM
];
};
__m512i
reg
;
explicit
INT32Vec16
(
const
void
*
data_ptr
)
:
reg
(
_mm512_loadu_epi32
(
data_ptr
))
{}
void
save
(
int32_t
*
ptr
)
const
{
_mm512_storeu_epi32
(
ptr
,
reg
);
}
void
save
(
int32_t
*
ptr
,
const
int
elem_num
)
const
{
constexpr
uint32_t
M
=
0xFFFFFFFF
;
__mmask16
mask
=
_cvtu32_mask16
(
M
>>
(
32
-
elem_num
));
_mm512_mask_storeu_epi32
(
ptr
,
mask
,
reg
);
}
};
#endif
#ifdef __AVX512F__
struct
FP32Vec16
:
public
Vec
<
FP32Vec16
>
{
constexpr
static
int
VEC_ELEM_NUM
=
16
;
...
...
@@ -283,8 +307,6 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit
FP32Vec16
(
__m512
data
)
:
reg
(
data
)
{}
explicit
FP32Vec16
(
const
FP32Vec16
&
data
)
:
reg
(
data
.
reg
)
{}
explicit
FP32Vec16
(
const
FP32Vec4
&
data
)
:
reg
((
__m512
)
_mm512_inserti32x4
(
_mm512_inserti32x4
(
...
...
@@ -303,6 +325,9 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
explicit
FP32Vec16
(
const
BF16Vec8
&
v
)
:
FP32Vec16
(
FP32Vec8
(
v
))
{}
explicit
FP32Vec16
(
const
INT32Vec16
&
v
)
:
reg
(
_mm512_cvt_roundepi32_ps
(
v
.
reg
,
_MM_FROUND_TO_NEAREST_INT
|
_MM_FROUND_NO_EXC
))
{}
FP32Vec16
operator
*
(
const
FP32Vec16
&
b
)
const
{
return
FP32Vec16
(
_mm512_mul_ps
(
reg
,
b
.
reg
));
}
...
...
@@ -333,6 +358,16 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
return
FP32Vec16
(
_mm512_mask_max_ps
(
reg
,
mask
,
reg
,
b
.
reg
));
}
FP32Vec16
min
(
const
FP32Vec16
&
b
)
const
{
return
FP32Vec16
(
_mm512_min_ps
(
reg
,
b
.
reg
));
}
FP32Vec16
min
(
const
FP32Vec16
&
b
,
const
int
elem_num
)
const
{
constexpr
uint32_t
M
=
0xFFFFFFFF
;
__mmask16
mask
=
_cvtu32_mask16
(
M
>>
(
32
-
elem_num
));
return
FP32Vec16
(
_mm512_mask_min_ps
(
reg
,
mask
,
reg
,
b
.
reg
));
}
FP32Vec16
abs
()
const
{
return
FP32Vec16
(
_mm512_abs_ps
(
reg
));
}
...
...
@@ -341,6 +376,8 @@ struct FP32Vec16 : public Vec<FP32Vec16> {
float
reduce_max
()
const
{
return
_mm512_reduce_max_ps
(
reg
);
}
float
reduce_min
()
const
{
return
_mm512_reduce_min_ps
(
reg
);
}
template
<
int
group_size
>
float
reduce_sub_sum
(
int
idx
)
{
static_assert
(
VEC_ELEM_NUM
%
group_size
==
0
);
constexpr
uint32_t
base_mask
=
(
0xFFFF
>>
(
16
-
group_size
));
...
...
csrc/cpu/quant.cpp
View file @
5eda21e7
This diff is collapsed.
Click to expand it.
csrc/cpu/torch_bindings.cpp
View file @
5eda21e7
...
...
@@ -11,6 +11,13 @@ void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a,
const
torch
::
Tensor
&
b_scales
,
const
c10
::
optional
<
torch
::
Tensor
>&
bias
);
void
int8_scaled_mm_azp
(
torch
::
Tensor
&
c
,
const
torch
::
Tensor
&
a
,
const
torch
::
Tensor
&
b
,
const
torch
::
Tensor
&
a_scales
,
const
torch
::
Tensor
&
b_scales
,
const
torch
::
Tensor
&
azp_adj
,
const
c10
::
optional
<
torch
::
Tensor
>&
azp
,
const
c10
::
optional
<
torch
::
Tensor
>&
bias
);
TORCH_LIBRARY_EXPAND
(
TORCH_EXTENSION_NAME
,
ops
)
{
// vLLM custom ops
...
...
@@ -111,6 +118,14 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor? bias) -> ()"
);
ops
.
impl
(
"cutlass_scaled_mm"
,
torch
::
kCPU
,
&
int8_scaled_mm
);
// w8a8 GEMM, supporting asymmetric per-tensor or per-row/column
// quantization.
ops
.
def
(
"cutlass_scaled_mm_azp(Tensor! out, Tensor a,"
" Tensor b, Tensor a_scales,"
" Tensor b_scales, Tensor azp_adj,"
" Tensor? azp, Tensor? bias) -> ()"
);
ops
.
impl
(
"cutlass_scaled_mm_azp"
,
torch
::
kCPU
,
&
int8_scaled_mm_azp
);
#endif
}
...
...
docs/source/getting_started/cpu-installation.rst
View file @
5eda21e7
...
...
@@ -59,20 +59,6 @@ Build from source
$ pip install cmake>=3.26 wheel packaging ninja "setuptools-scm>=8" numpy
$ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
- Third, build and install oneDNN library from source:
.. code-block:: console
$ git clone -b rls-v3.5 https://github.com/oneapi-src/oneDNN.git
$ cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
-DONEDNN_BUILD_DOC=OFF \
-DONEDNN_BUILD_EXAMPLES=OFF \
-DONEDNN_BUILD_TESTS=OFF \
-DONEDNN_BUILD_GRAPH=OFF \
-DONEDNN_ENABLE_WORKLOAD=INFERENCE \
-DONEDNN_ENABLE_PRIMITIVE=MATMUL
$ cmake --build ./oneDNN/build --target install --config Release
- Finally, build and install vLLM CPU backend:
.. code-block:: console
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment