Unverified Commit eb89439d authored by qinyiqun's avatar qinyiqun Committed by GitHub
Browse files

Support Quantization (#996)



demo131 - multiple issues regarding quantization, qy, and so forth

* issue/843: success per_channel_quant_int8

* issue/843: success qy quant

* issue/843: modified quant

* Add w8a8int8 performance tests

* add infinicore op linear_w8a8i8

* w8a8 linear module functional nn

* issue/843: QY-GPU Support Int8 scale_mm (#68)

* issue/843: success qy scaled_mm

* issue/843: modified kernel.cuh as per_channel_dequant_int8.cuh

* fix parallel slic in w8

* w8: support multiple batch size

* temp: 修改quantconfig处理

* fix format and delete redundancy code

* fix format

* fix format

* fix format

* Refactor: add new API alongside legacy interfaces with deprecation warnings

* 添加w4 inifnicore相关内容,以及将Quantization config划入InfiniCore

* 量化算子支持图

* solve cub version problem and fix code structure

* fix format

* demo131 - remove commented lines

---------
Co-authored-by: default avatarxgqdut2016 <kenan_gewei@163.com>
Co-authored-by: default avatarxgqdut2016 <140036308+xgqdut2016@users.noreply.github.com>
Co-authored-by: default avatarwooway777 <wooway777@gmail.com>
parent abab5652
...@@ -140,20 +140,9 @@ void cutlass_int8_scaled_mm( ...@@ -140,20 +140,9 @@ void cutlass_int8_scaled_mm(
typename Gemm::Arguments args{ typename Gemm::Arguments args{
{m, n, k}, {a_ptr, lda}, {b_ptr, ldb}, {b_s_ptr, 0}, {a_s_ptr, 0}, {bias_ptr, ldc}, {o_ptr, ldd}, visitor_args}; {m, n, k}, {a_ptr, lda}, {b_ptr, ldb}, {b_s_ptr, 0}, {a_s_ptr, 0}, {bias_ptr, ldc}, {o_ptr, ldd}, visitor_args};
/* 需要先看看是否需要workspace */
// auto workspace = torch::empty(
// gemm_op.get_workspace_size(args), torch::TensorOptions().dtype(torch::kUInt8).device(mat_a.device()));
// auto can_implement = gemm_op.can_implement(args);
check_cutlass_status(gemm_op.can_implement(args)); check_cutlass_status(gemm_op.can_implement(args));
// TORCH_CHECK(
// can_implement == cutlass::Status::kSuccess,
// "gemm cannot implement, error: ",
// cutlassGetStatusString(can_implement));
auto status = gemm_op(args, nullptr, (cudaStream_t)stream); auto status = gemm_op(args, nullptr, (cudaStream_t)stream);
check_cutlass_status(status); check_cutlass_status(status);
// TORCH_CHECK(status == cutlass::Status::kSuccess, "gemm executioin failed, error: ", cutlassGetStatusString(status));
} }
template <typename ElementOutput, typename ArchTag, typename InstructionShape> template <typename ElementOutput, typename ArchTag, typename InstructionShape>
......
#ifdef ENABLE_CUTLASS_API
#include "../../../devices/nvidia/nvidia_handle.cuh" #include "../../../devices/nvidia/nvidia_handle.cuh"
#include "../../../devices/nvidia/nvidia_kernel_common.cuh" #include "../../../devices/nvidia/nvidia_kernel_common.cuh"
#ifdef ENABLE_CUTLASS_API
#include "int8_gemm_kernel.cuh" #include "int8_gemm_kernel.cuh"
#endif
#include "../cuda/per_channel_dequant_int8.cuh"
#include "int8_gemm_nvidia.cuh" #include "int8_gemm_nvidia.cuh"
template <typename Tdata>
INFINIOP_CUDA_KERNEL postSym(
Tdata *y, int32_t *y_packed, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
postSymKernel<Tdata>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
template <typename Tdata>
INFINIOP_CUDA_KERNEL postSym(
Tdata *y, int32_t *y_packed, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, int M, int K, int N) {
postSymKernel<Tdata>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
namespace op::i8gemm::nvidia { namespace op::i8gemm::nvidia {
struct Descriptor::Opaque { struct Descriptor::Opaque {
...@@ -14,6 +28,7 @@ Descriptor::~Descriptor() { ...@@ -14,6 +28,7 @@ Descriptor::~Descriptor() {
delete _opaque; delete _opaque;
} }
#ifdef ENABLE_NVIDIA_API
inline int getSMVersion() { inline int getSMVersion() {
int device{-1}; int device{-1};
CHECK_CUDA(cudaGetDevice(&device)); CHECK_CUDA(cudaGetDevice(&device));
...@@ -23,6 +38,7 @@ inline int getSMVersion() { ...@@ -23,6 +38,7 @@ inline int getSMVersion() {
CHECK_CUDA(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device)); CHECK_CUDA(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
return sm_major * 10 + sm_minor; return sm_major * 10 + sm_minor;
} }
#endif
infiniStatus_t Descriptor::create( infiniStatus_t Descriptor::create(
infiniopHandle_t handle_, infiniopHandle_t handle_,
...@@ -40,14 +56,63 @@ infiniStatus_t Descriptor::create( ...@@ -40,14 +56,63 @@ infiniStatus_t Descriptor::create(
auto result = I8GemmInfo::create(out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR); auto result = I8GemmInfo::create(out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result); CHECK_RESULT(result);
size_t workspace_size = out_desc->dim(0) * out_desc->dim(1) * sizeof(int32_t);
*desc_ptr = new Descriptor( *desc_ptr = new Descriptor(
new Opaque{handle->internal()}, new Opaque{handle->internal()},
result.take(), 0, dtype, result.take(), workspace_size, dtype,
handle->device, handle->device_id); handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t Descriptor::launchKernel(const I8GemmInfo &info, Tdata *y, const Tdata *bias, const int8_t *x_packed, const float *x_scale, const int8_t *w_packed, const float *w_scale, void *stream_, void *workspace) const {
cudaStream_t stream = (cudaStream_t)stream_;
int M = (int)info.m;
int K = (int)info.k;
int N = (int)info.n;
char *workspace_ptr = reinterpret_cast<char *>(workspace);
int32_t *y_packed = reinterpret_cast<int32_t *>(workspace_ptr);
const int32_t alpha_I = 1;
const int32_t beta_I = 0;
int lda = K; // w_packed is column-major [K, N]
int ldb = K; // x_packed is row-major [M, K]
int ldc = N; // y_packed is row-major [M, N]
CHECK_STATUS(this->_opaque->internal->useCublas(
stream,
[&](cublasHandle_t handle) {
CHECK_CUBLAS(cublasGemmEx(
handle,
CUBLAS_OP_T, // A = w_packed^T : [N, K]
CUBLAS_OP_N, // B = x_packed^T viewed column-major : [K, M]
N, // m
M, // n
K, // k
&alpha_I,
w_packed, CUDA_R_8I, lda,
x_packed, CUDA_R_8I, ldb,
&beta_I,
y_packed, CUDA_R_32I, ldc,
CUBLAS_COMPUTE_32I,
CUBLAS_GEMM_DEFAULT));
return INFINI_STATUS_SUCCESS;
}));
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (N + BLOCK_SIZE_x - 1) / BLOCK_SIZE_x;
int num_block_y = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, num_block_y, 1);
if (bias == nullptr) {
postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, x_packed, x_scale, w_packed, w_scale, M, K, N);
} else {
postSym<Tdata><<<grid_dim, block_dim, 0, stream>>>(y, y_packed, bias, x_packed, x_scale, w_packed, w_scale, M, K, N);
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate( infiniStatus_t Descriptor::calculate(
void *workspace, void *workspace,
size_t workspace_size, size_t workspace_size,
...@@ -58,6 +123,7 @@ infiniStatus_t Descriptor::calculate( ...@@ -58,6 +123,7 @@ infiniStatus_t Descriptor::calculate(
const void *b, const void *b,
const void *b_scale, const void *b_scale,
void *stream) const { void *stream) const {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API)
auto sm_version = getSMVersion(); auto sm_version = getSMVersion();
if (sm_version >= 75 && sm_version < 80) { if (sm_version >= 75 && sm_version < 80) {
CHECK_DTYPE(this->_out_dtype, INFINI_DTYPE_F16); CHECK_DTYPE(this->_out_dtype, INFINI_DTYPE_F16);
...@@ -111,7 +177,30 @@ infiniStatus_t Descriptor::calculate( ...@@ -111,7 +177,30 @@ infiniStatus_t Descriptor::calculate(
} else { } else {
return INFINI_STATUS_NOT_IMPLEMENTED; return INFINI_STATUS_NOT_IMPLEMENTED;
} }
#elif defined ENABLE_QY_API
#define CALCULATE_LINEAR(BLOCK_SIZE, TDATA) \
launchKernel<BLOCK_SIZE, TDATA>(_info, (TDATA *)out, (const TDATA *)bias, (const int8_t *)a, (const float *)a_scale, (const int8_t *)b, (const float *)b_scale, stream, workspace)
#define CALCULATE_LINEAR_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (this->_out_dtype == INFINI_DTYPE_F16) \
return CALCULATE_LINEAR(BLOCK_SIZE, half); \
else if (this->_out_dtype == INFINI_DTYPE_F32) \
return CALCULATE_LINEAR(BLOCK_SIZE, float); \
else if (this->_out_dtype == INFINI_DTYPE_BF16) \
return CALCULATE_LINEAR(BLOCK_SIZE, __nv_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_1024) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_512) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_512)
} else if (_opaque->internal->maxThreadsPerBlock() == CUDA_BLOCK_SIZE_4096) {
CALCULATE_LINEAR_WITH_BLOCK_SIZE(CUDA_BLOCK_SIZE_4096)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
#endif
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
} }
} // namespace op::i8gemm::nvidia } // namespace op::i8gemm::nvidia
#endif
\ No newline at end of file
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
#include "../../handle.h" #include "../../handle.h"
#include "infiniop/ops/int8_gemm.h" #include "infiniop/ops/int8_gemm.h"
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API) #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/int8_gemm_nvidia.cuh" #include "nvidia/int8_gemm_nvidia.cuh"
#endif #endif
...@@ -26,8 +26,11 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle, ...@@ -26,8 +26,11 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
b_desc, \ b_desc, \
b_scale_desc); b_scale_desc);
switch (handle->device) { switch (handle->device) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API) #if defined(ENABLE_NVIDIA_API)
CREATE(INFINI_DEVICE_NVIDIA, nvidia) CREATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
CREATE(INFINI_DEVICE_QY, nvidia)
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -41,8 +44,11 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des ...@@ -41,8 +44,11 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
case CASE: \ case CASE: \
*size = reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \ *size = reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->minWorkspaceSize(); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API) #if defined(ENABLE_NVIDIA_API)
GET(INFINI_DEVICE_NVIDIA, nvidia) GET(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
GET(INFINI_DEVICE_QY, nvidia)
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -65,8 +71,11 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc, ...@@ -65,8 +71,11 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
return reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->calculate( \ return reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc)->calculate( \
workspace, workspace_size, out, bias, a, a_scale, b, b_scale, stream); workspace, workspace_size, out, bias, a, a_scale, b, b_scale, stream);
switch (desc->device_type) { switch (desc->device_type) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API) #if defined(ENABLE_NVIDIA_API)
CACULATE(INFINI_DEVICE_NVIDIA, nvidia) CACULATE(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
CACULATE(INFINI_DEVICE_QY, nvidia)
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
...@@ -80,8 +89,11 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de ...@@ -80,8 +89,11 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
delete reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc); \ delete reinterpret_cast<op::i8gemm::NAMESPACE::Descriptor *>(desc); \
return INFINI_STATUS_SUCCESS; return INFINI_STATUS_SUCCESS;
switch (desc->device_type) { switch (desc->device_type) {
#if defined(ENABLE_NVIDIA_API) && defined(ENABLE_CUTLASS_API) #if defined(ENABLE_NVIDIA_API)
DESTROY(INFINI_DEVICE_NVIDIA, nvidia) DESTROY(INFINI_DEVICE_NVIDIA, nvidia)
#endif
#if defined(ENABLE_QY_API)
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif #endif
default: default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
...@@ -725,6 +725,41 @@ def dequantize_(lib): ...@@ -725,6 +725,41 @@ def dequantize_(lib):
] ]
@OpRegister.operator
def per_channel_quant_int8_(lib):
lib.infiniopCreatePerChannelQuantI8Descriptor.restype = c_int32
lib.infiniopCreatePerChannelQuantI8Descriptor.argtypes = [
infiniopHandle_t,
POINTER(infiniopOperatorDescriptor_t),
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
infiniopTensorDescriptor_t,
]
lib.infiniopGetPerChannelQuantI8WorkspaceSize.restype = c_int32
lib.infiniopGetPerChannelQuantI8WorkspaceSize.argtypes = [
infiniopOperatorDescriptor_t,
POINTER(c_size_t),
]
lib.infiniopPerChannelQuantI8.restype = c_int32
lib.infiniopPerChannelQuantI8.argtypes = [
infiniopOperatorDescriptor_t,
c_void_p,
c_size_t,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
c_void_p,
]
lib.infiniopDestroyPerChannelQuantI8Descriptor.restype = c_int32
lib.infiniopDestroyPerChannelQuantI8Descriptor.argtypes = [
infiniopOperatorDescriptor_t,
]
@OpRegister.operator @OpRegister.operator
def softplus_(lib): def softplus_(lib):
lib.infiniopCreateSoftplusDescriptor.restype = c_int32 lib.infiniopCreateSoftplusDescriptor.restype = c_int32
......
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
profile_operation,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES = [
# x_shape, w_shape, symmetric, bias_exit, y_shape
((8, 8), True),
((128, 512), True),
((128, 128), True),
((256, 1024), False),
((256, 2048), True),
((1024, 2048), False),
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16, InfiniDtype.F32]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 1e-3, "rtol": 5e-2},
InfiniDtype.BF16: {"atol": 1e-3, "rtol": 5e-2},
InfiniDtype.F32: {"atol": 3e-5, "rtol": 5e-3},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def per_token_quant_int8_torch(x, symmetric):
if symmetric:
x = x.float()
absmax = x.abs().max(dim=-1).values
absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
scale_x = absmax / 127
x_q = x.mul(127 / absmax)
x_q = torch.round(x_q).to(torch.int8)
return x_q, scale_x, None
else:
w = x.float()
w_min = w.min(dim=-1, keepdim=True)[0]
w_max = w.max(dim=-1, keepdim=True)[0]
w_scale = (w_max - w_min) / 255.0
w_scale = torch.clamp(w_scale, min=1e-8)
w_zero = -w_min / w_scale - 128.0
w_q = torch.round(w / w_scale + w_zero)
w_q = torch.clamp(w_q, -128, 127)
w_packed = w_q.to(torch.int8)
return w_packed, w_scale, w_zero
def test(
handle,
device,
x_shape,
symmetric,
dtype=InfiniDtype.F16,
sync=None,
):
print(
f"Testing Per Channel Quant Int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, symmetric:{symmetric} , dtype:{InfiniDtypeNames[dtype]}"
)
M, K = x_shape
x = TestTensor(x_shape, None, dtype, device)
x_p, x_s, x_z = per_token_quant_int8_torch(x.torch_tensor(), symmetric)
x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
if symmetric:
x_zero = None
else:
x_zero = TestTensor((M, 1), None, InfiniDtype.F32, device)
if sync is not None:
sync()
descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
handle,
ctypes.byref(descriptor),
x_packed.descriptor,
x_scale.descriptor,
None if symmetric else x_zero.descriptor,
x.descriptor,
)
)
# Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
x_packed.destroy_desc()
x_scale.destroy_desc()
if symmetric == False:
x_zero.destroy_desc()
workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
descriptor, ctypes.byref(workspace_size)
)
)
workspace = TestWorkspace(workspace_size.value, x.device)
def lib_per_channel_quant_int8():
check_error(
LIBINFINIOP.infiniopPerChannelQuantI8(
descriptor,
workspace.data(),
workspace_size.value,
x_packed.data(),
x_scale.data(),
None if symmetric else x_zero.data(),
x.data(),
None,
)
)
lib_per_channel_quant_int8()
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(x_packed.actual_tensor(), x_p, atol=atol, rtol=rtol)
debug(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol)
if symmetric == False:
debug(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol)
if symmetric:
assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and
torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol))
else:
assert (torch.allclose(x_packed.actual_tensor(), x_p, atol=2, rtol=2) and
torch.allclose(x_scale.actual_tensor(), x_s, atol=atol, rtol=rtol) and
torch.allclose(x_zero.actual_tensor(), x_z, atol=atol, rtol=rtol))
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: per_token_quant_int8_torch(x.torch_tensor(), symmetric), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(" lib", lambda: lib_per_channel_quant_int8(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
check_error(LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(descriptor))
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
import torch
import ctypes
from ctypes import c_uint64
from libinfiniop import (
LIBINFINIOP,
TestTensor,
get_test_devices,
check_error,
test_operator,
get_args,
debug,
get_tolerance,
TestWorkspace,
InfiniDtype,
InfiniDtypeNames,
InfiniDeviceNames,
infiniopOperatorDescriptor_t,
)
from enum import Enum, auto
# ==============================================================================
# Configuration (Internal Use Only)
# ==============================================================================
# These are not meant to be imported from other modules
_TEST_CASES_ = [
# x_shape = [M,K], w_shape = [N, K], sym, y_shape = [M, N]
((100, 3584), (10752, 3584), True, (100, 10752)),
((1000, 3584), (10752, 3584), True, (1000, 10752)),
((1, 3584), (10752, 3584), True, (1, 10752)),
((2000, 3584), (10752, 3584), True, (2000, 10752)),
]
class Inplace(Enum):
OUT_OF_PLACE = auto()
INPLACE = auto()
# Inplace options applied for each test case in _TEST_CASES_
_INPLACE = [
Inplace.INPLACE,
]
_TEST_CASES = [
test_case + (inplace_item,)
for test_case in _TEST_CASES_
for inplace_item in _INPLACE
]
# Data types used for testing
_TENSOR_DTYPES = [InfiniDtype.BF16, InfiniDtype.F16]
# Tolerance map for different data types
_TOLERANCE_MAP = {
InfiniDtype.F16: {"atol": 3e-1, "rtol": 1e-2},
InfiniDtype.BF16: {"atol": 3e-1, "rtol": 1e-2},
}
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def mm(x, w, bias, out_dtype):
return (torch.matmul(x, w + bias)).to(out_dtype)
def scaled_mm(x, w_p, w_s, bias, out_dtype):
return (
torch.matmul(x.to(torch.float32), w_p.to(torch.float32)) * w_s.view(1, -1)
+ bias
).to(out_dtype)
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
if bias is not None:
o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1) + bias
else:
o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
return o.to(out_dtype)
def per_token_quant_int8_torch(x):
x = x.float()
absmax = x.abs().max(dim=-1).values
absmax = absmax.clamp_min(1e-10).unsqueeze(-1)
scale_x = absmax / 127
x_q = x.mul(127 / absmax)
x_q = torch.round(x_q).to(torch.int8)
return x_q, scale_x
def test(
handle,
device,
x_shape,
w_shape,
symmetric,
y_shape,
inplace=Inplace.OUT_OF_PLACE,
dtype=InfiniDtype.BF16,
sync=None,
):
print(
f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, symmetric:{symmetric}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
)
M, K = x_shape
N = w_shape[0]
x = TestTensor(x_shape, None, dtype, device)
x_packed = TestTensor(x_shape, None, InfiniDtype.I8, device, mode="zeros")
x_scale = TestTensor((M, 1), None, InfiniDtype.F32, device)
dev = x.torch_tensor().device
weights_packed = to_int8(torch.randn(w_shape, device=dev).t() * 5)
weights_scale = torch.randn((N, 1), device=dev, dtype=torch.float32)
bias = (
torch.randn(
(N,),
device=dev,
dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
)
* 10
)
w_packed = TestTensor(
(K, N),
weights_packed.stride(),
InfiniDtype.I8,
device,
mode="manual",
set_tensor=weights_packed,
)
w_scale = TestTensor(
(N, 1),
weights_scale.stride(),
InfiniDtype.F32,
device,
mode="manual",
set_tensor=weights_scale,
)
weights = w_packed.torch_tensor() * w_scale.torch_tensor().view(1, -1)
y = TestTensor(y_shape, None, dtype, device)
bias = TestTensor(
(N,), bias.stride(), dtype, device, mode="manual", set_tensor=bias
)
x_mm = x.torch_tensor().to(
torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16
)
w_mm = weights.to(torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16)
quant_descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreatePerChannelQuantI8Descriptor(
handle,
ctypes.byref(quant_descriptor),
x_packed.descriptor,
x_scale.descriptor,
None,
x.descriptor,
)
)
quant_workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetPerChannelQuantI8WorkspaceSize(
quant_descriptor, ctypes.byref(quant_workspace_size)
)
)
quant_workspace = TestWorkspace(quant_workspace_size.value, x.device)
def lib_per_channel_quant_int8():
check_error(
LIBINFINIOP.infiniopPerChannelQuantI8(
quant_descriptor,
quant_workspace.data(),
quant_workspace_size.value,
x_packed.data(),
x_scale.data(),
None,
x.data(),
None,
)
)
scaled_mm_descriptor = infiniopOperatorDescriptor_t()
check_error(
LIBINFINIOP.infiniopCreateI8GemmDescriptor(
handle,
ctypes.byref(scaled_mm_descriptor),
y.descriptor,
bias.descriptor,
x_packed.descriptor,
x_scale.descriptor,
w_packed.descriptor,
w_scale.descriptor,
)
)
scaled_mm_workspace_size = c_uint64(0)
check_error(
LIBINFINIOP.infiniopGetI8GemmWorkspaceSize(
scaled_mm_descriptor, ctypes.byref(scaled_mm_workspace_size)
)
)
scaled_mm_workspace = TestWorkspace(scaled_mm_workspace_size.value, x_packed.device)
def lib_linear():
check_error(
LIBINFINIOP.infiniopI8Gemm(
scaled_mm_descriptor,
scaled_mm_workspace.data(),
scaled_mm_workspace_size.value,
y.data(),
bias.data(),
x_packed.data(),
x_scale.data(),
w_packed.data(),
w_scale.data(),
None,
)
)
def lib_w8a8int8_linearFunction():
lib_per_channel_quant_int8()
lib_linear()
def lib_torch_mm():
mm(
x_mm,
w_mm,
bias.torch_tensor(),
out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
)
x_p, x_s = per_token_quant_int8_torch(x.torch_tensor())
lib_w8a8int8_linearFunction()
scaled_mm_torch = torch_scaled_mm(
x_p,
w_packed.torch_tensor(),
x_s,
w_scale.torch_tensor(),
torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
bias=bias.torch_tensor(),
)
mm_torch = scaled_mm(
x.torch_tensor(),
w_packed.torch_tensor(),
w_scale.torch_tensor(),
bias.torch_tensor(),
out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
)
if sync is not None:
sync()
atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
if DEBUG:
debug(y.actual_tensor(), mm_torch, atol=atol, rtol=rtol)
# The quantization test did not normalize the test data, leading to large errors; the error check has been temporarily removed.
def profile_operation(name, func, device, num_prerun, num_iterations):
# Warm up
for _ in range(num_prerun):
func()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
for _ in range(num_iterations):
func()
end.record()
torch.cuda.synchronize()
elapsed = start.elapsed_time(end)
print(
f"{name} took {elapsed / num_iterations:.6f} ms over {num_iterations} iterations"
)
# Profiling workflow
if PROFILE:
profile_operation(
"PyTorch mm ",
lambda: lib_torch_mm(),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
profile_operation(
"lib total ",
lambda: lib_w8a8int8_linearFunction(),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
profile_operation(
"lib quant ",
lambda: lib_per_channel_quant_int8(),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
profile_operation(
"lib scaled mm ",
lambda: lib_linear(),
device,
NUM_PRERUN,
NUM_ITERATIONS,
)
check_error(LIBINFINIOP.infiniopDestroyI8GemmDescriptor(scaled_mm_descriptor))
check_error(
LIBINFINIOP.infiniopDestroyPerChannelQuantI8Descriptor(quant_descriptor)
)
if __name__ == "__main__":
args = get_args()
# Configure testing options
DEBUG = args.debug
PROFILE = args.profile
NUM_PRERUN = args.num_prerun
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
Subproject commit 55f93686c01528224f448c19128836e7df245f72
...@@ -11,6 +11,7 @@ set_encodings("utf-8") ...@@ -11,6 +11,7 @@ set_encodings("utf-8")
add_includedirs("include") add_includedirs("include")
add_includedirs("third_party/spdlog/include") add_includedirs("third_party/spdlog/include")
add_includedirs("third_party/nlohmann_json/single_include/")
if is_mode("debug") then if is_mode("debug") then
add_defines("DEBUG_MODE") add_defines("DEBUG_MODE")
...@@ -330,6 +331,7 @@ target("infiniop") ...@@ -330,6 +331,7 @@ target("infiniop")
if has_config("qy-gpu") then if has_config("qy-gpu") then
add_deps("infiniop-qy") add_deps("infiniop-qy")
add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true}) add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/nvidia/*.cu.o", {public = true})
add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/ops/*/*/nvidia/*.cu.o", {public = true})
add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true}) add_files("build/.objs/infiniop-qy/rules/qy.cuda/src/infiniop/devices/nvidia/*.cu.o", {public = true})
end end
...@@ -353,7 +355,7 @@ target("infiniop") ...@@ -353,7 +355,7 @@ target("infiniop")
end end
set_languages("cxx17") set_languages("cxx17")
add_files("src/infiniop/devices/handle.cc") add_files("src/infiniop/devices/handle.cc")
add_files("src/infiniop/ops/*/operator.cc") add_files("src/infiniop/ops/*/operator.cc", "src/infiniop/ops/*/*/operator.cc")
add_files("src/infiniop/*.cc") add_files("src/infiniop/*.cc")
set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini")) set_installdir(os.getenv("INFINI_ROOT") or (os.getenv(is_host("windows") and "HOMEPATH" or "HOME") .. "/.infini"))
......
...@@ -71,7 +71,7 @@ target("infiniop-nvidia") ...@@ -71,7 +71,7 @@ target("infiniop-nvidia")
end end
set_languages("cxx17") set_languages("cxx17")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu") add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
if has_config("ninetoothed") then if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp") add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")
......
...@@ -99,7 +99,7 @@ target("infiniop-qy") ...@@ -99,7 +99,7 @@ target("infiniop-qy")
add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations") add_cuflags("-Xcompiler=-Wno-error=deprecated-declarations")
set_languages("cxx17") set_languages("cxx17")
add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu") add_files("../src/infiniop/devices/nvidia/*.cu", "../src/infiniop/ops/*/nvidia/*.cu", "../src/infiniop/ops/*/*/nvidia/*.cu")
if has_config("ninetoothed") then if has_config("ninetoothed") then
add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp") add_files("../build/ninetoothed/*.c", "../build/ninetoothed/*.cpp")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment