Unverified Commit 5675a4af authored by thatPepe's avatar thatPepe Committed by GitHub
Browse files

Merge pull request #1018 from InfiniTensor/issue/972

Issue/972:摩尔平台基于 muDNN 的 w8a8 量化实现,并完善 scaled_mm_int8 python 测试脚本
parents 6ec2ea40 6841663b
#ifndef __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
#define __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
#include "../per_channel_quant_int8.h"
DESCRIPTOR(moore)
#endif // __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
#include "../../../../devices/moore/moore_common.h"
#include "per_channel_quant_int8_moore.h"
#include "../../../../devices/moore/moore_kernel_common.h"
#include "../../../../reduce/cuda/reduce.cuh"
#include <cub/block/block_reduce.cuh>
#include "../cuda/kernel.cuh"
template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_MOORE_KERNEL blockPerChannelQuantI8(
int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_MOORE_KERNEL blockPerChannelQuantI8Sym(
int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_MOORE_KERNEL warpPerChannelQuantI8(
int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_MOORE_KERNEL warpPerChannelQuantI8Sym(
int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
}
namespace op::per_channel_quant_int8::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc) {
auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, musaStream_t stream) {
int M = (int)info.M;
int K = (int)info.K;
if (K >= 1024) {
if (x_zero == nullptr) {
blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
<<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
} else {
blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
<<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
}
} else {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
if (x_zero == nullptr) {
warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
} else {
warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *x_packed, void *x_scale, void *x_zero, const void *x,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define QUANT(BLOCK_SIZE, TDATA) \
per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return QUANT(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return QUANT(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return QUANT(BLOCK_SIZE, __mt_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::per_channel_quant_int8::moore
......@@ -5,6 +5,9 @@
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/per_channel_quant_int8_nvidia.cuh"
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/per_channel_quant_int8_moore.h"
#endif
__C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
......@@ -27,6 +30,9 @@ __C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t ha
#endif
#ifdef ENABLE_QY_API
CREATE(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_MOORE_API
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -45,6 +51,9 @@ __C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQ
#endif
#ifdef ENABLE_QY_API
GET(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_MOORE_API
GET(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -71,6 +80,9 @@ __C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor
#endif
#ifdef ENABLE_QY_API
QUANT(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_MOORE_API
QUANT(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -90,6 +102,9 @@ __C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannel
#endif
#ifdef ENABLE_QY_API
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif
#ifdef ENABLE_MOORE_API
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
#ifndef __GEMM_INFO_H__
#ifndef __I8GEMM_INFO_H__
#define __I8GEMM_INFO_H__
#include "../../../utils.h"
......
......@@ -18,8 +18,8 @@
size_t workspace_size, \
infiniDtype_t out_dtype, \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, _out_dtype(out_dtype), \
_opaque(opaque), _info(info), _workspace_size(workspace_size) {} \
: InfiniopDescriptor{device_type, device_id}, _opaque(opaque), \
_workspace_size(workspace_size), _info(info), _out_dtype(out_dtype) {} \
\
public: \
~Descriptor(); \
......
#ifndef __INT8_GEMM_MOORE_API_H__
#define __INT8_GEMM_MOORE_API_H__
#include "../int8_gemm.h"
DESCRIPTOR(moore)
#endif // __INT8_GEMM_MOORE_API_H__
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_handle.h"
#include "int8_gemm_moore.h"
namespace op::i8gemm::moore {
static void moore_i8gemm_launch(
const I8GemmInfo &info,
std::shared_ptr<device::moore::Handle::Internal> &internal,
void* out,
const int8_t* A,
const int8_t* B,
const float* A_scale,
const float* B_scale,
const void* bias,
infiniDtype_t out_dtype,
musaStream_t stream)
{
internal->useMudnn(stream,
[&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
// 1. Operator
auto matmul = std::make_unique<::musa::dnn::BatchMatMul>();
matmul->SetComputeMode(::musa::dnn::BatchMatMul::ComputeMode::TENSOR);
// 2. Tensors
::musa::dnn::Tensor out_t, a_t, b_t, bias_t;
::musa::dnn::Tensor scale_a_t, scale_b_t;
// 3. Output dtype
if (out_dtype == INFINI_DTYPE_F16) {
out_t.SetType(::musa::dnn::Tensor::Type::HALF);
bias_t.SetType(::musa::dnn::Tensor::Type::HALF);
} else {
out_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
bias_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
}
// 4. Input INT8
a_t.SetType(::musa::dnn::Tensor::Type::INT8);
b_t.SetType(::musa::dnn::Tensor::Type::INT8);
// 5. Scale (per-tensor)
scale_a_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
scale_b_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
// 6. Bind memory
out_t.SetAddr(out);
a_t.SetAddr(const_cast<int8_t*>(A));
b_t.SetAddr(const_cast<int8_t*>(B));
scale_a_t.SetAddr(const_cast<float*>(A_scale));
scale_b_t.SetAddr(const_cast<float*>(B_scale));
if (bias)
bias_t.SetAddr(const_cast<void*>(bias));
// 7. A NdInfo
{
std::array<int64_t,3> dims;
std::array<int64_t,3> strides;
if (info.a_matrix.col_stride != 1) {
dims = {info.batch, info.k, info.m};
} else {
dims = {info.batch, info.m, info.k};
}
strides = {
info.a_matrix.stride,
info.a_matrix.ld(),
1
};
a_t.SetNdInfo(3, dims.data(), strides.data());
}
// 8. B NdInfo
{
std::array<int64_t,3> dims;
std::array<int64_t,3> strides;
if (info.b_matrix.col_stride != 1) {
dims = {info.batch, info.n, info.k};
} else {
dims = {info.batch, info.k, info.n};
}
strides = {
info.b_matrix.stride,
info.b_matrix.ld(),
1
};
b_t.SetNdInfo(3, dims.data(), strides.data());
}
// 9. out NdInfo
{
std::array<int64_t, 3> dims = {
info.batch,
info.m,
info.n
};
std::array<int64_t, 3> strides = {
info.m * info.n,
info.n,
1
};
out_t.SetNdInfo(3, dims.data(), strides.data());
}
// 10. Bias & scale NdInfo
if (bias) {
std::array<int64_t,1> dims = { info.n };
std::array<int64_t,1> strides = { 1 };
bias_t.SetNdInfo(1, dims.data(), strides.data());
}
{
std::array<int64_t,3> a_scale_dims = { info.batch, info.m, 1 };
std::array<int64_t,3> a_scale_strides = { info.m, 1, 1 };
scale_a_t.SetNdInfo(3, a_scale_dims.data(), a_scale_strides.data());
std::array<int64_t,3> b_scale_dims = { info.batch, 1, info.n };
std::array<int64_t,3> b_scale_strides = { info.n, 1, 1 };
scale_b_t.SetNdInfo(3, b_scale_dims.data(), b_scale_strides.data());
}
// 11. Transpose
matmul->SetTranspose(
info.a_matrix.col_stride != 1,
info.b_matrix.col_stride != 1);
// 12. Lt param (no epilogue enum)
::musa::dnn::MatMulLtParam lt_param;
lt_param.SetScale(
scale_a_t,
scale_b_t,
::musa::dnn::Tensor(),
::musa::dnn::Tensor());
// 13. Alpha / Beta
matmul->SetAlpha(1.0);
matmul->SetBeta(0.0);
matmul->SetGamma(1.0);
// 14. Workspace
::musa::dnn::MemoryMaintainer maintainer =
[](size_t size) {
void* ptr = nullptr;
musaMalloc(&ptr, size);
return ::musa::dnn::MemoryHandler(
ptr,
[](void* p) { if (p) musaFree(p); });
};
// 15. Run
matmul->RunLt(
mudnn_handle,
out_t,
a_t,
b_t,
::musa::dnn::Tensor(),
bias ? bias_t : ::musa::dnn::Tensor(),
lt_param,
maintainer);
return INFINI_STATUS_SUCCESS;
});
}
/* ================= Descriptor ================= */
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_scale_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t b_scale_desc)
{
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
auto result = I8GemmInfo::create(
out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
result.take(),
0,
dtype,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *out,
const void *bias,
const void *a,
const void *a_scale,
const void *b,
const void *b_scale,
void *stream_) const
{
moore_i8gemm_launch(
_info,
_opaque->internal,
out,
static_cast<const int8_t*>(a),
static_cast<const int8_t*>(b),
static_cast<const float*>(a_scale),
static_cast<const float*>(b_scale),
bias,
_out_dtype,
reinterpret_cast<musaStream_t>(stream_));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::i8gemm::moore
......@@ -6,6 +6,10 @@
#include "nvidia/int8_gemm_nvidia.cuh"
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/int8_gemm_moore.h"
#endif
__C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
infiniopI8GemmDescriptor_t *desc_ptr,
infiniopTensorDescriptor_t out_desc,
......@@ -31,6 +35,9 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
#endif
#if defined(ENABLE_QY_API)
CREATE(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
CREATE(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -49,6 +56,9 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
#endif
#if defined(ENABLE_QY_API)
GET(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
GET(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -76,6 +86,9 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
#endif
#if defined(ENABLE_QY_API)
CACULATE(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
CACULATE(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......@@ -94,6 +107,9 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
#endif
#if defined(ENABLE_QY_API)
DESTROY(INFINI_DEVICE_QY, nvidia)
#endif
#if defined(ENABLE_MOORE_API)
DESTROY(INFINI_DEVICE_MOORE, moore)
#endif
default:
return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
......
......@@ -59,10 +59,8 @@ _TOLERANCE_MAP = {
DEBUG = False
PROFILE = False
NUM_PRERUN = 10
NUM_ITERATIONS = 1000
def to_int8(tensor: torch.Tensor) -> torch.Tensor:
return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
NUM_ITERATIONS = 100
def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
......@@ -72,6 +70,7 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
return o.to(out_dtype)
def test(
handle,
device,
......@@ -83,34 +82,91 @@ def test(
sync=None,
):
print(
f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
f"Testing scaled_mm_int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
)
M, K = x_shape
N = w_shape[1]
x_packed = to_int8(torch.randn((M, K), device="cuda") * 5)
weights = to_int8(torch.randn((N, K), device="cuda").t() * 5)
x_scale = torch.randn((M,), device="cuda", dtype=torch.float32)
weights_scale = torch.randn((N,), device="cuda", dtype=torch.float32)
bias = torch.randn((N,), device="cuda", dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16) * 10
ans = torch_scaled_mm(x_packed, weights, x_scale, weights_scale, torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16, bias=bias)
# --- Tensor Descriptor ---
# orig: create a random int8 tensor as the reference data source
# torch: extract the torch view to adjust layout/stride
# final: wrap it back as TestTensor with explicit stride for device execution
x_packed_orig = TestTensor(
(M, K),
None,
InfiniDtype.I8,
device,
mode="randint",
randint_low=-128,
randint_high=127,
)
x_packed_torch = x_packed_orig.torch_tensor()
x_packed = TestTensor(
(M, K), x_packed.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_packed
(M, K),
x_packed_torch.stride(),
InfiniDtype.I8,
device,
mode="manual",
set_tensor=x_packed_torch,
)
x_scale = TestTensor(
(M,), x_scale.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=x_scale
weights_orig = TestTensor(
(N, K),
None,
InfiniDtype.I8,
device,
mode="randint",
randint_low=-128,
randint_high=127,
)
weights_torch = weights_orig.torch_tensor().t()
weights = TestTensor(
(K, N), weights.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=weights
(K, N),
weights_torch.stride(),
InfiniDtype.I8,
device,
mode="manual",
set_tensor=weights_torch,
)
x_scale_orig = TestTensor((M,), None, InfiniDtype.F32, device, mode="random")
x_scale_torch = x_scale_orig.torch_tensor()
x_scale = TestTensor(
(M,),
x_scale_torch.stride(),
InfiniDtype.F32,
device,
mode="manual",
set_tensor=x_scale_torch,
)
weights_scale_orig = TestTensor((N,), None, InfiniDtype.F32, device, mode="random")
weights_scale_torch = weights_scale_orig.torch_tensor()
weights_scale = TestTensor(
(N,), weights_scale.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=weights_scale
(N,),
weights_scale_torch.stride(),
InfiniDtype.F32,
device,
mode="manual",
set_tensor=weights_scale_torch,
)
bias_orig = TestTensor((N,), None, dtype, device, mode="random")
bias_torch = bias_orig.torch_tensor()
bias = TestTensor(
(N,), bias_torch.stride(), dtype, device, mode="manual", set_tensor=bias_torch
)
y = TestTensor(y_shape, None, dtype, device, mode="zeros")
ans = torch_scaled_mm(
x_packed.torch_tensor(),
weights.torch_tensor(),
x_scale.torch_tensor(),
weights_scale.torch_tensor(),
out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
bias=bias.torch_tensor(),
)
y = TestTensor(y_shape, None, dtype, device)
bias = TestTensor((N,), bias.stride(), dtype, device, mode="manual", set_tensor=bias)
descriptor = infiniopOperatorDescriptor_t()
check_error(
......@@ -164,7 +220,20 @@ def test(
# Profiling workflow
if PROFILE:
# fmt: off
profile_operation("PyTorch", lambda: torch_scaled_mm(x_packed, weights, x_scale, weights_scale, torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16, bias=bias), device, NUM_PRERUN, NUM_ITERATIONS)
profile_operation(
"PyTorch",
lambda: torch_scaled_mm(
x_packed.torch_tensor(),
weights.torch_tensor(),
x_scale.torch_tensor(),
weights_scale.torch_tensor(),
out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
bias=bias.torch_tensor()
),
device,
NUM_PRERUN,
NUM_ITERATIONS
)
profile_operation(" lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
# fmt: on
......@@ -181,6 +250,12 @@ if __name__ == "__main__":
NUM_ITERATIONS = args.num_iterations
for device in get_test_devices(args):
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
# muDNN(v3101): INT8 quantized multiplication → BF16 output.
# Moore backend: BF16 output only.
if args.moore == True:
_TENSOR_DTYPES_MOORE = [InfiniDtype.BF16]
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES_MOORE)
else:
test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
print("\033[92mTest passed!\033[0m")
......@@ -48,6 +48,9 @@ target("infiniop-moore")
-- Add source files for Moore muBLAS/muDNN GEMM backends.
add_files("../src/infiniop/ops/gemm/moore/*/*.mu", {rule = "mu"})
-- Add source files for Moore per_channel_quant_int8 backends.
add_files("../src/infiniop/ops/quant/per_channel_quant_int8/moore/*.mu", {rule = "mu"})
target_end()
target("infinirt-moore")
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment