Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
jerrrrry
infinicore
Commits
5675a4af
Unverified
Commit
5675a4af
authored
Feb 12, 2026
by
thatPepe
Committed by
GitHub
Feb 12, 2026
Browse files
Merge pull request #1018 from InfiniTensor/issue/972
Issue/972:摩尔平台基于 muDNN 的 w8a8 量化实现,并完善 scaled_mm_int8 python 测试脚本
parents
6ec2ea40
6841663b
Changes
10
Hide whitespace changes
Inline
Side-by-side
Showing
10 changed files
with
504 additions
and
27 deletions
+504
-27
src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.h
...r_channel_quant_int8/moore/per_channel_quant_int8_moore.h
+7
-0
src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.mu
..._channel_quant_int8/moore/per_channel_quant_int8_moore.mu
+116
-0
src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
+15
-0
src/infiniop/ops/scaled_mm/info.h
src/infiniop/ops/scaled_mm/info.h
+1
-1
src/infiniop/ops/scaled_mm/int8_gemm.h
src/infiniop/ops/scaled_mm/int8_gemm.h
+2
-2
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.h
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.h
+7
-0
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.mu
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.mu
+238
-0
src/infiniop/ops/scaled_mm/operator.cc
src/infiniop/ops/scaled_mm/operator.cc
+16
-0
test/infiniop/scaled_mm_int8.py
test/infiniop/scaled_mm_int8.py
+99
-24
xmake/moore.lua
xmake/moore.lua
+3
-0
No files found.
src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.h
0 → 100644
View file @
5675a4af
#ifndef __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
#define __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
#include "../per_channel_quant_int8.h"
DESCRIPTOR
(
moore
)
#endif // __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.mu
0 → 100644
View file @
5675a4af
#include "../../../../devices/moore/moore_common.h"
#include "per_channel_quant_int8_moore.h"
#include "../../../../devices/moore/moore_kernel_common.h"
#include "../../../../reduce/cuda/reduce.cuh"
#include <cub/block/block_reduce.cuh>
#include "../cuda/kernel.cuh"
template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_MOORE_KERNEL blockPerChannelQuantI8(
int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE>
INFINIOP_MOORE_KERNEL blockPerChannelQuantI8Sym(
int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_MOORE_KERNEL warpPerChannelQuantI8(
int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
}
template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
INFINIOP_MOORE_KERNEL warpPerChannelQuantI8Sym(
int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
}
namespace op::per_channel_quant_int8::moore {
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle, Descriptor **desc_ptr,
infiniopTensorDescriptor_t x_packed_desc,
infiniopTensorDescriptor_t x_scale_desc,
infiniopTensorDescriptor_t x_zero_desc,
infiniopTensorDescriptor_t x_desc) {
auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
CHECK_RESULT(info);
*desc_ptr = new Descriptor(
new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
info.take(), 0, handle->device, handle->device_id);
return INFINI_STATUS_SUCCESS;
}
template <unsigned int BLOCK_SIZE, typename Tdata>
infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, musaStream_t stream) {
int M = (int)info.M;
int K = (int)info.K;
if (K >= 1024) {
if (x_zero == nullptr) {
blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
<<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
} else {
blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
<<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
}
} else {
constexpr unsigned int BLOCK_SIZE_x = 32;
constexpr unsigned int BLOCK_SIZE_y = 32;
int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
dim3 grid_dim(num_block_x, 1, 1);
if (x_zero == nullptr) {
warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
} else {
warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
<<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
}
}
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
void *x_packed, void *x_scale, void *x_zero, const void *x,
void *stream_) const {
musaStream_t stream = (musaStream_t)stream_;
#define QUANT(BLOCK_SIZE, TDATA) \
per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE) \
{ \
if (_info.dtype == INFINI_DTYPE_F16) \
return QUANT(BLOCK_SIZE, half); \
else if (_info.dtype == INFINI_DTYPE_F32) \
return QUANT(BLOCK_SIZE, float); \
else if (_info.dtype == INFINI_DTYPE_BF16) \
return QUANT(BLOCK_SIZE, __mt_bfloat16); \
else \
return INFINI_STATUS_BAD_TENSOR_DTYPE; \
}
if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
} else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
} else {
return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
}
return INFINI_STATUS_SUCCESS;
}
} // namespace op::per_channel_quant_int8::moore
src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
View file @
5675a4af
...
@@ -5,6 +5,9 @@
...
@@ -5,6 +5,9 @@
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
#include "nvidia/per_channel_quant_int8_nvidia.cuh"
#include "nvidia/per_channel_quant_int8_nvidia.cuh"
#endif
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/per_channel_quant_int8_moore.h"
#endif
__C
infiniStatus_t
infiniopCreatePerChannelQuantI8Descriptor
(
infiniopHandle_t
handle
,
__C
infiniStatus_t
infiniopCreatePerChannelQuantI8Descriptor
(
infiniopHandle_t
handle
,
infiniopPerChannelQuantI8Descriptor_t
*
desc_ptr
,
infiniopPerChannelQuantI8Descriptor_t
*
desc_ptr
,
...
@@ -27,6 +30,9 @@ __C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t ha
...
@@ -27,6 +30,9 @@ __C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t ha
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
CREATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -45,6 +51,9 @@ __C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQ
...
@@ -45,6 +51,9 @@ __C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQ
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
GET
(
INFINI_DEVICE_QY
,
nvidia
)
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
GET
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -71,6 +80,9 @@ __C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor
...
@@ -71,6 +80,9 @@ __C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
QUANT
(
INFINI_DEVICE_QY
,
nvidia
)
QUANT
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
QUANT
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -90,6 +102,9 @@ __C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannel
...
@@ -90,6 +102,9 @@ __C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannel
#endif
#endif
#ifdef ENABLE_QY_API
#ifdef ENABLE_QY_API
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#ifdef ENABLE_MOORE_API
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
src/infiniop/ops/scaled_mm/info.h
View file @
5675a4af
#ifndef __GEMM_INFO_H__
#ifndef __
I8
GEMM_INFO_H__
#define __I8GEMM_INFO_H__
#define __I8GEMM_INFO_H__
#include "../../../utils.h"
#include "../../../utils.h"
...
...
src/infiniop/ops/scaled_mm/int8_gemm.h
View file @
5675a4af
...
@@ -18,8 +18,8 @@
...
@@ -18,8 +18,8 @@
size_t workspace_size, \
size_t workspace_size, \
infiniDtype_t out_dtype, \
infiniDtype_t out_dtype, \
infiniDevice_t device_type, int device_id) \
infiniDevice_t device_type, int device_id) \
: InfiniopDescriptor{device_type, device_id}, _o
ut_dtype(out_dtype),
\
: InfiniopDescriptor{device_type, device_id}, _o
paque(opaque),
\
_
opaque(opaque), _info(info), _workspace_size(workspace_size) {}
\
_
workspace_size(workspace_size), _info(info), _out_dtype(out_dtype) {}
\
\
\
public: \
public: \
~Descriptor(); \
~Descriptor(); \
...
...
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.h
0 → 100644
View file @
5675a4af
#ifndef __INT8_GEMM_MOORE_API_H__
#define __INT8_GEMM_MOORE_API_H__
#include "../int8_gemm.h"
DESCRIPTOR
(
moore
)
#endif // __INT8_GEMM_MOORE_API_H__
src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.mu
0 → 100644
View file @
5675a4af
#include "../../../devices/moore/moore_common.h"
#include "../../../devices/moore/moore_handle.h"
#include "int8_gemm_moore.h"
namespace op::i8gemm::moore {
static void moore_i8gemm_launch(
const I8GemmInfo &info,
std::shared_ptr<device::moore::Handle::Internal> &internal,
void* out,
const int8_t* A,
const int8_t* B,
const float* A_scale,
const float* B_scale,
const void* bias,
infiniDtype_t out_dtype,
musaStream_t stream)
{
internal->useMudnn(stream,
[&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
// 1. Operator
auto matmul = std::make_unique<::musa::dnn::BatchMatMul>();
matmul->SetComputeMode(::musa::dnn::BatchMatMul::ComputeMode::TENSOR);
// 2. Tensors
::musa::dnn::Tensor out_t, a_t, b_t, bias_t;
::musa::dnn::Tensor scale_a_t, scale_b_t;
// 3. Output dtype
if (out_dtype == INFINI_DTYPE_F16) {
out_t.SetType(::musa::dnn::Tensor::Type::HALF);
bias_t.SetType(::musa::dnn::Tensor::Type::HALF);
} else {
out_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
bias_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
}
// 4. Input INT8
a_t.SetType(::musa::dnn::Tensor::Type::INT8);
b_t.SetType(::musa::dnn::Tensor::Type::INT8);
// 5. Scale (per-tensor)
scale_a_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
scale_b_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
// 6. Bind memory
out_t.SetAddr(out);
a_t.SetAddr(const_cast<int8_t*>(A));
b_t.SetAddr(const_cast<int8_t*>(B));
scale_a_t.SetAddr(const_cast<float*>(A_scale));
scale_b_t.SetAddr(const_cast<float*>(B_scale));
if (bias)
bias_t.SetAddr(const_cast<void*>(bias));
// 7. A NdInfo
{
std::array<int64_t,3> dims;
std::array<int64_t,3> strides;
if (info.a_matrix.col_stride != 1) {
dims = {info.batch, info.k, info.m};
} else {
dims = {info.batch, info.m, info.k};
}
strides = {
info.a_matrix.stride,
info.a_matrix.ld(),
1
};
a_t.SetNdInfo(3, dims.data(), strides.data());
}
// 8. B NdInfo
{
std::array<int64_t,3> dims;
std::array<int64_t,3> strides;
if (info.b_matrix.col_stride != 1) {
dims = {info.batch, info.n, info.k};
} else {
dims = {info.batch, info.k, info.n};
}
strides = {
info.b_matrix.stride,
info.b_matrix.ld(),
1
};
b_t.SetNdInfo(3, dims.data(), strides.data());
}
// 9. out NdInfo
{
std::array<int64_t, 3> dims = {
info.batch,
info.m,
info.n
};
std::array<int64_t, 3> strides = {
info.m * info.n,
info.n,
1
};
out_t.SetNdInfo(3, dims.data(), strides.data());
}
// 10. Bias & scale NdInfo
if (bias) {
std::array<int64_t,1> dims = { info.n };
std::array<int64_t,1> strides = { 1 };
bias_t.SetNdInfo(1, dims.data(), strides.data());
}
{
std::array<int64_t,3> a_scale_dims = { info.batch, info.m, 1 };
std::array<int64_t,3> a_scale_strides = { info.m, 1, 1 };
scale_a_t.SetNdInfo(3, a_scale_dims.data(), a_scale_strides.data());
std::array<int64_t,3> b_scale_dims = { info.batch, 1, info.n };
std::array<int64_t,3> b_scale_strides = { info.n, 1, 1 };
scale_b_t.SetNdInfo(3, b_scale_dims.data(), b_scale_strides.data());
}
// 11. Transpose
matmul->SetTranspose(
info.a_matrix.col_stride != 1,
info.b_matrix.col_stride != 1);
// 12. Lt param (no epilogue enum)
::musa::dnn::MatMulLtParam lt_param;
lt_param.SetScale(
scale_a_t,
scale_b_t,
::musa::dnn::Tensor(),
::musa::dnn::Tensor());
// 13. Alpha / Beta
matmul->SetAlpha(1.0);
matmul->SetBeta(0.0);
matmul->SetGamma(1.0);
// 14. Workspace
::musa::dnn::MemoryMaintainer maintainer =
[](size_t size) {
void* ptr = nullptr;
musaMalloc(&ptr, size);
return ::musa::dnn::MemoryHandler(
ptr,
[](void* p) { if (p) musaFree(p); });
};
// 15. Run
matmul->RunLt(
mudnn_handle,
out_t,
a_t,
b_t,
::musa::dnn::Tensor(),
bias ? bias_t : ::musa::dnn::Tensor(),
lt_param,
maintainer);
return INFINI_STATUS_SUCCESS;
});
}
/* ================= Descriptor ================= */
struct Descriptor::Opaque {
std::shared_ptr<device::moore::Handle::Internal> internal;
};
Descriptor::~Descriptor() {
delete _opaque;
}
infiniStatus_t Descriptor::create(
infiniopHandle_t handle_,
Descriptor **desc_ptr,
infiniopTensorDescriptor_t out_desc,
infiniopTensorDescriptor_t bias_desc,
infiniopTensorDescriptor_t a_desc,
infiniopTensorDescriptor_t a_scale_desc,
infiniopTensorDescriptor_t b_desc,
infiniopTensorDescriptor_t b_scale_desc)
{
auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
auto dtype = out_desc->dtype();
CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
auto result = I8GemmInfo::create(
out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
CHECK_RESULT(result);
*desc_ptr = new Descriptor(
new Opaque{handle->internal()},
result.take(),
0,
dtype,
handle->device,
handle->device_id);
return INFINI_STATUS_SUCCESS;
}
infiniStatus_t Descriptor::calculate(
void *workspace,
size_t workspace_size,
void *out,
const void *bias,
const void *a,
const void *a_scale,
const void *b,
const void *b_scale,
void *stream_) const
{
moore_i8gemm_launch(
_info,
_opaque->internal,
out,
static_cast<const int8_t*>(a),
static_cast<const int8_t*>(b),
static_cast<const float*>(a_scale),
static_cast<const float*>(b_scale),
bias,
_out_dtype,
reinterpret_cast<musaStream_t>(stream_));
return INFINI_STATUS_SUCCESS;
}
} // namespace op::i8gemm::moore
src/infiniop/ops/scaled_mm/operator.cc
View file @
5675a4af
...
@@ -6,6 +6,10 @@
...
@@ -6,6 +6,10 @@
#include "nvidia/int8_gemm_nvidia.cuh"
#include "nvidia/int8_gemm_nvidia.cuh"
#endif
#endif
#if defined(ENABLE_MOORE_API)
#include "moore/int8_gemm_moore.h"
#endif
__C
infiniStatus_t
infiniopCreateI8GemmDescriptor
(
infiniopHandle_t
handle
,
__C
infiniStatus_t
infiniopCreateI8GemmDescriptor
(
infiniopHandle_t
handle
,
infiniopI8GemmDescriptor_t
*
desc_ptr
,
infiniopI8GemmDescriptor_t
*
desc_ptr
,
infiniopTensorDescriptor_t
out_desc
,
infiniopTensorDescriptor_t
out_desc
,
...
@@ -31,6 +35,9 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
...
@@ -31,6 +35,9 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
#endif
#endif
#if defined(ENABLE_QY_API)
#if defined(ENABLE_QY_API)
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
CREATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
CREATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -49,6 +56,9 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
...
@@ -49,6 +56,9 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
#endif
#endif
#if defined(ENABLE_QY_API)
#if defined(ENABLE_QY_API)
GET
(
INFINI_DEVICE_QY
,
nvidia
)
GET
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
GET
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -76,6 +86,9 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
...
@@ -76,6 +86,9 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
#endif
#endif
#if defined(ENABLE_QY_API)
#if defined(ENABLE_QY_API)
CACULATE
(
INFINI_DEVICE_QY
,
nvidia
)
CACULATE
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
CACULATE
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
@@ -94,6 +107,9 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
...
@@ -94,6 +107,9 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
#endif
#endif
#if defined(ENABLE_QY_API)
#if defined(ENABLE_QY_API)
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
DESTROY
(
INFINI_DEVICE_QY
,
nvidia
)
#endif
#if defined(ENABLE_MOORE_API)
DESTROY
(
INFINI_DEVICE_MOORE
,
moore
)
#endif
#endif
default:
default:
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
return
INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED
;
...
...
test/infiniop/scaled_mm_int8.py
View file @
5675a4af
...
@@ -59,10 +59,8 @@ _TOLERANCE_MAP = {
...
@@ -59,10 +59,8 @@ _TOLERANCE_MAP = {
DEBUG
=
False
DEBUG
=
False
PROFILE
=
False
PROFILE
=
False
NUM_PRERUN
=
10
NUM_PRERUN
=
10
NUM_ITERATIONS
=
1000
NUM_ITERATIONS
=
100
def
to_int8
(
tensor
:
torch
.
Tensor
)
->
torch
.
Tensor
:
return
torch
.
round
(
tensor
.
clamp
(
min
=-
128
,
max
=
127
)).
to
(
dtype
=
torch
.
int8
)
def
torch_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
):
def
torch_scaled_mm
(
a
,
b
,
scale_a
,
scale_b
,
out_dtype
,
bias
):
o
=
torch
.
matmul
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
o
=
torch
.
matmul
(
a
.
to
(
torch
.
float32
),
b
.
to
(
torch
.
float32
))
...
@@ -72,6 +70,7 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
...
@@ -72,6 +70,7 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
o
=
o
.
to
(
torch
.
float32
)
*
scale_a
.
view
(
-
1
,
1
)
*
scale_b
.
view
(
1
,
-
1
)
o
=
o
.
to
(
torch
.
float32
)
*
scale_a
.
view
(
-
1
,
1
)
*
scale_b
.
view
(
1
,
-
1
)
return
o
.
to
(
out_dtype
)
return
o
.
to
(
out_dtype
)
def
test
(
def
test
(
handle
,
handle
,
device
,
device
,
...
@@ -83,34 +82,91 @@ def test(
...
@@ -83,34 +82,91 @@ def test(
sync
=
None
,
sync
=
None
,
):
):
print
(
print
(
f
"Testing
Linear
on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
, w_shape:
{
w_shape
}
, inplace:
{
inplace
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
f
"Testing
scaled_mm_int8
on
{
InfiniDeviceNames
[
device
]
}
with x_shape:
{
x_shape
}
, w_shape:
{
w_shape
}
, inplace:
{
inplace
}
dtype:
{
InfiniDtypeNames
[
dtype
]
}
"
)
)
M
,
K
=
x_shape
M
,
K
=
x_shape
N
=
w_shape
[
1
]
N
=
w_shape
[
1
]
x_packed
=
to_int8
(
torch
.
randn
((
M
,
K
),
device
=
"cuda"
)
*
5
)
# --- Tensor Descriptor ---
weights
=
to_int8
(
torch
.
randn
((
N
,
K
),
device
=
"cuda"
).
t
()
*
5
)
# orig: create a random int8 tensor as the reference data source
# torch: extract the torch view to adjust layout/stride
x_scale
=
torch
.
randn
((
M
,),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
# final: wrap it back as TestTensor with explicit stride for device execution
weights_scale
=
torch
.
randn
((
N
,),
device
=
"cuda"
,
dtype
=
torch
.
float32
)
x_packed_orig
=
TestTensor
(
bias
=
torch
.
randn
((
N
,),
device
=
"cuda"
,
dtype
=
torch
.
float16
if
dtype
==
InfiniDtype
.
F16
else
torch
.
bfloat16
)
*
10
(
M
,
K
),
None
,
ans
=
torch_scaled_mm
(
x_packed
,
weights
,
x_scale
,
weights_scale
,
torch
.
float16
if
dtype
==
InfiniDtype
.
F16
else
torch
.
bfloat16
,
bias
=
bias
)
InfiniDtype
.
I8
,
device
,
mode
=
"randint"
,
randint_low
=-
128
,
randint_high
=
127
,
)
x_packed_torch
=
x_packed_orig
.
torch_tensor
()
x_packed
=
TestTensor
(
x_packed
=
TestTensor
(
(
M
,
K
),
x_packed
.
stride
(),
InfiniDtype
.
I8
,
device
,
mode
=
"manual"
,
set_tensor
=
x_packed
(
M
,
K
),
x_packed_torch
.
stride
(),
InfiniDtype
.
I8
,
device
,
mode
=
"manual"
,
set_tensor
=
x_packed_torch
,
)
)
x_scale
=
TestTensor
(
(
M
,),
x_scale
.
stride
(),
InfiniDtype
.
F32
,
device
,
mode
=
"manual"
,
set_tensor
=
x_scale
weights_orig
=
TestTensor
(
(
N
,
K
),
None
,
InfiniDtype
.
I8
,
device
,
mode
=
"randint"
,
randint_low
=-
128
,
randint_high
=
127
,
)
)
weights_torch
=
weights_orig
.
torch_tensor
().
t
()
weights
=
TestTensor
(
weights
=
TestTensor
(
(
K
,
N
),
weights
.
stride
(),
InfiniDtype
.
I8
,
device
,
mode
=
"manual"
,
set_tensor
=
weights
(
K
,
N
),
weights_torch
.
stride
(),
InfiniDtype
.
I8
,
device
,
mode
=
"manual"
,
set_tensor
=
weights_torch
,
)
x_scale_orig
=
TestTensor
((
M
,),
None
,
InfiniDtype
.
F32
,
device
,
mode
=
"random"
)
x_scale_torch
=
x_scale_orig
.
torch_tensor
()
x_scale
=
TestTensor
(
(
M
,),
x_scale_torch
.
stride
(),
InfiniDtype
.
F32
,
device
,
mode
=
"manual"
,
set_tensor
=
x_scale_torch
,
)
)
weights_scale_orig
=
TestTensor
((
N
,),
None
,
InfiniDtype
.
F32
,
device
,
mode
=
"random"
)
weights_scale_torch
=
weights_scale_orig
.
torch_tensor
()
weights_scale
=
TestTensor
(
weights_scale
=
TestTensor
(
(
N
,),
weights_scale
.
stride
(),
InfiniDtype
.
F32
,
device
,
mode
=
"manual"
,
set_tensor
=
weights_scale
(
N
,),
weights_scale_torch
.
stride
(),
InfiniDtype
.
F32
,
device
,
mode
=
"manual"
,
set_tensor
=
weights_scale_torch
,
)
bias_orig
=
TestTensor
((
N
,),
None
,
dtype
,
device
,
mode
=
"random"
)
bias_torch
=
bias_orig
.
torch_tensor
()
bias
=
TestTensor
(
(
N
,),
bias_torch
.
stride
(),
dtype
,
device
,
mode
=
"manual"
,
set_tensor
=
bias_torch
)
y
=
TestTensor
(
y_shape
,
None
,
dtype
,
device
,
mode
=
"zeros"
)
ans
=
torch_scaled_mm
(
x_packed
.
torch_tensor
(),
weights
.
torch_tensor
(),
x_scale
.
torch_tensor
(),
weights_scale
.
torch_tensor
(),
out_dtype
=
torch
.
float16
if
dtype
==
InfiniDtype
.
F16
else
torch
.
bfloat16
,
bias
=
bias
.
torch_tensor
(),
)
)
y
=
TestTensor
(
y_shape
,
None
,
dtype
,
device
)
bias
=
TestTensor
((
N
,),
bias
.
stride
(),
dtype
,
device
,
mode
=
"manual"
,
set_tensor
=
bias
)
descriptor
=
infiniopOperatorDescriptor_t
()
descriptor
=
infiniopOperatorDescriptor_t
()
check_error
(
check_error
(
...
@@ -164,7 +220,20 @@ def test(
...
@@ -164,7 +220,20 @@ def test(
# Profiling workflow
# Profiling workflow
if
PROFILE
:
if
PROFILE
:
# fmt: off
# fmt: off
profile_operation
(
"PyTorch"
,
lambda
:
torch_scaled_mm
(
x_packed
,
weights
,
x_scale
,
weights_scale
,
torch
.
float16
if
dtype
==
InfiniDtype
.
F16
else
torch
.
bfloat16
,
bias
=
bias
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
"PyTorch"
,
lambda
:
torch_scaled_mm
(
x_packed
.
torch_tensor
(),
weights
.
torch_tensor
(),
x_scale
.
torch_tensor
(),
weights_scale
.
torch_tensor
(),
out_dtype
=
torch
.
float16
if
dtype
==
InfiniDtype
.
F16
else
torch
.
bfloat16
,
bias
=
bias
.
torch_tensor
()
),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_linear
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
profile_operation
(
" lib"
,
lambda
:
lib_linear
(),
device
,
NUM_PRERUN
,
NUM_ITERATIONS
)
# fmt: on
# fmt: on
...
@@ -181,6 +250,12 @@ if __name__ == "__main__":
...
@@ -181,6 +250,12 @@ if __name__ == "__main__":
NUM_ITERATIONS
=
args
.
num_iterations
NUM_ITERATIONS
=
args
.
num_iterations
for
device
in
get_test_devices
(
args
):
for
device
in
get_test_devices
(
args
):
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
# muDNN(v3101): INT8 quantized multiplication → BF16 output.
# Moore backend: BF16 output only.
if
args
.
moore
==
True
:
_TENSOR_DTYPES_MOORE
=
[
InfiniDtype
.
BF16
]
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES_MOORE
)
else
:
test_operator
(
device
,
test
,
_TEST_CASES
,
_TENSOR_DTYPES
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
print
(
"
\033
[92mTest passed!
\033
[0m"
)
xmake/moore.lua
View file @
5675a4af
...
@@ -48,6 +48,9 @@ target("infiniop-moore")
...
@@ -48,6 +48,9 @@ target("infiniop-moore")
-- Add source files for Moore muBLAS/muDNN GEMM backends.
-- Add source files for Moore muBLAS/muDNN GEMM backends.
add_files
(
"../src/infiniop/ops/gemm/moore/*/*.mu"
,
{
rule
=
"mu"
})
add_files
(
"../src/infiniop/ops/gemm/moore/*/*.mu"
,
{
rule
=
"mu"
})
-- Add source files for Moore per_channel_quant_int8 backends.
add_files
(
"../src/infiniop/ops/quant/per_channel_quant_int8/moore/*.mu"
,
{
rule
=
"mu"
})
target_end
()
target_end
()
target
(
"infinirt-moore"
)
target
(
"infinirt-moore"
)
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment