Merge pull request #1018 from InfiniTensor/issue/972

Issue/972：摩尔平台基于 muDNN 的 w8a8 量化实现，并完善 scaled_mm_int8 python 测试脚本

Merge pull request #1018 from InfiniTensor/issue/972
Issue/972：摩尔平台基于 muDNN 的 w8a8 量化实现，并完善 scaled_mm_int8 python 测试脚本
5675a4af · thatPepe · GitHub · 6ec2ea40 · 6841663b · 5675a4af
Unverified Commit 5675a4af authored Feb 12, 2026 by thatPepe Committed by GitHub Feb 12, 2026
10 changed files
--- a/src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.h
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.h
+#ifndef __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
+#define __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
+#include "../per_channel_quant_int8.h"
+DESCRIPTOR(moore)
+#endif // __PER_CHANNEL_QUANT_INT8_MOORE_API_H__
--- a/src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.mu
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/moore/per_channel_quant_int8_moore.mu
+#include "../../../../devices/moore/moore_common.h"
+#include "per_channel_quant_int8_moore.h"
+#include "../../../../devices/moore/moore_kernel_common.h"
+#include "../../../../reduce/cuda/reduce.cuh"
+#include <cub/block/block_reduce.cuh>
+#include "../cuda/kernel.cuh"
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL blockPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE>
+INFINIOP_MOORE_KERNEL blockPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    blockPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE>(x_packed, x_scale, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_MOORE_KERNEL warpPerChannelQuantI8(
+    int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8Kernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x_zero, x, M, K);
+}
+template <typename Tdata, unsigned int BLOCK_SIZE_x, unsigned int BLOCK_SIZE_y>
+INFINIOP_MOORE_KERNEL warpPerChannelQuantI8Sym(
+    int8_t *x_packed, float *x_scale, const Tdata *x, int M, int K) {
+    warpPerChannelQuantI8SymKernel<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>(x_packed, x_scale, x, M, K);
+}
+namespace op::per_channel_quant_int8::moore {
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle, Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t x_packed_desc,
+    infiniopTensorDescriptor_t x_scale_desc,
+    infiniopTensorDescriptor_t x_zero_desc,
+    infiniopTensorDescriptor_t x_desc) {
+    auto info = PerChannelQuantI8Info::createPerChannelQuantI8Info(x_packed_desc, x_scale_desc, x_zero_desc, x_desc);
+    CHECK_RESULT(info);
+    *desc_ptr = new Descriptor(
+        new Opaque{reinterpret_cast<device::moore::Handle *>(handle)->internal()},
+        info.take(), 0, handle->device, handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+template <unsigned int BLOCK_SIZE, typename Tdata>
+infiniStatus_t per_channel_quant_int8Kernel(const PerChannelQuantI8Info &info, int8_t *x_packed, float *x_scale, float *x_zero, const Tdata *x, musaStream_t stream) {
+    int M = (int)info.M;
+    int K = (int)info.K;
+    if (K >= 1024) {
+        if (x_zero == nullptr) {
+            blockPerChannelQuantI8Sym<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            blockPerChannelQuantI8<Tdata, BLOCK_SIZE>
+                <<<M, BLOCK_SIZE, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    } else {
+        constexpr unsigned int BLOCK_SIZE_x = 32;
+        constexpr unsigned int BLOCK_SIZE_y = 32;
+        int num_block_x = (M + BLOCK_SIZE_y - 1) / BLOCK_SIZE_y;
+        dim3 block_dim(BLOCK_SIZE_x, BLOCK_SIZE_y, 1);
+        dim3 grid_dim(num_block_x, 1, 1);
+        if (x_zero == nullptr) {
+            warpPerChannelQuantI8Sym<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x, M, K);
+        } else {
+            warpPerChannelQuantI8<Tdata, BLOCK_SIZE_x, BLOCK_SIZE_y>
+                <<<grid_dim, block_dim, 0, stream>>>(x_packed, x_scale, x_zero, x, M, K);
+        }
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(void *workspace, size_t workspace_size,
+                                     void *x_packed, void *x_scale, void *x_zero, const void *x,
+                                     void *stream_) const {
+    musaStream_t stream = (musaStream_t)stream_;
+#define QUANT(BLOCK_SIZE, TDATA) \
+    per_channel_quant_int8Kernel<BLOCK_SIZE, TDATA>(_info, (int8_t *)x_packed, (float *)x_scale, (float *)x_zero, (const TDATA *)x, stream)
+#define QUANT_WITH_BLOCK_SIZE(BLOCK_SIZE)            \
+    {                                                \
+        if (_info.dtype == INFINI_DTYPE_F16)         \
+            return QUANT(BLOCK_SIZE, half);          \
+        else if (_info.dtype == INFINI_DTYPE_F32)    \
+            return QUANT(BLOCK_SIZE, float);         \
+        else if (_info.dtype == INFINI_DTYPE_BF16)   \
+            return QUANT(BLOCK_SIZE, __mt_bfloat16); \
+        else                                         \
+            return INFINI_STATUS_BAD_TENSOR_DTYPE;   \
+    }
+    if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_1024) {
+        QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_1024)
+    } else if (_opaque->internal->maxThreadsPerBlock() == MOORE_BLOCK_SIZE_512) {
+        QUANT_WITH_BLOCK_SIZE(MOORE_BLOCK_SIZE_512)
+    } else {
+        return INFINI_STATUS_DEVICE_ARCHITECTURE_NOT_SUPPORTED;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::per_channel_quant_int8::moore
--- a/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
+++ b/src/infiniop/ops/quant/per_channel_quant_int8/operator.cc
@@ -5,6 +5,9 @@
 #if defined(ENABLE_NVIDIA_API) || defined(ENABLE_QY_API)
 #include "nvidia/per_channel_quant_int8_nvidia.cuh"
 #endif
+#if defined(ENABLE_MOORE_API)
+#include "moore/per_channel_quant_int8_moore.h"
+#endif
 __C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t handle,
                                                             infiniopPerChannelQuantI8Descriptor_t *desc_ptr,
@@ -27,6 +30,9 @@ __C infiniStatus_t infiniopCreatePerChannelQuantI8Descriptor(infiniopHandle_t ha
 #endif
 #ifdef ENABLE_QY_API
        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        CREATE(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -45,6 +51,9 @@ __C infiniStatus_t infiniopGetPerChannelQuantI8WorkspaceSize(infiniopPerChannelQ
 #endif
 #ifdef ENABLE_QY_API
        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        GET(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -71,6 +80,9 @@ __C infiniStatus_t infiniopPerChannelQuantI8(infiniopPerChannelQuantI8Descriptor
 #endif
 #ifdef ENABLE_QY_API
        QUANT(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        QUANT(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -90,6 +102,9 @@ __C infiniStatus_t infiniopDestroyPerChannelQuantI8Descriptor(infiniopPerChannel
 #endif
 #ifdef ENABLE_QY_API
        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+#ifdef ENABLE_MOORE_API
+        DESTROY(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/src/infiniop/ops/scaled_mm/info.h
+++ b/src/infiniop/ops/scaled_mm/info.h
-#ifndef __GEMM_INFO_H__
+#ifndef __I8GEMM_INFO_H__
 #define __I8GEMM_INFO_H__
 #include "../../../utils.h"

--- a/src/infiniop/ops/scaled_mm/int8_gemm.h
+++ b/src/infiniop/ops/scaled_mm/int8_gemm.h
@@ -18,8 +18,8 @@
                   size_t workspace_size,                                                       \
                   infiniDtype_t out_dtype,                                                     \
                   infiniDevice_t device_type, int device_id)                                   \
-            : InfiniopDescriptor{device_type, device_id}, _out_dtype(out_dtype),                \
+            : InfiniopDescriptor{device_type, device_id}, _opaque(opaque),                      \
-              _opaque(opaque), _info(info), _workspace_size(workspace_size) {}                  \
+              _workspace_size(workspace_size), _info(info), _out_dtype(out_dtype) {}            \
                                                                                                \
    public:                                                                                     \
        ~Descriptor();                                                                          \

--- a/src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.h
+++ b/src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.h
+#ifndef __INT8_GEMM_MOORE_API_H__
+#define __INT8_GEMM_MOORE_API_H__
+#include "../int8_gemm.h"
+DESCRIPTOR(moore)
+#endif // __INT8_GEMM_MOORE_API_H__
--- a/src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.mu
+++ b/src/infiniop/ops/scaled_mm/moore/int8_gemm_moore.mu
+#include "../../../devices/moore/moore_common.h"
+#include "../../../devices/moore/moore_handle.h"
+#include "int8_gemm_moore.h"
+namespace op::i8gemm::moore {
+static void moore_i8gemm_launch(
+    const I8GemmInfo &info,
+    std::shared_ptr<device::moore::Handle::Internal> &internal,
+    void* out,
+    const int8_t* A,
+    const int8_t* B,
+    const float* A_scale,
+    const float* B_scale,
+    const void* bias,
+    infiniDtype_t out_dtype,
+    musaStream_t stream)
+{
+    internal->useMudnn(stream,
+        [&](::musa::dnn::Handle &mudnn_handle) -> infiniStatus_t {
+        // 1. Operator
+        auto matmul = std::make_unique<::musa::dnn::BatchMatMul>();
+        matmul->SetComputeMode(::musa::dnn::BatchMatMul::ComputeMode::TENSOR);
+        // 2. Tensors
+        ::musa::dnn::Tensor out_t, a_t, b_t, bias_t;
+        ::musa::dnn::Tensor scale_a_t, scale_b_t;
+        // 3. Output dtype
+        if (out_dtype == INFINI_DTYPE_F16) {
+            out_t.SetType(::musa::dnn::Tensor::Type::HALF);
+            bias_t.SetType(::musa::dnn::Tensor::Type::HALF);
+        } else {
+            out_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
+            bias_t.SetType(::musa::dnn::Tensor::Type::BFLOAT16);
+        }
+        // 4. Input INT8
+        a_t.SetType(::musa::dnn::Tensor::Type::INT8);
+        b_t.SetType(::musa::dnn::Tensor::Type::INT8);
+        // 5. Scale (per-tensor)
+        scale_a_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
+        scale_b_t.SetType(::musa::dnn::Tensor::Type::FLOAT);
+        // 6. Bind memory
+        out_t.SetAddr(out);
+        a_t.SetAddr(const_cast<int8_t*>(A));
+        b_t.SetAddr(const_cast<int8_t*>(B));
+        scale_a_t.SetAddr(const_cast<float*>(A_scale));
+        scale_b_t.SetAddr(const_cast<float*>(B_scale));
+        if (bias)
+            bias_t.SetAddr(const_cast<void*>(bias));
+        // 7. A NdInfo
+        {
+            std::array<int64_t,3> dims;
+            std::array<int64_t,3> strides;
+            if (info.a_matrix.col_stride != 1) {
+                dims = {info.batch, info.k, info.m};
+            } else {
+                dims = {info.batch, info.m, info.k};
+            }
+            strides = {
+                info.a_matrix.stride,
+                info.a_matrix.ld(),
+                1
+            };
+            a_t.SetNdInfo(3, dims.data(), strides.data());
+        }
+        // 8. B NdInfo
+        {
+            std::array<int64_t,3> dims;
+            std::array<int64_t,3> strides;
+            if (info.b_matrix.col_stride != 1) {
+                dims = {info.batch, info.n, info.k};
+            } else {
+                dims = {info.batch, info.k, info.n};
+            }
+            strides = {
+                info.b_matrix.stride,
+                info.b_matrix.ld(),
+                1
+            };
+            b_t.SetNdInfo(3, dims.data(), strides.data());
+        }
+        // 9. out NdInfo
+        {
+            std::array<int64_t, 3> dims = {
+                info.batch,
+                info.m,
+                info.n
+            };
+            std::array<int64_t, 3> strides = {
+                info.m * info.n,
+                info.n,
+                1
+            };
+            out_t.SetNdInfo(3, dims.data(), strides.data());
+        }
+        // 10. Bias & scale NdInfo
+        if (bias) {
+            std::array<int64_t,1> dims = { info.n };
+            std::array<int64_t,1> strides = { 1 };
+            bias_t.SetNdInfo(1, dims.data(), strides.data());
+        }
+        {
+            std::array<int64_t,3> a_scale_dims  = { info.batch, info.m, 1 };
+            std::array<int64_t,3> a_scale_strides = { info.m, 1, 1 };
+            scale_a_t.SetNdInfo(3, a_scale_dims.data(), a_scale_strides.data());
+            std::array<int64_t,3> b_scale_dims  = { info.batch, 1, info.n };
+            std::array<int64_t,3> b_scale_strides = { info.n, 1, 1 };
+            scale_b_t.SetNdInfo(3, b_scale_dims.data(), b_scale_strides.data());
+        }
+        // 11. Transpose
+        matmul->SetTranspose(
+            info.a_matrix.col_stride != 1,
+            info.b_matrix.col_stride != 1);
+        // 12. Lt param (no epilogue enum)
+        ::musa::dnn::MatMulLtParam lt_param;
+        lt_param.SetScale(
+            scale_a_t,
+            scale_b_t,
+            ::musa::dnn::Tensor(),
+            ::musa::dnn::Tensor());
+        // 13. Alpha / Beta
+        matmul->SetAlpha(1.0);
+        matmul->SetBeta(0.0);
+        matmul->SetGamma(1.0);
+        // 14. Workspace
+        ::musa::dnn::MemoryMaintainer maintainer =
+            [](size_t size) {
+                void* ptr = nullptr;
+                musaMalloc(&ptr, size);
+                return ::musa::dnn::MemoryHandler(
+                    ptr,
+                    [](void* p) { if (p) musaFree(p); });
+            };
+        // 15. Run
+        matmul->RunLt(
+            mudnn_handle,
+            out_t,
+            a_t,
+            b_t,
+            ::musa::dnn::Tensor(),
+            bias ? bias_t : ::musa::dnn::Tensor(),
+            lt_param,
+            maintainer);
+        return INFINI_STATUS_SUCCESS;
+    });
+}
+/* ================= Descriptor ================= */
+struct Descriptor::Opaque {
+    std::shared_ptr<device::moore::Handle::Internal> internal;
+};
+Descriptor::~Descriptor() {
+    delete _opaque;
+}
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    infiniopTensorDescriptor_t bias_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t a_scale_desc,
+    infiniopTensorDescriptor_t b_desc,
+    infiniopTensorDescriptor_t b_scale_desc)
+{
+    auto handle = reinterpret_cast<device::moore::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_BF16);
+    auto result = I8GemmInfo::create(
+        out_desc, a_desc, b_desc, MatrixLayout::COL_MAJOR);
+    CHECK_RESULT(result);
+    *desc_ptr = new Descriptor(
+        new Opaque{handle->internal()},
+        result.take(),
+        0,
+        dtype,
+        handle->device,
+        handle->device_id);
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *out,
+    const void *bias,
+    const void *a,
+    const void *a_scale,
+    const void *b,
+    const void *b_scale,
+    void *stream_) const
+{
+    moore_i8gemm_launch(
+        _info,
+        _opaque->internal,
+        out,
+        static_cast<const int8_t*>(a),
+        static_cast<const int8_t*>(b),
+        static_cast<const float*>(a_scale),
+        static_cast<const float*>(b_scale),
+        bias,
+        _out_dtype,
+        reinterpret_cast<musaStream_t>(stream_));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::i8gemm::moore
--- a/src/infiniop/ops/scaled_mm/operator.cc
+++ b/src/infiniop/ops/scaled_mm/operator.cc
@@ -6,6 +6,10 @@
 #include "nvidia/int8_gemm_nvidia.cuh"
 #endif
+#if defined(ENABLE_MOORE_API)
+#include "moore/int8_gemm_moore.h"
+#endif
 __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
                                                  infiniopI8GemmDescriptor_t *desc_ptr,
                                                  infiniopTensorDescriptor_t out_desc,
@@ -31,6 +35,9 @@ __C infiniStatus_t infiniopCreateI8GemmDescriptor(infiniopHandle_t handle,
 #endif
 #if defined(ENABLE_QY_API)
        CREATE(INFINI_DEVICE_QY, nvidia)
+#endif
+#if defined(ENABLE_MOORE_API)
+        CREATE(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -49,6 +56,9 @@ __C infiniStatus_t infiniopGetI8GemmWorkspaceSize(infiniopI8GemmDescriptor_t des
 #endif
 #if defined(ENABLE_QY_API)
        GET(INFINI_DEVICE_QY, nvidia)
+#endif
+#if defined(ENABLE_MOORE_API)
+        GET(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -76,6 +86,9 @@ __C infiniStatus_t infiniopI8Gemm(infiniopI8GemmDescriptor_t desc,
 #endif
 #if defined(ENABLE_QY_API)
        CACULATE(INFINI_DEVICE_QY, nvidia)
+#endif
+#if defined(ENABLE_MOORE_API)
+        CACULATE(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
@@ -94,6 +107,9 @@ __C infiniStatus_t infiniopDestroyI8GemmDescriptor(infiniopI8GemmDescriptor_t de
 #endif
 #if defined(ENABLE_QY_API)
        DESTROY(INFINI_DEVICE_QY, nvidia)
+#endif
+#if defined(ENABLE_MOORE_API)
+        DESTROY(INFINI_DEVICE_MOORE, moore)
 #endif
    default:
        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;

--- a/test/infiniop/scaled_mm_int8.py
+++ b/test/infiniop/scaled_mm_int8.py
@@ -59,10 +59,8 @@ _TOLERANCE_MAP = {
 DEBUG = False
 PROFILE = False
 NUM_PRERUN = 10
-NUM_ITERATIONS = 1000
+NUM_ITERATIONS = 100
-def to_int8(tensor: torch.Tensor) -> torch.Tensor:
-    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
 def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
    o = torch.matmul(a.to(torch.float32), b.to(torch.float32))
@@ -72,6 +70,7 @@ def torch_scaled_mm(a, b, scale_a, scale_b, out_dtype, bias):
        o = o.to(torch.float32) * scale_a.view(-1, 1) * scale_b.view(1, -1)
    return o.to(out_dtype)
 def test(
    handle,
    device,
@@ -83,34 +82,91 @@ def test(
    sync=None,
 ):
    print(
-        f"Testing Linear on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
+        f"Testing scaled_mm_int8 on {InfiniDeviceNames[device]} with x_shape:{x_shape}, w_shape:{w_shape}, inplace:{inplace} dtype:{InfiniDtypeNames[dtype]}"
    )
    M, K = x_shape
    N = w_shape[1]
-    x_packed = to_int8(torch.randn((M, K), device="cuda") * 5)
+    # --- Tensor Descriptor ---
-    weights = to_int8(torch.randn((N, K), device="cuda").t() * 5)
+    # orig: create a random int8 tensor as the reference data source
+    # torch: extract the torch view to adjust layout/stride
-    x_scale = torch.randn((M,), device="cuda", dtype=torch.float32)
+    # final: wrap it back as TestTensor with explicit stride for device execution
-    weights_scale = torch.randn((N,), device="cuda", dtype=torch.float32)
+    x_packed_orig = TestTensor(
-    bias = torch.randn((N,), device="cuda", dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16) * 10
+        (M, K),
+        None,
-    ans = torch_scaled_mm(x_packed, weights, x_scale, weights_scale, torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16, bias=bias)
+        InfiniDtype.I8,
+        device,
+        mode="randint",
+        randint_low=-128,
+        randint_high=127,
+    )
+    x_packed_torch = x_packed_orig.torch_tensor()
    x_packed = TestTensor(
-        (M, K), x_packed.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=x_packed
+        (M, K),
+        x_packed_torch.stride(),
+        InfiniDtype.I8,
+        device,
+        mode="manual",
+        set_tensor=x_packed_torch,
    )
-    x_scale = TestTensor(
-        (M,), x_scale.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=x_scale
+    weights_orig = TestTensor(
+        (N, K),
+        None,
+        InfiniDtype.I8,
+        device,
+        mode="randint",
+        randint_low=-128,
+        randint_high=127,
    )
+    weights_torch = weights_orig.torch_tensor().t()
    weights = TestTensor(
-        (K, N), weights.stride(), InfiniDtype.I8, device, mode="manual", set_tensor=weights
+        (K, N),
+        weights_torch.stride(),
+        InfiniDtype.I8,
+        device,
+        mode="manual",
+        set_tensor=weights_torch,
+    )
+    x_scale_orig = TestTensor((M,), None, InfiniDtype.F32, device, mode="random")
+    x_scale_torch = x_scale_orig.torch_tensor()
+    x_scale = TestTensor(
+        (M,),
+        x_scale_torch.stride(),
+        InfiniDtype.F32,
+        device,
+        mode="manual",
+        set_tensor=x_scale_torch,
    )
+    weights_scale_orig = TestTensor((N,), None, InfiniDtype.F32, device, mode="random")
+    weights_scale_torch = weights_scale_orig.torch_tensor()
    weights_scale = TestTensor(
-        (N,), weights_scale.stride(), InfiniDtype.F32, device, mode="manual", set_tensor=weights_scale
+        (N,),
+        weights_scale_torch.stride(),
+        InfiniDtype.F32,
+        device,
+        mode="manual",
+        set_tensor=weights_scale_torch,
+    )
+    bias_orig = TestTensor((N,), None, dtype, device, mode="random")
+    bias_torch = bias_orig.torch_tensor()
+    bias = TestTensor(
+        (N,), bias_torch.stride(), dtype, device, mode="manual", set_tensor=bias_torch
+    )
+    y = TestTensor(y_shape, None, dtype, device, mode="zeros")
+    ans = torch_scaled_mm(
+        x_packed.torch_tensor(),
+        weights.torch_tensor(),
+        x_scale.torch_tensor(),
+        weights_scale.torch_tensor(),
+        out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+        bias=bias.torch_tensor(),
    )
-    y = TestTensor(y_shape, None, dtype, device)
-    bias = TestTensor((N,), bias.stride(), dtype, device, mode="manual", set_tensor=bias)
    descriptor = infiniopOperatorDescriptor_t()
    check_error(
@@ -164,7 +220,20 @@ def test(
    # Profiling workflow
    if PROFILE:
        # fmt: off
-        profile_operation("PyTorch", lambda: torch_scaled_mm(x_packed, weights, x_scale, weights_scale, torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16, bias=bias), device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation(
+            "PyTorch",
+            lambda: torch_scaled_mm(
+                x_packed.torch_tensor(),
+                weights.torch_tensor(),
+                x_scale.torch_tensor(),
+                weights_scale.torch_tensor(),
+                out_dtype=torch.float16 if dtype == InfiniDtype.F16 else torch.bfloat16,
+                bias=bias.torch_tensor()
+            ),
+            device,
+            NUM_PRERUN,
+            NUM_ITERATIONS
+        )
        profile_operation("    lib", lambda: lib_linear(), device, NUM_PRERUN, NUM_ITERATIONS)
        # fmt: on
@@ -181,6 +250,12 @@ if __name__ == "__main__":
    NUM_ITERATIONS = args.num_iterations
    for device in get_test_devices(args):
-        test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
+        # muDNN(v3101): INT8 quantized multiplication → BF16 output.
+        # Moore backend: BF16 output only.
+        if args.moore == True:
+            _TENSOR_DTYPES_MOORE = [InfiniDtype.BF16]
+            test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES_MOORE)
+        else:
+            test_operator(device, test, _TEST_CASES, _TENSOR_DTYPES)
    print("\033[92mTest passed!\033[0m")
--- a/xmake/moore.lua
+++ b/xmake/moore.lua
@@ -48,6 +48,9 @@ target("infiniop-moore")
    -- Add source files for Moore muBLAS/muDNN GEMM backends.
    add_files("../src/infiniop/ops/gemm/moore/*/*.mu", {rule = "mu"})
+    -- Add source files for Moore per_channel_quant_int8 backends.
+    add_files("../src/infiniop/ops/quant/per_channel_quant_int8/moore/*.mu", {rule = "mu"})
 target_end()
 target("infinirt-moore")