Issue/183 Mul算子CPU&CUDA

975559ee · Graylatzhou · d54ee0fb · 975559ee · 975559ee · 975559ee
Commit 975559ee authored Apr 22, 2025 by Graylatzhou
13 changed files
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -19,5 +19,6 @@
 #include "infiniop/ops/rope.h"
 #include "infiniop/ops/swiglu.h"
 #include "infiniop/tensor_descriptor.h"
+#include "infiniop/ops/mul.h"

 #endif // __INFINIOP_API_H__
--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
+#ifndef __INFINIOP_MUL_API_H__
+#define __INFINIOP_MUL_API_H__
+
+#include "../operator_descriptor.h"
+
+typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
+
+__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
+                                                       infiniopMulDescriptor_t *desc_ptr,
+                                                       infiniopTensorDescriptor_t c,
+                                                       infiniopTensorDescriptor_t a,
+                                                       infiniopTensorDescriptor_t b);
+
+__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
+
+__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        void const *a,
+                                        void const *b,
+                                        void *stream);
+                
+__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
+
+#endif
\ No newline at end of file
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -7,6 +7,7 @@
 */
 DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
+DECLARE_INFINIOP_TEST(mul)

 #define REGISTER_INFINIOP_TEST(name)                      \
    {                                                     \
@@ -24,6 +25,7 @@ DECLARE_INFINIOP_TEST(random_sample)
    {                                         \
        REGISTER_INFINIOP_TEST(gemm)          \
        REGISTER_INFINIOP_TEST(random_sample) \
+        REGISTER_INFINIOP_TEST(mul)          \
    }

 namespace infiniop_test {

--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+
+namespace infiniop_test::mul {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+
+    return test;
+}
+
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopMulDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
+                                          c->desc(),
+                                          a->desc(),
+                                          b->desc()), 
+            return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
+                          c->data(),
+                          a->data(),
+                          b->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+        
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+
+    double elapsed_time = 0.;
+
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopMul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+            infiniopMul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        (warm_ups + 1) / 2, (iterations + 1) / 2);
+
+    return TEST_PASSED(elapsed_time);
+}
+
+std::vector<std::string> Test::attribute_names() {
+    // MUL 操作不需要特殊属性（与 GEMM 不同，GEMM 需要 alpha 和 beta）
+    return {};
+}
+
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+
+Test::~Test() {
+    delete _attributes;
+}
+
+} // namespace infiniop_test::mul
\ No newline at end of file
--- a/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
@@ -359,7 +359,8 @@ private:
                                     d_input_shapes, d_input_strides, stream));

        dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
-        dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
+        dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX())));
+
        size_t step = gridDims.x * blockDims.x;

        for (size_t i = 0; i < output_size; i += step) {

--- a/src/infiniop/ops/mul/cpu/mul_cpu.cc
+++ b/src/infiniop/ops/mul/cpu/mul_cpu.cc
+#include "mul_cpu.h"
+
+namespace op::mul::cpu {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &out_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(out_shape, a_shape, b_shape);
+
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+
+    return INFINI_STATUS_SUCCESS;
+}    
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    switch(_dtype) {
+        case INFINI_DTYPE_F16:
+            return _device_info->calculate<MulOp, fp16_t>(_info, output, inputs, stream);
+        case INFINI_DTYPE_F32:
+            return _device_info->calculate<MulOp, float>(_info, output, inputs, stream);
+        case INFINI_DTYPE_F64:
+            return _device_info->calculate<MulOp, double>(_info, output, inputs, stream);
+        default:
+            return INFINI_STATUS_BAD_TENSOR_DTYPE; 
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+}
--- a/src/infiniop/ops/mul/cpu/mul_cpu.h
+++ b/src/infiniop/ops/mul/cpu/mul_cpu.h
+#ifndef __MUL_CPU_H__
+#define __MUL_CPU_H__
+
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+
+ELEMENTWISE_DESCRIPTOR(mul, cpu)
+
+namespace op::mul::cpu {
+typedef struct MulOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a * b;
+    }
+} MulOp;
+} // namespace op::mul::cpu
+
+#endif // __MUL_CPU_H__
\ No newline at end of file
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cu
+++ b/src/infiniop/ops/mul/cuda/mul_cuda.cu
+#include "mul_cuda.cuh"
+#include "mul_cuda_internal.cuh"
+
+namespace op::mul::cuda {
+
+Descriptor::~Descriptor() = default;
+
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+
+    auto handle = reinterpret_cast<device::cuda::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+
+    // create CUDA elementwise descriptor
+    CREATE_ELEMENTWISE_CUDA_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec)
+
+    return INFINI_STATUS_SUCCESS;
+}
+
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+
+    if (workspace_size < _workspace_size) {
+        return INFINI_STATUS_INSUFFICIENT_WORKSPACE;
+    }
+
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<256, MulOp, half>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<256, MulOp, float>(_info, workspace, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<256, MulOp, double>(_info, workspace, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::add::cuda
\ No newline at end of file
--- a/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+++ b/src/infiniop/ops/mul/cuda/mul_cuda.cuh
+#ifndef __MUL_CUDA_API_H__
+#define __MUL_CUDA_API_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda_api.cuh"
+
+ELEMENTWISE_DESCRIPTOR(mul, cuda)
+
+#endif // __MUL_CUDA_API_H__
\ No newline at end of file
--- a/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+++ b/src/infiniop/ops/mul/cuda/mul_cuda_internal.cuh
+#ifndef __MUL_CUDA_H__
+#define __MUL_CUDA_H__
+
+#include "../../../elementwise/cuda/elementwise_cuda.cuh"
+#include <cuda_fp16.h>
+
+namespace op::mul::cuda {
+typedef struct MulOp {
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    __device__ __forceinline__ T operator()(const T &a, const T &b) const {
+        if constexpr (std::is_same_v<T, half2>) {
+            return __hmul2(a, b);
+        } else if constexpr (std::is_same_v<T, half>) {
+            return __hmul(a, b);
+        } else if constexpr (std::is_same_v<T, float>) {
+            return __fmul_rn(a, b);
+        } else {
+            return a * b;
+        }
+    }
+} MulOp;
+
+} // namespace op::add::cuda
+
+#endif // __MUL_CUDA_H__
\ No newline at end of file
--- a/src/infiniop/ops/mul/operator.cc
+++ b/src/infiniop/ops/mul/operator.cc
+#include "../../operator.h"
+#include "../../handle.h"
+#include "infiniop/ops/mul.h"
+
+#ifdef ENABLE_CPU_API
+#include "cpu/mul_cpu.h"
+#endif
+
+#ifdef ENABLE_CUDA_API
+#include "cuda/mul_cuda.cuh"
+#endif
+
+
+__C infiniStatus_t infiniopCreateMulDescriptor(
+    infiniopHandle_t handle,
+    infiniopMulDescriptor_t *desc_ptr,
+    infiniopTensorDescriptor_t c_desc,
+    infiniopTensorDescriptor_t a_desc,
+    infiniopTensorDescriptor_t b_desc) {
+
+#define CREATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                             \
+        return op::mul::NAMESPACE::Descriptor::create(                     \
+            handle,                                                        \
+            reinterpret_cast<op::mul::NAMESPACE::Descriptor **>(desc_ptr), \
+            c_desc,                                                        \
+            {a_desc,                                                       \
+             b_desc})
+
+    switch (handle->device) {
+
+#ifdef ENABLE_CPU_API
+        CREATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CREATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CREATE
+}
+
+__C infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size) {
+
+#define GET(CASE, NAMESPACE)                                                               \
+    case CASE:                                                                             \
+        *size = reinterpret_cast<op::mul::NAMESPACE::Descriptor *>(desc)->workspaceSize(); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+#ifdef ENABLE_CPU_API
+        GET(INFINI_DEVICE_CPU, cpu)
+#endif
+#ifdef ENABLE_CUDA_API
+        GET(INFINI_DEVICE_NVIDIA, cuda)
+#endif
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef GET
+
+    return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+}
+
+__C infiniStatus_t infiniopMul(
+    infiniopMulDescriptor_t desc,
+    void *workspace,
+    size_t workspace_size,
+    void *c,
+    const void *a,
+    const void *b,
+    void *stream) {
+
+#define CALCULATE(CASE, NAMESPACE)                                            \
+    case CASE:                                                                \
+        return reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc) \
+            ->calculate(workspace, workspace_size, c, {a, b}, stream)
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        CALCULATE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        CALCULATE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef CALCULATE
+}
+
+__C infiniStatus_t
+infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc) {
+
+#define DELETE(CASE, NAMESPACE)                                                \
+    case CASE:                                                                 \
+        delete reinterpret_cast<const op::mul::NAMESPACE::Descriptor *>(desc); \
+        return INFINI_STATUS_SUCCESS;
+
+    switch (desc->device_type) {
+
+#ifdef ENABLE_CPU_API
+        DELETE(INFINI_DEVICE_CPU, cpu);
+#endif
+#ifdef ENABLE_CUDA_API
+        DELETE(INFINI_DEVICE_NVIDIA, cuda);
+#endif
+
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+
+#undef DELETE
+}
\ No newline at end of file
--- a/test/infiniop-test/test_generate/testcases/mul.py
+++ b/test/infiniop-test/test_generate/testcases/mul.py
+from ast import List
+import numpy as np
+import gguf
+from typing import List
+
+from .. import InfiniopTestWriter, InfiniopTestCase, np_dtype_to_ggml, gguf_strides
+
+def mul(
+    a: np.ndarray,
+    b: np.ndarray,
+    c: np.ndarray = None,
+):
+    if c is None:
+        return np.multiply(a, b)
+    return np.multiply(a, b, out=c)
+
+def random_tensor(shape, dtype):
+    rate = 1e-3
+    var = 0.5 * rate  # 数值范围在[-5e-4, 5e-4]
+    return rate * np.random.rand(*shape).astype(dtype) - var
+
+class MulTestCase(InfiniopTestCase):
+    def __init__(
+        self,
+        a: np.ndarray,
+        stride_a: List[int] | None,
+        b: np.ndarray,
+        stride_b: List[int] | None,
+        c: np.ndarray,
+        stride_c: List[int] | None,
+    ):
+        super().__init__("mul")
+        self.a = a
+        self.stride_a = stride_a
+        self.b = b
+        self.stride_b = stride_b
+        self.c = c
+        self.stride_c = stride_c
+
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        super().write_test(test_writer)
+        if self.stride_a is not None:
+            test_writer.add_array(test_writer.gguf_key("a.strides"), self.stride_a)
+        if self.stride_b is not None:
+            test_writer.add_array(test_writer.gguf_key("b.strides"), self.stride_b)
+        if self.stride_c is not None:
+            test_writer.add_array(test_writer.gguf_key("c.strides"), self.stride_c)
+        test_writer.add_tensor(
+            test_writer.gguf_key("a"), self.a, raw_dtype=np_dtype_to_ggml(self.a.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("b"), self.b, raw_dtype=np_dtype_to_ggml(self.b.dtype)
+        )
+        test_writer.add_tensor(
+            test_writer.gguf_key("c"), self.c, raw_dtype=np_dtype_to_ggml(self.c.dtype)
+        )
+        ans = mul(self.a, self.b, self.c)
+        test_writer.add_tensor(
+            test_writer.gguf_key("ans"), ans, raw_dtype=np_dtype_to_ggml(ans.dtype)
+        )
+
+if __name__ == '__main__':
+    test_writer = InfiniopTestWriter("mul.gguf")
+    test_cases = [
+        MulTestCase(
+            random_tensor((2, 3), np.float32),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float32),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float32),
+            gguf_strides(3, 1),
+        ),
+        MulTestCase(
+            random_tensor((2, 3), np.float16),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float16),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float16),
+            gguf_strides(3, 1),
+        ),
+        MulTestCase(
+            random_tensor((2, 3), np.float64),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float64),
+            gguf_strides(3, 1),
+            random_tensor((2, 3), np.float64),
+            gguf_strides(3, 1),
+        ),
+        MulTestCase(
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 4),
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 5),
+            random_tensor((4, 6), np.float16),
+            gguf_strides(1, 4),
+        ),
+        MulTestCase(
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 2048),
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 2048),
+            random_tensor((1, 2048), np.float16),
+            gguf_strides(1, 2048),
+        ),
+        MulTestCase(
+            random_tensor((2048, 2048), np.float32),
+            None,
+            random_tensor((2048, 2048), np.float32),
+            None,
+            random_tensor((2048, 2048), np.float32),
+            None,
+        ),
+        MulTestCase(
+            random_tensor((2, 4, 2048), np.float16),
+            None,
+            random_tensor((2, 4, 2048), np.float16),
+            None,
+            random_tensor((2, 4, 2048), np.float16),
+            None,
+        ),
+        MulTestCase(
+            random_tensor((2, 4, 2048), np.float32),
+            None,
+            random_tensor((2, 4, 2048), np.float32),
+            None,
+            random_tensor((2, 4, 2048), np.float32),
+            None,
+        ),
+        MulTestCase(
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(1, 2560),
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(1, 2560),
+            random_tensor((2048, 2560), np.float32),
+            gguf_strides(1, 2560),
+        ),
+        MulTestCase(
+            random_tensor((4, 48, 64), np.float16),
+            None,
+            random_tensor((4, 48, 64), np.float16),
+            None,
+            random_tensor((4, 48, 64), np.float16),
+            None
+        ),
+        MulTestCase(
+            random_tensor((4, 48, 64), np.float32),
+            None,
+            random_tensor((4, 48, 64), np.float32),
+            None,
+            random_tensor((4, 48, 64), np.float32),
+            None
+        ),
+
+    ]
+    test_writer.add_tests(test_cases)
+    test_writer.save()
--- a/test/infiniop/mul.py
+++ b/test/infiniop/mul.py
+import torch
+import ctypes
+from ctypes import POINTER, Structure, c_int32, c_void_p, c_uint64
+from libinfiniop import (
+    infiniopHandle_t,
+    infiniopTensorDescriptor_t,
+    open_lib,
+    to_tensor,
+    get_test_devices,
+    check_error,
+    rearrange_if_needed,
+    test_operator,
+    get_args,
+    debug,
+    get_tolerance,
+    profile_operation,
+    create_workspace,
+)
+from enum import Enum, auto
+
+# ==============================================================================
+#  Configuration (Internal Use Only)
+# ==============================================================================
+# These are not meant to be imported from other modules
+_TEST_CASES_ = [
+    # shape, a_stride, b_stride, c_stride
+    ((13, 4), None, None, None),
+    ((13, 4), (10, 1), (10, 1), (10, 1)),
+    ((13, 4), (0, 1), None, None),
+    ((13, 4, 4), None, None, None),
+    ((13, 4, 4), (20, 4, 1), (20, 4, 1), (20, 4, 1)),
+    ((13, 4, 4), (4, 0, 1), (0, 4, 1), None),
+    ((16, 5632), None, None, None),
+    ((16, 5632), (13312, 1), (13312, 1), (13312, 1)),
+    ((4, 4, 5632), None, None, None),
+    ((4, 4, 5632), (45056, 5632, 1), (45056, 5632, 1), (45056, 5632, 1)),
+]
+
+
+class Inplace(Enum):
+    OUT_OF_PLACE = auto()
+    INPLACE_A = auto()
+    INPLACE_B = auto()
+
+
+# Inplace options applied for each test case in _TEST_CASES_
+_INPLACE = [
+    Inplace.OUT_OF_PLACE,
+    Inplace.INPLACE_A,
+    Inplace.INPLACE_B,
+]
+
+# Form the test cases by appending each element of _INPLACE to each tuple in _TEST_CASES_
+_TEST_CASES = [
+    test_case + (inplace_item,)
+    for test_case in _TEST_CASES_
+    for inplace_item in _INPLACE
+]
+
+# Data types used for testing
+_TENSOR_DTYPES = [torch.float16, torch.float32]
+
+# Tolerance map for different data types
+_TOLERANCE_MAP = {
+    torch.float16: {"atol": 1e-4, "rtol": 1e-2},
+}
+
+DEBUG = False
+PROFILE = False
+NUM_PRERUN = 10
+NUM_ITERATIONS = 1000
+
+
+class MulDescriptor(Structure):
+    _fields_ = [("device", c_int32)]
+
+
+infiniopMulDescriptor_t = POINTER(MulDescriptor)
+
+
+def mul(x, y):
+    return torch.mul(x, y)
+
+
+def process_tensors(c, c_strides, a, a_stride, b, b_stride, inplace):
+    """
+    rearrange the tensors if needed and apply the inplace config.
+    if inplace is true and the output (i.e., c) is placed to the broadcasted input,
+    the inplace config is ignored and out-of-place is used
+    """
+    original_c_strides = c_strides if c_strides else c.stride()
+
+    def _rearrange(tensor, strides):
+        if strides and 0 in strides:
+            tensor.set_(tensor.untyped_storage(), 0, tensor.shape, strides)
+            return tensor
+        else:
+            return rearrange_if_needed(tensor, strides)
+
+    a, b, c = [
+        _rearrange(tensor, stride)
+        for tensor, stride in zip([a, b, c], [a_stride, b_stride, c_strides])
+    ]
+    c = (
+        c
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a if inplace == Inplace.INPLACE_A else b)
+    )
+    # if inplace is true and c has broadcasted config, reset it to the original unbroadcasted strides
+    if 0 in c.stride():
+        c.set_(c.untyped_storage(), 0, c.shape, original_c_strides)
+
+    return a, b, c
+
+
+def test(
+    lib,
+    handle,
+    torch_device,
+    shape,
+    a_stride=None,
+    b_stride=None,
+    c_stride=None,
+    inplace=Inplace.OUT_OF_PLACE,
+    dtype=torch.float16,
+    sync=None,
+):
+    print(
+        f"Testing Mul on {torch_device} with shape:{shape} a_stride:{a_stride} b_stride:{b_stride} c_stride:{c_stride} "
+        f"dtype:{dtype} inplace:{inplace}"
+    )
+
+    a = torch.rand(shape, dtype=dtype).to(torch_device)
+    b = torch.rand(shape, dtype=dtype).to(torch_device)
+    c = torch.rand(shape, dtype=dtype).to(torch_device)
+    a, b, c = process_tensors(c, c_stride, a, a_stride, b, b_stride, inplace)
+
+    ans = mul(a, b)
+
+    a_tensor, b_tensor = [to_tensor(tensor, lib) for tensor in [a, b]]
+    c_tensor = (
+        to_tensor(c, lib)
+        if inplace == Inplace.OUT_OF_PLACE
+        else (a_tensor if inplace == Inplace.INPLACE_A else b_tensor)
+    )
+    if sync is not None:
+        sync()
+
+    descriptor = infiniopMulDescriptor_t()
+    check_error(
+        lib.infiniopCreateMulDescriptor(
+            handle,
+            ctypes.byref(descriptor),
+            c_tensor.descriptor,
+            a_tensor.descriptor,
+            b_tensor.descriptor,
+        )
+    )
+
+    # Invalidate the shape and strides in the descriptor to prevent them from being directly used by the kernel
+    for tensor in [a_tensor, b_tensor, c_tensor]:
+        tensor.destroyDesc(lib)
+
+    workspace_size = c_uint64(0)
+    check_error(
+        lib.infiniopGetMulWorkspaceSize(descriptor, ctypes.byref(workspace_size))
+    )
+    workspace = create_workspace(workspace_size.value, c.device)
+
+    def lib_mul():
+        check_error(
+            lib.infiniopMul(
+                descriptor,
+                workspace.data_ptr() if workspace is not None else None,
+                workspace_size.value,
+                c_tensor.data,
+                a_tensor.data,
+                b_tensor.data,
+                None,
+            )
+        )
+
+    lib_mul()
+
+    atol, rtol = get_tolerance(_TOLERANCE_MAP, dtype)
+    if DEBUG:
+        debug(c, ans, atol=atol, rtol=rtol)
+    assert torch.allclose(c, ans, atol=atol, rtol=rtol)
+
+    # Profiling workflow
+    if PROFILE:
+        # fmt: off
+        profile_operation("PyTorch", lambda: mul(a, b), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        profile_operation("    lib", lambda: lib_mul(), torch_device, NUM_PRERUN, NUM_ITERATIONS)
+        # fmt: on
+    check_error(lib.infiniopDestroyMulDescriptor(descriptor))
+
+
+if __name__ == "__main__":
+    args = get_args()
+    lib = open_lib()
+
+    lib.infiniopCreateMulDescriptor.restype = c_int32
+    lib.infiniopCreateMulDescriptor.argtypes = [
+        infiniopHandle_t,
+        POINTER(infiniopMulDescriptor_t),
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+        infiniopTensorDescriptor_t,
+    ]
+
+    lib.infiniopGetMulWorkspaceSize.restype = c_int32
+    lib.infiniopGetMulWorkspaceSize.argtypes = [
+        infiniopMulDescriptor_t,
+        POINTER(c_uint64),
+    ]
+
+    lib.infiniopMul.restype = c_int32
+    lib.infiniopMul.argtypes = [
+        infiniopMulDescriptor_t,
+        c_void_p,
+        c_uint64,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+        c_void_p,
+    ]
+
+    lib.infiniopDestroyMulDescriptor.restype = c_int32
+    lib.infiniopDestroyMulDescriptor.argtypes = [
+        infiniopMulDescriptor_t,
+    ]
+
+    # Configure testing options
+    DEBUG = args.debug
+    PROFILE = args.profile
+    NUM_PRERUN = args.num_prerun
+    NUM_ITERATIONS = args.num_iterations
+
+    for device in get_test_devices(args):
+        test_operator(lib, device, test, _TEST_CASES, _TENSOR_DTYPES)
+
+    print("\033[92mTest passed!\033[0m")
\ No newline at end of file