Merge branch 'main' into issue/150

4e4d3415 · Catheriany · GitHub · d1c46889 · 1a4cfb99 · 4e4d3415
Unverified Commit 4e4d3415 authored Apr 29, 2025 by Catheriany Committed by GitHub Apr 29, 2025
20 changed files
--- a/README.md
+++ b/README.md
 # InfiniCore
+[![Doc](https://img.shields.io/badge/Document-ready-blue)](https://github.com/InfiniTensor/InfiniCore-Documentation)
+[![CI](https://github.com/InfiniTensor/InfiniCore/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/InfiniTensor/InfiniCore/actions)
+[![license](https://img.shields.io/github/license/InfiniTensor/InfiniCore)](https://mit-license.org/)
+![GitHub repo size](https://img.shields.io/github/repo-size/InfiniTensor/InfiniCore)
+![GitHub code size in bytes](https://img.shields.io/github/languages/code-size/InfiniTensor/InfiniCore)
+[![GitHub Issues](https://img.shields.io/github/issues/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/issues)
+[![GitHub Pull Requests](https://img.shields.io/github/issues-pr/InfiniTensor/InfiniCore)](https://github.com/InfiniTensor/InfiniCore/pulls)
+![GitHub contributors](https://img.shields.io/github/contributors/InfiniTensor/InfiniCore)
+![GitHub commit activity](https://img.shields.io/github/commit-activity/m/InfiniTensor/InfiniCore)
 InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功能（包括计算、运行时、通信等）提供统一 C 语言接口。目前支持的硬件和后端包括：
 - CPU；
@@ -15,22 +26,6 @@ InfiniCore 是一个跨平台统一编程工具集，为不同芯片平台的功
 ## 配置和使用
-### 软件依赖
- XMake编译器
-  XMake配置选项（`XMAKE_CONFIG_FLAGS`）以及含义
-  - `--omp=[y|n]` 是否使用OpenMP，默认开启
-  - `--cpu=[y|n]` 是否编译CPU接口实现，默认开启
-  - `--nv-gpu=[y|n]` 是否编译英伟达GPU接口实现
-  - `--ascend-npu=[y|n]` 是否编译昇腾NPU接口实现
-  - `--cambricon-mlu=[y|n]` 是否编译寒武纪MLU接口实现
-  - `--metax-gpu=[y|n]` 是否编译沐曦GPU接口实现
-  - `--moore-gpu=[y|n]` 是否编译摩尔线程GPU接口实现
-  - `--sugon-dcu=[y|n]` 是否编译曙光DCU接口实现
-  - `--kunlun-xpu=[y|n]` 是否编译昆仑XPU接口实现
 ### 一键安装
 在 `script/` 目录中提供了 `install.py` 安装脚本。使用方式如下：
@@ -41,6 +36,21 @@ cd InfiniCore
 python scripts/install.py [XMAKE_CONFIG_FLAGS]
 ```
+参数 `XMAKE_CONFIG_FLAGS` 是 xmake 构建配置，可配置下列可选项：
+| 选项                     | 功能                          | 默认值
+|--------------------------|-------------------------------|:-:
+| `--omp=[y\|n]`           | 是否使用 OpenMP               | y
+| `--cpu=[y\|n]`           | 是否编译 CPU 接口实现         | y
+| `--nv-gpu=[y\|n]`        | 是否编译英伟达 GPU 接口实现   | n
+| `--ascend-npu=[y\|n]`    | 是否编译昇腾 NPU 接口实现     | n
+| `--cambricon-mlu=[y\|n]` | 是否编译寒武纪 MLU 接口实现   | n
+| `--metax-gpu=[y\|n]`     | 是否编译沐曦 GPU 接口实现     | n
+| `--moore-gpu=[y\|n]`     | 是否编译摩尔线程 GPU 接口实现 | n
+| `--sugon-dcu=[y\|n]`     | 是否编译曙光 DCU 接口实现     | n
+| `--kunlun-xpu=[y\|n]`    | 是否编译昆仑 XPU 接口实现     | n
+| `--ccl=[y\|n]`           | 是否编译 InfiniCCL 通信库接口实现     | n
 ### 手动安装
 1. 项目配置
@@ -101,6 +111,18 @@ python scripts/python_test.py [--cpu | --nvidia | --cambricon | --ascend]
 详见 `test/infiniop-test` 目录
+#### 通信库（InfiniCCL）测试
+编译（需要先安装InfiniCCL）：
+```shell
+xmake build infiniccl-test
+```
+在英伟达平台运行测试（会自动使用所有可见的卡）：
+```shell
+infiniccl-test --nvidia
+```
 ## 开发指南
 ### 代码格式化
@@ -137,3 +159,22 @@ options:
 - 若设置 `--check`，将检查代码是否需要修改格式，不修改文件内容；
 - 通过 `--c` 指定 c/c++ 格式化器，默认为 `clang-format-16`；
 - 通过 `--python` 指定 python 格式化器 `black`；
+### vscode 开发配置
+基本配置见 [xmake 官方文档](https://xmake.io/#/zh-cn/plugin/more_plugins?id=%e9%85%8d%e7%bd%ae-intellsence)。
+- TL;DR
+  - clangd
+    打开 *xmake.lua*，保存一次以触发编译命令生成，将在工作路径下自动生成 *.vscode/compile_commands.json* 文件。然后在这个文件夹下创建 *settings.json*，填入：
+    > .vscode/settings.json
+    ```json
+    {
+        "clangd.arguments": [
+            "--compile-commands-dir=.vscode"
+        ]
+    }
+    ```
--- a/include/infiniccl.h
+++ b/include/infiniccl.h
+#ifndef __INFINICCL_API_H__
+#define __INFINICCL_API_H__
+#include "infinirt.h"
+typedef enum {
+    INFINICCL_SUM = 0,
+    INFINICCL_PROD = 1,
+    INFINICCL_MAX = 2,
+    INFINICCL_MIN = 3,
+    INFINICCL_AVG = 4,
+} infinicclReduceOp_t;
+struct InfinicclComm;
+typedef struct InfinicclComm *infinicclComm_t;
+__C __export infiniStatus_t infinicclCommInitAll(
+    infiniDevice_t device_type,
+    infinicclComm_t *comms,
+    int ndevice,
+    const int *device_ids);
+__C __export infiniStatus_t infinicclCommDestroy(infinicclComm_t comm);
+__C __export infiniStatus_t infinicclAllReduce(
+    void *sendbuf,
+    void *recvbuf,
+    size_t count,
+    infiniDtype_t dataype,
+    infinicclReduceOp_t op,
+    infinicclComm_t comm,
+    infinirtStream_t stream);
+#endif
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -12,6 +12,7 @@
 #include "infiniop/ops/global_avg_pool.h"
 #include "infiniop/ops/max_pool.h"
 #include "infiniop/ops/mlp.h"
+#include "infiniop/ops/mul.h"
 #include "infiniop/ops/random_sample.h"
 #include "infiniop/ops/rearrange.h"
 #include "infiniop/ops/relu.h"

--- a/include/infiniop/ops/add.h
+++ b/include/infiniop/ops/add.h
@@ -11,7 +11,11 @@ __C __export infiniStatus_t infiniopCreateAddDescriptor(infiniopHandle_t handle,
                                                        infiniopTensorDescriptor_t a,
                                                        infiniopTensorDescriptor_t b);
+__C __export infiniStatus_t infiniopGetAddWorkspaceSize(infiniopAddDescriptor_t desc, size_t *size);
 __C __export infiniStatus_t infiniopAdd(infiniopAddDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
                                        void *c,
                                        void const *a,
                                        void const *b,

--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
+#ifndef __INFINIOP_MUL_API_H__
+#define __INFINIOP_MUL_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
+__C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
+                                                       infiniopMulDescriptor_t *desc_ptr,
+                                                       infiniopTensorDescriptor_t c,
+                                                       infiniopTensorDescriptor_t a,
+                                                       infiniopTensorDescriptor_t b);
+__C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
+                                        void *workspace,
+                                        size_t workspace_size,
+                                        void *c,
+                                        const void *a,
+                                        const void *b,
+                                        void *stream);
+__C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
+#endif
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -12,7 +12,7 @@ os.chdir(PROJECT_DIR)
 def run_tests(args):
    failed = []
    for test in [
-        "causal_softmax.py",
+        "add.py",
        "gemm.py",
        "random_sample.py",
        "rms_norm.py",

--- a/src/infiniccl-test/infiniccl_test.cpp
+++ b/src/infiniccl-test/infiniccl_test.cpp
+#include "infiniccl_test.hpp"
+#include <chrono>
+#include <cstring>
+#include <iostream>
+#include <numeric>
+#include <pthread.h>
+#include <vector>
+#define TEST_INFINI(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return 1)
+#define TEST_INFINI_THREAD(API__) CHECK_API_OR(API__, INFINI_STATUS_SUCCESS, return nullptr)
+const size_t MAX_COUNT = 100ULL * 1024 * 1024;
+const size_t TEST_COUNTS[] = {
+    128,
+    1024,
+    4 * 1024,
+    MAX_COUNT,
+};
+const infiniDtype_t TEST_DTYPES[] = {INFINI_DTYPE_F32, INFINI_DTYPE_F16};
+const size_t WARM_UPS = 10;
+const size_t ITERATIONS = 100;
+struct ThreadArgs {
+    int rank;
+    int device_id;
+    infinicclComm_t comm;
+    infiniDevice_t device_type;
+    infiniDtype_t dtype;
+    size_t count;
+    const void *data;
+    const void *ans;
+    int *result;
+    double *time;
+};
+void setData(infiniDtype_t dtype, void *data, size_t count, float val) {
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        for (size_t i = 0; i < count; i++) {
+            ((float *)data)[i] = val;
+        }
+        break;
+    case INFINI_DTYPE_F16:
+        for (size_t i = 0; i < count; i++) {
+            ((fp16_t *)data)[i] = utils::cast<fp16_t>(val);
+        }
+        break;
+    default:
+        std::abort();
+        break;
+    }
+}
+template <typename T>
+int checkData(const T *actual_, const T *expected_, size_t count) {
+    int failed = 0;
+    for (size_t i = 0; i < count; i++) {
+        if constexpr (std::is_same<T, fp16_t>::value) {
+            float actual = utils::cast<float>(actual_[i]);
+            float expected = utils::cast<float>(expected_[i]);
+            if (std::abs(actual - expected) > 1e-4) {
+                failed += 1;
+            }
+        } else {
+            if (std::abs(actual_[i] - expected_[i]) > 1e-4) {
+                failed += 1;
+            }
+        }
+    }
+    return failed;
+}
+int checkData(const void *actual, const void *expected, infiniDtype_t dtype, size_t count) {
+    switch (dtype) {
+    case INFINI_DTYPE_F32:
+        return checkData((const float *)actual, (const float *)expected, count);
+    case INFINI_DTYPE_F16:
+        return checkData((const fp16_t *)actual, (const fp16_t *)expected, count);
+    default:
+        std::abort();
+        return 1;
+    }
+}
+void *testAllReduceThread(void *arg) {
+    ThreadArgs *args = (ThreadArgs *)arg;
+    *(args->result) = 1;
+    TEST_INFINI_THREAD(infinirtSetDevice(args->device_type, args->device_id));
+    void *output = std::malloc(args->count * infiniSizeOf(args->dtype));
+    std::memset(output, 0, args->count * infiniSizeOf(args->dtype));
+    void *buf;
+    TEST_INFINI_THREAD(infinirtMalloc(&buf, args->count * infiniSizeOf(args->dtype)));
+    TEST_INFINI_THREAD(infinirtMemcpy(buf, args->data, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_H2D));
+    TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
+    TEST_INFINI_THREAD(infinirtDeviceSynchronize());
+    TEST_INFINI_THREAD(infinirtMemcpy(output, buf, args->count * infiniSizeOf(args->dtype), INFINIRT_MEMCPY_D2H));
+    if (checkData(output, args->ans, args->dtype, args->count) != 0) {
+        std::free(output);
+        infinirtFree(buf);
+        return nullptr;
+    }
+    for (size_t i = 0; i < WARM_UPS; i++) {
+        TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
+    }
+    TEST_INFINI_THREAD(infinirtDeviceSynchronize());
+    // measure time
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < ITERATIONS; i++) {
+        TEST_INFINI_THREAD(infinicclAllReduce(buf, buf, args->count, args->dtype, INFINICCL_SUM, args->comm, NULL));
+    }
+    TEST_INFINI_THREAD(infinirtDeviceSynchronize());
+    auto end = std::chrono::high_resolution_clock::now();
+    double elapsed_ms = std::chrono::duration<double, std::milli>(end - start).count();
+    *args->time = elapsed_ms / ITERATIONS;
+    *args->result = 0;
+    std::free(output);
+    infinirtFree(buf);
+    return nullptr;
+}
+int testAllReduce(infiniDevice_t device_type, int ndevice) {
+    std::vector<ThreadArgs> thread_args(ndevice);
+    std::vector<infinicclComm_t> comms(ndevice);
+    std::vector<pthread_t> threads(ndevice);
+    std::vector<int> device_ids(ndevice);
+    std::vector<int> results(ndevice);
+    std::vector<double> times(ndevice);
+    void *data = std::malloc(MAX_COUNT * sizeof(float)); // Use float as max dtype size
+    void *ans = std::malloc(MAX_COUNT * sizeof(float));
+    for (int i = 0; i < ndevice; i++) {
+        device_ids[i] = i;
+    }
+    TEST_INFINI(infinicclCommInitAll(device_type, comms.data(), ndevice, device_ids.data()));
+    for (infiniDtype_t dtype : TEST_DTYPES) {
+        setData(dtype, data, MAX_COUNT, 1.0f);
+        setData(dtype, ans, MAX_COUNT, 1.0f * ndevice);
+        for (size_t count : TEST_COUNTS) {
+            std::cout << "Testing AllReduce with " << count << " elements of " << infiniDtypeToString(dtype) << std::endl;
+            for (int rank = 0; rank < ndevice; rank++) {
+                thread_args[rank] = {rank, device_ids[rank], comms[rank], device_type, dtype, count, data, ans, &results[rank], &times[rank]};
+                pthread_create(&threads[rank], NULL, testAllReduceThread, &thread_args[rank]);
+            }
+            for (int rank = 0; rank < ndevice; rank++) {
+                pthread_join(threads[rank], NULL);
+            }
+            int failed = std::accumulate(results.begin(), results.end(), 0);
+            for (int rank = 0; rank < ndevice; rank++) {
+                if (results[rank] != 0) {
+                    std::cout << "Rank " << rank << ": incorrect results." << std::endl;
+                } else {
+                    std::cout << "Rank " << rank << ": " << times[rank] << " ms." << std::endl;
+                }
+            }
+            if (failed > 0) {
+                std::cout << "Failed with " << failed << " errors." << std::endl
+                          << std::endl;
+                std::free(data);
+                std::free(ans);
+                return 1;
+            }
+            std::cout << std::endl;
+        }
+    }
+    std::free(data);
+    std::free(ans);
+    return 0;
+}
--- a/src/infiniccl-test/infiniccl_test.hpp
+++ b/src/infiniccl-test/infiniccl_test.hpp
+#ifndef INFINICCL_TEST_HPP
+#define INFINICCL_TEST_HPP
+#include <infiniccl.h>
+#include "../utils.h"
+int testAllReduce(infiniDevice_t device_type, int ndevice);
+#endif // INFINICCL_TEST_HPP
--- a/src/infiniccl-test/main.cpp
+++ b/src/infiniccl-test/main.cpp
+#include "infiniccl_test.hpp"
+#include <iostream>
+struct ParsedArgs {
+    infiniDevice_t device_type;
+};
+void printUsage() {
+    std::cout << "Usage:" << std::endl
+              << std::endl;
+    std::cout << "infiniccl-test --<device>" << std::endl
+              << std::endl;
+    std::cout << "  --<device>" << std::endl;
+    std::cout << "    Specify the device type --(nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon)." << std::endl
+              << std::endl;
+    std::cout << "The program will run tests on all visible devices of the specified device type."
+              << " Use Environmental Variables such as CUDA_VSIBLE_DEVICES to limit visible device IDs.";
+    exit(-1);
+}
+#define PARSE_DEVICE(FLAG, DEVICE) \
+    if (arg == FLAG) {             \
+        args.device_type = DEVICE; \
+    }
+ParsedArgs parseArgs(int argc, char *argv[]) {
+    if (argc != 2) {
+        printUsage();
+    }
+    if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") {
+        printUsage();
+    }
+    ParsedArgs args;
+    try {
+        std::string arg = argv[1];
+        // clang-format off
+        PARSE_DEVICE("--nvidia", INFINI_DEVICE_NVIDIA)
+        else PARSE_DEVICE("--cambricon", INFINI_DEVICE_CAMBRICON)
+        else PARSE_DEVICE("--ascend", INFINI_DEVICE_ASCEND)
+        else PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
+        else PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
+        else PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
+        else PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
+        else PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
+        else {
+            printUsage();
+        }
+        // clang-format on
+    } catch (const std::exception &) {
+        printUsage();
+    }
+    return args;
+}
+int main(int argc, char *argv[]) {
+    ParsedArgs args = parseArgs(argc, argv);
+    int ndevice = 0;
+    if (infinirtGetDeviceCount(args.device_type, &ndevice) != INFINI_STATUS_SUCCESS) {
+        std::cout << "Failed to get device count" << std::endl;
+        return -1;
+    }
+    if (ndevice == 0) {
+        std::cout << "No devices found. Tests skipped." << std::endl;
+        return 0;
+    } else {
+        std::cout << "Found " << ndevice << " devices. Running tests..." << std::endl;
+    }
+    int failed = 0;
+    failed += testAllReduce(args.device_type, ndevice);
+    return failed;
+}
--- a/src/infiniccl/cuda/infiniccl_cuda.cu
+++ b/src/infiniccl/cuda/infiniccl_cuda.cu
+#include "infiniccl_cuda.h"
+#include <cuda_runtime.h>
+#include <iostream>
+#include <nccl.h>
+#include <vector>
+#include "../../utils.h"
+#define CHECK_NCCL(API__) CHECK_INTERNAL(API__, ncclSuccess)
+inline cudaStream_t getCudaStream(infinirtStream_t stream) {
+    if (stream == nullptr) {
+        return 0;
+    }
+    return static_cast<cudaStream_t>(stream);
+}
+inline ncclDataType_t getNcclDtype(infiniDtype_t datatype) {
+    switch (datatype) {
+    case INFINI_DTYPE_F32:
+        return ncclFloat;
+    case INFINI_DTYPE_F16:
+        return ncclHalf;
+    default:
+        std::abort();
+        return ncclHalf;
+    }
+}
+inline ncclRedOp_t getNcclRedOp(infinicclReduceOp_t op) {
+    switch (op) {
+    case INFINICCL_SUM:
+        return ncclSum;
+    case INFINICCL_PROD:
+        return ncclProd;
+    case INFINICCL_MAX:
+        return ncclMax;
+    case INFINICCL_MIN:
+        return ncclMin;
+    case INFINICCL_AVG:
+        return ncclAvg;
+    default:
+        std::abort();
+        return ncclSum;
+    }
+}
+inline ncclComm_t getNcclComm(infinicclComm_t comm) {
+    return static_cast<ncclComm_t>(comm->comm);
+}
+namespace infiniccl::cuda {
+infiniStatus_t commInitAll(
+    infinicclComm_t *comms,
+    int ndevice,
+    const int *device_ids) {
+    std::vector<ncclComm_t> nccl_comms(ndevice);
+    CHECK_NCCL(ncclCommInitAll(nccl_comms.data(), ndevice, (int const *)device_ids));
+    for (int i = 0; i < ndevice; i++) {
+        comms[i] = new InfinicclComm{INFINI_DEVICE_NVIDIA, device_ids[i], (void *)(nccl_comms[i])};
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t commDestroy(infinicclComm_t comm) {
+    CHECK_NCCL(ncclCommDestroy(getNcclComm(comm)));
+    delete comm;
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t allReduce(
+    void *sendbuf,
+    void *recvbuf,
+    size_t count,
+    infiniDtype_t datatype,
+    infinicclReduceOp_t op,
+    infinicclComm_t comm,
+    infinirtStream_t stream) {
+    if (datatype != INFINI_DTYPE_F32 && datatype != INFINI_DTYPE_F16) {
+        return INFINI_STATUS_BAD_PARAM;
+    }
+    CHECK_NCCL(ncclAllReduce(sendbuf, recvbuf, count, getNcclDtype(datatype),
+                             getNcclRedOp(op), getNcclComm(comm), getCudaStream(stream)));
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace infiniccl::cuda
--- a/src/infiniccl/cuda/infiniccl_cuda.h
+++ b/src/infiniccl/cuda/infiniccl_cuda.h
+#ifndef INFINICCL_CUDA_H_
+#define INFINICCL_CUDA_H_
+#include "../infiniccl_impl.h"
+// Windows does not support CUDA
+#if defined(ENABLE_CUDA_API) && defined(ENABLE_CCL) && !defined(_WIN32)
+INFINICCL_DEVICE_API_IMPL(cuda)
+#else
+INFINICCL_DEVICE_API_NOOP(cuda)
+#endif
+#endif /* INFINICCL_CUDA_H_ */
--- a/src/infiniccl/infiniccl.cc
+++ b/src/infiniccl/infiniccl.cc
+#include "infiniccl.h"
+#include "./cuda/infiniccl_cuda.h"
+__C infiniStatus_t infinicclCommInitAll(
+    infiniDevice_t device_type,
+    infinicclComm_t *comms,
+    int ndevice,
+    const int *device_ids) {
+#define COMM_INIT_ALL(CASE_, NAMESPACE_) \
+    case CASE_:                          \
+        return infiniccl::NAMESPACE_::commInitAll(comms, ndevice, device_ids);
+    switch (device_type) {
+        COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef COMM_INIT_ALL
+}
+__C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
+    if (comm == nullptr) {
+        return INFINI_STATUS_SUCCESS;
+    }
+#define COMM_DESTROY(CASE_, NAMESPACE_) \
+    case CASE_:                         \
+        return infiniccl::NAMESPACE_::commDestroy(comm);
+    switch (comm->device_type) {
+        COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef COMM_DESTROY
+}
+__C infiniStatus_t infinicclAllReduce(
+    void *sendbuf,
+    void *recvbuf,
+    size_t count,
+    infiniDtype_t dataype,
+    infinicclReduceOp_t op,
+    infinicclComm_t comm,
+    infinirtStream_t stream) {
+    if (comm == nullptr) {
+        return INFINI_STATUS_NULL_POINTER;
+    }
+#define ALL_REDUCE(CASE_, NAMESPACE_) \
+    case CASE_:                       \
+        return infiniccl::NAMESPACE_::allReduce(sendbuf, recvbuf, count, dataype, op, comm, stream);
+    switch (comm->device_type) {
+        ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef ALL_REDUCE
+}
--- a/src/infiniccl/infiniccl_impl.h
+++ b/src/infiniccl/infiniccl_impl.h
+#ifndef INFINICCL_IMPL_H
+#define INFINICCL_IMPL_H
+#include "infiniccl.h"
+struct InfinicclComm {
+    infiniDevice_t device_type;
+    int device_id; // the actual device ID, not rank number
+    void *comm;    // the actual communicator
+};
+#define INFINICCL_DEVICE_API(NAMSPACE, IMPL)               \
+    namespace infiniccl::NAMSPACE {                        \
+    infiniStatus_t commInitAll(                            \
+        infinicclComm_t *comms,                            \
+        int ndevice,                                       \
+        const int *device_ids) IMPL;                       \
+                                                           \
+    infiniStatus_t commDestroy(infinicclComm_t comm) IMPL; \
+                                                           \
+    infiniStatus_t allReduce(                              \
+        void *sendbuf,                                     \
+        void *recvbuf,                                     \
+        size_t count,                                      \
+        infiniDtype_t datatype,                            \
+        infinicclReduceOp_t op,                            \
+        infinicclComm_t comm,                              \
+        infinirtStream_t stream) IMPL;                     \
+    };
+#define INFINICCL_DEVICE_API_IMPL(NAMSPACE) \
+    INFINICCL_DEVICE_API(NAMSPACE, )
+#define INFINICCL_DEVICE_API_NOOP(NAMSPACE) \
+    INFINICCL_DEVICE_API(NAMSPACE, { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; })
+#endif // INFINICCL_IMPL_H
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -7,6 +7,7 @@
 */
 DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
+DECLARE_INFINIOP_TEST(mul)
 DECLARE_INFINIOP_TEST(rope)
 #define REGISTER_INFINIOP_TEST(name)                      \
@@ -25,7 +26,8 @@ DECLARE_INFINIOP_TEST(rope)
    {                                         \
        REGISTER_INFINIOP_TEST(gemm)          \
        REGISTER_INFINIOP_TEST(random_sample) \
-        REGISTER_INFINIOP_TEST(rope)          \
+        REGISTER_INFINIOP_TEST(mul)           \
+        REGISTER_INFINIOP_TEST(rope)          \        
    }
 namespace infiniop_test {

--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::mul {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopMulDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopMul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::mul
--- a/src/infiniop/elementwise/cpu/elementwise_cpu.h
+++ b/src/infiniop/elementwise/cpu/elementwise_cpu.h
@@ -102,7 +102,7 @@ struct DeviceImpl::Opaque {};
 template <typename... Args>
 utils::Result<DeviceImpl> DeviceImpl::create(Args &&...args) {
-    return utils::Result<DeviceImpl>(nullptr);
+    return INFINI_STATUS_NOT_IMPLEMENTED;
 }
 // Perform elementwise operation for different input types

--- a/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
+++ b/src/infiniop/elementwise/cuda/elementwise_cuda.cuh
@@ -208,7 +208,7 @@ struct DeviceImpl::Opaque {
     * @param args           Additional arguments forwarded to the operation.
     * @return infiniStatus_t Returns success or failure status.
     */
-    template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
+    template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tdata, typename... Args>
    infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
                                 void *workspace,
                                 void *output,
@@ -241,7 +241,7 @@ struct DeviceImpl::Opaque {
     * @param args           Additional arguments forwarded to the operation.
     * @return infiniStatus_t Returns success or failure status.
     */
-    template <size_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
+    template <uint32_t BLOCK_SIZE, size_t N, typename Op, typename Tout, typename... Tin, typename... Args,
              std::enable_if_t<(sizeof...(Tin) == Op::num_inputs), int> = 0>
    infiniStatus_t calculateImpl(const op::elementwise::ElementwiseInfo &info,
                                 void *workspace,
@@ -329,7 +329,7 @@ private:
     * @param args          Additional arguments passed to the kernel.
     * @return infiniStatus_t  Status code indicating success or failure.
     */
-    template <size_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
+    template <uint32_t BLOCK_SIZE, size_t N, typename KernelFunc, typename Tout, typename... Args>
    infiniStatus_t launchElementwiseKernel(
        const op::elementwise::ElementwiseInfo &info,
        void *workspace,
@@ -358,8 +358,8 @@ private:
                                     d_output_shape, d_output_strides,
                                     d_input_shapes, d_input_strides, stream));
-        dim3 blockDims(std::min(BLOCK_SIZE, static_cast<size_t>(internal->maxThreadsPerBlock())));
+        dim3 blockDims(std::min(BLOCK_SIZE, static_cast<uint32_t>(internal->maxThreadsPerBlock())));
-        dim3 gridDims(std::min(CEIL_DIV(output_size, blockDims.x), static_cast<size_t>(internal->gridSizeX())));
+        dim3 gridDims(std::min(uint32_t(CEIL_DIV(output_size, blockDims.x)), static_cast<uint32_t>(internal->gridSizeX())));
        size_t step = gridDims.x * blockDims.x;
        for (size_t i = 0; i < output_size; i += step) {

--- a/src/infiniop/elementwise/elementwise.h
+++ b/src/infiniop/elementwise/elementwise.h
@@ -84,8 +84,9 @@ private:
          _output_contiguous(output_contiguous) {}
 public:
+    // Get the Memory size of the meta data in bytes
    inline size_t getMetaMemSize() const {
-        return _meta.size();
+        return _meta.size() * sizeof(size_t);
    }
    inline const int8_t *getMetaStart() const {
        return reinterpret_cast<const int8_t *>(_meta.data());
@@ -167,7 +168,7 @@ public:
                             + input_size * ndim * sizeof(shape_unit)
                             + input_size * ndim * sizeof(stride_unit)
                             + 2 * input_size * sizeof(bool);
-        std::vector<size_t> meta(meta_mem_size);
+        std::vector<size_t> meta(CEIL_DIV(meta_mem_size, sizeof(size_t)));
        int8_t *meta_ptr = reinterpret_cast<int8_t *>(meta.data());
        const auto output_shape = output_desc->shape();

--- a/src/infiniop/ops/add/cpu/add_cpu.cc
+++ b/src/infiniop/ops/add/cpu/add_cpu.cc
+#include "add_cpu.h"
+namespace op::add::cpu {
+Descriptor::~Descriptor() = default;
+infiniStatus_t Descriptor::create(
+    infiniopHandle_t handle_,
+    Descriptor **desc_ptr,
+    infiniopTensorDescriptor_t out_desc,
+    std::vector<infiniopTensorDescriptor_t> input_desc_vec) {
+    auto handle = reinterpret_cast<device::cpu::Handle *>(handle_);
+    auto dtype = out_desc->dtype();
+    const auto &a_desc = input_desc_vec.at(0);
+    const auto &b_desc = input_desc_vec.at(1);
+    const auto &c_shape = out_desc->shape();
+    const auto &a_shape = a_desc->shape();
+    const auto &b_shape = b_desc->shape();
+    CHECK_DTYPE(dtype, INFINI_DTYPE_F16, INFINI_DTYPE_F32, INFINI_DTYPE_F64);
+    CHECK_SAME_SHAPE(c_shape, a_shape, b_shape);
+    // create CPU elementwise descriptor
+    CREATE_ELEMENTWISE_CPU_DESCRIPTOR(handle, dtype, out_desc, input_desc_vec);
+    return INFINI_STATUS_SUCCESS;
+}
+infiniStatus_t Descriptor::calculate(
+    void *workspace,
+    size_t workspace_size,
+    void *output,
+    std::vector<const void *> inputs,
+    void *stream) const {
+    switch (_dtype) {
+    case INFINI_DTYPE_F16:
+        return _device_info->calculate<AddOp, fp16_t>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F32:
+        return _device_info->calculate<AddOp, float>(_info, output, inputs, stream);
+    case INFINI_DTYPE_F64:
+        return _device_info->calculate<AddOp, double>(_info, output, inputs, stream);
+    default:
+        return INFINI_STATUS_BAD_TENSOR_DTYPE;
+    }
+    return INFINI_STATUS_SUCCESS;
+}
+} // namespace op::add::cpu
--- a/src/infiniop/ops/add/cpu/add_cpu.h
+++ b/src/infiniop/ops/add/cpu/add_cpu.h
+#ifndef __ADD_CPU_H__
+#define __ADD_CPU_H__
+#include "../../../elementwise/cpu/elementwise_cpu.h"
+ELEMENTWISE_DESCRIPTOR(add, cpu)
+namespace op::add::cpu {
+typedef struct AddOp {
+public:
+    static constexpr size_t num_inputs = 2;
+    template <typename T>
+    T operator()(const T &a, const T &b) const {
+        return a + b;
+    }
+} AddOp;
+} // namespace op::add::cpu
+#endif // __ADD_CPU_H__