Merge remote-tracking branch 'origin/main' into issue/142

c2e87202 · Catheriany · 41818f84 · c203635b · c2e87202 · c2e87202
Commit c2e87202 authored Jun 04, 2025 by Catheriany
20 changed files
--- a/src/infiniccl/infiniccl.cc
+++ b/src/infiniccl/infiniccl.cc
+#include "infiniccl.h"
+#include "./cuda/infiniccl_cuda.h"
+__C infiniStatus_t infinicclCommInitAll(
+    infiniDevice_t device_type,
+    infinicclComm_t *comms,
+    int ndevice,
+    const int *device_ids) {
+#define COMM_INIT_ALL(CASE_, NAMESPACE_) \
+    case CASE_:                          \
+        return infiniccl::NAMESPACE_::commInitAll(comms, ndevice, device_ids);
+    switch (device_type) {
+        COMM_INIT_ALL(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef COMM_INIT_ALL
+}
+__C infiniStatus_t infinicclCommDestroy(infinicclComm_t comm) {
+    if (comm == nullptr) {
+        return INFINI_STATUS_SUCCESS;
+    }
+#define COMM_DESTROY(CASE_, NAMESPACE_) \
+    case CASE_:                         \
+        return infiniccl::NAMESPACE_::commDestroy(comm);
+    switch (comm->device_type) {
+        COMM_DESTROY(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef COMM_DESTROY
+}
+__C infiniStatus_t infinicclAllReduce(
+    void *sendbuf,
+    void *recvbuf,
+    size_t count,
+    infiniDtype_t dataype,
+    infinicclReduceOp_t op,
+    infinicclComm_t comm,
+    infinirtStream_t stream) {
+    if (comm == nullptr) {
+        return INFINI_STATUS_NULL_POINTER;
+    }
+#define ALL_REDUCE(CASE_, NAMESPACE_) \
+    case CASE_:                       \
+        return infiniccl::NAMESPACE_::allReduce(sendbuf, recvbuf, count, dataype, op, comm, stream);
+    switch (comm->device_type) {
+        ALL_REDUCE(INFINI_DEVICE_NVIDIA, cuda)
+    default:
+        return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED;
+    }
+#undef ALL_REDUCE
+}
--- a/src/infiniccl/infiniccl_impl.h
+++ b/src/infiniccl/infiniccl_impl.h
+#ifndef INFINICCL_IMPL_H
+#define INFINICCL_IMPL_H
+#include "infiniccl.h"
+struct InfinicclComm {
+    infiniDevice_t device_type;
+    int device_id; // the actual device ID, not rank number
+    void *comm;    // the actual communicator
+};
+#define INFINICCL_DEVICE_API(NAMSPACE, IMPL)               \
+    namespace infiniccl::NAMSPACE {                        \
+    infiniStatus_t commInitAll(                            \
+        infinicclComm_t *comms,                            \
+        int ndevice,                                       \
+        const int *device_ids) IMPL;                       \
+                                                           \
+    infiniStatus_t commDestroy(infinicclComm_t comm) IMPL; \
+                                                           \
+    infiniStatus_t allReduce(                              \
+        void *sendbuf,                                     \
+        void *recvbuf,                                     \
+        size_t count,                                      \
+        infiniDtype_t datatype,                            \
+        infinicclReduceOp_t op,                            \
+        infinicclComm_t comm,                              \
+        infinirtStream_t stream) IMPL;                     \
+    };
+#define INFINICCL_DEVICE_API_IMPL(NAMSPACE) \
+    INFINICCL_DEVICE_API(NAMSPACE, )
+#define INFINICCL_DEVICE_API_NOOP(NAMSPACE) \
+    INFINICCL_DEVICE_API(NAMSPACE, { return INFINI_STATUS_DEVICE_TYPE_NOT_SUPPORTED; })
+#endif // INFINICCL_IMPL_H
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -8,6 +8,10 @@
 DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
 DECLARE_INFINIOP_TEST(rms_norm)
+DECLARE_INFINIOP_TEST(mul)
+DECLARE_INFINIOP_TEST(clip)
+DECLARE_INFINIOP_TEST(swiglu)
+DECLARE_INFINIOP_TEST(add)
 #define REGISTER_INFINIOP_TEST(name)                      \
    {                                                     \
@@ -16,6 +20,7 @@ DECLARE_INFINIOP_TEST(rms_norm)
            infiniop_test::name::Test::build,             \
            infiniop_test::name::Test::attribute_names(), \
            infiniop_test::name::Test::tensor_names(),    \
+            infiniop_test::name::Test::output_names(),    \
        }},
 /*
@@ -25,6 +30,10 @@ DECLARE_INFINIOP_TEST(rms_norm)
    {                                         \
        REGISTER_INFINIOP_TEST(gemm)          \
        REGISTER_INFINIOP_TEST(random_sample) \
+        REGISTER_INFINIOP_TEST(add)           \
+        REGISTER_INFINIOP_TEST(mul)           \
+        REGISTER_INFINIOP_TEST(clip)          \
+        REGISTER_INFINIOP_TEST(swiglu)        \
        REGISTER_INFINIOP_TEST(rms_norm)      \
    }

--- a/src/infiniop-test/include/tensor.hpp
+++ b/src/infiniop-test/include/tensor.hpp
@@ -58,7 +58,9 @@ private:
 public:
    Tensor(const GGUFTensorInfo *info,
           const void *ggml_ptr,
-           const GGUFKeyValue *strides_meta = nullptr);
+           const GGUFKeyValue *shape_meta = nullptr,
+           const GGUFKeyValue *strides_meta = nullptr,
+           bool isOutput = false);
    Tensor(std::shared_ptr<Memory> memory, size_t offset,
           const std::vector<size_t> &shape,
           const std::vector<ptrdiff_t> &strides,

--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -92,6 +92,7 @@ public:
                                                                              \
        static std::vector<std::string> attribute_names();                    \
        static std::vector<std::string> tensor_names();                       \
+        static std::vector<std::string> output_names();                       \
                                                                              \
        std::shared_ptr<infiniop_test::Result> run(                           \
            infiniopHandle_t handle, infiniDevice_t device, int device_id,    \
@@ -121,6 +122,7 @@ struct TestBuilder {
    BuilderFunc build;
    std::vector<std::string> attribute_names;
    std::vector<std::string> tensor_names;
+    std::vector<std::string> output_names;
 };
 } // namespace infiniop_test

--- a/src/infiniop-test/src/ops/add.cpp
+++ b/src/infiniop-test/src/ops/add.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::add {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopAddDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateAddDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAddWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopAdd(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAdd(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::add
--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::clip {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> min_val;
+    std::shared_ptr<Tensor> max_val;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("min_val") == tensors.end()
+        || tensors.find("max_val") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->x = tensors["x"];
+    test->_attributes->min_val = tensors["min_val"];
+    test->_attributes->max_val = tensors["max_val"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopClipDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto min_val = _attributes->min_val->to(device, device_id);
+    auto max_val = _attributes->max_val->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
+                                          y->desc(),
+                                          x->desc(),
+                                          min_val->desc(),
+                                          max_val->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
+                          y->data(),
+                          x->data(),
+                          min_val->data(),
+                          max_val->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopClip(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                min_val->data(),
+                max_val->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    infiniopDestroyClipDescriptor(op_desc);
+    infinirtFree(workspace);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "min_val", "max_val", "y", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- min_val: " << _attributes->min_val->info() << std::endl;
+    oss << "- max_val: " << _attributes->max_val->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::clip
--- a/src/infiniop-test/src/ops/gemm.cpp
+++ b/src/infiniop-test/src/ops/gemm.cpp
@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() {
    return {"a", "b", "c", "ans"};
 }
+std::vector<std::string> Test::output_names() {
+    return {};
+}
 std::string Test::toString() const {
    std::ostringstream oss;
    oss << op_name() << std::endl;

--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::mul {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopMulDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateMulDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMulWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopMul(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopMul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::mul
--- a/src/infiniop-test/src/ops/random_sample.cpp
+++ b/src/infiniop-test/src/ops/random_sample.cpp
@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() {
    return {"data", "ans", "result"};
 }
+std::vector<std::string> Test::output_names() {
+    return {"result"};
+}
 std::string Test::toString() const {
    std::ostringstream oss;
    oss << op_name() << std::endl;

--- a/src/infiniop-test/src/ops/swiglu.cpp
+++ b/src/infiniop-test/src/ops/swiglu.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::swiglu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> ans;
+    std::shared_ptr<Tensor> c;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSwiGLUDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateSwiGLUDescriptor(handle, &op_desc,
+                                            c->desc(),
+                                            a->desc(),
+                                            b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSwiGLUWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSwiGLU(
+                op_desc,
+                workspace,
+                workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::swiglu
--- a/src/infiniop-test/src/tensor.cpp
+++ b/src/infiniop-test/src/tensor.cpp
@@ -98,20 +98,28 @@ void *Tensor::data() const {
 Tensor::Tensor(const GGUFTensorInfo *info,
               const void *ggml_ptr,
-               const GGUFKeyValue *strides_meta) {
+               const GGUFKeyValue *shape_meta,
+               const GGUFKeyValue *strides_meta,
+               bool isOutput) {
    _ggml_type = info->ggml_type;
    _offset = 0;
    size_t ndim = static_cast<size_t>(info->ndim);
+    // `_shape`存储真实的tensor形状（来自shape_meta），`temp_shape`存储用于rearrange和计算内存的tensor形状
    _shape = std::vector<size_t>(ndim);
+    std::vector<size_t> temp_shape(ndim);
    _strides = std::vector<ptrdiff_t>(ndim);
    std::vector<ptrdiff_t> contiguous_strides(ndim);
    for (size_t i = 0; i < ndim; i++) {
-        _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
+        temp_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
        if (i == 0) {
            contiguous_strides[ndim - 1] = (ptrdiff_t)1;
        } else {
            contiguous_strides[ndim - 1 - i] = (ptrdiff_t)info->shape[i - 1] * contiguous_strides[ndim - i];
        }
+        if (isOutput) {
+            contiguous_strides[i] = (ptrdiff_t)0;
+        }
    }
    if (strides_meta == nullptr) {
@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info,
        }
    } else {
        for (size_t i = 0; i < ndim; i++) {
-            _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
            if (strides_meta->gguf_type == GGUF_TYPE_INT64) {
                _strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>(
                    strides_meta->value.data())[ndim - 1 - i]);
@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info,
        }
    }
-    infiniopCreateTensorDescriptor(&_desc, ndim, _shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
+    if (isOutput) {
+        if (shape_meta == nullptr) {
+            throw std::runtime_error("Error Creating Tensor: shape_meta cannot be null for output tensor");
+        }
+        for (size_t i = 0; i < ndim; i++) {
+            if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
+                int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                temp_shape[i] = static_cast<size_t>(val);
+            } else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
+                int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                temp_shape[i] = static_cast<size_t>(val);
+            } else {
+                throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
+            }
+        }
+    }
+    infiniopCreateTensorDescriptor(&_desc, ndim, temp_shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
    size_t size;
-    calculateTensorMemory(size, _offset, _shape, _strides, ggmlTypeSize(_ggml_type));
+    calculateTensorMemory(size, _offset, temp_shape, _strides, ggmlTypeSize(_ggml_type));
    _memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0);
    utils::rearrange(
        (char *)_memory->ptr() + _offset,
        (char *)ggml_ptr + info->data_offset,
-        _shape.data(),
+        temp_shape.data(),
        _strides.data(),
        contiguous_strides.data(),
        ndim,
        ggmlTypeSize(_ggml_type));
+    if (shape_meta == nullptr) {
+        _shape = temp_shape;
+    } else {
+        for (size_t i = 0; i < ndim; i++) {
+            if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
+                int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                _shape[i] = static_cast<size_t>(val);
+            } else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
+                int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                _shape[i] = static_cast<size_t>(val);
+            } else {
+                throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
+            }
+        }
+    }
 }
 Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset,

--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
                attrs[attr_name] = attr->second->value;
            }
        }
        for (auto tensor_name : builder.tensor_names) {
            auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name);
            if (info != tensor_info.end()) {
+                auto shape = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".shape");
                auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides");
+                bool is_output = std::find(builder.output_names.begin(), builder.output_names.end(), tensor_name) != builder.output_names.end();
                tensors[tensor_name] = std::make_shared<Tensor>(
                    info->second.get(),
                    gguf_reader.getGgmlStart(),
-                    strides != meta.end() ? strides->second.get() : nullptr);
+                    shape != meta.end() ? shape->second.get() : nullptr,
+                    strides != meta.end() ? strides->second.get() : nullptr,
+                    is_output);
            }
        }
        std::shared_ptr<infiniop_test::base::Test> test;

--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
 # project information
 project(Ascend_C)
 set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
-set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory")
+set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu")
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
@@ -19,10 +19,13 @@ else()
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+include_directories(
+    ${CMAKE_SOURCE_DIR}/../../../../include/infiniop/ 
+)
 ascendc_library(ascend_kernels STATIC
-    ../../ops/swiglu/ascend/swiglu_kernel.cpp
+    ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
-    ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/rope/ascend/rope_ascend_kernel.cpp
-    ../../ops/random_sample/ascend/random_sample_kernel.cpp
+    # ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )
--- a/src/infiniop/devices/ascend/ascend_kernel_common.h
+++ b/src/infiniop/devices/ascend/ascend_kernel_common.h
+#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
+#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
+#include "../../../../include/infinicore.h"
+#include "kernel_operator.h"
+constexpr size_t BLOCK_NUM = 8;
+constexpr size_t BUFFER_NUM = 2;
+constexpr size_t BYTE_ALIGN = 32;
+template <typename T>
+__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
+    size_t bytes = tile_len * sizeof(T);
+    size_t aligned_bytes = (bytes % byte_align == 0)
+                             ? bytes
+                             : (bytes + (byte_align - bytes % byte_align));
+    return aligned_bytes / sizeof(T);
+}
+#endif
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
 #include "common_ascend.h"
 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
+    if (shape.size() != strides.size()) {
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
+        throw std::invalid_argument("Shape and strides must have the same length.");
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    }
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }
-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }
 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
    this->strides = std::vector<int64_t>(ndim);
    for (uint64_t i = 0; i < ndim; ++i) {
        this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
    }
    this->storageShape = inferStorageShape(this->shape, this->strides);
    this->dataType = toAclDataType(desc->dtype());
@@ -41,7 +47,12 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(aclDataType dtype, const std::vecto
    this->strides = strides;
    this->dataType = dtype;
    this->format = aclFormat::ACL_FORMAT_ND;
-    this->storageShape = inferStorageShape(this->shape, this->strides);
+    if (this->ndim != 0) {
+        this->storageShape = inferStorageShape(this->shape, this->strides);
+    } else {
+        this->storageShape = shape;
+        this->storageNdim = 0;
+    }
    this->tensor = aclCreateTensor(this->shape.data(),
                                   this->ndim,
                                   this->dataType,

--- a/src/infiniop/devices/cuda/cuda_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_common.cuh
 #ifndef __INFINIOP_CUDA_COMMON_CUH__
 #define __INFINIOP_CUDA_COMMON_CUH__
-#include "../../reduce/cuda/reduce.cuh"
 #include "cuda_handle.cuh"
 #include "infinicore.h"
-#ifdef ENABLE_SUGON_CUDA_API
-#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
-#else
-#define INFINIOP_CUDA_KERNEL __global__ void
-#endif
-// Posible maximum number of threads per block for CUDA architectures
-// Used for picking correct kernel launch configuration
-#define CUDA_BLOCK_SIZE_1024 1024
-#define CUDA_BLOCK_SIZE_512 512
 namespace device::cuda {
 cudnnDataType_t getCudnnDtype(infiniDtype_t dt);
-// return the memory offset of original tensor, given the flattened index of broadcasted tensor
-__forceinline__ __device__ __host__ size_t
-indexToReducedOffset(
-    size_t flat_index,
-    size_t ndim,
-    const ptrdiff_t *broadcasted_strides,
-    const ptrdiff_t *target_strides) {
-    size_t res = 0;
-    for (size_t i = 0; i < ndim; ++i) {
-        res += flat_index / broadcasted_strides[i] * target_strides[i];
-        flat_index %= broadcasted_strides[i];
-    }
-    return res;
-}
-// get the memory offset of the given element in a tensor given its flat index
-__forceinline__ __device__ __host__ size_t
-indexToOffset(
-    size_t flat_index,
-    size_t ndim,
-    const size_t *shape,
-    const ptrdiff_t *strides) {
-    size_t res = 0;
-    for (size_t i = ndim; i-- > 0;) {
-        res += (flat_index % shape[i]) * strides[i];
-        flat_index /= shape[i];
-    }
-    return res;
-}
 } // namespace device::cuda
 #endif // __INFINIOP_CUDA_COMMON_CUH__
--- a/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+++ b/src/infiniop/devices/cuda/cuda_kernel_common.cuh
+#ifdef ENABLE_SUGON_CUDA_API
+#define INFINIOP_CUDA_KERNEL __launch_bounds__(512) __global__ void
+#else
+#define INFINIOP_CUDA_KERNEL __global__ void
+#endif
+// Posible maximum number of threads per block for CUDA architectures
+// Used for picking correct kernel launch configuration
+#define CUDA_BLOCK_SIZE_1024 1024
+#define CUDA_BLOCK_SIZE_512 512
+#define CHECK_CUDA(API) CHECK_INTERNAL(API, cudaSuccess)
+namespace device::cuda {
+// return the memory offset of original tensor, given the flattened index of broadcasted tensor
+__forceinline__ __device__ __host__ size_t
+indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const ptrdiff_t *broadcasted_strides,
+    const ptrdiff_t *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i] * target_strides[i];
+        flat_index %= broadcasted_strides[i];
+    }
+    return res;
+}
+// get the memory offset of the given element in a tensor given its flat index
+__forceinline__ __device__ __host__ size_t
+indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const size_t *shape,
+    const ptrdiff_t *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i]) * strides[i];
+        flat_index /= shape[i];
+    }
+    return res;
+}
+} // namespace device::cuda
+#ifdef ENABLE_CUDA_API
+#include <cuda_fp16.h>
+__forceinline__ __device__ float
+exp_(const float val) {
+    return expf(val);
+}
+__forceinline__ __device__ long double
+exp_(const long double val) {
+    return expl(val);
+}
+__forceinline__ __device__ double
+exp_(const double val) {
+    return exp(val);
+}
+__forceinline__ __device__ __half
+exp_(const __half x) {
+    return hexp(x);
+}
+#endif
--- a/src/infiniop/devices/kunlun/kunlun_handle.h
+++ b/src/infiniop/devices/kunlun/kunlun_handle.h
@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t;
 typedef XPUEvent kunlunEvent_t;
 typedef xdnn::Context *xdnnHandle_t;
-#define CHECK_XDNN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
+#define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
 namespace device::kunlun {

--- a/src/infiniop/devices/kunlun/kunlun_kernel_common.h
+++ b/src/infiniop/devices/kunlun/kunlun_kernel_common.h
+#ifndef __INFINIOP_KUNLUN_KERNEL_COMMON_H__
+#define __INFINIOP_KUNLUN_KERNEL_COMMON_H__
+// This header file will only be include by .xpu file
+#include "kunlun_kernel_dtype.h"
+#include "xpu/kernel/xtdk.h"
+#include "xpu/kernel/xtdk_math.h"
+#include "xpu/kernel/xtdk_simd.h"
+#include "xpu/runtime.h"
+namespace device::kunlun::kernel {
+// Get mask for kunlun xpu 512bit register calculation
+// if data is not enough to 512bit, padding zero and use
+// mask to identify real data
+// 0 - i bit 1, others 0
+inline __device__ float lowerBitMask(int i) {
+    return (1 << (i + 1)) - 1;
+}
+// Atomic add for reduce
+inline __device__ void atomicAddF32(__shared_ptr__ float *ptr, float value) {
+    int success = 1;
+    while (success) {
+        // SM2REG read 32bit data to register
+        float a = SM2REG_atomic(ptr);
+        a = a + value;
+        success = REG2SM_atomic(ptr, a);
+    }
+}
+inline __device__ size_t indexToReducedOffset(
+    size_t flat_index,
+    size_t ndim,
+    const _ptrdiff_t *broadcasted_strides,
+    const _ptrdiff_t *target_strides) {
+    size_t res = 0;
+    for (size_t i = 0; i < ndim; ++i) {
+        res += flat_index / broadcasted_strides[i].value * target_strides[i].value;
+        flat_index %= broadcasted_strides[i].value;
+        mfence();
+    }
+    return res;
+}
+inline __device__ size_t indexToOffset(
+    size_t flat_index,
+    size_t ndim,
+    const _size_t *shape,
+    const _ptrdiff_t *strides) {
+    size_t res = 0;
+    for (size_t i = ndim; i-- > 0;) {
+        res += (flat_index % shape[i].value) * strides[i].value;
+        flat_index /= shape[i].value;
+        mfence();
+    }
+    return res;
+}
+} // namespace device::kunlun::kernel
+// TODO: atomicAddF16
+// TODO: atomicAddI8
+#endif