Merge remote-tracking branch 'origin/main' into issue/150

9b32b4b1 · Catheriany · 15bcbdfc · 4799ddbf · 9b32b4b1 · 9b32b4b1
Commit 9b32b4b1 authored Jun 04, 2025 by Catheriany
20 changed files
--- a/README.md
+++ b/README.md
@@ -175,6 +175,10 @@ options:
    {
        "clangd.arguments": [
            "--compile-commands-dir=.vscode"
-        ]
+        ],
+        "xmake.additionalConfigArguments": [
+            // 在这里配置 XMAKE_CONFIG_FLAGS
+            "--nv-gpu=y"
+        ],
    }
    ```
--- a/include/infiniop.h
+++ b/include/infiniop.h
@@ -6,6 +6,7 @@
 #include "infiniop/ops/attention.h"
 #include "infiniop/ops/avg_pool.h"
 #include "infiniop/ops/causal_softmax.h"
+#include "infiniop/ops/clip.h"
 #include "infiniop/ops/conv.h"
 #include "infiniop/ops/expand.h"
 #include "infiniop/ops/gemm.h"

--- a/include/infiniop/ops/clip.h
+++ b/include/infiniop/ops/clip.h
+#ifndef __INFINIOP_CLIP_API_H__
+#define __INFINIOP_CLIP_API_H__
+#include "../operator_descriptor.h"
+typedef struct InfiniopDescriptor *infiniopClipDescriptor_t;
+__C __export infiniStatus_t infiniopCreateClipDescriptor(infiniopHandle_t handle,
+                                                         infiniopClipDescriptor_t *desc_ptr,
+                                                         infiniopTensorDescriptor_t y,
+                                                         infiniopTensorDescriptor_t x,
+                                                         infiniopTensorDescriptor_t min_val,
+                                                         infiniopTensorDescriptor_t max_val);
+__C __export infiniStatus_t infiniopGetClipWorkspaceSize(infiniopClipDescriptor_t desc, size_t *size);
+__C __export infiniStatus_t infiniopClip(infiniopClipDescriptor_t desc,
+                                         void *workspace,
+                                         size_t workspace_size,
+                                         void *y,
+                                         const void *x,
+                                         const void *min_val,
+                                         const void *max_val,
+                                         void *stream);
+__C __export infiniStatus_t infiniopDestroyClipDescriptor(infiniopClipDescriptor_t desc);
+#endif
--- a/include/infiniop/ops/mul.h
+++ b/include/infiniop/ops/mul.h
@@ -6,10 +6,10 @@
 typedef struct InfiniopDescriptor *infiniopMulDescriptor_t;
 __C __export infiniStatus_t infiniopCreateMulDescriptor(infiniopHandle_t handle,
-                                                       infiniopMulDescriptor_t *desc_ptr,
+                                                        infiniopMulDescriptor_t *desc_ptr,
-                                                       infiniopTensorDescriptor_t c,
+                                                        infiniopTensorDescriptor_t c,
-                                                       infiniopTensorDescriptor_t a,
+                                                        infiniopTensorDescriptor_t a,
-                                                       infiniopTensorDescriptor_t b);
+                                                        infiniopTensorDescriptor_t b);
 __C __export infiniStatus_t infiniopGetMulWorkspaceSize(infiniopMulDescriptor_t desc, size_t *size);
@@ -20,7 +20,7 @@ __C __export infiniStatus_t infiniopMul(infiniopMulDescriptor_t desc,
                                        const void *a,
                                        const void *b,
                                        void *stream);
 __C __export infiniStatus_t infiniopDestroyMulDescriptor(infiniopMulDescriptor_t desc);
 #endif
--- a/scripts/python_test.py
+++ b/scripts/python_test.py
@@ -18,6 +18,7 @@ def run_tests(args):
        "rms_norm.py",
        "rope.py",
        "swiglu.py",
+        "attention.py",
    ]:
        result = subprocess.run(
            f"python {test} {args}", text=True, encoding="utf-8", shell=True

--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
@@ -9,6 +9,9 @@ DECLARE_INFINIOP_TEST(gemm)
 DECLARE_INFINIOP_TEST(random_sample)
 DECLARE_INFINIOP_TEST(mul)
 DECLARE_INFINIOP_TEST(rope)
+DECLARE_INFINIOP_TEST(clip)
+DECLARE_INFINIOP_TEST(swiglu)
+DECLARE_INFINIOP_TEST(add)
 #define REGISTER_INFINIOP_TEST(name)                      \
    {                                                     \
@@ -17,6 +20,7 @@ DECLARE_INFINIOP_TEST(rope)
            infiniop_test::name::Test::build,             \
            infiniop_test::name::Test::attribute_names(), \
            infiniop_test::name::Test::tensor_names(),    \
+            infiniop_test::name::Test::output_names(),    \
        }},
 /*
@@ -26,7 +30,10 @@ DECLARE_INFINIOP_TEST(rope)
    {                                         \
        REGISTER_INFINIOP_TEST(gemm)          \
        REGISTER_INFINIOP_TEST(random_sample) \
+        REGISTER_INFINIOP_TEST(add)           \
        REGISTER_INFINIOP_TEST(mul)           \
+        REGISTER_INFINIOP_TEST(clip)          \
+        REGISTER_INFINIOP_TEST(swiglu)        \
        REGISTER_INFINIOP_TEST(rope)          \
    }

--- a/src/infiniop-test/include/tensor.hpp
+++ b/src/infiniop-test/include/tensor.hpp
@@ -58,7 +58,9 @@ private:
 public:
    Tensor(const GGUFTensorInfo *info,
           const void *ggml_ptr,
-           const GGUFKeyValue *strides_meta = nullptr);
+           const GGUFKeyValue *shape_meta = nullptr,
+           const GGUFKeyValue *strides_meta = nullptr,
+           bool isOutput = false);
    Tensor(std::shared_ptr<Memory> memory, size_t offset,
           const std::vector<size_t> &shape,
           const std::vector<ptrdiff_t> &strides,

--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
@@ -92,6 +92,7 @@ public:
                                                                              \
        static std::vector<std::string> attribute_names();                    \
        static std::vector<std::string> tensor_names();                       \
+        static std::vector<std::string> output_names();                       \
                                                                              \
        std::shared_ptr<infiniop_test::Result> run(                           \
            infiniopHandle_t handle, infiniDevice_t device, int device_id,    \
@@ -121,6 +122,7 @@ struct TestBuilder {
    BuilderFunc build;
    std::vector<std::string> attribute_names;
    std::vector<std::string> tensor_names;
+    std::vector<std::string> output_names;
 };
 } // namespace infiniop_test

--- a/src/infiniop-test/src/ops/add.cpp
+++ b/src/infiniop-test/src/ops/add.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::add {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopAddDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateAddDescriptor(handle, &op_desc,
+                                         c->desc(),
+                                         a->desc(),
+                                         b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetAddWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopAdd(op_desc, workspace, workspace_size,
+                         c->data(),
+                         a->data(),
+                         b->data(),
+                         nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopAdd(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::add
--- a/src/infiniop-test/src/ops/clip.cpp
+++ b/src/infiniop-test/src/ops/clip.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::clip {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> x;
+    std::shared_ptr<Tensor> min_val;
+    std::shared_ptr<Tensor> max_val;
+    std::shared_ptr<Tensor> y;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("x") == tensors.end()
+        || tensors.find("min_val") == tensors.end()
+        || tensors.find("max_val") == tensors.end()
+        || tensors.find("y") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->x = tensors["x"];
+    test->_attributes->min_val = tensors["min_val"];
+    test->_attributes->max_val = tensors["max_val"];
+    test->_attributes->y = tensors["y"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopClipDescriptor_t op_desc;
+    auto x = _attributes->x->to(device, device_id);
+    auto min_val = _attributes->min_val->to(device, device_id);
+    auto max_val = _attributes->max_val->to(device, device_id);
+    auto y = _attributes->y->to(device, device_id);
+    CHECK_OR(infiniopCreateClipDescriptor(handle, &op_desc,
+                                          y->desc(),
+                                          x->desc(),
+                                          min_val->desc(),
+                                          max_val->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create clip descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetClipWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopClip(op_desc, workspace, workspace_size,
+                          y->data(),
+                          x->data(),
+                          min_val->data(),
+                          max_val->data(),
+                          nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(y, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopClip(
+                op_desc, workspace, workspace_size,
+                y->data(),
+                x->data(),
+                min_val->data(),
+                max_val->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    infiniopDestroyClipDescriptor(op_desc);
+    infinirtFree(workspace);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"x", "min_val", "max_val", "y", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"y"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- x: " << _attributes->x->info() << std::endl;
+    oss << "- min_val: " << _attributes->min_val->info() << std::endl;
+    oss << "- max_val: " << _attributes->max_val->info() << std::endl;
+    oss << "- y: " << _attributes->y->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::clip
--- a/src/infiniop-test/src/ops/gemm.cpp
+++ b/src/infiniop-test/src/ops/gemm.cpp
@@ -113,6 +113,10 @@ std::vector<std::string> Test::tensor_names() {
    return {"a", "b", "c", "ans"};
 }
+std::vector<std::string> Test::output_names() {
+    return {};
+}
 std::string Test::toString() const {
    std::ostringstream oss;
    oss << op_name() << std::endl;

--- a/src/infiniop-test/src/ops/mul.cpp
+++ b/src/infiniop-test/src/ops/mul.cpp
@@ -87,6 +87,10 @@ std::vector<std::string> Test::tensor_names() {
    return {"a", "b", "c", "ans"};
 }
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
 std::string Test::toString() const {
    std::ostringstream oss;
    oss << op_name() << std::endl;

--- a/src/infiniop-test/src/ops/random_sample.cpp
+++ b/src/infiniop-test/src/ops/random_sample.cpp
@@ -109,6 +109,10 @@ std::vector<std::string> Test::tensor_names() {
    return {"data", "ans", "result"};
 }
+std::vector<std::string> Test::output_names() {
+    return {"result"};
+}
 std::string Test::toString() const {
    std::ostringstream oss;
    oss << op_name() << std::endl;

--- a/src/infiniop-test/src/ops/swiglu.cpp
+++ b/src/infiniop-test/src/ops/swiglu.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::swiglu {
+struct Test::Attributes {
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> ans;
+    std::shared_ptr<Tensor> c;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopSwiGLUDescriptor_t op_desc;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateSwiGLUDescriptor(handle, &op_desc,
+                                            c->desc(),
+                                            a->desc(),
+                                            b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetSwiGLUWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopSwiGLU(op_desc, workspace, workspace_size, c->data(), a->data(), b->data(), nullptr),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans, _rtol, _atol);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopSwiGLU(
+                op_desc,
+                workspace,
+                workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                nullptr);
+        },
+        warm_ups, iterations);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::vector<std::string> Test::output_names() {
+    return {"c"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::swiglu
--- a/src/infiniop-test/src/tensor.cpp
+++ b/src/infiniop-test/src/tensor.cpp
@@ -98,20 +98,28 @@ void *Tensor::data() const {
 Tensor::Tensor(const GGUFTensorInfo *info,
               const void *ggml_ptr,
-               const GGUFKeyValue *strides_meta) {
+               const GGUFKeyValue *shape_meta,
+               const GGUFKeyValue *strides_meta,
+               bool isOutput) {
    _ggml_type = info->ggml_type;
    _offset = 0;
    size_t ndim = static_cast<size_t>(info->ndim);
+    // `_shape`存储真实的tensor形状（来自shape_meta），`temp_shape`存储用于rearrange和计算内存的tensor形状
    _shape = std::vector<size_t>(ndim);
+    std::vector<size_t> temp_shape(ndim);
    _strides = std::vector<ptrdiff_t>(ndim);
    std::vector<ptrdiff_t> contiguous_strides(ndim);
    for (size_t i = 0; i < ndim; i++) {
-        _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
+        temp_shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
        if (i == 0) {
            contiguous_strides[ndim - 1] = (ptrdiff_t)1;
        } else {
            contiguous_strides[ndim - 1 - i] = (ptrdiff_t)info->shape[i - 1] * contiguous_strides[ndim - i];
        }
+        if (isOutput) {
+            contiguous_strides[i] = (ptrdiff_t)0;
+        }
    }
    if (strides_meta == nullptr) {
@@ -120,7 +128,6 @@ Tensor::Tensor(const GGUFTensorInfo *info,
        }
    } else {
        for (size_t i = 0; i < ndim; i++) {
-            _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
            if (strides_meta->gguf_type == GGUF_TYPE_INT64) {
                _strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>(
                    strides_meta->value.data())[ndim - 1 - i]);
@@ -133,18 +140,62 @@ Tensor::Tensor(const GGUFTensorInfo *info,
        }
    }
-    infiniopCreateTensorDescriptor(&_desc, ndim, _shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
+    if (isOutput) {
+        if (shape_meta == nullptr) {
+            throw std::runtime_error("Error Creating Tensor: shape_meta cannot be null for output tensor");
+        }
+        for (size_t i = 0; i < ndim; i++) {
+            if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
+                int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                temp_shape[i] = static_cast<size_t>(val);
+            } else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
+                int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                temp_shape[i] = static_cast<size_t>(val);
+            } else {
+                throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
+            }
+        }
+    }
+    infiniopCreateTensorDescriptor(&_desc, ndim, temp_shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
    size_t size;
-    calculateTensorMemory(size, _offset, _shape, _strides, ggmlTypeSize(_ggml_type));
+    calculateTensorMemory(size, _offset, temp_shape, _strides, ggmlTypeSize(_ggml_type));
    _memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0);
    utils::rearrange(
        (char *)_memory->ptr() + _offset,
        (char *)ggml_ptr + info->data_offset,
-        _shape.data(),
+        temp_shape.data(),
        _strides.data(),
        contiguous_strides.data(),
        ndim,
        ggmlTypeSize(_ggml_type));
+    if (shape_meta == nullptr) {
+        _shape = temp_shape;
+    } else {
+        for (size_t i = 0; i < ndim; i++) {
+            if (shape_meta->gguf_type == GGUF_TYPE_INT64) {
+                int64_t val = reinterpret_cast<const int64_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                _shape[i] = static_cast<size_t>(val);
+            } else if (shape_meta->gguf_type == GGUF_TYPE_INT32) {
+                int32_t val = reinterpret_cast<const int32_t *>(shape_meta->value.data())[i];
+                if (val < 0) {
+                    throw std::runtime_error("Shape must be non-negative");
+                }
+                _shape[i] = static_cast<size_t>(val);
+            } else {
+                throw std::runtime_error("Error Creating Tensor: Unsupported shape type");
+            }
+        }
+    }
 }
 Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset,

--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
@@ -90,14 +90,19 @@ std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
                attrs[attr_name] = attr->second->value;
            }
        }
        for (auto tensor_name : builder.tensor_names) {
            auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name);
            if (info != tensor_info.end()) {
+                auto shape = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".shape");
                auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides");
+                bool is_output = std::find(builder.output_names.begin(), builder.output_names.end(), tensor_name) != builder.output_names.end();
                tensors[tensor_name] = std::make_shared<Tensor>(
                    info->second.get(),
                    gguf_reader.getGgmlStart(),
-                    strides != meta.end() ? strides->second.get() : nullptr);
+                    shape != meta.end() ? shape->second.get() : nullptr,
+                    strides != meta.end() ? strides->second.get() : nullptr,
+                    is_output);
            }
        }
        std::shared_ptr<infiniop_test::base::Test> test;

--- a/src/infiniop/devices/ascend/CMakeLists.txt
+++ b/src/infiniop/devices/ascend/CMakeLists.txt
@@ -3,7 +3,7 @@ cmake_minimum_required(VERSION 3.16.0)
 # project information
 project(Ascend_C)
 set(SOC_VERSION "Ascend910B3" CACHE STRING "system on chip type")
-set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_HOME} CACHE PATH "ASCEND CANN package installation directory")
+set(ASCEND_CANN_PACKAGE_PATH $ENV{ASCEND_TOOLKIT_HOME} CACHE PATH "ASCEND CANN package installation directory")
 set(RUN_MODE "npu" CACHE STRING "run mode: npu")
 set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Build type Release/Debug (default Debug)" FORCE)
 set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE)
@@ -19,10 +19,13 @@ else()
 endif()
 include(${ASCENDC_CMAKE_DIR}/ascendc.cmake)
+include_directories(
+    ${CMAKE_SOURCE_DIR}/../../../../include/infiniop/ 
+)
 ascendc_library(ascend_kernels STATIC
-    ../../ops/swiglu/ascend/swiglu_kernel.cpp
+    ../../ops/swiglu/ascend/swiglu_ascend_kernel.cpp
-    ../../ops/rotary_embedding/ascend/rotary_embedding_kernel.cpp
+    ../../ops/rope/ascend/rope_ascend_kernel.cpp
-    ../../ops/random_sample/ascend/random_sample_kernel.cpp
+    # ../../ops/random_sample/ascend/random_sample_kernel.cpp
 )
--- a/src/infiniop/devices/ascend/ascend_kernel_common.h
+++ b/src/infiniop/devices/ascend/ascend_kernel_common.h
+#ifndef __INFINIOP_ASCEND_KERNEL_COMMON_H__
+#define __INFINIOP_ASCEND_KERNEL_COMMON_H__
+#include "../../../../include/infinicore.h"
+#include "kernel_operator.h"
+constexpr size_t BLOCK_NUM = 8;
+constexpr size_t BUFFER_NUM = 2;
+constexpr size_t BYTE_ALIGN = 32;
+template <typename T>
+__aicore__ inline size_t alignTileLen(size_t tile_len, size_t byte_align) {
+    size_t bytes = tile_len * sizeof(T);
+    size_t aligned_bytes = (bytes % byte_align == 0)
+                             ? bytes
+                             : (bytes + (byte_align - bytes % byte_align));
+    return aligned_bytes / sizeof(T);
+}
+#endif
--- a/src/infiniop/devices/ascend/common_ascend.cc
+++ b/src/infiniop/devices/ascend/common_ascend.cc
 #include "common_ascend.h"
 std::vector<int64_t> inferStorageShape(std::vector<int64_t> shape, std::vector<int64_t> strides) {
-    auto index = std::max_element(strides.begin(), strides.end());
+    if (shape.size() != strides.size()) {
-    uint64_t max_stride_index = std::distance(strides.begin(), index);
+        throw std::invalid_argument("Shape and strides must have the same length.");
-    auto storageShape = std::vector<int64_t>({shape[max_stride_index] * strides[max_stride_index]});
+    }
+    int64_t max_offset = 0;
+    for (size_t i = 0; i < shape.size(); ++i) {
+        max_offset += (shape[i] - 1) * strides[i];
+    }
-    return storageShape;
+    // storage shape is 1D buffer that must cover all accessed elements
+    return {max_offset + 1};
 }
 size_t aclnnTensorDescriptor::numel() const {
@@ -18,7 +24,7 @@ aclnnTensorDescriptor::aclnnTensorDescriptor(infiniopTensorDescriptor_t desc, vo
    this->strides = std::vector<int64_t>(ndim);
    for (uint64_t i = 0; i < ndim; ++i) {
        this->shape[i] = static_cast<int64_t>(desc->dim(i));
-        this->strides[i] = desc->stride(i);
+        this->strides[i] = static_cast<int64_t>(desc->stride(i));
    }
    this->storageShape = inferStorageShape(this->shape, this->strides);
    this->dataType = toAclDataType(desc->dtype());

--- a/src/infiniop/devices/kunlun/kunlun_handle.h
+++ b/src/infiniop/devices/kunlun/kunlun_handle.h
@@ -16,7 +16,7 @@ typedef XPUStream kunlunStream_t;
 typedef XPUEvent kunlunEvent_t;
 typedef xdnn::Context *xdnnHandle_t;
-#define CHECK_XDNN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
+#define CHECK_KUNLUN(API) CHECK_INTERNAL(API, XPU_SUCCESS)
 namespace device::kunlun {