Merge pull request #94 from PanZezhong1725/issue/68

issue/68 测试框架，matmul测例生成

Merge pull request #94 from PanZezhong1725/issue/68
issue/68 测试框架，matmul测例生成
e950d3fd · PanZezhong1725 · GitHub · b394e3d6 · b03d744c · e950d3fd
Unverified Commit e950d3fd authored Mar 19, 2025 by PanZezhong1725 Committed by GitHub Mar 19, 2025
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -41,6 +41,10 @@ jobs:
      if: matrix.os != 'windows-latest'
      run: xmake install
+    - name: build infiniop-test
+      if: matrix.os != 'windows-latest'
+      run: xmake build infiniop-test
    - name: python test
      if: matrix.os != 'windows-latest'
      run: |

--- a/.gitignore
+++ b/.gitignore
@@ -19,3 +19,6 @@ cache/
 # JSON
 *.json
+#GGUF
+*.gguf
--- a/src/infiniop-test/include/file_mapping.hpp
+++ b/src/infiniop-test/include/file_mapping.hpp
+#ifndef __INFINIOPTEST_FILE_MAPPING_HPP__
+#define __INFINIOPTEST_FILE_MAPPING_HPP__
+#ifdef _WIN32 // windows
+#include <windows.h>
+#else // linux
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+#include <cstddef>
+#include <memory>
+#include <string>
+class FileMapping {
+private:
+    void *_ptr;
+    size_t _size;
+#ifdef _WIN32
+    HANDLE _file_handle = NULL;
+    HANDLE _file_mapping = NULL;
+#endif
+public:
+    FileMapping(const std::string &filepath);
+    ~FileMapping();
+    void *ptr() const;
+    size_t size() const;
+};
+#endif // __INFINIOPTEST_FILE_MAPPING_HPP__
--- a/src/infiniop-test/include/gguf.hpp
+++ b/src/infiniop-test/include/gguf.hpp
+#ifndef __INFINIOPTEST_GGUF_HPP__
+#define __INFINIOPTEST_GGUF_HPP__
+#include "file_mapping.hpp"
+#include <cstdint>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <unordered_map>
+#include <vector>
+#ifdef _WIN32
+#include <windows.h>
+#endif
+typedef enum {
+    GGUF_TYPE_UINT8 = 0,
+    GGUF_TYPE_INT8 = 1,
+    GGUF_TYPE_UINT16 = 2,
+    GGUF_TYPE_INT16 = 3,
+    GGUF_TYPE_UINT32 = 4,
+    GGUF_TYPE_INT32 = 5,
+    GGUF_TYPE_FLOAT32 = 6,
+    GGUF_TYPE_BOOL = 7,
+    GGUF_TYPE_STRING = 8,
+    GGUF_TYPE_ARRAY = 9,
+    GGUF_TYPE_UINT64 = 10,
+    GGUF_TYPE_INT64 = 11,
+    GGUF_TYPE_FLOAT64 = 12,
+    GGUF_TYPE_COUNT, // marks the end of the enum
+} GGUF_TYPE;
+constexpr const char *GGUF_TYPE_NAME[GGUF_TYPE_COUNT] = {
+    "GGUF_TYPE_UINT8",
+    "GGUF_TYPE_INT8",
+    "GGUF_TYPE_UINT16",
+    "GGUF_TYPE_INT16",
+    "GGUF_TYPE_UINT32",
+    "GGUF_TYPE_INT32",
+    "GGUF_TYPE_FLOAT32",
+    "GGUF_TYPE_BOOL",
+    "GGUF_TYPE_STRING",
+    "GGUF_TYPE_ARRAY",
+    "GGUF_TYPE_UINT64",
+    "GGUF_TYPE_INT64",
+    "GGUF_TYPE_FLOAT64",
+};
+struct gguf_str {
+    uint64_t n;
+    char *data;
+};
+static const size_t GGUF_TYPE_SIZE[GGUF_TYPE_COUNT] = {
+    sizeof(uint8_t),  // GGUF_TYPE_UINT8
+    sizeof(int8_t),   // GGUF_TYPE_INT8
+    sizeof(uint16_t), // GGUF_TYPE_UINT16
+    sizeof(int16_t),  // GGUF_TYPE_INT16
+    sizeof(uint32_t), // GGUF_TYPE_UINT32
+    sizeof(int32_t),  // GGUF_TYPE_INT32
+    sizeof(float),    // GGUF_TYPE_FLOAT32
+    sizeof(bool),     // GGUF_TYPE_BOOL
+    sizeof(gguf_str), // GGUF_TYPE_STRING
+    0,                // GGUF_TYPE_ARRAY (undefined)
+    sizeof(uint64_t), // GGUF_TYPE_UINT64
+    sizeof(int64_t),  // GGUF_TYPE_INT64
+    sizeof(double),   // GGUF_TYPE_FLOAT64
+};
+inline std::string ggufDataToString(const uint8_t *data, GGUF_TYPE gguf_type) {
+    switch (gguf_type) {
+#define RETURN_GGUF_DATA(CASE, CTYPE) \
+    case CASE:                        \
+        return std::to_string(*reinterpret_cast<const CTYPE *>(data));
+        RETURN_GGUF_DATA(GGUF_TYPE_UINT8, uint8_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_INT8, int8_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_UINT16, uint16_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_INT16, int16_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_UINT32, uint32_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_INT32, int32_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_FLOAT32, float)
+        RETURN_GGUF_DATA(GGUF_TYPE_BOOL, bool)
+        RETURN_GGUF_DATA(GGUF_TYPE_UINT64, uint64_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_INT64, int64_t)
+        RETURN_GGUF_DATA(GGUF_TYPE_FLOAT64, double)
+        RETURN_GGUF_DATA(GGUF_TYPE_STRING, char)
+    case GGUF_TYPE_ARRAY:
+        throw std::runtime_error("GGUF_TYPE_ARRAY should be processed element by element");
+    default:
+        return "GGUF_TYPE_UNKNOWN";
+    }
+#undef RETURN_GGUF_DATA
+}
+struct GGUFKeyValue {
+    std::string key;
+    GGUF_TYPE gguf_type; // gguf_type
+    std::vector<uint8_t> value;
+    std::string toString() const;
+};
+typedef enum {
+    GGML_TYPE_F32 = 0,
+    GGML_TYPE_F16 = 1,
+    GGML_TYPE_Q4_0 = 2,
+    GGML_TYPE_Q4_1 = 3,
+    GGML_TYPE_Q5_0 = 6,
+    GGML_TYPE_Q5_1 = 7,
+    GGML_TYPE_Q8_0 = 8,
+    GGML_TYPE_Q8_1 = 9,
+    GGML_TYPE_Q2_K = 10,
+    GGML_TYPE_Q3_K = 11,
+    GGML_TYPE_Q4_K = 12,
+    GGML_TYPE_Q5_K = 13,
+    GGML_TYPE_Q6_K = 14,
+    GGML_TYPE_Q8_K = 15,
+    GGML_TYPE_IQ2_XXS = 16,
+    GGML_TYPE_IQ2_XS = 17,
+    GGML_TYPE_IQ3_XXS = 18,
+    GGML_TYPE_IQ1_S = 19,
+    GGML_TYPE_IQ4_NL = 20,
+    GGML_TYPE_IQ3_S = 21,
+    GGML_TYPE_IQ2_S = 22,
+    GGML_TYPE_IQ4_XS = 23,
+    GGML_TYPE_I8 = 24,
+    GGML_TYPE_I16 = 25,
+    GGML_TYPE_I32 = 26,
+    GGML_TYPE_I64 = 27,
+    GGML_TYPE_F64 = 28,
+    GGML_TYPE_IQ1_M = 29,
+    GGML_TYPE_BF16 = 30,
+    GGML_TYPE_TQ1_0 = 34,
+    GGML_TYPE_TQ2_0 = 35,
+    GGML_TYPE_COUNT = 36,
+} GGML_TYPE;
+inline size_t ggmlTypeSize(GGML_TYPE ggml_type) {
+    switch (ggml_type) {
+    case GGML_TYPE_F32:
+        return 4;
+    case GGML_TYPE_F16:
+        return 2;
+    case GGML_TYPE_I8:
+        return 1;
+    case GGML_TYPE_I16:
+        return 2;
+    case GGML_TYPE_I32:
+        return 4;
+    case GGML_TYPE_I64:
+        return 8;
+    case GGML_TYPE_F64:
+        return 8;
+    case GGML_TYPE_BF16:
+        return 2;
+    default:
+        throw std::runtime_error("GGML_TYPE_SIZE: Unsupported GGML_TYPE");
+    }
+    return 0;
+}
+constexpr const char *GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
+    "F32",
+    "F16",
+    "Q4_0",
+    "Q4_1",
+    nullptr, // 4 (gap)
+    nullptr, // 5 (gap)
+    "Q5_0",
+    "Q5_1",
+    "Q8_0",
+    "Q8_1",
+    "Q2_K",
+    "Q3_K",
+    "Q4_K",
+    "Q5_K",
+    "Q6_K",
+    "Q8_K",
+    "IQ2_XXS",
+    "IQ2_XS",
+    "IQ3_XXS",
+    "IQ1_S",
+    "IQ4_NL",
+    "IQ3_S",
+    "IQ2_S",
+    "IQ4_XS",
+    "I8",
+    "I16",
+    "I32",
+    "I64",
+    "F64",
+    "IQ1_M",
+    "BF16",
+    nullptr, // 31 (gap)
+    nullptr, // 32 (gap)
+    nullptr, // 33 (gap)
+    "TQ1_0",
+    "TQ2_0",
+};
+struct GGUFTensorInfo {
+    std::string name;
+    uint32_t ndim;
+    std::vector<int64_t> shape;
+    GGML_TYPE ggml_type;
+    uint64_t data_offset;
+    std::string toString() const;
+};
+class GGUFFileReader {
+public:
+    GGUFFileReader(const std::string &filepath);
+    ~GGUFFileReader() = default;
+    std::string toString() const;
+    const std::unordered_map<std::string, std::shared_ptr<GGUFKeyValue>> &getAttributeMap() const;
+    const std::unordered_map<std::string, std::shared_ptr<GGUFTensorInfo>> &getTensorInfoMap() const;
+    std::shared_ptr<FileMapping> getFileMapping() const { return _file; }
+    void *getGgmlStart() const { return _cursor; }
+private:
+    void readHeader();
+    void readMetaKVs();
+    void readTensorInfos();
+    std::string readString();
+    template <typename T>
+    T read();
+    std::shared_ptr<FileMapping> _file;
+    void *_data = nullptr;
+    uint8_t *_cursor = nullptr;
+    uint32_t _version;
+    int64_t _num_tensors;
+    int64_t _num_meta_kvs;
+    std::vector<std::shared_ptr<GGUFKeyValue>> _meta_kvs;
+    std::vector<std::shared_ptr<GGUFTensorInfo>> _tensor_infos;
+    std::unordered_map<std::string, std::shared_ptr<GGUFKeyValue>> _attributes_map;
+    std::unordered_map<std::string, std::shared_ptr<GGUFTensorInfo>> _tensors_info_map;
+};
+#endif
--- a/src/infiniop-test/include/ops.hpp
+++ b/src/infiniop-test/include/ops.hpp
+#ifndef __INFINIOPTEST_OPS_HPP__
+#define __INFINIOPTEST_OPS_HPP__
+#include "test.hpp"
+/*
+ * Declare all the tests here
+ */
+DECLARE_INFINIOP_TEST(matmul)
+#define REGISTER_INFINIOP_TEST(name)                    \
+    {                                                   \
+        #name,                                          \
+        { infiniop_test::name::Test::build,             \
+          infiniop_test::name::Test::attribute_names(), \
+          infiniop_test::name::Test::tensor_names() }   \
+    }
+/*
+ * Register all the tests here
+ */
+#define TEST_BUILDER_MAPPINGS           \
+    {                                   \
+        REGISTER_INFINIOP_TEST(matmul), \
+    }
+namespace infiniop_test {
+// Global variable for {op_name: builder} mappings
+extern std::unordered_map<std::string, const TestBuilder> TEST_BUILDERS;
+} // namespace infiniop_test
+#endif
--- a/src/infiniop-test/include/tensor.hpp
+++ b/src/infiniop-test/include/tensor.hpp
+#ifndef __INFINIOPTEST_TENSOR_HPP__
+#define __INFINIOPTEST_TENSOR_HPP__
+#include "file_mapping.hpp"
+#include "gguf.hpp"
+#include <infiniop.h>
+inline infiniDtype_t ggmlTypeToInfiniType(GGML_TYPE type) {
+    switch (type) {
+    case GGML_TYPE_I8:
+        return INFINI_DTYPE_I8;
+    case GGML_TYPE_I16:
+        return INFINI_DTYPE_I16;
+    case GGML_TYPE_I32:
+        return INFINI_DTYPE_I32;
+    case GGML_TYPE_I64:
+        return INFINI_DTYPE_I64;
+    case GGML_TYPE_F16:
+        return INFINI_DTYPE_F16;
+    case GGML_TYPE_BF16:
+        return INFINI_DTYPE_BF16;
+    case GGML_TYPE_F32:
+        return INFINI_DTYPE_F32;
+    case GGML_TYPE_F64:
+        return INFINI_DTYPE_F64;
+    default:
+        throw std::runtime_error("Unsupported GGML type");
+    }
+}
+namespace infiniop_test {
+class Memory {
+private:
+    void *_ptr;
+    size_t _size;
+    infiniDevice_t _device;
+    int _device_id;
+    std::shared_ptr<FileMapping> _file_mapping;
+public:
+    Memory(size_t size, infiniDevice_t device, int device_id);
+    Memory(const std::shared_ptr<FileMapping> &file_mapping, void *ptr, size_t size);
+    ~Memory();
+    void *ptr() const { return _ptr; }
+    size_t size() const { return _size; }
+    infiniDevice_t device() const { return _device; }
+    int device_id() const { return _device_id; }
+};
+class Tensor {
+private:
+    infiniopTensorDescriptor_t _desc;
+    std::shared_ptr<Memory> _memory;
+    std::vector<size_t> _shape;
+    std::vector<ptrdiff_t> _strides;
+    size_t _offset;
+    GGML_TYPE _ggml_type;
+public:
+    Tensor(const GGUFTensorInfo *info,
+           const void *ggml_ptr,
+           const GGUFKeyValue *strides_meta = nullptr);
+    Tensor(std::shared_ptr<Memory> memory, size_t offset,
+           const std::vector<size_t> &shape,
+           const std::vector<ptrdiff_t> &strides,
+           GGML_TYPE dtype);
+    ~Tensor();
+    infiniopTensorDescriptor_t desc() const { return _desc; }
+    std::vector<size_t> shape() const { return std::vector<size_t>(_shape); }
+    std::vector<ptrdiff_t> strides() const { return std::vector<ptrdiff_t>(_strides); }
+    GGML_TYPE ggml_type() const { return _ggml_type; }
+    void *data() const;
+    std::shared_ptr<Tensor> to(infiniDevice_t device, int device_id = 0) const;
+    std::string info() const;
+    void debug() const;
+};
+} // namespace infiniop_test
+#endif
--- a/src/infiniop-test/include/test.hpp
+++ b/src/infiniop-test/include/test.hpp
+#ifndef __INFINIOPTEST_HPP__
+#define __INFINIOPTEST_HPP__
+#include "gguf.hpp"
+#include "tensor.hpp"
+#include <functional>
+#include <sstream>
+#include <unordered_map>
+#include <vector>
+#define RESET "\033[0m"
+#define GREEN "\033[32m"
+#define RED "\033[31m"
+#define YELLOW "\033[33m"
+namespace infiniop_test {
+enum class TestStatus {
+    PASS,
+    TEST_INIT_FAILED,
+    OP_CREATION_FAILED,
+    OP_EXECUTION_FAILED,
+    RESULT_INCORRECT,
+};
+// Result of a testcase
+class Result {
+private:
+    TestStatus _status;
+    double _time = 0.;
+    std::string _description;
+    std::string _error_message;
+public:
+    Result(TestStatus status_, double time_, const std::string &description_, const std::string &error_message_)
+        : _status(status_), _time(time_), _description(description_), _error_message(error_message_) {}
+    bool isPassed() const { return _status == TestStatus::PASS; }
+    std::string toString() const;
+};
+// Quick macro for creating a test result
+#define TEST_PASSED(delay) std::make_shared<infiniop_test::Result>(infiniop_test::TestStatus::PASS, delay, toString(), "")
+#define TEST_FAILED(reason, msg) std::make_shared<infiniop_test::Result>(infiniop_test::TestStatus::reason, 0., toString(), msg)
+#define TEST_INIT_FAILED(op_name) std::make_shared<infiniop_test::Result>(infiniop_test::TestStatus::TEST_INIT_FAILED, 0., "Invalid " + std::string(op_name), "")
+// Run all tests read from a GGUF file
+std::vector<std::shared_ptr<Result>> runAllTests(
+    const GGUFFileReader &,
+    infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations,
+    double rtol, double atol);
+// Run a single test read from a GGUF file
+std::shared_ptr<Result> runTest(
+    const GGUFFileReader &,
+    infiniDevice_t device, int device_id,
+    size_t warm_ups, size_t iterations,
+    double rtol, double atol,
+    size_t test_id);
+// Check if two tensors are close within given tolerance
+void allClose(std::shared_ptr<Tensor> actual, std::shared_ptr<Tensor> expected, double rtol = 1e-3, double atol = 1e-3);
+// Helper function for benchmarking a function
+double benchmark(std::function<void()> func, size_t warmups, size_t iterations);
+} // namespace infiniop_test
+namespace infiniop_test::base {
+// Base class for a testcase, each operator test should inherit from this class
+class Test {
+public:
+    virtual std::shared_ptr<infiniop_test::Result> run(
+        infiniopHandle_t handle, infiniDevice_t device, int device_id,
+        size_t warm_ups, size_t iterations)
+        = 0;
+    virtual std::string toString() const = 0;
+};
+} // namespace infiniop_test::base
+// Quick macro for declaring a new testcase
+#define DECLARE_INFINIOP_TEST(name)                                           \
+    namespace infiniop_test::name {                                           \
+    class Test : public infiniop_test::base::Test {                           \
+        double _rtol, _atol;                                                  \
+                                                                              \
+    public:                                                                   \
+        static std::string op_name() { return #name; }                        \
+        static std::shared_ptr<Test> build(                                   \
+            std::unordered_map<std::string, std::vector<uint8_t>> attributes, \
+            std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors, \
+            double, double);                                                  \
+                                                                              \
+        static std::vector<std::string> attribute_names();                    \
+        static std::vector<std::string> tensor_names();                       \
+                                                                              \
+        std::shared_ptr<infiniop_test::Result> run(                           \
+            infiniopHandle_t handle, infiniDevice_t device, int device_id,    \
+            size_t warm_ups, size_t iterations) override;                     \
+                                                                              \
+        std::string toString() const override;                                \
+                                                                              \
+        ~Test();                                                              \
+                                                                              \
+    private:                                                                  \
+        struct Attributes;                                                    \
+        Attributes *_attributes;                                              \
+        Test() = delete;                                                      \
+        Test(double rtol, double atol) : _rtol(rtol), _atol(atol) {}          \
+    };                                                                        \
+    }
+namespace infiniop_test {
+using BuilderFunc = std::function<std::shared_ptr<infiniop_test::base::Test>(
+    std::unordered_map<std::string, std::vector<uint8_t>>,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>>,
+    double, double)>;
+// Testcase Registry
+// Each testcase should provid a formatted builder, attribute names, and tensor names
+struct TestBuilder {
+    BuilderFunc build;
+    std::vector<std::string> attribute_names;
+    std::vector<std::string> tensor_names;
+};
+} // namespace infiniop_test
+#endif
--- a/src/infiniop-test/include/utils.hpp
+++ b/src/infiniop-test/include/utils.hpp
+#ifndef __INFINIOPTEST_UTILS_HPP__
+#define __INFINIOPTEST_UTILS_HPP__
+#include "../../utils.h"
+#include "gguf.hpp"
+#include <cstring>
+#include <iostream>
+#define CHECK_OR(cmd, action) CHECK_API_OR(cmd, INFINI_STATUS_SUCCESS, action)
+inline double getVal(void *ptr, GGML_TYPE ggml_type) {
+    switch (ggml_type) {
+    case GGML_TYPE_F16:
+        return utils::cast<double>(*(fp16_t *)ptr);
+    case GGML_TYPE_F32:
+        return *(float *)ptr;
+    case GGML_TYPE_F64:
+        return *(double *)ptr;
+    case GGML_TYPE_I8:
+        return *(int8_t *)ptr;
+    case GGML_TYPE_I16:
+        return *(int16_t *)ptr;
+    case GGML_TYPE_I32:
+        return *(int32_t *)ptr;
+    case GGML_TYPE_I64:
+        return (double)(*(int64_t *)ptr);
+    default:
+        throw std::runtime_error("Unsupported data type");
+    }
+}
+#endif
--- a/src/infiniop-test/src/file_mapping.cpp
+++ b/src/infiniop-test/src/file_mapping.cpp
+#include "file_mapping.hpp"
+#include <stdexcept>
+FileMapping::FileMapping(const std::string &filepath) {
+#ifdef _WIN32
+    _file_handle = CreateFile(filepath.c_str(), GENERIC_READ, FILE_SHARE_READ, NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
+    if (_file_handle == INVALID_HANDLE_VALUE) {
+        throw std::runtime_error("Failed to open GGUF file");
+    }
+    _file_mapping = CreateFileMapping(_file_handle, NULL, PAGE_READONLY, 0, 0, NULL);
+    if (!_file_mapping) {
+        CloseHandle(_file_handle);
+        throw std::runtime_error("Failed to create file mapping");
+    }
+    _ptr = MapViewOfFile(_file_mapping, FILE_MAP_READ, 0, 0, 0);
+    if (!_ptr) {
+        CloseHandle(_file_mapping);
+        CloseHandle(_file_handle);
+        throw std::runtime_error("Failed to map view of file");
+    }
+    _size = GetFileSize(_file_handle, NULL);
+#else
+    int fd = open(filepath.c_str(), O_RDONLY);
+    if (fd == -1) {
+        throw std::runtime_error("Failed to open GGUF file");
+    }
+    struct stat sb;
+    if (fstat(fd, &sb) == -1) {
+        close(fd);
+        throw std::runtime_error("Failed to get file size");
+    }
+    _size = sb.st_size;
+    _ptr = mmap(NULL, _size, PROT_READ, MAP_PRIVATE, fd, 0);
+    close(fd);
+    if (_ptr == MAP_FAILED) {
+        throw std::runtime_error("Failed to mmap file");
+    }
+#endif
+}
+FileMapping::~FileMapping() {
+#ifdef _WIN32
+    if (_ptr) {
+        UnmapViewOfFile(_ptr);
+    }
+    if (_file_mapping) {
+        CloseHandle(_file_mapping);
+    }
+    if (_file_handle) {
+        CloseHandle(_file_handle);
+    }
+#else
+    if (_ptr) {
+        munmap(_ptr, _size);
+    }
+#endif
+}
+void *FileMapping::ptr() const {
+    return _ptr;
+}
+size_t FileMapping::size() const {
+    return _size;
+}
--- a/src/infiniop-test/src/gguf.cpp
+++ b/src/infiniop-test/src/gguf.cpp
+#include "gguf.hpp"
+#include <cstring>
+#include <fstream>
+#include <iostream>
+#include <sstream>
+#include <stdexcept>
+#ifdef _WIN32
+#include <windows.h>
+#else
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#endif
+std::string GGUFKeyValue::toString() const {
+    std::ostringstream oss;
+    oss << "Key: " << key << ", Type: " << GGUF_TYPE_NAME[gguf_type] << ", Value: ";
+    if (gguf_type == GGUF_TYPE_STRING) {
+        std::string str(value.begin(), value.end());
+        oss << str;
+    } else if (value.size() > GGUF_TYPE_SIZE[gguf_type]) {
+        oss << "[";
+        for (size_t i = 0; i < value.size() / GGUF_TYPE_SIZE[gguf_type]; ++i) {
+            oss << ggufDataToString(value.data() + i * GGUF_TYPE_SIZE[gguf_type], gguf_type);
+            if (i < value.size() / GGUF_TYPE_SIZE[gguf_type] - 1) {
+                oss << ", ";
+            }
+        }
+        oss << "]";
+    } else {
+        oss << ggufDataToString(value.data(), gguf_type);
+    }
+    return oss.str();
+}
+std::string GGUFTensorInfo::toString() const {
+    std::ostringstream oss;
+    oss << "Name: " << name << ", NDims: " << ndim << ", Shape: [";
+    for (size_t i = 0; i < shape.size(); ++i) {
+        oss << shape[i];
+        if (i < shape.size() - 1) {
+            oss << ", ";
+        }
+    }
+    oss << "], DataType: " << GGML_TYPE_NAME[ggml_type] << ", DataOffset: " << data_offset;
+    return oss.str();
+}
+GGUFFileReader::GGUFFileReader(const std::string &filepath) {
+    try {
+        _file = std::make_shared<FileMapping>(filepath);
+    } catch (const std::exception &e) {
+        throw e;
+    }
+    _data = _file->ptr();
+    _cursor = reinterpret_cast<uint8_t *>(_data);
+    readHeader();
+    readMetaKVs();
+    readTensorInfos();
+    size_t padding = (size_t)(32 - ((char *)_cursor - (char *)_data) % 32) % 32;
+    _cursor += padding;
+}
+const std::unordered_map<std::string, std::shared_ptr<GGUFKeyValue>> &
+GGUFFileReader::getAttributeMap() const {
+    return _attributes_map;
+}
+const std::unordered_map<std::string, std::shared_ptr<GGUFTensorInfo>> &
+GGUFFileReader::getTensorInfoMap() const {
+    return _tensors_info_map;
+}
+void GGUFFileReader::readHeader() {
+    if (std::memcmp(_cursor, "GGUF", 4) != 0) {
+        throw std::runtime_error("Invalid GGUF magic");
+    }
+    _cursor += 4;
+    _version = read<uint32_t>();
+    _num_tensors = read<int64_t>();
+    _num_meta_kvs = read<int64_t>();
+    _attributes_map = std::unordered_map<std::string, std::shared_ptr<GGUFKeyValue>>();
+    _tensors_info_map = std::unordered_map<std::string, std::shared_ptr<GGUFTensorInfo>>();
+}
+void GGUFFileReader::readMetaKVs() {
+    for (int64_t i = 0; i < _num_meta_kvs; ++i) {
+        auto kv = std::make_shared<GGUFKeyValue>();
+        kv->key = readString();
+        kv->gguf_type = read<GGUF_TYPE>();
+        if (kv->gguf_type == GGUF_TYPE_ARRAY) {
+            GGUF_TYPE array_type = read<GGUF_TYPE>();
+            uint64_t array_size = read<uint64_t>();
+            kv->value.resize(array_size * GGUF_TYPE_SIZE[array_type]);
+            kv->gguf_type = array_type;
+            std::memcpy(kv->value.data(), _cursor, kv->value.size());
+            _cursor += kv->value.size();
+        } else if (kv->gguf_type == GGUF_TYPE_STRING) {
+            uint64_t str_size = read<uint64_t>();
+            kv->value.resize(str_size);
+            std::memcpy(kv->value.data(), _cursor, str_size);
+            _cursor += str_size;
+        } else {
+            kv->value.resize(GGUF_TYPE_SIZE[kv->gguf_type]);
+            std::memcpy(kv->value.data(), _cursor, kv->value.size());
+            _cursor += kv->value.size();
+        }
+        _meta_kvs.push_back(kv);
+        _attributes_map.emplace(kv->key, kv);
+    }
+}
+void GGUFFileReader::readTensorInfos() {
+    for (int64_t i = 0; i < _num_tensors; ++i) {
+        auto tensor_info = std::make_shared<GGUFTensorInfo>();
+        tensor_info->name = readString();
+        tensor_info->ndim = read<uint32_t>();
+        tensor_info->shape.resize(tensor_info->ndim);
+        for (size_t j = 0; j < tensor_info->ndim; ++j) {
+            tensor_info->shape[j] = read<int64_t>();
+        }
+        tensor_info->ggml_type = read<GGML_TYPE>();
+        tensor_info->data_offset = read<uint64_t>();
+        _tensor_infos.push_back(tensor_info);
+        _tensors_info_map.emplace(tensor_info->name, tensor_info);
+    }
+}
+std::string GGUFFileReader::readString() {
+    uint64_t length = read<uint64_t>();
+    std::string str(reinterpret_cast<const char *>(_cursor), length);
+    _cursor += length;
+    return str;
+}
+template <typename T>
+T GGUFFileReader::read() {
+    T value;
+    std::memcpy(&value, _cursor, sizeof(T));
+    _cursor += sizeof(T);
+    return value;
+}
+std::string GGUFFileReader::toString() const {
+    std::ostringstream oss;
+    oss << "GGUF File Contents: " << std::endl;
+    oss << "Version: " << _version << std::endl;
+    oss << "Number of Meta KVs: " << _num_meta_kvs << std::endl;
+    oss << "Number of Tensors: " << _num_tensors << std::endl
+        << std::endl;
+    oss << "Meta KVs: " << std::endl;
+    for (const auto &kv : _meta_kvs) {
+        oss << kv->toString() << std::endl;
+    }
+    oss << std::endl;
+    oss << "Tensor INFOs: " << std::endl;
+    for (const auto &info : _tensor_infos) {
+        oss << info->toString() << std::endl;
+    }
+    return oss.str();
+}
--- a/src/infiniop-test/src/main.cpp
+++ b/src/infiniop-test/src/main.cpp
+#include "gguf.hpp"
+#include "test.hpp"
+#include <infinirt.h>
+#include <iostream>
+struct ParsedArgs {
+    std::string file_path;                          // Mandatory argument: test.gguf file path
+    infiniDevice_t device_type = INFINI_DEVICE_CPU; // Default to CPU
+    int device_id = 0;                              // CUDA device ID (if specified)
+    int warmups = 0;                                // Default to 0 if not given
+    int iterations = 0;                             // Default to 0 if not given
+    double atol = 0.001;                            // Default absolute tolerance
+    double rtol = 0.001;                            // Default relative tolerance
+};
+void printUsage() {
+    std::cout << "Usage:" << std::endl
+              << std::endl;
+    std::cout << "infiniop-test <test.gguf> [--<device>[:id]] [--warmup <warmups>] [--run <iterations>] [--atol <atol>] [--rtol <rtol>]" << std::endl
+              << std::endl;
+    std::cout << "  <test.gguf>>" << std::endl;
+    std::cout << "    Path to the test gguf file" << std::endl
+              << std::endl;
+    std::cout << "  --<device>[:id]" << std::endl;
+    std::cout << "    (Optional) Specify the device type --(cpu|nvidia|cambricon|ascend|metax|moore|iluvatar|kunlun|sugon) and device ID (optional). CPU by default." << std::endl
+              << std::endl;
+    std::cout << "  --warmup <warmups>" << std::endl;
+    std::cout << "    (Optional) Number of warmups to perform before timing. Default to 0." << std::endl
+              << std::endl;
+    std::cout << "  --run <iterations>" << std::endl;
+    std::cout << "    (Optional) Number of iterations to perform for timing. Default to 0." << std::endl
+              << std::endl;
+    std::cout << "  --atol <absolute_tolerance>" << std::endl;
+    std::cout << "    (Optional) Absolute tolerance for correctness check. Default to 0.001" << std::endl
+              << std::endl;
+    std::cout << "  --rtol <relative_tolerance>" << std::endl;
+    std::cout << "    (Optional) Relative tolerance for correctness check. Default to 0.001" << std::endl
+              << std::endl;
+    exit(-1);
+}
+#define PARSE_DEVICE(FLAG, DEVICE)                                 \
+    else if (arg.find(FLAG) == 0) {                                \
+        size_t colon_pos = arg.find(':');                          \
+        args.device_type = DEVICE;                                 \
+        if (colon_pos != std::string::npos) {                      \
+            args.device_id = std::stoi(arg.substr(colon_pos + 1)); \
+        } else {                                                   \
+            args.device_id = 0;                                    \
+        }                                                          \
+    }
+ParsedArgs parseArgs(int argc, char *argv[]) {
+    if (argc < 2) {
+        printUsage();
+    }
+    if (std::string(argv[1]) == "--help" || std::string(argv[1]) == "-h") {
+        printUsage();
+    }
+    ParsedArgs args;
+    args.file_path = argv[1]; // First argument is always the test.gguf file
+    std::unordered_map<std::string, std::string> options;
+    try {
+        for (int i = 2; i < argc; ++i) {
+            std::string arg = argv[i];
+            if (arg.find("--cpu") == 0) {
+                args.device_id = 0;
+            }
+            PARSE_DEVICE("--nvidia", INFINI_DEVICE_NVIDIA)
+            PARSE_DEVICE("--cambricon", INFINI_DEVICE_CAMBRICON)
+            PARSE_DEVICE("--ascend", INFINI_DEVICE_ASCEND)
+            PARSE_DEVICE("--metax", INFINI_DEVICE_METAX)
+            PARSE_DEVICE("--moore", INFINI_DEVICE_MOORE)
+            PARSE_DEVICE("--iluvatar", INFINI_DEVICE_ILUVATAR)
+            PARSE_DEVICE("--kunlun", INFINI_DEVICE_KUNLUN)
+            PARSE_DEVICE("--sugon", INFINI_DEVICE_SUGON)
+            else if (arg == "--warmup" && i + 1 < argc) {
+                args.warmups = std::stoi(argv[++i]);
+            }
+            else if (arg == "--run" && i + 1 < argc) {
+                args.iterations = std::stoi(argv[++i]);
+            }
+            else if (arg == "--atol" && i + 1 < argc) {
+                args.atol = std::stod(argv[++i]);
+            }
+            else if (arg == "--rtol" && i + 1 < argc) {
+                args.rtol = std::stod(argv[++i]);
+            }
+            else {
+                printUsage();
+            }
+        }
+    } catch (const std::exception &) {
+        printUsage();
+    }
+    return args;
+}
+int main(int argc, char *argv[]) {
+    ParsedArgs args = parseArgs(argc, argv);
+    int failed = 0;
+    try {
+        std::cout << args.file_path << std::endl;
+        GGUFFileReader reader = GGUFFileReader(args.file_path);
+        // std::cout << reader.toString() << std::endl;
+        if (infinirtInit() != INFINI_STATUS_SUCCESS) {
+            std::cerr << "Error: Failed to initialize InfiniRT" << std::endl;
+            return -1;
+        }
+        auto results = infiniop_test::runAllTests(
+            reader,
+            (infiniDevice_t)args.device_type, args.device_id,
+            args.warmups, args.iterations,
+            args.rtol, args.atol);
+        std::cout << "=====================================" << std::endl;
+        for (auto result : results) {
+            if (!result->isPassed()) {
+                failed++;
+            }
+            std::cout << result->toString() << std::endl;
+            std::cout << "=====================================" << std::endl;
+        }
+        if (failed == 0) {
+            std::cout << GREEN << "All tests passed" << RESET << std::endl;
+        } else {
+            std::cout << RED << failed << " of " << results.size() << " tests failed" << RESET << std::endl;
+        }
+    } catch (const std::exception &e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
+    return failed;
+}
--- a/src/infiniop-test/src/ops/matmul.cpp
+++ b/src/infiniop-test/src/ops/matmul.cpp
+#include "ops.hpp"
+#include "utils.hpp"
+#include <infinirt.h>
+#include <iomanip>
+#include <iostream>
+namespace infiniop_test::matmul {
+struct Test::Attributes {
+    float alpha;
+    float beta;
+    std::shared_ptr<Tensor> a;
+    std::shared_ptr<Tensor> b;
+    std::shared_ptr<Tensor> c;
+    std::shared_ptr<Tensor> ans;
+};
+std::shared_ptr<Test> Test::build(
+    std::unordered_map<std::string, std::vector<uint8_t>> attributes,
+    std::unordered_map<std::string, std::shared_ptr<Tensor>> tensors,
+    double rtol, double atol) {
+    auto test = std::shared_ptr<Test>(new Test(rtol, atol));
+    test->_attributes = new Attributes();
+    if (attributes.find("alpha") == attributes.end()
+        || attributes.find("beta") == attributes.end()
+        || tensors.find("a") == tensors.end()
+        || tensors.find("b") == tensors.end()
+        || tensors.find("c") == tensors.end()
+        || tensors.find("ans") == tensors.end()) {
+        throw std::runtime_error("Invalid Test");
+    }
+    test->_attributes->alpha = *reinterpret_cast<float *>(attributes["alpha"].data());
+    test->_attributes->beta = *reinterpret_cast<float *>(attributes["beta"].data());
+    test->_attributes->a = tensors["a"];
+    test->_attributes->b = tensors["b"];
+    test->_attributes->c = tensors["c"];
+    test->_attributes->ans = tensors["ans"];
+    return test;
+}
+std::shared_ptr<infiniop_test::Result> Test::run(
+    infiniopHandle_t handle, infiniDevice_t device, int device_id, size_t warm_ups, size_t iterations) {
+    infiniopMatmulDescriptor_t op_desc;
+    auto alpha = _attributes->alpha;
+    auto beta = _attributes->beta;
+    auto a = _attributes->a->to(device, device_id);
+    auto b = _attributes->b->to(device, device_id);
+    auto c = _attributes->c->to(device, device_id);
+    CHECK_OR(infiniopCreateMatmulDescriptor(handle, &op_desc,
+                                            c->desc(),
+                                            a->desc(),
+                                            b->desc()),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to create op descriptor."));
+    size_t workspace_size;
+    CHECK_OR(infiniopGetMatmulWorkspaceSize(op_desc, &workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to get workspace size."));
+    void *workspace;
+    CHECK_OR(infinirtMalloc(&workspace, workspace_size),
+             return TEST_FAILED(OP_CREATION_FAILED, "Failed to allocate workspace."));
+    CHECK_OR(infiniopMatmul(op_desc, workspace, workspace_size,
+                            c->data(),
+                            a->data(),
+                            b->data(),
+                            alpha,
+                            beta,
+                            nullptr),
+             return TEST_FAILED(OP_EXECUTION_FAILED, "Failed during execution."));
+    try {
+        allClose(c, _attributes->ans);
+    } catch (const std::exception &e) {
+        return TEST_FAILED(RESULT_INCORRECT, e.what());
+    }
+    double elapsed_time = 0.;
+    // add and subtract to avoid overflow
+    float beta_ = beta == .0f ? .0f : 1.f / beta;
+    float alpha_ = beta == .0f ? alpha : -beta_;
+    elapsed_time = benchmark(
+        [=]() {
+            infiniopMatmul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                alpha,
+                beta,
+                nullptr);
+            infiniopMatmul(
+                op_desc, workspace, workspace_size,
+                c->data(),
+                a->data(),
+                b->data(),
+                alpha_,
+                beta_,
+                nullptr);
+        },
+        (warm_ups + 1) / 2, (iterations + 1) / 2);
+    return TEST_PASSED(elapsed_time);
+}
+std::vector<std::string> Test::attribute_names() {
+    return {"alpha", "beta"};
+}
+std::vector<std::string> Test::tensor_names() {
+    return {"a", "b", "c", "ans"};
+}
+std::string Test::toString() const {
+    std::ostringstream oss;
+    oss << op_name() << std::endl;
+    oss << "- alpha=" << _attributes->alpha << ", beta=" << _attributes->beta << std::endl;
+    oss << "- a: " << _attributes->a->info() << std::endl;
+    oss << "- b: " << _attributes->b->info() << std::endl;
+    oss << "- c: " << _attributes->c->info() << std::endl;
+    oss << std::scientific << std::setprecision(2);
+    oss << "- rtol=" << _rtol << ", atol=" << _atol << std::endl;
+    return oss.str();
+}
+Test::~Test() {
+    delete _attributes;
+}
+} // namespace infiniop_test::matmul
--- a/src/infiniop-test/src/tensor.cpp
+++ b/src/infiniop-test/src/tensor.cpp
+#include "tensor.hpp"
+#include "utils.hpp"
+#include <cstring>
+#include <infinirt.h>
+#include <sstream>
+template <typename T>
+void printData(const T *data, const std::vector<size_t> &shape, const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << *(data + i * strides[dim]) << " ";
+        }
+        std::cout << std::endl;
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            printData(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+template <>
+void printData(const fp16_t *data, const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &strides, size_t dim) {
+    if (dim == shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            std::cout << utils::cast<float>(*(data + i * strides[dim])) << " ";
+        }
+    } else if (dim < shape.size() - 1) {
+        for (size_t i = 0; i < shape[dim]; i++) {
+            printData(data + i * strides[dim], shape, strides, dim + 1);
+            std::cout << std::endl;
+        }
+    }
+}
+// Calculate memory size & offset given shape & strides
+inline void calculateTensorMemory(size_t &size, size_t &offset,
+                                  std::vector<size_t> shape,
+                                  std::vector<ptrdiff_t> strides,
+                                  size_t data_size) {
+    size_t ndim = shape.size();
+    offset = 0;
+    size = 0;
+    for (size_t i = 0; i < ndim; i++) {
+        if (shape[i] == 0) {
+            offset = 0;
+            size = 0;
+            return;
+        }
+        if (strides[i] > 0) {
+            size += (shape[i] - 1) * strides[i] * data_size;
+        } else if (strides[i] < 0) {
+            offset += (shape[i] - 1) * (size_t)(-strides[i]) * data_size;
+        }
+    }
+    size = offset + size + data_size;
+}
+namespace infiniop_test {
+Memory::Memory(size_t size, infiniDevice_t device, int device_id) {
+    _file_mapping = nullptr;
+    _device = device;
+    _device_id = device_id;
+    _size = size;
+    if (device == INFINI_DEVICE_CPU) {
+        _ptr = std::malloc(size);
+    } else {
+        CHECK_OR(infinirtSetDevice(_device, _device_id), throw std::runtime_error("Error Creating Memory: set device"));
+        CHECK_OR(infinirtMalloc(&_ptr, _size), throw std::runtime_error("Error Creating Memory: malloc"));
+    }
+}
+Memory::Memory(const std::shared_ptr<FileMapping> &file_mapping, void *ptr, size_t size) {
+    _device = INFINI_DEVICE_CPU;
+    _device_id = 0;
+    _size = size;
+    _ptr = ptr;
+    _file_mapping = file_mapping;
+}
+Memory::~Memory() {
+    // if memory does not map to a file, free it manually
+    if (_file_mapping == nullptr) {
+        if (_device == INFINI_DEVICE_CPU) {
+            std::free(_ptr);
+        } else {
+            infinirtSetDevice(_device, _device_id);
+            infinirtFree(_ptr);
+        }
+    }
+}
+void *Tensor::data() const {
+    return (char *)(_memory->ptr()) + _offset;
+}
+Tensor::Tensor(const GGUFTensorInfo *info,
+               const void *ggml_ptr,
+               const GGUFKeyValue *strides_meta) {
+    _ggml_type = info->ggml_type;
+    _offset = 0;
+    size_t ndim = static_cast<size_t>(info->ndim);
+    _shape = std::vector<size_t>(ndim);
+    _strides = std::vector<ptrdiff_t>(ndim);
+    std::vector<ptrdiff_t> contiguous_strides(ndim);
+    for (size_t i = 0; i < ndim; i++) {
+        _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
+        if (i == 0) {
+            contiguous_strides[ndim - 1] = (ptrdiff_t)1;
+        } else {
+            contiguous_strides[ndim - 1 - i] = (ptrdiff_t)_shape[ndim - i] * contiguous_strides[ndim - i];
+        }
+    }
+    if (strides_meta == nullptr) {
+        for (size_t i = 0; i < ndim; i++) {
+            _strides[i] = contiguous_strides[i];
+        }
+    } else {
+        for (size_t i = 0; i < ndim; i++) {
+            _shape[i] = static_cast<size_t>(info->shape[ndim - 1 - i]);
+            if (strides_meta->gguf_type == GGUF_TYPE_INT64) {
+                _strides[i] = (ptrdiff_t)(reinterpret_cast<const int64_t *>(
+                    strides_meta->value.data())[ndim - 1 - i]);
+            } else if (strides_meta->gguf_type == GGUF_TYPE_INT32) {
+                _strides[i] = (ptrdiff_t)(reinterpret_cast<const int32_t *>(
+                    strides_meta->value.data())[ndim - 1 - i]);
+            } else {
+                throw std::runtime_error("Error Creating Tensor: Unsupported strides type");
+            }
+        }
+    }
+    infiniopCreateTensorDescriptor(&_desc, ndim, _shape.data(), _strides.data(), ggmlTypeToInfiniType(_ggml_type));
+    size_t size;
+    calculateTensorMemory(size, _offset, _shape, _strides, ggmlTypeSize(_ggml_type));
+    _memory = std::make_shared<Memory>(size, INFINI_DEVICE_CPU, 0);
+    utils::rearrange(
+        (char *)_memory->ptr() + _offset,
+        (char *)ggml_ptr + info->data_offset,
+        _shape.data(),
+        _strides.data(),
+        contiguous_strides.data(),
+        ndim,
+        ggmlTypeSize(_ggml_type));
+}
+Tensor::Tensor(std::shared_ptr<Memory> memory, size_t offset,
+               const std::vector<size_t> &shape,
+               const std::vector<ptrdiff_t> &strides,
+               GGML_TYPE dtype) : _memory(memory), _shape(shape), _strides(strides), _offset(offset), _ggml_type(dtype) {
+    infiniopCreateTensorDescriptor(&_desc, shape.size(), shape.data(), strides.data(), ggmlTypeToInfiniType(dtype));
+}
+std::shared_ptr<Tensor> Tensor::to(infiniDevice_t device, int device_id) const {
+    if (device == _memory->device() && (device_id == _memory->device_id() || device == INFINI_DEVICE_CPU)) {
+        return std::make_shared<Tensor>(_memory, _offset, _shape, _strides, _ggml_type);
+    }
+    std::shared_ptr<Memory> memory;
+    if (device == INFINI_DEVICE_CPU) {
+        memory = std::make_shared<Memory>(_memory->size(), INFINI_DEVICE_CPU, 0);
+        CHECK_OR(infinirtSetDevice(_memory->device(), _memory->device_id()), throw std::runtime_error("Error Tensor::to: set device"));
+        CHECK_OR(infinirtMemcpy(memory->ptr(), _memory->ptr(), _memory->size(), INFINIRT_MEMCPY_D2H), throw std::runtime_error("Error Tensor::to: cpy"));
+    } else if (_memory->device() == INFINI_DEVICE_CPU) {
+        memory = std::make_shared<Memory>(_memory->size(), device, device_id);
+        CHECK_OR(infinirtMemcpy(memory->ptr(), _memory->ptr(), _memory->size(), INFINIRT_MEMCPY_H2D), throw std::runtime_error("Error Tensor::to: cpy"));
+    } else {
+        return to(INFINI_DEVICE_CPU, 0)->to(device, device_id);
+    }
+    return std::make_shared<Tensor>(memory, _offset, _shape, _strides, _ggml_type);
+}
+void Tensor::debug() const {
+    auto tensor = to(INFINI_DEVICE_CPU, 0);
+    std::cout << "Tensor: " << tensor->info() << std::endl;
+    switch (_ggml_type) {
+    case GGML_TYPE_F16:
+        printData((fp16_t *)(tensor->data()), _shape, _strides, 0);
+        break;
+    case GGML_TYPE_F32:
+        printData((float *)(tensor->data()), _shape, _strides, 0);
+        break;
+    case GGML_TYPE_F64:
+        printData((double *)(tensor->data()), _shape, _strides, 0);
+        break;
+    case GGML_TYPE_I8:
+        printData((int8_t *)(tensor->data()), _shape, _strides, 0);
+        break;
+    case GGML_TYPE_I16:
+        printData((int16_t *)(tensor->data()), _shape, _strides, 0);
+        break;
+    case GGML_TYPE_I32:
+        printData((int32_t *)(tensor->data()), _shape, _strides, 0);
+        break;
+    default:
+        std::cout << "Unsupported GGML type" << std::endl;
+        break;
+    }
+}
+std::string Tensor::info() const {
+    std::ostringstream oss;
+    oss << "Shape: [";
+    for (size_t i = 0; i < _shape.size(); ++i) {
+        oss << _shape[i];
+        if (i != _shape.size() - 1) {
+            oss << ", ";
+        }
+    }
+    oss << "]";
+    oss << ", Strides: [";
+    for (size_t i = 0; i < _strides.size(); ++i) {
+        oss << _strides[i];
+        if (i != _strides.size() - 1) {
+            oss << ", ";
+        }
+    }
+    oss << "]";
+    oss << ", Type: " << GGML_TYPE_NAME[_ggml_type];
+    return oss.str();
+}
+Tensor::~Tensor() {
+    infiniopDestroyTensorDescriptor(_desc);
+}
+} // namespace infiniop_test
--- a/src/infiniop-test/src/test.cpp
+++ b/src/infiniop-test/src/test.cpp
+#include "ops.hpp"
+#include "tensor.hpp"
+#include "utils.hpp"
+#include <chrono>
+#include <cmath>
+#include <infinirt.h>
+#include <iostream>
+#include <numeric>
+namespace infiniop_test {
+std::unordered_map<std::string, const TestBuilder> TEST_BUILDERS = TEST_BUILDER_MAPPINGS;
+std::string Result::toString() const {
+    std::ostringstream oss;
+    oss << "Status: ";
+    switch (_status) {
+    case TestStatus::PASS:
+        oss << GREEN << "PASS" << RESET;
+        break;
+    case TestStatus::TEST_INIT_FAILED:
+        oss << RED << "INVALID TEST" << RESET;
+        break;
+    case TestStatus::OP_CREATION_FAILED:
+        oss << RED << "OP CREATION FAILED" << RESET;
+        break;
+    case TestStatus::OP_EXECUTION_FAILED:
+        oss << RED << "EXECUTION FAILED" << RESET;
+        break;
+    case TestStatus::RESULT_INCORRECT:
+        oss << RED << "WRONG ANSWER" << RESET;
+        break;
+    default:
+        oss << YELLOW << "SKIPPED" << RESET;
+        break;
+    }
+    oss << std::endl;
+    oss << "Description: " << _description << std::endl;
+    if (_time > 0.) {
+        oss << "Time: " << _time << " us" << std::endl;
+    } else {
+        oss << "Time: N/A" << std::endl;
+    }
+    if (_error_message.size() > 0) {
+        oss << "Error: " << _error_message << std::endl;
+    }
+    return oss.str();
+}
+std::vector<std::shared_ptr<Result>> runAllTests(const GGUFFileReader &gguf_reader,
+                                                 infiniDevice_t device, int device_id,
+                                                 size_t warm_ups, size_t iterations,
+                                                 double rtol, double atol) {
+    auto meta = gguf_reader.getAttributeMap();
+    auto count_meta = meta.find("test_count");
+    if (count_meta == meta.end()) {
+        throw std::runtime_error("Invalid GGUF file: missing test_count attribute");
+    }
+    size_t count = *(size_t *)(count_meta->second->value.data());
+    std::cout << "Found " << count << " tests" << std::endl;
+    auto results = std::vector<std::shared_ptr<Result>>(count);
+    try {
+        for (size_t i = 0; i < count; i++) {
+            results[i] = runTest(gguf_reader, device, device_id, warm_ups, iterations, rtol, atol, i);
+        }
+    } catch (const std::exception &e) {
+        std::cerr << "Error: " << e.what() << std::endl;
+    }
+    return results;
+}
+std::shared_ptr<Result> runTest(const GGUFFileReader &gguf_reader,
+                                infiniDevice_t device, int device_id,
+                                size_t warm_ups, size_t iterations,
+                                double rtol, double atol, size_t test_id) {
+    auto meta = gguf_reader.getAttributeMap();
+    auto tensor_info = gguf_reader.getTensorInfoMap();
+    auto name_meta = meta.find("test." + std::to_string(test_id) + ".op_name");
+    if (name_meta != meta.end()) {
+        std::string op_name(name_meta->second->value.begin(), name_meta->second->value.end());
+        auto builder = TEST_BUILDERS.find(op_name)->second;
+        auto attrs = std::unordered_map<std::string, std::vector<uint8_t>>();
+        auto tensors = std::unordered_map<std::string, std::shared_ptr<Tensor>>();
+        infiniopHandle_t handle;
+        CHECK_OR(infinirtSetDevice(device, device_id), throw std::runtime_error("Failed to set device"));
+        CHECK_OR(infiniopCreateHandle(&handle), throw std::runtime_error("Failed to create handle"));
+        for (auto attr_name : builder.attribute_names) {
+            auto attr = meta.find("test." + std::to_string(test_id) + "." + attr_name);
+            if (attr != meta.end()) {
+                attrs[attr_name] = attr->second->value;
+            }
+        }
+        for (auto tensor_name : builder.tensor_names) {
+            auto info = tensor_info.find("test." + std::to_string(test_id) + "." + tensor_name);
+            if (info != tensor_info.end()) {
+                auto strides = meta.find("test." + std::to_string(test_id) + "." + tensor_name + ".strides");
+                tensors[tensor_name] = std::make_shared<Tensor>(
+                    info->second.get(),
+                    gguf_reader.getGgmlStart(),
+                    strides != meta.end() ? strides->second.get() : nullptr);
+            }
+        }
+        std::shared_ptr<infiniop_test::base::Test> test;
+        try {
+            test = builder.build(attrs, tensors, rtol, atol);
+        } catch (const std::exception &e) {
+            return TEST_INIT_FAILED(op_name + "/n" + e.what());
+        }
+        std::shared_ptr<Result> result;
+        try {
+            result = test->run(handle, device, device_id, warm_ups, iterations);
+        } catch (const std::exception &e) {
+            return TEST_INIT_FAILED(op_name + "/n" + e.what());
+        }
+        CHECK_OR(infiniopDestroyHandle(handle), throw std::runtime_error("Failed to destroy handle"));
+        return result;
+    }
+    return TEST_INIT_FAILED("");
+}
+void incrementOffset(ptrdiff_t &offset_1, const std::vector<ptrdiff_t> &strides_1, size_t data_size_1,
+                     ptrdiff_t &offset_2, const std::vector<ptrdiff_t> &strides_2, size_t data_size_2,
+                     std::vector<size_t> &counter, const std::vector<size_t> &shape) {
+    for (ptrdiff_t d = shape.size() - 1; d >= 0; d--) {
+        counter[d] += 1;
+        offset_1 += strides_1[d] * data_size_1;
+        offset_2 += strides_2[d] * data_size_2;
+        if (counter[d] < shape[d]) {
+            break;
+        }
+        counter[d] = 0;
+        offset_1 -= shape[d] * strides_1[d] * data_size_1;
+        offset_2 -= shape[d] * strides_2[d] * data_size_2;
+    }
+}
+void allClose(std::shared_ptr<Tensor> actual_, std::shared_ptr<Tensor> expected_, double rtol, double atol) {
+    auto actual = actual_->to(INFINI_DEVICE_CPU);
+    auto expected = expected_->to(INFINI_DEVICE_CPU);
+    auto shape = actual->shape();
+    if (shape != expected->shape()) {
+        throw std::runtime_error("Shape mismatch.");
+    }
+    auto ndim = shape.size();
+    size_t total = std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
+    auto counter = std::vector<size_t>(ndim, 0);
+    ptrdiff_t actual_offset = 0,
+              expected_offset = 0;
+    size_t num_failed = 0;
+    std::string first_failed_msg;
+    for (size_t i = 0; i < total; i++) {
+        double a_ = getVal((char *)actual->data() + actual_offset, actual->ggml_type());
+        double e_ = getVal((char *)expected->data() + expected_offset, expected->ggml_type());
+        if (std::fabs(a_ - e_) > atol || std::fabs(a_ - e_) > rtol * std::fmax(std::fabs(a_), std::fabs(e_))) {
+            if (num_failed == 0) {
+                first_failed_msg = "First failed at index " + std::to_string(i) + " with value " + std::to_string(a_) + " but should be " + std::to_string(e_) + ".";
+            }
+            num_failed++;
+        }
+        incrementOffset(actual_offset, actual->strides(), ggmlTypeSize(actual->ggml_type()),
+                        expected_offset, expected->strides(), ggmlTypeSize(expected->ggml_type()),
+                        counter, shape);
+    }
+    if (num_failed > 0) {
+        throw std::runtime_error(std::to_string(num_failed) + " out of " + std::to_string(total) + " values failed. " + first_failed_msg);
+    }
+}
+double benchmark(std::function<void()> func, size_t warmups, size_t iterations) {
+    if (iterations == 0) {
+        return 0.0;
+    }
+    for (size_t i = 0; i < warmups; ++i) {
+        func();
+    }
+    infinirtDeviceSynchronize();
+    auto start = std::chrono::high_resolution_clock::now();
+    for (size_t i = 0; i < iterations; ++i) {
+        func();
+    }
+    infinirtDeviceSynchronize();
+    auto end = std::chrono::high_resolution_clock::now();
+    auto duration = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start);
+    double average_time = duration.count() / iterations / 1e3; // average in us
+    return average_time;
+}
+} // namespace infiniop_test
--- a/src/utils-test/test_rearrange.cc
+++ b/src/utils-test/test_rearrange.cc
@@ -45,28 +45,26 @@ size_t check_equal(
    return fails;
 }
-int test_transpose_2d() {
+int test_transpose_any(size_t index, std::vector<size_t> shape, std::vector<ptrdiff_t> strides_a, std::vector<ptrdiff_t> strides_b) {
-    std::vector<size_t> shape = {3, 5};
-    std::vector<ptrdiff_t> strides_a = {5, 1};
-    std::vector<ptrdiff_t> strides_b = {1, 3};
    auto numel = std::accumulate(shape.begin(), shape.end(), (size_t)1, std::multiplies<size_t>());
    std::vector<float> a(numel);
    std::vector<float> b(numel);
    for (size_t i = 0; i < numel; i++) {
-        a[i] = i / numel;
+        a[i] = (float)i / numel;
    }
-    utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), 2, sizeof(float));
+    utils::rearrange(b.data(), a.data(), shape.data(), strides_b.data(), strides_a.data(), shape.size(), sizeof(float));
-    if (check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b)) {
+    auto fails = check_equal<float>(a.data(), b.data(), shape, strides_a, strides_b);
+    if (fails > 0) {
+        std::cout << "test_transpose " << index << " failed" << std::endl;
        return 1;
    } else {
-        std::cout << "test_transpose_2d passed" << std::endl;
+        std::cout << "test_transpose " << index << " passed" << std::endl;
+        return 0;
    }
-    return 0;
 }
 int test_rearrange() {
-    return test_transpose_2d();
+    return test_transpose_any(1, {3, 5}, {5, 1}, {1, 3})
+         + test_transpose_any(2, {1, 2048}, {2048, 1}, {2048, 1});
 }
--- a/src/utils/rearrange.cc
+++ b/src/utils/rearrange.cc
@@ -46,6 +46,7 @@ std::optional<RearrangeMeta> RearrangeMeta::create(
        }
        return std::abs(a.dst) > std::abs(b.dst);
    });
+    ndim = dims.size();
    // # 合并连续维度
    // ## 合并末尾连续维度到 unit
    for (auto it = dims.rbegin(); it != dims.rend(); ++it) {

--- a/test/infiniop-test/README.md
+++ b/test/infiniop-test/README.md
+# InfiniOP 测例生成
+## 介绍
+使用 python 脚本生成包含测例的 `.gguf` 文件，并使用 `infiniop-test` 程序进行测试。
+## 运行方式
+- 编译 `infiniop-test` 程序
+```bash
+xmake build infiniop-test
+```
+- 生成测例
+在`/test/infiniop-test/`目录执行矩阵乘测例生成脚本，执行结束以后会在`/test/infiniop-test/`目录生成`matmul.gguf`测例文件。
+```bash
+cd /test/infiniop-test/
+python -m test_generate.testcases.matmul
+```
+- 测试测例
+打印测试程序用法
+```bash
+infiniop-test --help
+```
+示例：在CPU上测试`matmul.gguf`测例文件，预热20次，测试1000次。
+```bash
+infiniop-test matmul.gguf --cpu --warmup 20 --run 1000
+```
+## 自定义测例
+### GGUF文件格式
+```text
+GGUF File Contents:
+Version: 3
+Number of Meta KVs: 8
+Number of Tensors: 4
+Meta KVs:
+Key: general.architecture, Type: GGUF_TYPE_STRING, Value: infiniop-test
+Key: test_count, Type: GGUF_TYPE_UINT64, Value: 1
+Key: test.0.op_name, Type: GGUF_TYPE_STRING, Value: matmul
+Key: test.0.a.strides, Type: GGUF_TYPE_INT32, Value: [1, 5]
+Key: test.0.b.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
+Key: test.0.c.strides, Type: GGUF_TYPE_INT32, Value: [1, 6]
+Key: test.0.alpha, Type: GGUF_TYPE_FLOAT32, Value: 1.000000
+Key: test.0.beta, Type: GGUF_TYPE_FLOAT32, Value: 0.000000
+Tensor INFOs:
+Name: test.0.a, NDims: 2, Shape: [5, 4], DataType: F32, DataOffset: 0
+Name: test.0.b, NDims: 2, Shape: [6, 5], DataType: F32, DataOffset: 96
+Name: test.0.c, NDims: 2, Shape: [6, 4], DataType: F32, DataOffset: 224
+Name: test.0.ans, NDims: 2, Shape: [6, 4], DataType: F64, DataOffset: 320
+```
+- `Meta` 中必须包含 `test_count` ，表示测例数量。
+- 每个测例的 `Meta` 和 `Tensor` 名字以 `test.[id].` 开头，后接具体信息名称。数字 `[id]` 表示测例编号。编号必须为 0 到 test_count-1.
+- `Tensor` 名字接 `.strides` 表示步长，若没有则默认为连续。
--- a/test/infiniop-test/test_generate/__init__.py
+++ b/test/infiniop-test/test_generate/__init__.py
+from .infiniop_test import InfiniopTestCase, InfiniopTestWriter, np_dtype_to_ggml, gguf_strides
--- a/test/infiniop-test/test_generate/infiniop_test.py
+++ b/test/infiniop-test/test_generate/infiniop_test.py
+import gguf
+from typing import List
+import numpy as np
+from gguf import GGMLQuantizationType
+def np_dtype_to_ggml(tensor_dtype: np.dtype):
+    if tensor_dtype == np.float16:
+        return GGMLQuantizationType.F16
+    elif tensor_dtype == np.float32:
+        return GGMLQuantizationType.F32
+    elif tensor_dtype == np.float64:
+        return GGMLQuantizationType.F64
+    elif tensor_dtype == np.int8:
+        return GGMLQuantizationType.I8
+    elif tensor_dtype == np.int16:
+        return GGMLQuantizationType.I16
+    elif tensor_dtype == np.int32:
+        return GGMLQuantizationType.I32
+    elif tensor_dtype == np.int64:
+        return GGMLQuantizationType.I64
+    else:
+        raise ValueError(
+            "Only F16, F32, F64, I8, I16, I32, I64 tensors are supported for now"
+        )
+def gguf_strides(*args: int) -> list[int] | None:
+    return list(args)[::-1] if args else None
+class InfiniopTestCase:
+    op_name: str
+    def __init__(self, op_name: str):
+        self.op_name = op_name
+    def write_test(self, test_writer: "InfiniopTestWriter"):
+        test_writer.add_string(test_writer.gguf_key("op_name"), self.op_name)
+class InfiniopTestWriter(gguf.GGUFWriter):
+    _test_cases: List["InfiniopTestCase"]
+    _written_tests = 0
+    def __init__(self, filepath):
+        super().__init__(filepath, "infiniop-test")
+        self._test_cases = []
+        self._written_tests = 0
+    def add_test(self, test_case: "InfiniopTestCase"):
+        self._test_cases.append(test_case)
+    def add_tests(self, test_cases: List["InfiniopTestCase"]):
+        self._test_cases.extend(test_cases)
+    def gguf_key(self, name: str) -> str:
+        return f"test.{self._written_tests}.{name}"
+    def save(self):
+        super().add_uint64("test_count", len(self._test_cases))
+        for test_case in self._test_cases:
+            test_case.write_test(self)
+            self._written_tests += 1
+        super().write_header_to_file()
+        super().write_kv_data_to_file()
+        super().write_tensors_to_file()
+        super().close()
--- a/test/infiniop-test/test_generate/testcases/__init__.py
+++ b/test/infiniop-test/test_generate/testcases/__init__.py