issue/461 InfiniCore 推理运行时

Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>

issue/461 InfiniCore 推理运行时
Co-authored-by: Jiacheng Huang <huangjiacheng0709@outlook.com> Co-authored-by: wooway777 <wooway777@gmail.com>
9a05446f · PanZezhong1725 · GitHub · 37411f6d · 9a05446f · 9a05446f
Unverified Commit 9a05446f authored Oct 11, 2025 by PanZezhong1725 Committed by GitHub Oct 11, 2025
20 changed files
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -19,6 +19,9 @@ jobs:
    - name: checkout code
      uses: actions/checkout@v4
+      with:
+        submodules: recursive
+        fetch-depth: 0
    - name: Check Format
      run: |

--- a/.gitignore
+++ b/.gitignore
@@ -10,6 +10,7 @@ build/
 # Python
 __pycache__/
+*.egg-info/
 # Log
 *.log
@@ -22,3 +23,8 @@ cache/
 #GGUF
 *.gguf
+# Compressed
+*.gz
+*.zip
+*.tar
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "third_party/spdlog"]
+	path = third_party/spdlog
+	url = https://github.com/gabime/spdlog.git
--- a/README.md
+++ b/README.md
@@ -28,6 +28,20 @@ API 定义以及使用方式详见 [`InfiniCore文档`](https://github.com/Infin
 ## 配置和使用
+### 子模块
+由于仓库中含有子模块，所以在克隆时请添加 `--recursive` 或 `--recurse-submodules`，如：
+```shell
+git clone --recursive https://github.com/InfiniTensor/InfiniCore.git
+```
+或者在普通克隆后进行更新：
+```shell
+git submodule update --init --recursive
+```
 ### 一键安装
 在 `script/` 目录中提供了 `install.py` 安装脚本。使用方式如下：

--- a/include/infinicore.hpp
+++ b/include/infinicore.hpp
-#ifndef __INFINICORE_API_HPP__
+#pragma once
-#define __INFINICORE_API_HPP__
+#include "infinicore/ops.hpp"
 #include "infinicore/tensor.hpp"
-#endif
--- a/include/infinicore/common/LRUCache.hpp
+++ b/include/infinicore/common/LRUCache.hpp
+#pragma once
+#include <cstddef>
+#include <functional>
+#include <iostream>
+#include <list>
+#include <optional>
+#include <stdexcept>
+#include <unordered_map>
+namespace infinicore::common {
+template <typename Key, typename Value>
+class LRUCache {
+public:
+    using KeyValuePair = std::pair<Key, Value>;
+    using ListIt = typename std::list<KeyValuePair>::iterator;
+    using Destructor = std::function<void(Value &)>;
+    explicit LRUCache(size_t capacity = 100, Destructor destructor = nullptr)
+        : capacity_(capacity), destructor_(destructor) {
+        if (capacity == 0) {
+            capacity_ = UINT64_MAX; // effectively unbounded
+        }
+    }
+    ~LRUCache() {
+        cleanup();
+    }
+    bool contains(const Key &key) const {
+        return map_.find(key) != map_.end();
+    }
+    void put(const Key &key, const Value &value) {
+        auto it = map_.find(key);
+        if (it != map_.end()) {
+            if (destructor_) {
+                destructor_(it->second->second);
+            }
+            it->second->second = value;
+            touch(it);
+        } else {
+            // insert new
+            if (list_.size() >= capacity_) {
+                evictLRU();
+            }
+            list_.emplace_front(key, value);
+            map_[key] = list_.begin();
+        }
+    }
+    std::optional<Value> get(const Key &key) {
+        auto it = map_.find(key);
+        if (it == map_.end()) {
+            return std::nullopt;
+        }
+        touch(it);
+        return it->second->second;
+    }
+    std::optional<Value> get(const Key &key) const {
+        auto it = map_.find(key);
+        if (it == map_.end()) {
+            return std::nullopt;
+        }
+        // Note: can't touch in const context
+        return it->second->second;
+    }
+    void setDestructor(Destructor destructor) {
+        destructor_ = destructor;
+    }
+    void setCapacity(size_t capacity) {
+        capacity_ = capacity;
+        while (list_.size() > capacity_) {
+            evictLRU();
+        }
+    }
+    void clear() {
+        if (destructor_) {
+            for (auto &item : list_) {
+                safeDestruct(item.second);
+            }
+        }
+        list_.clear();
+        map_.clear();
+    }
+    const std::list<KeyValuePair> &getAllItems() const {
+        return list_;
+    }
+protected:
+    std::list<KeyValuePair> list_; // front = most recent, back = least
+private:
+    void touch(typename std::unordered_map<Key, ListIt>::iterator it) {
+        // move this key to front (most recent)
+        list_.splice(list_.begin(), list_, it->second);
+        it->second = list_.begin();
+    }
+    void safeDestruct(Value &value) {
+        if (!destructor_) {
+            return;
+        }
+        try {
+            destructor_(value);
+        } catch (const std::exception &e) {
+            // Built-in default error handling
+            std::cerr << "Cache destructor error (type: " << typeid(Value).name()
+                      << "): " << e.what() << std::endl;
+        }
+    }
+    void evictLRU() {
+        if (!list_.empty()) {
+            auto &kv = list_.back();
+            safeDestruct(kv.second);
+            map_.erase(kv.first);
+            list_.pop_back();
+        }
+    }
+    void cleanup() {
+        clear();
+    }
+    size_t capacity_;
+    std::unordered_map<Key, ListIt> map_;
+    Destructor destructor_;
+};
+} // namespace infinicore::common
--- a/include/infinicore/common/hash.hpp
+++ b/include/infinicore/common/hash.hpp
+#pragma once
+#include "../tensor.hpp"
+#include <type_traits>
+namespace infinicore {
+// Base hash_combine for arithmetic types
+template <typename T>
+std::enable_if_t<std::is_arithmetic_v<T>, void>
+hash_combine(size_t &seed, const T &value) {
+    seed ^= std::hash<T>{}(value) + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+// Specialization for Tensor
+inline void hash_combine(size_t &seed, Tensor tensor) {
+    hash_combine(seed, static_cast<size_t>(tensor->dtype()));
+    for (Size shape : tensor->shape()) {
+        hash_combine(seed, shape);
+    }
+    for (Stride stride : tensor->strides()) {
+        hash_combine(seed, static_cast<size_t>(stride));
+    }
+}
+// Specialization for std::string
+inline void hash_combine(size_t &seed, const std::string &str) {
+    hash_combine(seed, std::hash<std::string>{}(str));
+}
+// Specialization for const char*
+inline void hash_combine(size_t &seed, const char *str) {
+    hash_combine(seed, std::string(str));
+}
+// Variadic template for multiple arguments
+template <typename First, typename... Rest>
+void hash_combine(size_t &seed, const First &first, const Rest &...rest) {
+    hash_combine(seed, first);
+    hash_combine(seed, rest...);
+}
+// Base case for variadic template
+inline void hash_combine(size_t &seed) {
+    // Base case - do nothing
+}
+// Convenience function to hash multiple values
+template <typename... Types>
+size_t hash_combine(const Types &...values) {
+    size_t seed = 0;
+    hash_combine(seed, values...);
+    return seed;
+}
+} // namespace infinicore
--- a/include/infinicore/context/context.hpp
+++ b/include/infinicore/context/context.hpp
+#pragma once
+#include "../device.hpp"
+#include "../memory.hpp"
+#include <infiniop.h>
+#include <infinirt.h>
+#include <memory>
+namespace infinicore {
+namespace context {
+void setDevice(Device device);
+Device getDevice();
+size_t getDeviceCount(Device::Type type);
+infinirtStream_t getStream();
+infiniopHandle_t getInfiniopHandle();
+void syncStream();
+void syncDevice();
+std::shared_ptr<Memory> allocateMemory(size_t size);
+std::shared_ptr<Memory> allocateHostMemory(size_t size);
+std::shared_ptr<Memory> allocatePinnedHostMemory(size_t size);
+void memcpyH2D(void *dst, const void *src, size_t size);
+void memcpyD2H(void *dst, const void *src, size_t size);
+void memcpyD2D(void *dst, const void *src, size_t size);
+void memcpyH2H(void *dst, const void *src, size_t size);
+} // namespace context
+} // namespace infinicore
--- a/include/infinicore/device.hpp
+++ b/include/infinicore/device.hpp
-#ifndef __INFINICORE_DEVICE_API_HPP__
+#pragma once
-#define __INFINICORE_DEVICE_API_HPP__
 #include <cstdint>
 #include <string>
+#include "infinicore.h"
 namespace infinicore {
 class Device {
@@ -11,20 +12,31 @@ public:
    using Index = std::size_t;
    enum class Type {
-        cpu,
+        CPU = INFINI_DEVICE_CPU,
-        cuda,
+        NVIDIA = INFINI_DEVICE_NVIDIA,
-        meta,
+        CAMBRICON = INFINI_DEVICE_CAMBRICON,
+        ASCEND = INFINI_DEVICE_ASCEND,
+        METAX = INFINI_DEVICE_METAX,
+        MOORE = INFINI_DEVICE_MOORE,
+        ILUVATAR = INFINI_DEVICE_ILUVATAR,
+        KUNLUN = INFINI_DEVICE_KUNLUN,
+        SUGON = INFINI_DEVICE_SUGON,
+        COUNT = INFINI_DEVICE_TYPE_COUNT,
    };
-    Device(const Type &type, const Index &index = 0);
+    Device(const Type &type = Type::CPU, const Index &index = 0);
+    const Type &getType() const;
-    const Type &get_type() const;
+    const Index &getIndex() const;
-    const Index &get_index() const;
+    std::string toString() const;
-    std::string to_string() const;
+    static std::string toString(const Type &type);
-    static std::string to_string(const Type &type);
+    bool operator==(const Device &other) const;
+    bool operator!=(const Device &other) const;
 private:
    Type type_;
@@ -33,5 +45,3 @@ private:
 };
 } // namespace infinicore
-#endif
--- a/include/infinicore/dtype.hpp
+++ b/include/infinicore/dtype.hpp
-#ifndef __INFINICORE_DTYPE_API_HPP__
+#pragma once
-#define __INFINICORE_DTYPE_API_HPP__
 #include <infinicore.h>
+#include <string>
 namespace infinicore {
 enum class DataType {
-    bfloat16 = INFINI_DTYPE_BF16,
+    BYTE = INFINI_DTYPE_BYTE,
-    float16 = INFINI_DTYPE_F16,
+    BOOL = INFINI_DTYPE_BOOL,
-    float32 = INFINI_DTYPE_F32,
+    I8 = INFINI_DTYPE_I8,
-    float64 = INFINI_DTYPE_F64,
+    I16 = INFINI_DTYPE_I16,
-    int32 = INFINI_DTYPE_I32,
+    I32 = INFINI_DTYPE_I32,
-    int64 = INFINI_DTYPE_I64,
+    I64 = INFINI_DTYPE_I64,
-    uint8 = INFINI_DTYPE_U8,
+    U8 = INFINI_DTYPE_U8,
+    U16 = INFINI_DTYPE_U16,
+    U32 = INFINI_DTYPE_U32,
+    U64 = INFINI_DTYPE_U64,
+    F8 = INFINI_DTYPE_F8,
+    F16 = INFINI_DTYPE_F16,
+    F32 = INFINI_DTYPE_F32,
+    F64 = INFINI_DTYPE_F64,
+    C16 = INFINI_DTYPE_C16,
+    C32 = INFINI_DTYPE_C32,
+    C64 = INFINI_DTYPE_C64,
+    C128 = INFINI_DTYPE_C128,
+    BF16 = INFINI_DTYPE_BF16,
 };
-std::string to_string(const DataType &dtype);
+std::string toString(const DataType &dtype);
+size_t dsize(const DataType &dtype);
 } // namespace infinicore
-#endif
--- a/include/infinicore/memory.hpp
+++ b/include/infinicore/memory.hpp
+#pragma once
+#include "device.hpp"
+#include <cstddef>
+#include <functional>
+namespace infinicore {
+class Memory {
+public:
+    using Deleter = std::function<void(std::byte *)>;
+    Memory(std::byte *data, size_t size, Device device, Deleter deleter, bool pin_memory = false);
+    ~Memory();
+    std::byte *data();
+    Device device() const;
+    size_t size() const;
+    bool is_pinned() const;
+private:
+    std::byte *data_;
+    size_t size_;
+    Device device_;
+    Deleter deleter_;
+    bool is_pinned_;
+};
+} // namespace infinicore
--- a/include/infinicore/op/common/cache.hpp
+++ b/include/infinicore/op/common/cache.hpp
+#pragma once
+#include "../../common/LRUCache.hpp"
+#include "../../context/context.hpp"
+#include <array>
+#include <functional>
+#include <memory>
+#include <vector>
+namespace infinicore::op::common {
+template <typename Key, typename Value>
+class OpCache {
+private:
+    using BaseCache = infinicore::common::LRUCache<Key, Value>;
+    using Destructor = typename BaseCache::Destructor;
+    using CacheVector = std::vector<BaseCache>;
+public:
+    explicit OpCache(size_t capacity = 100, Destructor destructor = nullptr)
+        : capacity_(capacity), destructor_(destructor) {}
+    ~OpCache() {
+        clear();
+    }
+    BaseCache &getCache(Device::Type device_type, size_t device_index) {
+        auto &cache_vector = caches_[static_cast<size_t>(device_type)];
+        if (cache_vector.size() <= device_index) {
+            cache_vector.resize(device_index + 1, BaseCache(capacity_, destructor_));
+        } else {
+            cache_vector[device_index].setDestructor(destructor_);
+        }
+        return cache_vector[device_index];
+    }
+    void setCapacity(size_t capacity) {
+        capacity_ = capacity;
+        for (auto &vec : caches_) {
+            for (auto &cache : vec) {
+                cache.setCapacity(capacity);
+            }
+        }
+    }
+    void clear() {
+        Device current_device = context::getDevice();
+        for (size_t type_idx = 0; type_idx < caches_.size(); ++type_idx) {
+            auto &vec = caches_[type_idx];
+            for (size_t dev_idx = 0; dev_idx < vec.size(); ++dev_idx) {
+                Device target_device(static_cast<Device::Type>(type_idx), dev_idx);
+                if (current_device != target_device) {
+                    context::setDevice(target_device);
+                }
+                vec[dev_idx].clear();
+                if (current_device != target_device) {
+                    context::setDevice(current_device);
+                }
+            }
+            vec.clear();
+        }
+        caches_ = {};
+    }
+private:
+    size_t capacity_;
+    Destructor destructor_;
+    std::array<CacheVector, static_cast<size_t>(Device::Type::COUNT)> caches_ = {};
+};
+} // namespace infinicore::op::common
--- a/include/infinicore/op/common/dispatcher.hpp
+++ b/include/infinicore/op/common/dispatcher.hpp
+#pragma once
+#include "../../device.hpp"
+#include <array>
+namespace infinicore::op::common {
+template <typename Fn>
+class OpDispatcher {
+public:
+    void registerDevice(Device::Type device_type, Fn fn, bool override_existing=true) {
+        if (table_[(size_t)device_type] == nullptr || override_existing){
+            table_[(size_t)device_type] = fn;
+        }
+    }
+    void registerDevice(std::initializer_list<Device::Type> device_types, Fn fn, bool override_existing=true) {
+        for (auto device_type : device_types) {
+            registerDevice(device_type, fn, override_existing);
+        }
+    }
+    void registerAll(Fn fn, bool override_existing=true) {
+        for (size_t device_type = 0; device_type < static_cast<size_t>(Device::Type::COUNT); ++device_type) {
+            registerDevice((Device::Type)device_type, fn, override_existing);
+        }
+    }
+    Fn lookup(Device::Type device_type) const {
+        return table_.at((size_t)device_type);
+    }
+private:
+    std::array<Fn, static_cast<size_t>(Device::Type::COUNT)> table_;
+};
+} // namespace infinicore::op::common
--- a/include/infinicore/op/common/op.hpp
+++ b/include/infinicore/op/common/op.hpp
+#pragma once
+#include "../../context/context.hpp"
+#include "../../tensor.hpp"
+#include "dispatcher.hpp"
--- a/include/infinicore/op/matmul.hpp
+++ b/include/infinicore/op/matmul.hpp
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+namespace infinicore::op {
+class Matmul {
+public:
+    using schema = void (*)(Tensor, Tensor, Tensor);
+    static void execute(Tensor c, Tensor a, Tensor b);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor matmul(Tensor a, Tensor b);
+void matmul_(Tensor c, Tensor a, Tensor b);
+} // namespace infinicore::op
--- a/include/infinicore/op/ones.hpp
+++ b/include/infinicore/op/ones.hpp
+#pragma once
+#include "common/op.hpp"
+namespace infinicore::op {
+class Ones {
+public:
+    using schema = void (*)(Tensor);
+    static void execute(Tensor output);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor ones();
+void ones_(Tensor output);
+} // namespace infinicore::op
--- a/include/infinicore/op/rearrange.hpp
+++ b/include/infinicore/op/rearrange.hpp
+#pragma once
+#include "../device.hpp"
+#include "common/op.hpp"
+namespace infinicore::op {
+class Rearrange {
+public:
+    using schema = void (*)(Tensor, Tensor);
+    static void execute(Tensor y, Tensor x);
+    static common::OpDispatcher<schema> &dispatcher();
+};
+Tensor rearrange(Tensor x);
+void rearrange_(Tensor y, Tensor x);
+} // namespace infinicore::op
--- a/include/infinicore/ops.hpp
+++ b/include/infinicore/ops.hpp
+#pragma once
+#include "op/matmul.hpp"
+#include "op/ones.hpp"
+#include "op/rearrange.hpp"
--- a/include/infinicore/tensor.hpp
+++ b/include/infinicore/tensor.hpp
-#ifndef __INFINICORE_TENSOR_API_HPP__
+#pragma once
-#define __INFINICORE_TENSOR_API_HPP__
-#include <vector>
 #include "device.hpp"
 #include "dtype.hpp"
+#include "memory.hpp"
+#include <algorithm>
+#include <cassert>
+#include <functional>
+#include <memory>
+#include <vector>
+#include <infiniop.h>
 namespace infinicore {
+using Size = std::size_t;
+using Stride = std::ptrdiff_t;
+using Shape = std::vector<Size>;
+using Strides = std::vector<Stride>;
+class TensorImpl;
+struct TensorMetaData {
+    Shape shape;
+    Strides strides;
+    DataType dtype;
+    infiniopTensorDescriptor_t desc;
+    TensorMetaData(const Shape &shape, const Strides &strides, const DataType &dtype);
+};
+struct TensorData {
+    size_t offset;
+    std::shared_ptr<Memory> memory;
+};
+struct TensorSliceParams {
+    size_t dim;
+    size_t start;
+    Size len;
+};
 class Tensor {
 public:
-    using Size = std::size_t;
+    static Tensor empty(const Shape &shape,
+                        const DataType &dtype,
+                        const Device &device,
+                        bool pin_memory = false);
-    using Stride = std::ptrdiff_t;
+    static Tensor strided_empty(const Shape &shape,
+                                const Strides &strides,
+                                const DataType &dtype,
+                                const Device &device,
+                                bool pin_memory = false);
-    using Shape = std::vector<Size>;
+    static Tensor zeros(const Shape &shape,
+                        const DataType &dtype,
+                        const Device &device,
+                        bool pin_memory = false);
-    using Strides = std::vector<Stride>;
+    static Tensor ones(const Shape &shape,
+                       const DataType &dtype,
+                       const Device &device,
+                       bool pin_memory = false);
-    Tensor(const Shape &shape, const DataType &dtype, const Device &device);
+    static Tensor from_blob(void *raw_ptr,
+                            const Shape &shape,
+                            const DataType &dtype,
+                            const Device &device);
-    const Shape &get_shape() const;
+    static Tensor strided_from_blob(void *raw_ptr,
+                                    const Shape &shape,
+                                    const Strides &strides,
+                                    const DataType &dtype,
+                                    const Device &device);
-    const DataType &get_dtype() const;
+    Tensor(const Tensor &) = default;
+    Tensor(Tensor &&) = default;
+    Tensor &operator=(const Tensor &) = default;
+    Tensor &operator=(Tensor &&) = default;
-    const Device &get_device() const;
+    TensorImpl *operator->();
+    const TensorImpl *operator->() const;
-private:
+protected:
-    Shape shape_;
+    explicit Tensor(std::shared_ptr<TensorImpl> impl) : impl_(std::move(impl)) {}
+    std::shared_ptr<TensorImpl> impl_;
+    friend class TensorImpl;
+};
+class TensorImpl : public std::enable_shared_from_this<TensorImpl> {
+public:
+    TensorImpl(const Shape &shape, const DataType &dtype);
+    TensorImpl(const Shape &shape, const Strides &strides, const DataType &dtype);
+    std::byte *data();
+    const std::byte *data() const;
+    const Shape &shape() const;
+    const Strides &strides() const;
+    bool is_contiguous() const;
+    Size ndim() const;
+    Size numel() const;
-    DataType dtype_;
+    Size size(size_t dim) const;
-    Device device_;
+    Stride stride(size_t dim) const;
+    DataType dtype() const;
+    Device device() const;
+    infiniopTensorDescriptor_t desc() const;
+    bool is_pinned() const;
+    std::string info() const;
+    ///
+    /// Data Transfer APIs
+    ///
+    /**
+     * Returns a new tensor with the same data on a different device.
+     * If the new device passed is same as the current device, the original tensor is returned.
+     *
+     * @param device The device of the new tensor
+     *
+     * @return A new tensor with the same data on the specified device
+     */
+    Tensor to(Device device) const;
+    /**
+     * Copy Data from another tensor to this tensor.
+     * Currently, only contigous tensors of the same dtype and shape are supported.
+     *
+     * @param src The source tensor to copy from
+     *
+     * @return A new tensor with the same data on the specified device
+     */
+    void copy_from(Tensor src);
+    /**
+     * Return a tensor with the same data in contiguous arrangement as current tensor.
+     * If this tensor is already contiguous, the original tensor is returned.
+     *
+     * @return A new tensor with the same data on the specified device
+     */
+    Tensor contiguous() const;
+    ///
+    /// View APIs
+    ///
+    /**
+     * Returns a new tensor that is a narrowed version of the current tensor.
+     * The returned tensor shares the same underlying storage with the original tensor.
+     *
+     * @param slices A vector of slice parameters specifying the dimension, start index,
+     *               and length for each dimension to narrow
+     * @return A new tensor with narrowed dimensions
+     *
+     * Example:
+     *   // Narrow dimension 0 from index 2 to 5 (length 3)
+     *   // and dimension 1 from index 1 to 3 (length 2)
+     *   tensor.narrow({{0, 2, 3}, {1, 1, 2}});
+     */
+    Tensor narrow(const std::vector<TensorSliceParams> &slices) const;
+    /**
+     * Returns a new tensor with the dimensions permuted (reordered) according to the given order.
+     * The returned tensor shares the same underlying storage with the original tensor.
+     *
+     * @param order The desired ordering of dimensions
+     * @return A new tensor with permuted dimensions
+     *
+     * Example:
+     *   // For a 3D tensor with shape [2, 3, 4], permute to [2, 0, 1]
+     *   // This swaps the dimensions: dim0->dim2, dim1->dim0, dim2->dim1
+     *   tensor->permute({2, 0, 1});
+     */
+    Tensor permute(const Shape &order) const;
+    /**
+     * Returns a new tensor with the same data but a different shape.
+     * The returned tensor shares the same underlying storage with the original tensor.
+     * The tensor is rearranged if the new shape is not compatible with the current shape.
+     *
+     * @param new_shape The desired new shape
+     * @return A new tensor with the specified shape
+     *
+     * Example:
+     *   // Reshape a 2x3 tensor (6 elements) to a 3x2 tensor
+     *   tensor->view({3, 2});
+     */
+    Tensor view(const Shape &new_shape) const;
+    /**
+     * Insecurely returns a new tensor with the specified shape and strides.
+     * The returned tensor shares the same underlying storage with the original tensor.
+     *
+     * @param new_shape The desired new shape
+     * @param new_strides The desired new strides
+     * @return A new tensor with the specified shape and strides
+     *
+     * Example:
+     *   // Create a non-contiguous view with custom strides
+     *   tensor->as_strided({2, 3}, {6, 2}); // Stride of 6 for dim0, 2 for dim1
+     */
+    Tensor as_strided(const Shape &new_shape, const Strides &new_strides) const;
+protected:
+    static std::shared_ptr<TensorImpl> empty(
+        const Shape &shape,
+        const DataType &dtype,
+        const Device &device,
+        bool pin_memory = false);
+    static std::shared_ptr<TensorImpl> strided_empty(
+        const Shape &shape,
+        const Strides &strides,
+        const DataType &dtype,
+        const Device &device,
+        bool pin_memory = false);
+    static std::shared_ptr<TensorImpl> zeros(
+        const Shape &shape,
+        const DataType &dtype,
+        const Device &device,
+        bool pin_memory = false);
+    static std::shared_ptr<TensorImpl> ones(
+        const Shape &shape,
+        const DataType &dtype,
+        const Device &device,
+        bool pin_memory = false);
+    static std::shared_ptr<TensorImpl> from_blob(
+        void *raw_ptr,
+        const Shape &shape,
+        const DataType &dtype,
+        const Device &device);
+    static std::shared_ptr<TensorImpl> strided_from_blob(
+        void *raw_ptr,
+        const Shape &shape,
+        const Strides &strides,
+        const DataType &dtype,
+        const Device &device);
+    friend class Tensor;
+private:
+    TensorMetaData meta_;
+    TensorData data_;
 };
 } // namespace infinicore
-#endif
--- a/python/infinicore/__init__.py
+++ b/python/infinicore/__init__.py
+from infinicore.device import device
+from infinicore.dtype import (
+    bfloat16,
+    bool,
+    cdouble,
+    cfloat,
+    chalf,
+    complex32,
+    complex64,
+    complex128,
+    double,
+    float,
+    float16,
+    float32,
+    float64,
+    half,
+    int,
+    int8,
+    int16,
+    int32,
+    int64,
+    long,
+    short,
+    uint8,
+)
+from infinicore.ops.matmul import matmul
+from infinicore.ops.rearrange import rearrange
+from infinicore.tensor import (
+    empty,
+    from_blob,
+    ones,
+    strided_empty,
+    strided_from_blob,
+    zeros,
+)
+__all__ = [
+    # Classes.
+    "device",
+    # Data Types.
+    "bfloat16",
+    "bool",
+    "cdouble",
+    "cfloat",
+    "chalf",
+    "complex32",
+    "complex64",
+    "complex128",
+    "double",
+    "float",
+    "float16",
+    "float32",
+    "float64",
+    "half",
+    "int",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "long",
+    "short",
+    "uint8",
+    # Operations.
+    "matmul",
+    "rearrange",
+    "empty",
+    "from_blob",
+    "ones",
+    "strided_empty",
+    "strided_from_blob",
+    "zeros",
+]