Merge pull request #22 from InfiniTensor/issue/21

Issue/21 - Inference Process Modualization

Merge pull request #22 from InfiniTensor/issue/21
Issue/21 - Inference Process Modualization
07aa6990 · PanZezhong1725 · GitHub · be0e66ef · bfae3bbb · 07aa6990
Unverified Commit 07aa6990 authored Aug 11, 2025 by PanZezhong1725 Committed by GitHub Aug 11, 2025
10 changed files
--- a/src/models/cache_manager.hpp
+++ b/src/models/cache_manager.hpp
+#ifndef CACHE_MANAGER_HPP
+#define CACHE_MANAGER_HPP
+
+#include <functional>
+#include <memory>
+#include <unordered_map>
+#include <vector>
+
+#include "../tensor.hpp"
+#include "../utils.hpp"
+#include "infinicore_infer.h"
+
+class IDescriptorDestroyer {
+public:
+    virtual ~IDescriptorDestroyer() = default;
+    virtual void destroy(void *descriptor) = 0;
+};
+
+template <typename DescriptorType>
+class DescriptorDestroyer : public IDescriptorDestroyer {
+    using DestroyFunc = infiniStatus_t (*)(DescriptorType);
+    DestroyFunc destroyFunc;
+
+public:
+    DescriptorDestroyer(DestroyFunc func) : destroyFunc(func) {}
+
+    void destroy(void *descriptor) override {
+        destroyFunc(*static_cast<DescriptorType *>(descriptor));
+    }
+};
+
+template <typename DescriptorType>
+class LRUDescriptorCache {
+private:
+    struct CacheNode {
+        size_t key;
+        DescriptorType desc;
+        CacheNode *prev;
+        CacheNode *next;
+
+        CacheNode() : key(0), desc(), prev(nullptr), next(nullptr) {}
+        CacheNode(size_t k, const DescriptorType &d) : key(k), desc(d), prev(nullptr), next(nullptr) {}
+    };
+
+    std::unordered_map<size_t, CacheNode *> cache;
+    CacheNode *head;
+    CacheNode *tail;
+    const size_t capacity;
+    size_t size;
+    std::unique_ptr<IDescriptorDestroyer> destroyer;
+
+    void removeNode(CacheNode *node) {
+        node->prev->next = node->next;
+        node->next->prev = node->prev;
+        if (destroyer) {
+            destroyer->destroy(&node->desc);
+        }
+        cache.erase(node->key);
+        delete node;
+        --size;
+    }
+
+    void addToTop(CacheNode *node) {
+        node->next = head->next;
+        node->next->prev = node;
+        node->prev = head;
+        head->next = node;
+        cache[node->key] = node;
+        if (++size > capacity) {
+            removeNode(tail->prev);
+        }
+    }
+
+    void moveToTop(CacheNode *node) {
+        node->prev->next = node->next;
+        node->next->prev = node->prev;
+        node->next = head->next;
+        node->next->prev = node;
+        node->prev = head;
+        head->next = node;
+    }
+
+public:
+    template <typename DestroyFunc>
+    LRUDescriptorCache(size_t c, DestroyFunc destroyFunc)
+        : capacity(c), size(0), destroyer(std::make_unique<DescriptorDestroyer<DescriptorType>>(destroyFunc)) {
+        head = new CacheNode();
+        tail = new CacheNode();
+        head->next = tail;
+        tail->prev = head;
+    }
+
+    ~LRUDescriptorCache() {
+        while (head->next != tail) {
+            removeNode(head->next);
+        }
+        delete head;
+        delete tail;
+    }
+
+    bool get(size_t key, DescriptorType &out_desc) {
+        auto it = cache.find(key);
+        if (it == cache.end()) {
+            return false;
+        }
+
+        CacheNode *node = it->second;
+        moveToTop(node);
+        out_desc = node->desc;
+        return true;
+    }
+
+    void put(size_t key, const DescriptorType &descriptor) {
+        auto it = cache.find(key);
+        if (it != cache.end()) {
+            // Key already exists, update the descriptor
+            CacheNode *node = it->second;
+            if (destroyer) {
+                destroyer->destroy(&node->desc);
+            }
+            node->desc = descriptor;
+            moveToTop(node);
+            return;
+        }
+
+        // Check if we need to evict
+        if (size >= capacity) {
+            removeNode(tail->prev);
+        }
+
+        // Create new node and add to top
+        CacheNode *node = new CacheNode(key, descriptor);
+        addToTop(node);
+    }
+
+    LRUDescriptorCache(const LRUDescriptorCache &) = delete;
+    LRUDescriptorCache &operator=(const LRUDescriptorCache &) = delete;
+};
+
+class CacheManager {
+private:
+    const size_t DEFAULT_CACHE_CAPACITY = 128;
+
+    LRUDescriptorCache<infiniopAddDescriptor_t> add_cache;
+    LRUDescriptorCache<infiniopRMSNormDescriptor_t> rms_norm_cache;
+    LRUDescriptorCache<infiniopGemmDescriptor_t> gemm_cache;
+    LRUDescriptorCache<infiniopRoPEDescriptor_t> rope_cache;
+    LRUDescriptorCache<infiniopRearrangeDescriptor_t> rearrange_cache;
+    LRUDescriptorCache<infiniopCausalSoftmaxDescriptor_t> causal_softmax_cache;
+    LRUDescriptorCache<infiniopSwiGLUDescriptor_t> swiglu_cache;
+    LRUDescriptorCache<infiniopRandomSampleDescriptor_t> random_sample_cache;
+
+public:
+    CacheManager(size_t capacity = 100)
+        : add_cache(capacity, infiniopDestroyAddDescriptor),
+          rms_norm_cache(capacity, infiniopDestroyRMSNormDescriptor),
+          gemm_cache(capacity, infiniopDestroyGemmDescriptor),
+          rope_cache(capacity, infiniopDestroyRoPEDescriptor),
+          rearrange_cache(capacity, infiniopDestroyRearrangeDescriptor),
+          causal_softmax_cache(capacity, infiniopDestroyCausalSoftmaxDescriptor),
+          swiglu_cache(capacity, infiniopDestroySwiGLUDescriptor),
+          random_sample_cache(capacity, infiniopDestroyRandomSampleDescriptor) {}
+
+    // Add operations
+    bool getAddDescriptor(size_t key, infiniopAddDescriptor_t &desc) {
+        return add_cache.get(key, desc);
+    }
+
+    void putAddDescriptor(size_t key, const infiniopAddDescriptor_t &desc) {
+        add_cache.put(key, desc);
+    }
+
+    // RMSNorm operations
+    bool getRMSNormDescriptor(size_t key, infiniopRMSNormDescriptor_t &desc) {
+        return rms_norm_cache.get(key, desc);
+    }
+
+    void putRMSNormDescriptor(size_t key, const infiniopRMSNormDescriptor_t &desc) {
+        rms_norm_cache.put(key, desc);
+    }
+
+    // GEMM operations
+    bool getGemmDescriptor(size_t key, infiniopGemmDescriptor_t &desc) {
+        return gemm_cache.get(key, desc);
+    }
+
+    void putGemmDescriptor(size_t key, const infiniopGemmDescriptor_t &desc) {
+        gemm_cache.put(key, desc);
+    }
+
+    // RoPE operations
+    bool getRoPEDescriptor(size_t key, infiniopRoPEDescriptor_t &desc) {
+        return rope_cache.get(key, desc);
+    }
+
+    void putRoPEDescriptor(size_t key, const infiniopRoPEDescriptor_t &desc) {
+        rope_cache.put(key, desc);
+    }
+
+    // Rearrange operations
+    bool getRearrangeDescriptor(size_t key, infiniopRearrangeDescriptor_t &desc) {
+        return rearrange_cache.get(key, desc);
+    }
+
+    void putRearrangeDescriptor(size_t key, const infiniopRearrangeDescriptor_t &desc) {
+        rearrange_cache.put(key, desc);
+    }
+
+    // Softmax operations
+    bool getCausalSoftmaxDescriptor(size_t key, infiniopCausalSoftmaxDescriptor_t &desc) {
+        return causal_softmax_cache.get(key, desc);
+    }
+
+    void putCausalSoftmaxDescriptor(size_t key, const infiniopCausalSoftmaxDescriptor_t &desc) {
+        causal_softmax_cache.put(key, desc);
+    }
+
+    // SwiGLU operations
+    bool getSwiGLUDescriptor(size_t key, infiniopSwiGLUDescriptor_t &desc) {
+        return swiglu_cache.get(key, desc);
+    }
+
+    void putSwiGLUDescriptor(size_t key, const infiniopSwiGLUDescriptor_t &desc) {
+        swiglu_cache.put(key, desc);
+    }
+
+    // Random Sample operations
+    bool getRandomSampleDescriptor(size_t key, infiniopRandomSampleDescriptor_t &desc) {
+        return random_sample_cache.get(key, desc);
+    }
+
+    void putRandomSampleDescriptor(size_t key, const infiniopRandomSampleDescriptor_t &desc) {
+        random_sample_cache.put(key, desc);
+    }
+
+    template <typename... Tensors>
+    static size_t createDescriptorKey(Tensors... tensors) {
+        size_t seed = 0;
+        (..., (tensors ? hash_combine(seed, tensors->seed()) : (void)0));
+        return seed;
+    }
+};
+
+#endif // CACHE_MANAGER_HPP
--- a/src/models/inference_context.cpp
+++ b/src/models/inference_context.cpp
+#include "inference_context.hpp"
+#include "../tensor.hpp"
+#include "../utils.hpp"
+
+InferenceContext::InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream)
+    : rsrc(rsrc), cache_manager(cache_manager), stream(stream) {}
+
+void InferenceContext::ensure_workspace(size_t required_size) {
+    if (required_size > current_workspace_size || !workspace_storage) {
+        workspace_storage = Storage::createFromPool(required_size, rsrc->memory_pool);
+        current_workspace_size = required_size;
+    }
+}
+
+void InferenceContext::add(std::shared_ptr<Tensor> c,
+                           std::shared_ptr<Tensor> a,
+                           std::shared_ptr<Tensor> b) {
+    size_t key = CacheManager::createDescriptorKey(c, a, b);
+
+    infiniopAddDescriptor_t desc;
+    if (!cache_manager->getAddDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateAddDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
+        cache_manager->putAddDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetAddWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopAdd(
+        desc, workspace, workspace_size,
+        c->data(), a->data(), b->data(), stream));
+}
+
+void InferenceContext::rmsnorm(std::shared_ptr<Tensor> y,
+                               std::shared_ptr<Tensor> x,
+                               std::shared_ptr<Tensor> w,
+                               float epsilon) {
+    size_t key = CacheManager::createDescriptorKey(y, x, w);
+
+    infiniopRMSNormDescriptor_t desc;
+    if (!cache_manager->getRMSNormDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateRMSNormDescriptor(
+            rsrc->handle, &desc, y->desc(), x->desc(), w->desc(), epsilon));
+        cache_manager->putRMSNormDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopRMSNorm(
+        desc, workspace, workspace_size,
+        y->data(), x->data(), w->data(), stream));
+}
+
+void InferenceContext::gemm(std::shared_ptr<Tensor> c,
+                            std::shared_ptr<Tensor> a,
+                            std::shared_ptr<Tensor> b,
+                            float alpha, float beta) {
+    size_t key = CacheManager::createDescriptorKey(c, a, b);
+
+    infiniopGemmDescriptor_t desc;
+    if (!cache_manager->getGemmDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateGemmDescriptor(rsrc->handle, &desc, c->desc(), a->desc(), b->desc()));
+        cache_manager->putGemmDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetGemmWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopGemm(
+        desc, workspace, workspace_size,
+        c->data(), a->data(), b->data(), alpha, beta, stream));
+}
+
+void InferenceContext::rearrange(std::shared_ptr<Tensor> dst,
+                                 std::shared_ptr<Tensor> src) {
+    size_t key = CacheManager::createDescriptorKey(dst, src);
+
+    infiniopRearrangeDescriptor_t desc;
+    if (!cache_manager->getRearrangeDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateRearrangeDescriptor(rsrc->handle, &desc, dst->desc(), src->desc()));
+        cache_manager->putRearrangeDescriptor(key, desc);
+    }
+
+    RUN_INFINI(infiniopRearrange(
+        desc,
+        dst->data(),
+        src->data(),
+        stream));
+}
+
+void InferenceContext::rope(std::shared_ptr<Tensor> q,
+                            std::shared_ptr<Tensor> k,
+                            std::shared_ptr<Tensor> pos,
+                            std::shared_ptr<Tensor> sin,
+                            std::shared_ptr<Tensor> cos) {
+    size_t key = CacheManager::createDescriptorKey(q, k, pos, sin, cos);
+
+    infiniopRoPEDescriptor_t desc;
+    if (!cache_manager->getRoPEDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateRoPEDescriptor(
+            rsrc->handle, &desc, q->desc(), k->desc(),
+            pos->desc(), sin->desc(), cos->desc()));
+        cache_manager->putRoPEDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopRoPE(
+        desc, workspace, workspace_size,
+        q->data(), k->data(), pos->data(),
+        sin->data(), cos->data(), stream));
+}
+
+void InferenceContext::causalSoftmax(std::shared_ptr<Tensor> y,
+                                     std::shared_ptr<Tensor> x) {
+    size_t key = CacheManager::createDescriptorKey(y, x);
+
+    infiniopCausalSoftmaxDescriptor_t desc;
+    if (!cache_manager->getCausalSoftmaxDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateCausalSoftmaxDescriptor(
+            rsrc->handle, &desc, y->desc(), x->desc()));
+        cache_manager->putCausalSoftmaxDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetCausalSoftmaxWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopCausalSoftmax(desc, workspace, workspace_size,
+                                     y->data(), x->data(), stream));
+}
+
+void InferenceContext::swiglu(std::shared_ptr<Tensor> out,
+                              std::shared_ptr<Tensor> up,
+                              std::shared_ptr<Tensor> gate) {
+    size_t key = CacheManager::createDescriptorKey(out, up, gate);
+
+    infiniopSwiGLUDescriptor_t desc;
+    if (!cache_manager->getSwiGLUDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateSwiGLUDescriptor(
+            rsrc->handle, &desc, out->desc(), up->desc(), gate->desc()));
+        cache_manager->putSwiGLUDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetSwiGLUWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopSwiGLU(desc, workspace, workspace_size,
+                              out->data(), up->data(), gate->data(), stream));
+}
+
+void InferenceContext::randomSample(std::shared_ptr<Tensor> out,
+                                    std::shared_ptr<Tensor> prob,
+                                    float random_val, float top_p, uint32_t top_k, float temperature) {
+    size_t key = CacheManager::createDescriptorKey(out, prob);
+
+    infiniopRandomSampleDescriptor_t desc;
+    if (!cache_manager->getRandomSampleDescriptor(key, desc)) {
+        RUN_INFINI(infiniopCreateRandomSampleDescriptor(
+            rsrc->handle, &desc, out->desc(), prob->desc()));
+        cache_manager->putRandomSampleDescriptor(key, desc);
+    }
+
+    size_t workspace_size = 0;
+    RUN_INFINI(infiniopGetRandomSampleWorkspaceSize(desc, &workspace_size));
+    ensure_workspace(workspace_size);
+    void *workspace = workspace_storage->memory();
+
+    RUN_INFINI(infiniopRandomSample(
+        desc, workspace, workspace_size,
+        out->data(), prob->data(),
+        random_val, top_p, top_k, temperature,
+        stream));
+}
+
+void InferenceContext::linear(std::shared_ptr<Tensor> c,
+                              std::shared_ptr<Tensor> a,
+                              std::shared_ptr<Tensor> b,
+                              float alpha, float beta,
+                              std::shared_ptr<Tensor> residual,
+                              std::shared_ptr<Tensor> bias) {
+    bool residual_flag = residual != nullptr;
+
+    if (bias && !residual) {
+        int ndim_diff = c->ndim() - 1;
+        ASSERT_EQ(bias->ndim(), 1);
+        ASSERT_EQ(bias->shape()[0], c->shape()[ndim_diff]);
+        std::vector<ptrdiff_t> strides(ndim_diff, 0);
+        strides.push_back(bias->strides()[0]);
+        rearrange(c, bias->view_as(c->shape(), strides));
+        residual = c;
+    }
+
+    if (residual) {
+        if (residual->data() == c->data()) {
+            if (beta == 0.0) {
+                gemm(c, a, b, alpha, 1.0);
+            } else {
+                auto c_copy = Tensor::buffer(c->dtype(), c->shape(), rsrc->memory_pool);
+                c_copy->copyFrom(c, rsrc->handle, stream);
+                gemm(c, a, b, alpha, beta);
+                add(c, c, c_copy);
+            }
+        } else {
+            gemm(c, a, b, alpha, beta);
+            add(c, c, residual);
+        }
+    } else {
+        gemm(c, a, b, alpha, beta);
+    }
+
+    if (bias && residual_flag) {
+        int ndim_diff = c->ndim() - 1;
+        ASSERT_EQ(bias->ndim(), 1);
+        ASSERT_EQ(bias->shape()[0], c->shape()[ndim_diff]);
+        std::vector<ptrdiff_t> strides(ndim_diff, 0);
+        strides.push_back(bias->strides()[0]);
+        add(c, c, bias->view_as(c->shape(), strides));
+    }
+}
--- a/src/models/inference_context.hpp
+++ b/src/models/inference_context.hpp
+#pragma once
+
+#include "cache_manager.hpp"
+#include "jiuge/jiuge_impl.hpp"
+#include "jiuge/jiuge_weight.hpp"
+#include <cassert>
+
+struct InferenceContext {
+    DeviceResource *rsrc;
+    CacheManager *cache_manager;
+    infinirtStream_t stream;
+    std::shared_ptr<Storage> workspace_storage;
+    size_t current_workspace_size = 0;
+
+    InferenceContext(DeviceResource *rsrc, CacheManager *cache_manager, infinirtStream_t stream);
+
+    void ensure_workspace(size_t required_size);
+
+    void add(std::shared_ptr<Tensor> c,
+             std::shared_ptr<Tensor> a,
+             std::shared_ptr<Tensor> b);
+    void rmsnorm(std::shared_ptr<Tensor> y,
+                 std::shared_ptr<Tensor> x,
+                 std::shared_ptr<Tensor> w,
+                 float epsilon);
+    void gemm(std::shared_ptr<Tensor> c,
+              std::shared_ptr<Tensor> a,
+              std::shared_ptr<Tensor> b,
+              float alpha, float beta);
+    void rearrange(std::shared_ptr<Tensor> dst,
+                   std::shared_ptr<Tensor> src);
+    void rope(std::shared_ptr<Tensor> q,
+              std::shared_ptr<Tensor> k,
+              std::shared_ptr<Tensor> pos,
+              std::shared_ptr<Tensor> sin,
+              std::shared_ptr<Tensor> cos);
+    void causalSoftmax(std::shared_ptr<Tensor> y,
+                       std::shared_ptr<Tensor> x);
+    void swiglu(std::shared_ptr<Tensor> out,
+                std::shared_ptr<Tensor> up,
+                std::shared_ptr<Tensor> gate);
+    void randomSample(std::shared_ptr<Tensor> out,
+                      std::shared_ptr<Tensor> prob,
+                      float random_val, float top_p, uint32_t top_k, float temperature);
+
+    void linear(std::shared_ptr<Tensor> c,
+                std::shared_ptr<Tensor> a,
+                std::shared_ptr<Tensor> b,
+                float alpha, float beta,
+                std::shared_ptr<Tensor> residual,
+                std::shared_ptr<Tensor> bias);
+};
+
+namespace {
+thread_local InferenceContext *tls_inference_context = nullptr;
+}
+
+inline InferenceContext &getInferenceContext() {
+    assert(tls_inference_context != nullptr && "InferenceContext not set for this thread");
+    return *tls_inference_context;
+}
+
+inline void setInferenceContext(InferenceContext *ctx) {
+    tls_inference_context = ctx;
+}
+
+inline void add(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a, std::shared_ptr<Tensor> b) {
+    getInferenceContext().add(c, a, b);
+}
+
+inline void rmsnorm(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x,
+                    std::shared_ptr<Tensor> w, float epsilon) {
+    getInferenceContext().rmsnorm(y, x, w, epsilon);
+}
+
+inline void gemm(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
+                 std::shared_ptr<Tensor> b, float alpha, float beta) {
+    getInferenceContext().gemm(c, a, b, alpha, beta);
+}
+
+inline void rearrange(std::shared_ptr<Tensor> dst, std::shared_ptr<Tensor> src) {
+    getInferenceContext().rearrange(dst, src);
+}
+
+inline void rope(std::shared_ptr<Tensor> q, std::shared_ptr<Tensor> k,
+                 std::shared_ptr<Tensor> pos, std::shared_ptr<Tensor> sin,
+                 std::shared_ptr<Tensor> cos) {
+    getInferenceContext().rope(q, k, pos, sin, cos);
+}
+
+inline void causalSoftmax(std::shared_ptr<Tensor> y, std::shared_ptr<Tensor> x) {
+    getInferenceContext().causalSoftmax(y, x);
+}
+
+inline void swiglu(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> up,
+                   std::shared_ptr<Tensor> gate) {
+    getInferenceContext().swiglu(out, up, gate);
+}
+
+inline void randomSample(std::shared_ptr<Tensor> out, std::shared_ptr<Tensor> prob,
+                         float random_val, float top_p, uint32_t top_k, float temperature) {
+    getInferenceContext().randomSample(out, prob, random_val, top_p, top_k, temperature);
+}
+
+inline void linear(std::shared_ptr<Tensor> c, std::shared_ptr<Tensor> a,
+                   std::shared_ptr<Tensor> b, float alpha, float beta,
+                   std::shared_ptr<Tensor> residual, std::shared_ptr<Tensor> bias) {
+    getInferenceContext().linear(c, a, b, alpha, beta, residual, bias);
+}
--- a/src/models/jiuge/jiuge.cpp
+++ b/src/models/jiuge/jiuge.cpp
--- a/src/tensor.hpp
+++ b/src/tensor.hpp
@@ -51,10 +51,12 @@ private:
    std::vector<size_t> _shape;
    std::vector<ptrdiff_t> _strides;
    infiniopTensorDescriptor_t _desc;
+    size_t _seed;

    TensorDesc(infiniDtype_t dtype, const std::vector<size_t> &shape,
-               const std::vector<ptrdiff_t> &strides) : _dtype(dtype), _shape(shape), _strides(strides), _desc(nullptr) {}
+               const std::vector<ptrdiff_t> &strides) : _dtype(dtype), _shape(shape), _strides(strides), _desc(nullptr) { computeTensorDesHash(); }
    void resetDesc();
+    void computeTensorDesHash();

 public:
    ~TensorDesc();
@@ -74,6 +76,7 @@ public:
    infiniopTensorDescriptor_t desc() const;
    bool isContigous() const;
    std::string info() const;
+    size_t seed() const { return _seed; }

    void dimMerge(size_t dim_start, size_t dim_end);
    void dimSplit(size_t dim, const std::vector<size_t> &dims);
@@ -83,7 +86,7 @@ public:
 class Tensor : public std::enable_shared_from_this<Tensor> {
 private:
    std::shared_ptr<Storage> _storage;
-    std::shared_ptr<TensorDesc> _desc;
+    std::shared_ptr<const TensorDesc> _desc;

    ptrdiff_t _offset;

@@ -127,6 +130,11 @@ public:
    void debug(const std::string &filename) const;
    void debug() const;
    std::string info() const;
+    size_t seed() const;
+
+    std::shared_ptr<Tensor> view(const std::vector<size_t> &new_shape) const;
+    std::shared_ptr<Tensor> view_as(const std::vector<size_t> &new_shape) const;
+    std::shared_ptr<Tensor> view_as(const std::vector<size_t> &new_shape, const std::vector<ptrdiff_t> &new_strides) const;

    ~Tensor();
 };

--- a/src/tensor/strorage.cpp
+++ b/src/tensor/strorage.cpp
--- a/src/tensor/tensor.cpp
+++ b/src/tensor/tensor.cpp
@@ -62,6 +62,16 @@ void TensorDesc::resetDesc() {
    }
 }

+void TensorDesc::computeTensorDesHash() {
+    _seed = 0;
+    for (auto dim : this->shape()) {
+        hash_combine(_seed, dim);
+    }
+    for (auto stride : this->strides()) {
+        hash_combine(_seed, static_cast<size_t>(stride));
+    }
+}
+
 bool TensorDesc::isContigous() const {
    auto ndim = this->ndim();
    auto shape = this->shape();
@@ -258,6 +268,86 @@ std::string Tensor::info() const {
    return this->_desc->info();
 }

+size_t Tensor::seed() const {
+    return this->_desc->seed();
+}
+
+std::shared_ptr<Tensor> Tensor::view(const std::vector<size_t> &new_shape) const {
+    // Step 1: Validate total size
+    size_t numel = 1;
+    for (size_t dim : this->_desc->shape()) {
+        numel *= dim;
+    }
+
+    size_t new_numel = 1;
+    for (size_t dim : new_shape) {
+        new_numel *= dim;
+    }
+
+    ASSERT_EQ(numel, new_numel);
+
+    // Step 2: Get current shape and strides
+    const std::vector<size_t> &old_shape = this->_desc->shape();
+    const std::vector<ptrdiff_t> &old_strides = this->_desc->strides();
+
+    // Step 3: Create merged shape and strides
+    std::vector<size_t> merged_shape;
+    std::vector<ptrdiff_t> merged_strides;
+
+    if (!old_shape.empty()) {
+        merged_shape.push_back(old_shape[0]);
+        merged_strides.push_back(old_strides[0]);
+
+        for (size_t i = 1; i < old_shape.size(); ++i) {
+            if (old_strides[i] * static_cast<ptrdiff_t>(old_shape[i]) == merged_strides.back()) {
+                merged_shape.back() *= old_shape[i];
+                merged_strides.back() = old_strides[i];
+            } else {
+                merged_shape.push_back(old_shape[i]);
+                merged_strides.push_back(old_strides[i]);
+            }
+        }
+    }
+
+    // Step 4: Compute new strides by splitting merged dimensions
+    std::vector<ptrdiff_t> new_strides(new_shape.size());
+    size_t merged_idx = 0;
+    ptrdiff_t current_stride = merged_strides[0];
+    size_t remaining_size = merged_shape[0];
+
+    for (size_t i = 0; i < new_shape.size(); ++i) {
+        // Find which merged dimension contains this new dimension
+        while (new_shape[i] > remaining_size) {
+            ASSERT(++merged_idx < merged_shape.size());
+            current_stride = merged_strides[merged_idx];
+            remaining_size = merged_shape[merged_idx];
+        }
+
+        ASSERT_EQ(remaining_size % new_shape[i], 0);
+
+        new_strides[i] = current_stride * (remaining_size / new_shape[i]);
+        remaining_size /= new_shape[i];
+    }
+
+    return this->view_as(new_shape, new_strides);
+}
+
+std::shared_ptr<Tensor> Tensor::view_as(const std::vector<size_t> &new_shape) const {
+    std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
+    tensor->_storage = this->_storage;
+    tensor->_desc = TensorDesc::create(this->dtype(), new_shape);
+    tensor->_offset = this->_offset;
+    return tensor;
+}
+
+std::shared_ptr<Tensor> Tensor::view_as(const std::vector<size_t> &new_shape, const std::vector<ptrdiff_t> &new_strides) const {
+    std::shared_ptr<Tensor> tensor = std::make_shared<Tensor>();
+    tensor->_storage = this->_storage;
+    tensor->_desc = TensorDesc::create(this->dtype(), new_shape, new_strides);
+    tensor->_offset = this->_offset;
+    return tensor;
+}
+
 void Tensor::debug(const std::string &filename) const {
    RUN_INFINI(infinirtDeviceSynchronize());


--- a/src/tensor/transform.cpp
+++ b/src/tensor/transform.cpp
@@ -63,11 +63,18 @@ void TensorDesc::dimMerge(size_t dim_start, size_t dim_end) {
    this->_shape = new_shape;
    this->_strides = new_strides;
    this->resetDesc();
+    this->computeTensorDesHash();
 }

 std::shared_ptr<Tensor> Tensor::dimMerge(size_t dim_start, size_t dim_end) {
-    this->_desc->dimMerge(dim_start, dim_end);
-    return shared_from_this();
+    auto new_desc = TensorDesc::create(_desc->dtype(), _desc->shape(), _desc->strides());
+    new_desc->dimMerge(dim_start, dim_end);
+
+    auto tensor = std::make_shared<Tensor>();
+    tensor->_storage = _storage;
+    tensor->_desc = new_desc;
+    tensor->_offset = _offset;
+    return tensor;
 }

 void TensorDesc::dimSplit(size_t dim, const std::vector<size_t> &dims) {
@@ -89,11 +96,18 @@ void TensorDesc::dimSplit(size_t dim, const std::vector<size_t> &dims) {
    this->_shape = new_shape;
    this->_strides = new_strides;
    this->resetDesc();
+    this->computeTensorDesHash();
 }

 std::shared_ptr<Tensor> Tensor::dimSplit(size_t dim, const std::vector<size_t> &dims) {
-    this->_desc->dimSplit(dim, dims);
-    return shared_from_this();
+    auto new_desc = TensorDesc::create(_desc->dtype(), _desc->shape(), _desc->strides());
+    new_desc->dimSplit(dim, dims);
+
+    auto tensor = std::make_shared<Tensor>();
+    tensor->_storage = _storage;
+    tensor->_desc = new_desc;
+    tensor->_offset = _offset;
+    return tensor;
 }

 void TensorDesc::permute(const std::vector<size_t> &order) {
@@ -108,9 +122,16 @@ void TensorDesc::permute(const std::vector<size_t> &order) {
    this->_shape = new_shape;
    this->_strides = new_strides;
    this->resetDesc();
+    this->computeTensorDesHash();
 }

 std::shared_ptr<Tensor> Tensor::permute(const std::vector<size_t> &order) {
-    this->_desc->permute(order);
-    return shared_from_this();
+    auto new_desc = TensorDesc::create(_desc->dtype(), _desc->shape(), _desc->strides());
+    new_desc->permute(order);
+
+    auto tensor = std::make_shared<Tensor>();
+    tensor->_storage = _storage;
+    tensor->_desc = new_desc;
+    tensor->_offset = _offset;
+    return tensor;
 }
--- a/src/utils.hpp
+++ b/src/utils.hpp
@@ -119,4 +119,9 @@ inline uint16_t f32_to_bf16(float val) {
    return bf16_bits;
 }

+// Hash combine utility (similar to boost::hash_combine)
+inline void hash_combine(size_t &seed, size_t value) {
+    seed ^= value + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+}
+
 #endif
--- a/xmake.lua
+++ b/xmake.lua
@@ -12,6 +12,7 @@ target("infinicore_infer")
    set_languages("cxx17")
    set_warnings("all", "error")

+    add_files("src/models/*.cpp")
    add_files("src/models/*/*.cpp")
    add_files("src/tensor/*.cpp")
    add_files("src/allocator/*.cpp")