Merge pull request #990 from InfiniTensor/demo131

Demo-131 Cuda graph with optimized paged attention

Merge pull request #990 from InfiniTensor/demo131
Demo-131 Cuda graph with optimized paged attention
784139b9 · thatPepe · GitHub · 3c8fb3c0 · 1d6527cb · 784139b9
Unverified Commit 784139b9 authored Feb 13, 2026 by thatPepe Committed by GitHub Feb 13, 2026
20 changed files
--- a/src/infinicore/ops/gemm/gemm_infiniop.cc
+++ b/src/infinicore/ops/gemm/gemm_infiniop.cc
@@ -11,7 +11,7 @@ struct PlannedMeta {
    float alpha, beta;
 };

-void *plan(Tensor c, Tensor a, Tensor b, float alpha, float beta) {
+void *plan(Tensor c, const Tensor &a, const Tensor &b, float alpha, float beta) {
    size_t seed = hash_combine(c, a, b);

    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(

--- a/src/infinicore/ops/infiniop_impl.hpp
+++ b/src/infinicore/ops/infiniop_impl.hpp
@@ -5,23 +5,46 @@
 #include "infinicore/ops/common/cache.hpp"
 #include <infiniop.h>

-#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__)   \
-    struct __DESC_TYPE__ {                                                   \
-        infiniop##__OP_NAME__##Descriptor_t desc;                            \
-        Descriptor(infiniop##__OP_NAME__##Descriptor_t desc) : desc(desc) {} \
-        ~Descriptor() {                                                      \
-            if (desc != nullptr) {                                           \
-                infiniopDestroy##__OP_NAME__##Descriptor(desc);              \
-                desc = nullptr;                                              \
-            }                                                                \
-        }                                                                    \
-    };                                                                       \
-                                                                             \
-    thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>>     \
-        caches(                                                              \
-            __SIZE__,                                                        \
-            [](std::shared_ptr<__DESC_TYPE__> &desc) {                       \
-                desc = nullptr;                                              \
+#define INFINIOP_CACHABLE_DESCRIPTOR(__DESC_TYPE__, __OP_NAME__, __SIZE__) \
+    struct __DESC_TYPE__ {                                                 \
+        infiniop##__OP_NAME__##Descriptor_t desc = nullptr;                \
+                                                                           \
+        explicit __DESC_TYPE__(infiniop##__OP_NAME__##Descriptor_t d)      \
+            : desc(d) {}                                                   \
+                                                                           \
+        /* non-copyable */                                                 \
+        __DESC_TYPE__(const __DESC_TYPE__ &) = delete;                     \
+        __DESC_TYPE__ &operator=(const __DESC_TYPE__ &) = delete;          \
+                                                                           \
+        /* movable */                                                      \
+        __DESC_TYPE__(__DESC_TYPE__ &&other) noexcept                      \
+            : desc(other.desc) {                                           \
+            other.desc = nullptr;                                          \
+        }                                                                  \
+                                                                           \
+        __DESC_TYPE__ &operator=(__DESC_TYPE__ &&other) noexcept {         \
+            if (this != &other) {                                          \
+                if (desc != nullptr) {                                     \
+                    infiniopDestroy##__OP_NAME__##Descriptor(desc);        \
+                }                                                          \
+                desc = other.desc;                                         \
+                other.desc = nullptr;                                      \
+            }                                                              \
+            return *this;                                                  \
+        }                                                                  \
+                                                                           \
+        ~__DESC_TYPE__() {                                                 \
+            if (desc != nullptr) {                                         \
+                infiniopDestroy##__OP_NAME__##Descriptor(desc);            \
+            }                                                              \
+        }                                                                  \
+    };                                                                     \
+                                                                           \
+    thread_local common::OpCache<size_t, std::shared_ptr<__DESC_TYPE__>>   \
+        caches(                                                            \
+            __SIZE__,                                                      \
+            [](std::shared_ptr<__DESC_TYPE__> &desc) {                     \
+                desc = nullptr;                                            \
            });

 #define INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(__DESC_TYPE__, __DESC_NAME__, __INFINIOP_NAME__, __HASH_KEY__, ...) \

--- a/src/infinicore/ops/kv_caching/kv_caching.cc
+++ b/src/infinicore/ops/kv_caching/kv_caching.cc
+#include "infinicore/ops/kv_caching.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(KVCaching);
+
+KVCaching::KVCaching(Tensor k_cache,
+                     Tensor v_cache,
+                     const Tensor &k,
+                     const Tensor &v,
+                     const Tensor &past_kv_lengths) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, past_kv_lengths);
+    INFINICORE_GRAPH_OP_DISPATCH(k_cache->device().getType(),
+                                 k_cache,
+                                 v_cache,
+                                 k,
+                                 v,
+                                 past_kv_lengths);
+}
+
+void KVCaching::execute(Tensor k_cache,
+                        Tensor v_cache,
+                        const Tensor &k,
+                        const Tensor &v,
+                        const Tensor &past_kv_lengths) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(KVCaching,
+                                      k_cache,
+                                      v_cache,
+                                      k,
+                                      v,
+                                      past_kv_lengths);
+}
+
+void kv_caching_(Tensor k_cache,
+                 Tensor v_cache,
+                 const Tensor &k,
+                 const Tensor &v,
+                 const Tensor &past_kv_lengths) {
+    KVCaching::execute(k_cache, v_cache, k, v, past_kv_lengths);
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+++ b/src/infinicore/ops/kv_caching/kv_caching_infiniop.cc
+#include "../infiniop_impl.hpp"
+#include "infinicore/ops/kv_caching.hpp"
+
+namespace infinicore::op::kv_caching_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, KVCaching, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, k_cache, v_cache, k, v, past_kv_lengths;
+};
+
+void *plan(Tensor k_cache,
+           Tensor v_cache,
+           const Tensor &k,
+           const Tensor &v,
+           const Tensor &past_kv_lengths) {
+    size_t seed = hash_combine(k_cache, v_cache, k, v, past_kv_lengths);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, KVCaching,
+        seed, k_cache->desc(), v_cache->desc(),
+        k->desc(), v->desc(), past_kv_lengths->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, KVCaching, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(past_kv_lengths)};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopKVCaching(
+        planned->descriptor->desc,
+        nullptr, 0,
+        planned->k_cache->data(),
+        planned->v_cache->data(),
+        planned->k->data(),
+        planned->v->data(),
+        planned->past_kv_lengths->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(KVCaching, &plan, &run, cleanup);
+
+} // namespace infinicore::op::kv_caching_impl::infiniop
--- a/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+++ b/src/infinicore/ops/linear_w4a16_awq/linear_w4a16_awq.cc
+#include "infinicore/ops/linear_w4a16_awq.hpp"
+#include "infinicore/ops/dequantize_awq.hpp"
+#include "infinicore/ops/gemm.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w4a16_awq(Tensor input,
+                        Tensor weight_packed,
+                        Tensor weight_scale,
+                        Tensor weight_zeros,
+                        std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w4a16_awq_(out, input, weight_packed, weight_scale, weight_zeros, bias);
+    return out;
+}
+
+void linear_w4a16_awq_(Tensor out,
+                       Tensor input,
+                       Tensor weight_packed,
+                       Tensor weight_scale,
+                       Tensor weight_zeros,
+                       std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto weight = Tensor::empty(
+        {out_features, in_features},
+        out->dtype(),
+        weight_packed->device());
+    float alpha = 1.0f;
+    float beta = 0.0f;
+    op::dequantize_awq_(weight, weight_packed, weight_scale, weight_zeros);
+    bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    gemm_(out->view({N, out_features}),
+          input->view({N, in_features}),
+          weight->permute({1, 0}), alpha, beta);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+++ b/src/infinicore/ops/linear_w8a8i8/linear_w8a8i8.cc
+#include "infinicore/ops/linear_w8a8i8.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+namespace infinicore::op {
+
+Tensor linear_w8a8i8(Tensor input,
+                     Tensor weight_packed,
+                     Tensor weight_scale,
+                     std::optional<Tensor> bias) {
+
+    // Input is of shape [M, K], Weight_packed is of shape [N, K],stirdes is [N, 1]
+    Size ndim = input->ndim();
+    Size out_features = weight_packed->shape()[0];
+
+    // Assign memory to out variables
+    auto output_shape = input->shape();
+    output_shape[ndim - 1] = out_features;
+    auto out = Tensor::empty(output_shape, input->dtype(), input->device());
+
+    // Inplace Calculate
+    linear_w8a8i8_(out, input, weight_packed, weight_scale, bias);
+    return out;
+}
+
+void linear_w8a8i8_(Tensor out,
+                    Tensor input,
+                    Tensor weight_packed,
+                    Tensor weight_scale,
+                    std::optional<Tensor> bias) {
+
+    auto weight_packed_shape = weight_packed->shape();
+    Size out_features = weight_packed_shape[0];
+    Size in_features = weight_packed_shape[1];
+
+    Size ndim = input->ndim();
+    assert(out->ndim() == ndim);
+
+    Size N = 1;
+    auto input_shape = input->shape();
+    for (size_t i = 0; i < ndim - 1; ++i) {
+        N *= input_shape[i];
+    }
+
+    auto input_packed = Tensor::empty(
+        {N, input_shape[ndim - 1]},
+        DataType::I8,
+        input->device());
+    auto input_scale = Tensor::empty(
+        {N, 1},
+        DataType::F32,
+        input->device());
+    op::per_channel_quant_i8_(input->view({N, in_features}), input_packed, input_scale);
+    if (bias.has_value()) {
+        bias = std::make_optional(bias.value()->as_strided({N, out_features}, {0, 1}));
+    }
+    op::scaled_mm_i8_(
+        out->view({N, out_features}),
+        input_packed,
+        input_scale,
+        weight_packed->permute({1, 0}),
+        weight_scale,
+        bias);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/mul/mul.cc
+++ b/src/infinicore/ops/mul/mul.cc
 #include "infinicore/ops/mul.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {

-common::OpDispatcher<Mul::schema> &Mul::dispatcher() {
-    static common::OpDispatcher<Mul::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Mul);

-void Mul::execute(Tensor c, Tensor a, Tensor b) {
+Mul::Mul(Tensor c, const Tensor &a, const Tensor &b) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    dispatcher().lookup(c->device().getType())(c, a, b);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
+}
+
+void Mul::execute(Tensor c, const Tensor &a, const Tensor &b) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(Mul, c, a, b);
 }

-Tensor mul(Tensor a, Tensor b) {
+Tensor mul(const Tensor &a, const Tensor &b) {
    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
    mul_(c, a, b);
    return c;
 }

-void mul_(Tensor c, Tensor a, Tensor b) {
+void mul_(Tensor c, const Tensor &a, const Tensor &b) {
    Mul::execute(c, a, b);
 }


--- a/src/infinicore/ops/mul/mul_infiniop.cc
+++ b/src/infinicore/ops/mul/mul_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/mul.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::mul_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopMulDescriptor_t> caches(
-    100, // capacity
-    [](infiniopMulDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyMulDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Mul, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a, b;
+};

-void calculate(Tensor c, Tensor a, Tensor b) {
+void *plan(Tensor c, const Tensor &a, const Tensor &b) {
    size_t seed = hash_combine(c, b, a);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Mul,
+        seed, c->desc(), a->desc(), b->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopMulDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, Mul, descriptor);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateMulDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetMulWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(infiniopMul(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->c->data(),
+        planned->a->data(),
+        planned->b->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    Mul::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Mul, &plan, &run, &cleanup);

 } // namespace infinicore::op::mul_impl::infiniop
--- a/src/infinicore/ops/paged_attention/paged_attention.cc
+++ b/src/infinicore/ops/paged_attention/paged_attention.cc
 #include "infinicore/ops/paged_attention.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {

-common::OpDispatcher<PagedAttention::schema> &PagedAttention::dispatcher() {
-    static common::OpDispatcher<PagedAttention::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedAttention);

-void PagedAttention::execute(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+PagedAttention::PagedAttention(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                               const Tensor &block_tables, const Tensor &kv_lens,
+                               std::optional<Tensor> alibi_slopes, float scale) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, q, k_cache, v_cache, block_tables, kv_lens);
-    infinicore::context::setDevice(out->device());
-    dispatcher().lookup(out->device().getType())(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(),
+                                 out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
+}
+
+void PagedAttention::execute(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                             const Tensor &block_tables, const Tensor &kv_lens,
+                             std::optional<Tensor> alibi_slopes, float scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(
+        PagedAttention,
+        out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }

-Tensor paged_attention(Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+Tensor paged_attention(const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                       const Tensor &block_tables, const Tensor &kv_lens,
+                       std::optional<Tensor> alibi_slopes, float scale) {
    auto out = Tensor::empty(q->shape(), q->dtype(), q->device());
    paged_attention_(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
    return out;
 }

-void paged_attention_(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
+void paged_attention_(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+                      const Tensor &block_tables, const Tensor &kv_lens,
+                      std::optional<Tensor> alibi_slopes, float scale) {
    PagedAttention::execute(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
 }


--- a/src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
+++ b/src/infinicore/ops/paged_attention/paged_attention_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/paged_attention.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::paged_attention_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopPagedAttentionDescriptor_t> caches(
-    100, // capacity
-    [](infiniopPagedAttentionDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyPagedAttentionDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor out, Tensor q, Tensor k_cache, Tensor v_cache, Tensor block_tables, Tensor kv_lens, std::optional<Tensor> alibi_slopes, float scale) {
-    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, kv_lens, alibi_slopes, scale);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopPagedAttentionDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreatePagedAttentionDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            out->desc(), q->desc(), k_cache->desc(), v_cache->desc(), block_tables->desc(), kv_lens->desc(),
-            alibi_slopes.has_value() ? alibi_slopes.value()->desc() : nullptr,
-            scale));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetPagedAttentionWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopPagedAttention(
-        desc, workspace->data(), workspace_size,
-        out->data(), q->data(), k_cache->data(), v_cache->data(), block_tables->data(), kv_lens->data(),
-        alibi_slopes.has_value() ? alibi_slopes.value()->data() : nullptr,
-        context::getStream()));
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedAttention, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, out, q, k_cache, v_cache, block_tables, cache_lens;
+    std::optional<graph::GraphTensor> alibi_slopes;
+    float scale;
+};
+
+void *plan(Tensor out, const Tensor &q, const Tensor &k_cache, const Tensor &v_cache,
+           const Tensor &block_tables, const Tensor &cache_lens,
+           std::optional<Tensor> alibi_slopes, float scale) {
+    size_t seed = hash_combine(out, q, k_cache, v_cache, block_tables, cache_lens, alibi_slopes);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PagedAttention,
+        seed,
+        out->desc(), q->desc(), k_cache->desc(), v_cache->desc(),
+        block_tables->desc(), cache_lens->desc(),
+        alibi_slopes ? alibi_slopes.value()->desc() : nullptr,
+        scale);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PagedAttention, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(out),
+        graph::GraphTensor(q),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(block_tables),
+        graph::GraphTensor(cache_lens),
+        alibi_slopes ? std::optional<graph::GraphTensor>(graph::GraphTensor(*alibi_slopes)) : std::nullopt,
+        scale};
+}
+
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopPagedAttention(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->out->data(),
+            p->q->data(),
+            p->k_cache->data(),
+            p->v_cache->data(),
+            p->block_tables->data(),
+            p->cache_lens->data(),
+            p->alibi_slopes.has_value() ? p->alibi_slopes.value()->data() : nullptr,
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    PagedAttention::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedAttention, &plan, &run, &cleanup);

 } // namespace infinicore::op::paged_attention_impl::infiniop
--- a/src/infinicore/ops/paged_caching/paged_caching.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching.cc
 #include "infinicore/ops/paged_caching.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {

-common::OpDispatcher<PagedCaching::schema> &PagedCaching::dispatcher() {
-    static common::OpDispatcher<PagedCaching::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedCaching);

-void PagedCaching::execute(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
+PagedCaching::PagedCaching(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, slot_mapping);
-    infinicore::context::setDevice(k_cache->device());
-    dispatcher().lookup(k_cache->device().getType())(k_cache, v_cache, k, v, slot_mapping);
+    INFINICORE_GRAPH_OP_DISPATCH(k->device().getType(), k_cache, v_cache, k, v, slot_mapping);
+}
+
+void PagedCaching::execute(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PagedCaching, k_cache, v_cache, k, v, slot_mapping);
 }

-void paged_caching_(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
+void paged_caching_(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
    PagedCaching::execute(k_cache, v_cache, k, v, slot_mapping);
 }


--- a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/paged_caching.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::paged_caching_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopPagedCachingDescriptor_t> caches(
-    100, // capacity
-    [](infiniopPagedCachingDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyPagedCachingDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor k_cache, Tensor v_cache, Tensor k, Tensor v, Tensor slot_mapping) {
-    size_t seed = hash_combine(k_cache, v_cache, k, v, slot_mapping);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopPagedCachingDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreatePagedCachingDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetPagedCachingWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopPagedCaching(
-        desc, workspace->data(), workspace_size,
-        k_cache->data(), v_cache->data(), k->data(), v->data(), slot_mapping->data(), context::getStream()));
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedCaching, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+
+    graph::GraphTensor workspace, k_cache, v_cache, k, v, slot_mapping;
+};
+
+void *plan(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    size_t key = hash_combine(k_cache, v_cache, k, v, slot_mapping);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PagedCaching,
+        key, k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PagedCaching, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(slot_mapping)};
+}
+
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopPagedCaching(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->k_cache->data(),
+            p->v_cache->data(),
+            p->k->data(),
+            p->v->data(),
+            p->slot_mapping->data(),
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    PagedCaching::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedCaching, &plan, &run, &cleanup);

 } // namespace infinicore::op::paged_caching_impl::infiniop
--- a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerChannelQuantI8);
+
+PerChannelQuantI8::PerChannelQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale);
+}
+
+void PerChannelQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerChannelQuantI8, x, x_packed, x_scale);
+}
+
+void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    PerChannelQuantI8::execute(x, x_packed, x_scale);
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::per_channel_quant_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerChannelQuantI8, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale;
+};
+
+void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    size_t seed = hash_combine(x, x_packed, x_scale);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PerChannelQuantI8,
+        seed,
+        x_packed->desc(), x_scale->desc(), nullptr, x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PerChannelQuantI8, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopPerChannelQuantI8(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        nullptr,
+        planned->x->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerChannelQuantI8, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::per_channel_quant_i8_impl::infiniop
--- a/src/infinicore/ops/rearrange/rearrange.cc
+++ b/src/infinicore/ops/rearrange/rearrange.cc
@@ -3,24 +3,30 @@

 namespace infinicore::op {

-common::OpDispatcher<Rearrange::schema> &Rearrange::dispatcher() {
-    static common::OpDispatcher<Rearrange::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Rearrange);

-void Rearrange::execute(Tensor y, Tensor x) {
+Rearrange::Rearrange(Tensor y, const Tensor &x) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(y, x);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x);
 }

-Tensor rearrange(Tensor x) {
+void Rearrange::execute(Tensor y, const Tensor &x) {
+    auto op = std::make_shared<Rearrange>(y, x);
+    if (context::isGraphRecording()) {
+        context::addGraphOperator(op);
+    } else {
+        op->run();
+    }
+}
+
+Tensor rearrange(const Tensor &x) {
    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
    rearrange_(y, x);
    return y;
 }

-void rearrange_(Tensor y, Tensor x) {
+void rearrange_(Tensor y, const Tensor &x) {
    Rearrange::execute(y, x);
 }
+
 } // namespace infinicore::op
--- a/src/infinicore/ops/rearrange/rearrange_infiniop.cc
+++ b/src/infinicore/ops/rearrange/rearrange_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rearrange.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::rearrange_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopRearrangeDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRearrangeDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRearrangeDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Rearrange, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor y, x;
+};

-void calculate(Tensor y, Tensor x) {
+void *plan(Tensor y, const Tensor &x) {
    size_t seed = hash_combine(y, x);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Rearrange,
+        seed, y->desc(),
+        x->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopRearrangeDescriptor_t desc = nullptr;
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(y),
+        graph::GraphTensor(x)};
+}

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRearrangeDescriptor(context::getInfiniopHandle(device), &desc, y->desc(), x->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(
        infiniopRearrange(
-            desc,
-            y->data(),
-            x->data(),
+            planned->descriptor->desc,
+            planned->y->data(),
+            planned->x->data(),
            context::getStream()));
 }

-static bool registered = []() {
-    Rearrange::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Rearrange, &plan, &run, &cleanup);

 } // namespace infinicore::op::rearrange_impl::infiniop
--- a/src/infinicore/ops/rms_norm/rms_norm.cc
+++ b/src/infinicore/ops/rms_norm/rms_norm.cc
 #include "infinicore/ops/rms_norm.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RMSNorm);

-common::OpDispatcher<RMSNorm::schema> &RMSNorm::dispatcher() {
-    static common::OpDispatcher<RMSNorm::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void RMSNorm::execute(Tensor y, Tensor x, Tensor weight, float epsilon) {
+RMSNorm::RMSNorm(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x, weight);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(y, x, weight, epsilon);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x, weight, epsilon);
+}
+
+void RMSNorm::execute(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(RMSNorm, y, x, weight, epsilon);
 }

-Tensor rms_norm(Tensor x, Tensor weight, float epsilon) {
+Tensor rms_norm(const Tensor &x, const Tensor &weight, float epsilon) {
    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
    rms_norm_(y, x, weight, epsilon);
    return y;
 }

-void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon) {
+void rms_norm_(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
    RMSNorm::execute(y, x, weight, epsilon);
 }


--- a/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
+++ b/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rms_norm.hpp"
-#include <infiniop.h>

-namespace infinicore::op::rms_norm_impl::infiniop {
+#include "../infiniop_impl.hpp"

-thread_local common::OpCache<size_t, infiniopRMSNormDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRMSNormDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRMSNormDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+namespace infinicore::op::rms_norm_impl::infiniop {

-void calculate(Tensor y, Tensor x, Tensor weight, float epsilon) {
-    size_t seed = hash_combine(y, x, weight, epsilon);
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RMSNorm, 100);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, y, x, weight;
+};

-    auto desc_opt = cache.get(seed);
-    infiniopRMSNormDescriptor_t desc = nullptr;
+void *plan(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
+    size_t seed = hash_combine(y, x, weight, epsilon);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRMSNormDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            y->desc(), x->desc(), weight->desc(), epsilon));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, RMSNorm,
+        seed, y->desc(),
+        x->desc(),
+        weight->desc(),
+        epsilon);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, RMSNorm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(y),
+        graph::GraphTensor(x),
+        graph::GraphTensor(weight)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopRMSNorm(
+            planned->descriptor->desc,
+            planned->workspace->data(),
+            planned->workspace->numel(),
+            planned->y->data(),
+            planned->x->data(),
+            planned->weight->data(),
+            context::getStream()));
+}

-    INFINICORE_CHECK_ERROR(infiniopRMSNorm(
-        desc, workspace->data(), workspace_size,
-        y->data(), x->data(), weight->data(), context::getStream()));
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    RMSNorm::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RMSNorm, &plan, &run, &cleanup);

 } // namespace infinicore::op::rms_norm_impl::infiniop
--- a/src/infinicore/ops/rope/rope.cc
+++ b/src/infinicore/ops/rope/rope.cc
 #include "infinicore/ops/rope.hpp"
-
 #include "../../utils.hpp"
-#include "infinicore/context/context.hpp"
-
-#include <stdexcept>

 namespace infinicore::op {

-common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
-    static common::OpDispatcher<RoPE::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RoPE);

-void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
+RoPE::RoPE(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin_table,
+           const Tensor &cos_table,
+           infinicore::nn::RoPE::Algo algo) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x_out, x, pos, sin_table, cos_table);
-    infinicore::context::setDevice(x_out->device());
-    auto device_type = x_out->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
+    INFINICORE_GRAPH_OP_DISPATCH(x_out->device().getType(), x_out, x, pos, sin_table, cos_table, algo);
+}

-    func(x_out, x, pos, sin_table, cos_table, algo);
+void RoPE::execute(Tensor x_out,
+                   const Tensor &x,
+                   const Tensor &pos,
+                   const Tensor &sin_table,
+                   const Tensor &cos_table,
+                   infinicore::nn::RoPE::Algo algo) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(RoPE, x_out, x, pos, sin_table, cos_table, algo);
 }

-void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
+void rope_(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin_table,
+           const Tensor &cos_table,
+           infinicore::nn::RoPE::Algo algo) {
    RoPE::execute(x_out, x, pos, sin_table, cos_table, algo);
 }

-Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
-    Shape shape = x->shape();
-    auto x_out = Tensor::empty(shape, x->dtype(), x->device());
+Tensor rope(const Tensor &x,
+            const Tensor &pos,
+            const Tensor &sin_table,
+            const Tensor &cos_table,
+            infinicore::nn::RoPE::Algo algo) {
+    auto x_out = Tensor::empty(x->shape(), x->dtype(), x->device());
    rope_(x_out, x, pos, sin_table, cos_table, algo);
    return x_out;
 }

--- a/src/infinicore/ops/rope/rope_infiniop.cc
+++ b/src/infinicore/ops/rope/rope_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rope.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::rope_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopRoPEDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRoPEDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRoPEDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RoPE, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace;
+    graph::GraphTensor x_out;
+    graph::GraphTensor x;
+    graph::GraphTensor pos;
+    graph::GraphTensor sin;
+    graph::GraphTensor cos;
+};

-void calculate(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
-    // Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
-    infiniopRoPEAlgo_t infiniop_algo;
+static infiniopRoPEAlgo_t to_infiniop_algo(infinicore::nn::RoPE::Algo algo) {
    switch (algo) {
    case infinicore::nn::RoPE::Algo::GPT_J:
-        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_J;
-        break;
+        return INFINIOP_ROPE_ALGO_GPT_J;
    case infinicore::nn::RoPE::Algo::GPT_NEOX:
-        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_NEOX;
-        break;
+        return INFINIOP_ROPE_ALGO_GPT_NEOX;
    default:
-        throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo)));
+        throw std::runtime_error("Unsupported RoPE algorithm");
    }
+}

-    // Create hash key for descriptor caching
-    size_t key = hash_combine(x_out, x, pos, sin_cache, cos_cache);
-    hash_combine(key, std::hash<int>()(static_cast<int>(infiniop_algo)));
+void *plan(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin,
+           const Tensor &cos,
+           infinicore::nn::RoPE::Algo algo) {
+    auto infiniop_algo = to_infiniop_algo(algo);
+    size_t key = hash_combine(x_out, x, pos, sin, cos, static_cast<int>(infiniop_algo));

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, RoPE, key, x_out->desc(),
+        x->desc(),
+        pos->desc(),
+        sin->desc(),
+        cos->desc(),
+        infiniop_algo);

-    auto desc_opt = cache.get(key);
-    infiniopRoPEDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, RoPE, descriptor);
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x_out),
+        graph::GraphTensor(x),
+        graph::GraphTensor(pos),
+        graph::GraphTensor(sin),
+        graph::GraphTensor(cos)};
+}

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRoPEDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            x_out->desc(), x->desc(),
-            pos->desc(), sin_cache->desc(), cos_cache->desc(),
-            infiniop_algo));
-        cache.put(key, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+    INFINICORE_CHECK_ERROR(
+        infiniopRoPE(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->x_out->data(),
+            p->x->data(),
+            p->pos->data(),
+            p->sin->data(),
+            p->cos->data(),
+            context::getStream()));
+}

-    // InfiniOP reads from x and writes to x_out (handles copying internally)
-    INFINICORE_CHECK_ERROR(infiniopRoPE(
-        desc, workspace->data(), workspace_size,
-        x_out->data(), x->data(), pos->data(),
-        sin_cache->data(), cos_cache->data(), context::getStream()));
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    RoPE::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RoPE, &plan, &run, &cleanup);

 } // namespace infinicore::op::rope_impl::infiniop