Merge branch 'demo131' into Issue/862

8d09630a · gongchensu · GitHub · ab52dead · 012df56c · 8d09630a
Unverified Commit 8d09630a authored Feb 11, 2026 by gongchensu Committed by GitHub Feb 11, 2026
20 changed files
--- a/src/infinicore/ops/paged_caching/paged_caching.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching.cc
+#include "infinicore/ops/paged_caching.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PagedCaching);
+
+PagedCaching::PagedCaching(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(k_cache, v_cache, k, v, slot_mapping);
+    INFINICORE_GRAPH_OP_DISPATCH(k->device().getType(), k_cache, v_cache, k, v, slot_mapping);
+}
+
+void PagedCaching::execute(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PagedCaching, k_cache, v_cache, k, v, slot_mapping);
+}
+
+void paged_caching_(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    PagedCaching::execute(k_cache, v_cache, k, v, slot_mapping);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
+++ b/src/infinicore/ops/paged_caching/paged_caching_infiniop.cc
+#include "infinicore/ops/paged_caching.hpp"
+
+#include "../infiniop_impl.hpp"
+
+namespace infinicore::op::paged_caching_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PagedCaching, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+
+    graph::GraphTensor workspace, k_cache, v_cache, k, v, slot_mapping;
+};
+
+void *plan(Tensor k_cache, Tensor v_cache, const Tensor &k, const Tensor &v, const Tensor &slot_mapping) {
+    size_t key = hash_combine(k_cache, v_cache, k, v, slot_mapping);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PagedCaching,
+        key, k_cache->desc(), v_cache->desc(), k->desc(), v->desc(), slot_mapping->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PagedCaching, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(k_cache),
+        graph::GraphTensor(v_cache),
+        graph::GraphTensor(k),
+        graph::GraphTensor(v),
+        graph::GraphTensor(slot_mapping)};
+}
+
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopPagedCaching(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->k_cache->data(),
+            p->v_cache->data(),
+            p->k->data(),
+            p->v->data(),
+            p->slot_mapping->data(),
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PagedCaching, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::paged_caching_impl::infiniop
--- a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8.cc
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(PerChannelQuantI8);
+
+PerChannelQuantI8::PerChannelQuantI8(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x, x_packed, x_scale);
+    INFINICORE_GRAPH_OP_DISPATCH(x->device().getType(), x, x_packed, x_scale);
+}
+
+void PerChannelQuantI8::execute(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(PerChannelQuantI8, x, x_packed, x_scale);
+}
+
+void per_channel_quant_i8_(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    PerChannelQuantI8::execute(x, x_packed, x_scale);
+}
+} // namespace infinicore::op
--- a/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
+++ b/src/infinicore/ops/per_channel_quant_i8/per_channel_quant_i8_infiniop.cc
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/per_channel_quant_i8.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::per_channel_quant_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, PerChannelQuantI8, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, x, x_packed, x_scale;
+};
+
+void *plan(const Tensor &x, Tensor x_packed, Tensor x_scale) {
+    size_t seed = hash_combine(x, x_packed, x_scale);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, PerChannelQuantI8,
+        seed,
+        x_packed->desc(), x_scale->desc(), nullptr, x->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, PerChannelQuantI8, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x),
+        graph::GraphTensor(x_packed),
+        graph::GraphTensor(x_scale)};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopPerChannelQuantI8(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->x_packed->data(),
+        planned->x_scale->data(),
+        nullptr,
+        planned->x->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(PerChannelQuantI8, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::per_channel_quant_i8_impl::infiniop
--- a/src/infinicore/ops/rearrange/rearrange.cc
+++ b/src/infinicore/ops/rearrange/rearrange.cc
@@ -3,24 +3,30 @@

 namespace infinicore::op {

-common::OpDispatcher<Rearrange::schema> &Rearrange::dispatcher() {
-    static common::OpDispatcher<Rearrange::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(Rearrange);

-void Rearrange::execute(Tensor y, Tensor x) {
+Rearrange::Rearrange(Tensor y, const Tensor &x) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(y, x);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x);
 }

-Tensor rearrange(Tensor x) {
+void Rearrange::execute(Tensor y, const Tensor &x) {
+    auto op = std::make_shared<Rearrange>(y, x);
+    if (context::isGraphRecording()) {
+        context::addGraphOperator(op);
+    } else {
+        op->run();
+    }
+}
+
+Tensor rearrange(const Tensor &x) {
    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
    rearrange_(y, x);
    return y;
 }

-void rearrange_(Tensor y, Tensor x) {
+void rearrange_(Tensor y, const Tensor &x) {
    Rearrange::execute(y, x);
 }
+
 } // namespace infinicore::op
--- a/src/infinicore/ops/rearrange/rearrange_infiniop.cc
+++ b/src/infinicore/ops/rearrange/rearrange_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rearrange.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::rearrange_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopRearrangeDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRearrangeDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRearrangeDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, Rearrange, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor y, x;
+};

-void calculate(Tensor y, Tensor x) {
+void *plan(Tensor y, const Tensor &x) {
    size_t seed = hash_combine(y, x);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, Rearrange,
+        seed, y->desc(),
+        x->desc());

-    auto desc_opt = cache.get(seed);
-    infiniopRearrangeDescriptor_t desc = nullptr;
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(y),
+        graph::GraphTensor(x)};
+}

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRearrangeDescriptor(context::getInfiniopHandle(device), &desc, y->desc(), x->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);

    INFINICORE_CHECK_ERROR(
        infiniopRearrange(
-            desc,
-            y->data(),
-            x->data(),
+            planned->descriptor->desc,
+            planned->y->data(),
+            planned->x->data(),
            context::getStream()));
 }

-static bool registered = []() {
-    Rearrange::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(Rearrange, &plan, &run, &cleanup);

 } // namespace infinicore::op::rearrange_impl::infiniop
--- a/src/infinicore/ops/rms_norm/rms_norm.cc
+++ b/src/infinicore/ops/rms_norm/rms_norm.cc
 #include "infinicore/ops/rms_norm.hpp"
-
 #include "../../utils.hpp"

 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RMSNorm);

-common::OpDispatcher<RMSNorm::schema> &RMSNorm::dispatcher() {
-    static common::OpDispatcher<RMSNorm::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void RMSNorm::execute(Tensor y, Tensor x, Tensor weight, float epsilon) {
+RMSNorm::RMSNorm(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(y, x, weight);
-    infinicore::context::setDevice(y->device());
-    dispatcher().lookup(y->device().getType())(y, x, weight, epsilon);
+    INFINICORE_GRAPH_OP_DISPATCH(y->device().getType(), y, x, weight, epsilon);
+}
+
+void RMSNorm::execute(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(RMSNorm, y, x, weight, epsilon);
 }

-Tensor rms_norm(Tensor x, Tensor weight, float epsilon) {
+Tensor rms_norm(const Tensor &x, const Tensor &weight, float epsilon) {
    auto y = Tensor::empty(x->shape(), x->dtype(), x->device());
    rms_norm_(y, x, weight, epsilon);
    return y;
 }

-void rms_norm_(Tensor y, Tensor x, Tensor weight, float epsilon) {
+void rms_norm_(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
    RMSNorm::execute(y, x, weight, epsilon);
 }


--- a/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
+++ b/src/infinicore/ops/rms_norm/rms_norm_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rms_norm.hpp"
-#include <infiniop.h>

-namespace infinicore::op::rms_norm_impl::infiniop {
+#include "../infiniop_impl.hpp"

-thread_local common::OpCache<size_t, infiniopRMSNormDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRMSNormDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRMSNormDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+namespace infinicore::op::rms_norm_impl::infiniop {

-void calculate(Tensor y, Tensor x, Tensor weight, float epsilon) {
-    size_t seed = hash_combine(y, x, weight, epsilon);
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RMSNorm, 100);

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, y, x, weight;
+};

-    auto desc_opt = cache.get(seed);
-    infiniopRMSNormDescriptor_t desc = nullptr;
+void *plan(Tensor y, const Tensor &x, const Tensor &weight, float epsilon) {
+    size_t seed = hash_combine(y, x, weight, epsilon);

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRMSNormDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            y->desc(), x->desc(), weight->desc(), epsilon));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, RMSNorm,
+        seed, y->desc(),
+        x->desc(),
+        weight->desc(),
+        epsilon);
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, RMSNorm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(y),
+        graph::GraphTensor(x),
+        graph::GraphTensor(weight)};
+}

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetRMSNormWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopRMSNorm(
+            planned->descriptor->desc,
+            planned->workspace->data(),
+            planned->workspace->numel(),
+            planned->y->data(),
+            planned->x->data(),
+            planned->weight->data(),
+            context::getStream()));
+}

-    INFINICORE_CHECK_ERROR(infiniopRMSNorm(
-        desc, workspace->data(), workspace_size,
-        y->data(), x->data(), weight->data(), context::getStream()));
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    RMSNorm::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RMSNorm, &plan, &run, &cleanup);

 } // namespace infinicore::op::rms_norm_impl::infiniop
--- a/src/infinicore/ops/rope/rope.cc
+++ b/src/infinicore/ops/rope/rope.cc
 #include "infinicore/ops/rope.hpp"
-
 #include "../../utils.hpp"
-#include "infinicore/context/context.hpp"
-
-#include <stdexcept>

 namespace infinicore::op {

-common::OpDispatcher<RoPE::schema> &RoPE::dispatcher() {
-    static common::OpDispatcher<RoPE::schema> dispatcher_;
-    return dispatcher_;
-};
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(RoPE);

-void RoPE::execute(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
+RoPE::RoPE(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin_table,
+           const Tensor &cos_table,
+           infinicore::nn::RoPE::Algo algo) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(x_out, x, pos, sin_table, cos_table);
-    infinicore::context::setDevice(x_out->device());
-    auto device_type = x_out->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No RoPE implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
+    INFINICORE_GRAPH_OP_DISPATCH(x_out->device().getType(), x_out, x, pos, sin_table, cos_table, algo);
+}

-    func(x_out, x, pos, sin_table, cos_table, algo);
+void RoPE::execute(Tensor x_out,
+                   const Tensor &x,
+                   const Tensor &pos,
+                   const Tensor &sin_table,
+                   const Tensor &cos_table,
+                   infinicore::nn::RoPE::Algo algo) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(RoPE, x_out, x, pos, sin_table, cos_table, algo);
 }

-void rope_(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
+void rope_(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin_table,
+           const Tensor &cos_table,
+           infinicore::nn::RoPE::Algo algo) {
    RoPE::execute(x_out, x, pos, sin_table, cos_table, algo);
 }

-Tensor rope(const Tensor &x, const Tensor &pos, const Tensor &sin_table, const Tensor &cos_table, infinicore::nn::RoPE::Algo algo) {
-    Shape shape = x->shape();
-    auto x_out = Tensor::empty(shape, x->dtype(), x->device());
+Tensor rope(const Tensor &x,
+            const Tensor &pos,
+            const Tensor &sin_table,
+            const Tensor &cos_table,
+            infinicore::nn::RoPE::Algo algo) {
+    auto x_out = Tensor::empty(x->shape(), x->dtype(), x->device());
    rope_(x_out, x, pos, sin_table, cos_table, algo);
    return x_out;
 }

--- a/src/infinicore/ops/rope/rope_infiniop.cc
+++ b/src/infinicore/ops/rope/rope_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/rope.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::rope_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopRoPEDescriptor_t> caches(
-    100, // capacity
-    [](infiniopRoPEDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroyRoPEDescriptor(desc));
-            desc = nullptr;
-        }
-    });
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, RoPE, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace;
+    graph::GraphTensor x_out;
+    graph::GraphTensor x;
+    graph::GraphTensor pos;
+    graph::GraphTensor sin;
+    graph::GraphTensor cos;
+};

-void calculate(Tensor x_out, const Tensor &x, const Tensor &pos, const Tensor &sin_cache, const Tensor &cos_cache, infinicore::nn::RoPE::Algo algo) {
-    // Convert infinicore::nn::RoPE::Algo to infiniopRoPEAlgo_t
-    infiniopRoPEAlgo_t infiniop_algo;
+static infiniopRoPEAlgo_t to_infiniop_algo(infinicore::nn::RoPE::Algo algo) {
    switch (algo) {
    case infinicore::nn::RoPE::Algo::GPT_J:
-        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_J;
-        break;
+        return INFINIOP_ROPE_ALGO_GPT_J;
    case infinicore::nn::RoPE::Algo::GPT_NEOX:
-        infiniop_algo = INFINIOP_ROPE_ALGO_GPT_NEOX;
-        break;
+        return INFINIOP_ROPE_ALGO_GPT_NEOX;
    default:
-        throw std::runtime_error("Unsupported RoPE algorithm: " + std::to_string(static_cast<int>(algo)));
+        throw std::runtime_error("Unsupported RoPE algorithm");
    }
+}

-    // Create hash key for descriptor caching
-    size_t key = hash_combine(x_out, x, pos, sin_cache, cos_cache);
-    hash_combine(key, std::hash<int>()(static_cast<int>(infiniop_algo)));
+void *plan(Tensor x_out,
+           const Tensor &x,
+           const Tensor &pos,
+           const Tensor &sin,
+           const Tensor &cos,
+           infinicore::nn::RoPE::Algo algo) {
+    auto infiniop_algo = to_infiniop_algo(algo);
+    size_t key = hash_combine(x_out, x, pos, sin, cos, static_cast<int>(infiniop_algo));

-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, RoPE, key, x_out->desc(),
+        x->desc(),
+        pos->desc(),
+        sin->desc(),
+        cos->desc(),
+        infiniop_algo);

-    auto desc_opt = cache.get(key);
-    infiniopRoPEDescriptor_t desc = nullptr;
+    INFINIOP_WORKSPACE_TENSOR(workspace, RoPE, descriptor);
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(x_out),
+        graph::GraphTensor(x),
+        graph::GraphTensor(pos),
+        graph::GraphTensor(sin),
+        graph::GraphTensor(cos)};
+}

-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateRoPEDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            x_out->desc(), x->desc(),
-            pos->desc(), sin_cache->desc(), cos_cache->desc(),
-            infiniop_algo));
-        cache.put(key, desc);
-    } else {
-        desc = *desc_opt;
-    }
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);

-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetRoPEWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
+    INFINICORE_CHECK_ERROR(
+        infiniopRoPE(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->x_out->data(),
+            p->x->data(),
+            p->pos->data(),
+            p->sin->data(),
+            p->cos->data(),
+            context::getStream()));
+}

-    // InfiniOP reads from x and writes to x_out (handles copying internally)
-    INFINICORE_CHECK_ERROR(infiniopRoPE(
-        desc, workspace->data(), workspace_size,
-        x_out->data(), x->data(), pos->data(),
-        sin_cache->data(), cos_cache->data(), context::getStream()));
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    RoPE::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(RoPE, &plan, &run, &cleanup);

 } // namespace infinicore::op::rope_impl::infiniop
--- a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc
+++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8.cc
+#include "infinicore/ops/scaled_mm_i8.hpp"
+
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(I8Gemm);
+
+I8Gemm::I8Gemm(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a_p, a_s, b_p, b_s);
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a_p, a_s, b_p, b_s, bias);
+}
+void I8Gemm::execute(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(I8Gemm, c, a_p, a_s, b_p, b_s, bias);
+}
+
+void scaled_mm_i8_(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    I8Gemm::execute(c, a_p, a_s, b_p, b_s, bias);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc
+++ b/src/infinicore/ops/scaled_mm_i8/scaled_mm_i8_infiniop.cc
+#include "../../utils.hpp"
+#include "../infiniop_impl.hpp"
+#include "infinicore/common/hash.hpp"
+#include "infinicore/ops/common/cache.hpp"
+#include "infinicore/ops/scaled_mm_i8.hpp"
+#include <infiniop.h>
+
+namespace infinicore::op::scaled_mm_i8_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, I8Gemm, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, c, a_p, a_s, b_p, b_s;
+    std::optional<graph::GraphTensor> bias;
+};
+
+void *plan(Tensor c, const Tensor &a_p, const Tensor &a_s, const Tensor &b_p, const Tensor &b_s, std::optional<Tensor> bias) {
+    size_t seed = hash_combine(c, a_p, a_s, b_p, b_s);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, I8Gemm,
+        seed,
+        c->desc(), bias.has_value() ? bias.value()->desc() : nullptr,
+        a_p->desc(), a_s->desc(), b_p->desc(), b_s->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, I8Gemm, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a_p),
+        graph::GraphTensor(a_s),
+        graph::GraphTensor(b_p),
+        graph::GraphTensor(b_s),
+        // bias.has_value() ? bias.value()->desc() : nullptr};
+        bias ? std::optional<graph::GraphTensor>(graph::GraphTensor(*bias)) : std::nullopt};
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopI8Gemm(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->c->data(),
+        // planned->bias->data(),
+        planned->bias.has_value() ? planned->bias.value()->data() : nullptr,
+        planned->a_p->data(),
+        planned->a_s->data(),
+        planned->b_p->data(),
+        planned->b_s->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(I8Gemm, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::scaled_mm_i8_impl::infiniop
--- a/src/infinicore/ops/silu_and_mul/silu_and_mul.cc
+++ b/src/infinicore/ops/silu_and_mul/silu_and_mul.cc
+#include "infinicore/ops/silu_and_mul.hpp"
+#include "../../utils.hpp"
+
+namespace infinicore::op {
+
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SiluAndMul);
+
+SiluAndMul::SiluAndMul(Tensor out, const Tensor &x) {
+    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(out, x);
+    INFINICORE_GRAPH_OP_DISPATCH(out->device().getType(), out, x);
+}
+
+void SiluAndMul::execute(Tensor out, const Tensor &x) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(SiluAndMul, out, x);
+}
+
+Tensor silu_and_mul(const Tensor &x) {
+    Shape shape = x->shape();
+    size_t ndim = x->ndim();
+
+    if (shape[ndim - 1] % 2 != 0) {
+        throw std::runtime_error("SiluAndMul input last dim must be even.");
+    }
+    shape[ndim - 1] /= 2;
+
+    auto out = Tensor::empty(shape, x->dtype(), x->device());
+    silu_and_mul_(out, x);
+    return out;
+}
+
+void silu_and_mul_(Tensor out, const Tensor &x) {
+    SiluAndMul::execute(out, x);
+}
+
+} // namespace infinicore::op
--- a/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
+++ b/src/infinicore/ops/silu_and_mul/silu_and_mul_infiniop.cc
+#include "../infiniop_impl.hpp"
+#include "infinicore/ops/silu_and_mul.hpp"
+
+namespace infinicore::op::silu_and_mul_impl::infiniop {
+
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SiluAndMul, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace, output, input;
+};
+
+void *plan(Tensor output, const Tensor &input) {
+    size_t seed = hash_combine(output, input);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, SiluAndMul,
+        seed, output->desc(), input->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, SiluAndMul, descriptor);
+
+    auto planned = new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(output),
+        graph::GraphTensor(input)};
+
+    return planned;
+}
+
+void run(void *planned_meta) {
+    auto planned = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(infiniopSiluAndMul(
+        planned->descriptor->desc,
+        planned->workspace->data(),
+        planned->workspace->numel(),
+        planned->output->data(),
+        planned->input->data(),
+        context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
+}
+
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SiluAndMul, &plan, &run, &cleanup);
+
+} // namespace infinicore::op::silu_and_mul_impl::infiniop
--- a/src/infinicore/ops/swiglu/swiglu.cc
+++ b/src/infinicore/ops/swiglu/swiglu.cc
 #include "infinicore/ops/swiglu.hpp"
-
 #include "../../utils.hpp"

-#include <stdexcept>
-
 namespace infinicore::op {
+INFINICORE_GRAPH_OP_DISPATCHERS_IMPL(SwiGLU);

-common::OpDispatcher<SwiGLU::schema> &SwiGLU::dispatcher() {
-    static common::OpDispatcher<SwiGLU::schema> dispatcher_;
-    return dispatcher_;
-};
-
-void SwiGLU::execute(Tensor c, Tensor a, Tensor b) {
+SwiGLU::SwiGLU(Tensor c, const Tensor &a, const Tensor &b) {
    INFINICORE_ASSERT_TENSORS_SAME_DEVICE(c, a, b);
-    infinicore::context::setDevice(c->device());
-    auto device_type = c->device().getType();
-    auto func = dispatcher().lookup(device_type);
-
-    if (func == nullptr) {
-        throw std::runtime_error("No SwiGLU implementation found for device type: " + std::to_string(static_cast<int>(device_type)));
-    }
+    INFINICORE_GRAPH_OP_DISPATCH(c->device().getType(), c, a, b);
+}

-    func(c, a, b);
+void SwiGLU::execute(Tensor c, const Tensor &a, const Tensor &b) {
+    INFINICORE_GRAPH_OP_RECORD_OR_RUN(SwiGLU, c, a, b);
 }

-Tensor swiglu(Tensor a, Tensor b) {
-    Shape shape = a->shape();
-    auto c = Tensor::empty(shape, a->dtype(), a->device());
+Tensor swiglu(const Tensor &a, const Tensor &b) {
+    auto c = Tensor::empty(a->shape(), a->dtype(), a->device());
    swiglu_(c, a, b);
    return c;
 }

-void swiglu_(Tensor c, Tensor a, Tensor b) {
+void swiglu_(Tensor c, const Tensor &a, const Tensor &b) {
    SwiGLU::execute(c, a, b);
 }
+
 } // namespace infinicore::op
--- a/src/infinicore/ops/swiglu/swiglu_infiniop.cc
+++ b/src/infinicore/ops/swiglu/swiglu_infiniop.cc
-#include "../../utils.hpp"
-#include "infinicore/common/hash.hpp"
-#include "infinicore/ops/common/cache.hpp"
 #include "infinicore/ops/swiglu.hpp"
-#include <infiniop.h>
+
+#include "../infiniop_impl.hpp"

 namespace infinicore::op::swiglu_impl::infiniop {

-thread_local common::OpCache<size_t, infiniopSwiGLUDescriptor_t> caches(
-    100, // capacity
-    [](infiniopSwiGLUDescriptor_t &desc) {
-        if (desc != nullptr) {
-            INFINICORE_CHECK_ERROR(infiniopDestroySwiGLUDescriptor(desc));
-            desc = nullptr;
-        }
-    });
-
-void calculate(Tensor c, Tensor a, Tensor b) {
-    size_t seed = hash_combine(c, b, a);
-
-    auto device = context::getDevice();
-    auto &cache = caches.getCache(device);
-
-    auto desc_opt = cache.get(seed);
-    infiniopSwiGLUDescriptor_t desc = nullptr;
-
-    if (!desc_opt) {
-        INFINICORE_CHECK_ERROR(infiniopCreateSwiGLUDescriptor(
-            context::getInfiniopHandle(device), &desc,
-            c->desc(), a->desc(), b->desc()));
-        cache.put(seed, desc);
-    } else {
-        desc = *desc_opt;
-    }
-
-    size_t workspace_size = 0;
-    INFINICORE_CHECK_ERROR(infiniopGetSwiGLUWorkspaceSize(desc, &workspace_size));
-    std::shared_ptr<Memory> workspace = context::allocateMemory(workspace_size);
-
-    INFINICORE_CHECK_ERROR(infiniopSwiGLU(
-        desc, workspace->data(), workspace_size,
-        c->data(), a->data(), b->data(), context::getStream()));
+INFINIOP_CACHABLE_DESCRIPTOR(Descriptor, SwiGLU, 100);
+
+struct PlannedMeta {
+    std::shared_ptr<Descriptor> descriptor;
+    graph::GraphTensor workspace;
+    graph::GraphTensor c;
+    graph::GraphTensor a;
+    graph::GraphTensor b;
+};
+
+void *plan(Tensor c, const Tensor &a, const Tensor &b) {
+    size_t key = hash_combine(c, a, b);
+
+    INFINIOP_CACHABLE_DESCRIPTOR_GET_OR_CREATE(
+        Descriptor, descriptor, SwiGLU,
+        key, c->desc(), a->desc(), b->desc());
+
+    INFINIOP_WORKSPACE_TENSOR(workspace, SwiGLU, descriptor);
+
+    return new PlannedMeta{
+        descriptor,
+        graph::GraphTensor(workspace),
+        graph::GraphTensor(c),
+        graph::GraphTensor(a),
+        graph::GraphTensor(b)};
+}
+
+void run(void *planned_meta) {
+    auto *p = reinterpret_cast<PlannedMeta *>(planned_meta);
+
+    INFINICORE_CHECK_ERROR(
+        infiniopSwiGLU(
+            p->descriptor->desc,
+            p->workspace->data(),
+            p->workspace->numel(),
+            p->c->data(),
+            p->a->data(),
+            p->b->data(),
+            context::getStream()));
+}
+
+void cleanup(void **planned_meta_ptr) {
+    delete *reinterpret_cast<PlannedMeta **>(planned_meta_ptr);
+    *planned_meta_ptr = nullptr;
 }

-static bool registered = []() {
-    SwiGLU::dispatcher().registerAll(&calculate, false);
-    return true;
-}();
+INFINICORE_GRAPH_OP_REGISTER_ALLDEVICE(SwiGLU, &plan, &run, &cleanup);

 } // namespace infinicore::op::swiglu_impl::infiniop
--- a/src/infinicore/pybind11/context.hpp
+++ b/src/infinicore/pybind11/context.hpp
@@ -24,6 +24,11 @@ inline void bind(py::module &m) {
    // Synchronization
    m.def("sync_stream", &syncStream, "Synchronize the current stream");
    m.def("sync_device", &syncDevice, "Synchronize the current device");
+
+    // Graph
+    m.def("is_graph_recording", &isGraphRecording, "Check if graph recording is turned on");
+    m.def("start_graph_recording", &startGraphRecording, "Start graph recording");
+    m.def("stop_graph_recording", &stopGraphRecording, "Stop graph recording and return the graph");
 }

 } // namespace infinicore::context
--- a/src/infinicore/pybind11/device.hpp
+++ b/src/infinicore/pybind11/device.hpp
@@ -22,6 +22,7 @@ inline void bind(py::module &m) {
        .value("QY", Device::Type::QY)
        .value("KUNLUN", Device::Type::KUNLUN)
        .value("HYGON", Device::Type::HYGON)
+        .value("ALI", Device::Type::ALI)
        .value("COUNT", Device::Type::COUNT);

    device

--- a/src/infinicore/pybind11/graph.hpp
+++ b/src/infinicore/pybind11/graph.hpp
+#pragma once
+
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include "infinicore.hpp"
+
+namespace py = pybind11;
+
+namespace infinicore::graph {
+inline void bind(py::module_ &m) {
+    py::class_<infinicore::graph::Graph,
+               std::shared_ptr<infinicore::graph::Graph>>(m, "Graph")
+        .def(py::init<>()) // allow construction
+        .def("run", &infinicore::graph::Graph::run);
+}
+} // namespace infinicore::graph
--- a/src/infinicore/pybind11/infinicore.cc
+++ b/src/infinicore/pybind11/infinicore.cc
@@ -6,6 +6,7 @@
 #include "device.hpp"
 #include "device_event.hpp"
 #include "dtype.hpp"
+#include "graph.hpp"
 #include "ops.hpp"
 #include "tensor.hpp"

@@ -18,6 +19,7 @@ PYBIND11_MODULE(_infinicore, m) {
    dtype::bind(m);
    ops::bind(m);
    tensor::bind(m);
+    graph::bind(m);
 }

 } // namespace infinicore